12 files changed, 822 insertions, 391 deletions
diff --git a/src/sna/blt.c b/src/sna/blt.c
index fb3dd35c..65d586cb 100644
--- a/src/sna/blt.c
+++ b/src/sna/blt.c
@@ -154,6 +154,8 @@ memcpy_blt(const void *src, void *dst, int bpp,
 	uint8_t *dst_bytes;
 	int byte_width;
 
+	assert(src);
+	assert(dst);
 	assert(width && height);
 	assert(bpp >= 8);
 
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index d813d95a..41e05d0e 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -1193,6 +1193,7 @@ gen6_bind_bo(struct sna *sna,
 			       bo, domains, 0);
 	ss[2] = ((width - 1)  << GEN6_SURFACE_WIDTH_SHIFT |
 		 (height - 1) << GEN6_SURFACE_HEIGHT_SHIFT);
+	assert(bo->pitch <= (1 << 18));
 	ss[3] = (gen6_tiling_bits(bo->tiling) |
 		 (bo->pitch - 1) << GEN6_SURFACE_PITCH_SHIFT);
 	ss[4] = 0;
@@ -3136,10 +3137,19 @@ fallback:
 		if (!sna_blt_compare_depth(&src->drawable, &dst->drawable))
 			return false;
 
-		return sna_blt_copy_boxes_fallback(sna, alu,
-						   src, src_bo, src_dx, src_dy,
-						   dst, dst_bo, dst_dx, dst_dy,
-						   box, n);
+		if (sna_blt_copy_boxes_fallback(sna, alu,
+						 src, src_bo, src_dx, src_dy,
+						 dst, dst_bo, dst_dx, dst_dy,
+						 box, n))
+			return true;
+
+		return false;
+#if 0
+		return sna_tiling_copy_boxes(sna,
+					     src, src_bo, src_dx, src_dy,
+					     dst, dst_bo, dst_dx, dst_dy,
+					     box, n);
+#endif
 	}
 
 	if (dst->drawable.depth == src->drawable.depth) {
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 86a43729..208c8f27 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -45,7 +45,7 @@
 #endif
 
 static struct kgem_bo *
-search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags);
+search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
 
 static inline void _list_del(struct list *list)
 {
@@ -99,7 +99,6 @@ static inline void list_replace(struct list *old,
 #define DBG(x) ErrorF x
 #endif
 
-#define PAGE_SIZE 4096
 #define PAGE_ALIGN(x) ALIGN(x, PAGE_SIZE)
 #define MAX_GTT_VMA_CACHE 512
 #define MAX_CPU_VMA_CACHE INT16_MAX
@@ -120,6 +119,14 @@ struct kgem_partial_bo {
 static struct kgem_bo *__kgem_freed_bo;
 static struct drm_i915_gem_exec_object2 _kgem_dummy_exec;
 
+static inline int bytes(struct kgem_bo *bo)
+{
+	return kgem_bo_size(bo);
+}
+
+#define bucket(B) (B)->size.pages.bucket
+#define num_pages(B) (B)->size.pages.count
+
 #ifndef NDEBUG
 static bool validate_partials(struct kgem *kgem)
 {
@@ -128,10 +135,10 @@ static bool validate_partials(struct kgem *kgem)
 	list_for_each_entry_safe(bo, next, &kgem->partial, base.list) {
 		if (bo->base.list.next == &kgem->partial)
 			return true;
-		if (bo->base.size - bo->used < next->base.size - next->used) {
+		if (bytes(&bo->base) - bo->used < bytes(&next->base) - next->used) {
 			ErrorF("this rem: %d, next rem: %d\n",
-			       bo->base.size - bo->used,
-			       next->base.size - next->used);
+			       bytes(&bo->base) - bo->used,
+			       bytes(&next->base) - next->used);
 			goto err;
 		}
 	}
@@ -140,7 +147,7 @@ static bool validate_partials(struct kgem *kgem)
 err:
 	list_for_each_entry(bo, &kgem->partial, base.list)
 		ErrorF("bo: used=%d / %d, rem=%d\n",
-		       bo->used, bo->base.size, bo->base.size - bo->used);
+		       bo->used, bytes(&bo->base), bytes(&bo->base) - bo->used);
 	return false;
 }
 #else
@@ -312,7 +319,7 @@ Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 	assert(!bo->purged);
 	assert(!kgem_busy(kgem, bo->handle));
 
-	assert(length <= bo->size);
+	assert(length <= bytes(bo));
 	if (gem_write(kgem->fd, bo->handle, 0, length, data))
 		return FALSE;
 
@@ -322,17 +329,13 @@ Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 	return TRUE;
 }
 
-static uint32_t gem_create(int fd, int size)
+static uint32_t gem_create(int fd, int num_pages)
 {
 	struct drm_i915_gem_create create;
 
-#if DEBUG_KGEM
-	assert((size & (PAGE_SIZE-1)) == 0);
-#endif
-
 	VG_CLEAR(create);
 	create.handle = 0;
-	create.size = size;
+	create.size = PAGE_SIZE * num_pages;
 	(void)drmIoctl(fd, DRM_IOCTL_I915_GEM_CREATE, &create);
 
 	return create.handle;
@@ -415,7 +418,7 @@ static void gem_close(int fd, uint32_t handle)
 	(void)drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, &close);
 }
 
-static inline unsigned long __fls(unsigned long word)
+constant inline static unsigned long __fls(unsigned long word)
 {
 	asm("bsr %1,%0"
 	    : "=r" (word)
@@ -423,24 +426,21 @@ static inline unsigned long __fls(unsigned long word)
 	return word;
 }
 
-constant inline static int cache_bucket(int size)
+constant inline static int cache_bucket(int num_pages)
 {
-	uint32_t order = __fls(size / PAGE_SIZE);
-	assert(order < NUM_CACHE_BUCKETS);
-	return order;
+	return __fls(num_pages);
 }
 
 static struct kgem_bo *__kgem_bo_init(struct kgem_bo *bo,
-				      int handle, int size)
+				      int handle, int num_pages)
 {
-	assert(size);
+	assert(num_pages);
 	memset(bo, 0, sizeof(*bo));
 
 	bo->refcnt = 1;
 	bo->handle = handle;
-	bo->size = size;
-	bo->bucket = cache_bucket(size);
-	assert(bo->size < 1 << (12 + bo->bucket + 1));
+	num_pages(bo) = num_pages;
+	bucket(bo) = cache_bucket(num_pages);
 	bo->reusable = true;
 	bo->domain = DOMAIN_CPU;
 	list_init(&bo->request);
@@ -450,7 +450,7 @@ static struct kgem_bo *__kgem_bo_init(struct kgem_bo *bo,
 	return bo;
 }
 
-static struct kgem_bo *__kgem_bo_alloc(int handle, int size)
+static struct kgem_bo *__kgem_bo_alloc(int handle, int num_pages)
 {
 	struct kgem_bo *bo;
 
@@ -463,7 +463,7 @@ static struct kgem_bo *__kgem_bo_alloc(int handle, int size)
 			return NULL;
 	}
 
-	return __kgem_bo_init(bo, handle, size);
+	return __kgem_bo_init(bo, handle, num_pages);
 }
 
 static struct kgem_request _kgem_static_request;
@@ -481,14 +481,14 @@ static struct kgem_request *__kgem_request_alloc(void)
 	return rq;
 }
 
-static struct list *inactive(struct kgem *kgem, int size)
+static struct list *inactive(struct kgem *kgem, int num_pages)
 {
-	return &kgem->inactive[cache_bucket(size)];
+	return &kgem->inactive[cache_bucket(num_pages)];
 }
 
-static struct list *active(struct kgem *kgem, int size, int tiling)
+static struct list *active(struct kgem *kgem, int num_pages, int tiling)
 {
-	return &kgem->active[cache_bucket(size)][tiling];
+	return &kgem->active[cache_bucket(num_pages)][tiling];
 }
 
 static size_t
@@ -575,6 +575,7 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	list_init(&kgem->partial);
 	list_init(&kgem->requests);
 	list_init(&kgem->flushing);
+	list_init(&kgem->large);
 	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++)
 		list_init(&kgem->inactive[i]);
 	for (i = 0; i < ARRAY_SIZE(kgem->active); i++) {
@@ -658,11 +659,9 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 		 * disable dual-stream mode */
 		kgem->min_alignment = 64;
 
+	kgem->max_object_size = kgem->aperture_total / 2;
 	kgem->max_cpu_size = kgem->aperture_total / 2;
-	if (kgem->max_cpu_size > MAX_OBJECT_SIZE)
-		kgem->max_cpu_size = MAX_OBJECT_SIZE;
-
-	kgem->max_gpu_size = -1;
+	kgem->max_gpu_size = MAX_CACHE_SIZE;
 	if (gen < 40) {
 		/* If we have to use fences for blitting, we have to make
 		 * sure we can fit them into the aperture.
@@ -677,6 +676,10 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	DBG(("%s: max object size (tiled=%d, linear=%d)\n",
 	     __FUNCTION__, kgem->max_gpu_size, kgem->max_cpu_size));
 
+	/* Convert the aperture thresholds to pages */
+	kgem->aperture_low /= PAGE_SIZE;
+	kgem->aperture_high /= PAGE_SIZE;
+
 	kgem->fence_max = gem_param(kgem, I915_PARAM_NUM_FENCES_AVAIL) - 2;
 	if ((int)kgem->fence_max < 0)
 		kgem->fence_max = 5; /* minimum safe value for all hw */
@@ -811,7 +814,7 @@ kgem_add_handle(struct kgem *kgem, struct kgem_bo *bo)
 	exec->handle = bo->handle;
 	exec->offset = bo->presumed_offset;
 
-	kgem->aperture += bo->size;
+	kgem->aperture += num_pages(bo);
 
 	return exec;
 }
@@ -875,7 +878,7 @@ static void kgem_bo_release_map(struct kgem *kgem, struct kgem_bo *bo)
 	     bo->handle, kgem->vma[type].count));
 
 	VG(if (type) VALGRIND_FREELIKE_BLOCK(CPU_MAP(bo->map), 0));
-	munmap(CPU_MAP(bo->map), bo->size);
+	munmap(CPU_MAP(bo->map), bytes(bo));
 	bo->map = NULL;
 
 	if (!list_is_empty(&bo->vma)) {
@@ -917,16 +920,22 @@ inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
 	assert(bo->rq == NULL);
 	assert(bo->domain != DOMAIN_GPU);
 
-	list_move(&bo->list, &kgem->inactive[bo->bucket]);
+	if (bucket(bo) >= NUM_CACHE_BUCKETS) {
+		kgem_bo_free(kgem, bo);
+		return;
+	}
+
+	list_move(&bo->list, &kgem->inactive[bucket(bo)]);
 	if (bo->map) {
 		int type = IS_CPU_MAP(bo->map);
-		if (!type && !kgem_bo_is_mappable(kgem, bo)) {
+		if (bucket(bo) >= NUM_CACHE_BUCKETS ||
+		    (!type && !kgem_bo_is_mappable(kgem, bo))) {
 			list_del(&bo->vma);
-			munmap(CPU_MAP(bo->map), bo->size);
+			munmap(CPU_MAP(bo->map), bytes(bo));
 			bo->map = NULL;
 		}
 		if (bo->map) {
-			list_move(&bo->vma, &kgem->vma[type].inactive[bo->bucket]);
+			list_move(&bo->vma, &kgem->vma[type].inactive[bucket(bo)]);
 			kgem->vma[type].count++;
 		}
 	}
@@ -1002,8 +1011,14 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 
 	bo->scanout = bo->flush = false;
 	if (bo->rq) {
+		struct list *cache;
+
 		DBG(("%s: handle=%d -> active\n", __FUNCTION__, bo->handle));
-		list_add(&bo->list, &kgem->active[bo->bucket][bo->tiling]);
+		if (bucket(bo) < NUM_CACHE_BUCKETS)
+			cache = &kgem->active[bucket(bo)][bo->tiling];
+		else
+			cache = &kgem->large;
+		list_add(&bo->list, cache);
 		return;
 	}
 
@@ -1012,10 +1027,17 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 
 	if (bo->needs_flush) {
 		if ((bo->needs_flush = kgem_busy(kgem, bo->handle))) {
+			struct list *cache;
+
 			DBG(("%s: handle=%d -> flushing\n",
 			     __FUNCTION__, bo->handle));
+
 			list_add(&bo->request, &kgem->flushing);
-			list_add(&bo->list, &kgem->active[bo->bucket][bo->tiling]);
+			if (bucket(bo) < NUM_CACHE_BUCKETS)
+				cache = &kgem->active[bucket(bo)][bo->tiling];
+			else
+				cache = &kgem->large;
+			list_add(&bo->list, cache);
 			bo->rq = &_kgem_static_request;
 			return;
 		}
@@ -1231,7 +1253,7 @@ static void kgem_close_inactive(struct kgem *kgem)
 
 static void bubble_sort_partial(struct kgem *kgem, struct kgem_partial_bo *bo)
 {
-	int remain = bo->base.size - bo->used;
+	int remain = bytes(&bo->base) - bo->used;
 
 	while (bo->base.list.prev != &kgem->partial) {
 		struct kgem_partial_bo *p;
@@ -1239,7 +1261,7 @@ static void bubble_sort_partial(struct kgem *kgem, struct kgem_partial_bo *bo)
 		p = list_entry(bo->base.list.prev,
 			       struct kgem_partial_bo,
 			       base.list);
-		if (remain <= p->base.size - p->used)
+		if (remain <= bytes(&p->base) - p->used)
 			break;
 
 		assert(p->base.list.next == &bo->base.list);
@@ -1282,7 +1304,7 @@ static void kgem_finish_partials(struct kgem *kgem)
 		assert(bo->base.rq == kgem->next_request);
 		if (bo->used && bo->need_io) {
 			if (bo->base.refcnt == 1 &&
-			    bo->used < bo->base.size / 2) {
+			    bo->used < bytes(&bo->base) / 2) {
 				struct kgem_bo *shrink;
 
 				shrink = search_linear_cache(kgem,
@@ -1293,10 +1315,10 @@ static void kgem_finish_partials(struct kgem *kgem)
 
 					DBG(("%s: used=%d, shrinking %d to %d, handle %d to %d\n",
 					     __FUNCTION__,
-					     bo->used, bo->base.size, shrink->size,
+					     bo->used, bytes(&bo->base), bytes(shrink),
 					     bo->base.handle, shrink->handle));
 
-					assert(bo->used <= shrink->size);
+					assert(bo->used <= bytes(shrink));
 					gem_write(kgem->fd, shrink->handle,
 						  0, bo->used, bo->mem);
 
@@ -1330,9 +1352,9 @@ static void kgem_finish_partials(struct kgem *kgem)
 			}
 
 			DBG(("%s: handle=%d, uploading %d/%d\n",
-			     __FUNCTION__, bo->base.handle, bo->used, bo->base.size));
+			     __FUNCTION__, bo->base.handle, bo->used, bytes(&bo->base)));
 			assert(!kgem_busy(kgem, bo->base.handle));
-			assert(bo->used <= bo->base.size);
+			assert(bo->used <= bytes(&bo->base));
 			gem_write(kgem->fd, bo->base.handle,
 				  0, bo->used, bo->mem);
 			bo->need_io = 0;
@@ -1616,7 +1638,7 @@ void _kgem_submit(struct kgem *kgem)
 					       i,
 					       kgem->exec[i].handle,
 					       (int)kgem->exec[i].offset,
-					       found ? found->size : -1,
+					       found ? bytes(found) : -1,
 					       found ? found->tiling : -1,
 					       (int)(kgem->exec[i].flags & EXEC_OBJECT_NEEDS_FENCE),
 					       found ? found->purged : -1);
@@ -1690,7 +1712,7 @@ static void kgem_expire_partial(struct kgem *kgem)
 			continue;
 
 		DBG(("%s: discarding unused partial buffer: %d/%d, write? %d\n",
-		     __FUNCTION__, bo->used, bo->base.size, bo->write));
+		     __FUNCTION__, bo->used, bytes(&bo->base), bo->write));
 		list_del(&bo->base.list);
 		kgem_bo_unref(kgem, &bo->base);
 	}
@@ -1773,7 +1795,7 @@ bool kgem_expire_cache(struct kgem *kgem)
 				list_move_tail(&bo->list, &preserve);
 			} else {
 				count++;
-				size += bo->size;
+				size += bytes(bo);
 				kgem_bo_free(kgem, bo);
 				DBG(("%s: expiring %d\n",
 				     __FUNCTION__, bo->handle));
@@ -1834,28 +1856,31 @@ void kgem_cleanup_cache(struct kgem *kgem)
 }
 
 static struct kgem_bo *
-search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
+search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 {
 	struct kgem_bo *bo, *first = NULL;
 	bool use_active = (flags & CREATE_INACTIVE) == 0;
 	struct list *cache;
 
+	if (num_pages >= MAX_CACHE_SIZE / PAGE_SIZE)
+		return NULL;
+
 	if (!use_active &&
-	    list_is_empty(inactive(kgem, size)) &&
-	    !list_is_empty(active(kgem, size, I915_TILING_NONE)) &&
+	    list_is_empty(inactive(kgem, num_pages)) &&
+	    !list_is_empty(active(kgem, num_pages, I915_TILING_NONE)) &&
 	    !kgem_retire(kgem))
 		return NULL;
 
 	if (!use_active && flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
 		int for_cpu = !!(flags & CREATE_CPU_MAP);
-		cache = &kgem->vma[for_cpu].inactive[cache_bucket(size)];
+		cache = &kgem->vma[for_cpu].inactive[cache_bucket(num_pages)];
 		list_for_each_entry(bo, cache, vma) {
 			assert(IS_CPU_MAP(bo->map) == for_cpu);
-			assert(bo->bucket == cache_bucket(size));
+			assert(bucket(bo) == cache_bucket(num_pages));
 
-			if (size > bo->size) {
+			if (num_pages > num_pages(bo)) {
 				DBG(("inactive too small: %d < %d\n",
-				     bo->size, size));
+				     num_pages(bo), num_pages));
 				continue;
 			}
 
@@ -1874,8 +1899,8 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 			bo->tiling = I915_TILING_NONE;
 			bo->pitch = 0;
 			bo->delta = 0;
-			DBG(("  %s: found handle=%d (size=%d) in linear vma cache\n",
-			     __FUNCTION__, bo->handle, bo->size));
+			DBG(("  %s: found handle=%d (num_pages=%d) in linear vma cache\n",
+			     __FUNCTION__, bo->handle, num_pages(bo)));
 			assert(use_active || bo->domain != DOMAIN_GPU);
 			assert(!bo->needs_flush);
 			//assert(!kgem_busy(kgem, bo->handle));
@@ -1883,13 +1908,13 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 		}
 	}
 
-	cache = use_active ? active(kgem, size, I915_TILING_NONE) : inactive(kgem, size);
+	cache = use_active ? active(kgem, num_pages, I915_TILING_NONE) : inactive(kgem, num_pages);
 	list_for_each_entry(bo, cache, list) {
 		assert(bo->refcnt == 0);
 		assert(bo->reusable);
 		assert(!!bo->rq == !!use_active);
 
-		if (size > bo->size)
+		if (num_pages > num_pages(bo))
 			continue;
 
 		if (use_active && bo->tiling != I915_TILING_NONE)
@@ -1946,8 +1971,8 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 		assert(bo->tiling == I915_TILING_NONE);
 		bo->pitch = 0;
 		bo->delta = 0;
-		DBG(("  %s: found handle=%d (size=%d) in linear %s cache\n",
-		     __FUNCTION__, bo->handle, bo->size,
+		DBG(("  %s: found handle=%d (num_pages=%d) in linear %s cache\n",
+		     __FUNCTION__, bo->handle, num_pages(bo),
 		     use_active ? "active" : "inactive"));
 		assert(use_active || bo->domain != DOMAIN_GPU);
 		assert(!bo->needs_flush || use_active);
@@ -1965,8 +1990,8 @@ search_linear_cache(struct kgem *kgem, unsigned int size, unsigned flags)
 
 		first->pitch = 0;
 		first->delta = 0;
-		DBG(("  %s: found handle=%d (size=%d) in linear %s cache\n",
-		     __FUNCTION__, first->handle, first->size,
+		DBG(("  %s: found handle=%d (num_pages=%d) in linear %s cache\n",
+		     __FUNCTION__, first->handle, num_pages(first),
 		     use_active ? "active" : "inactive"));
 		assert(use_active || first->domain != DOMAIN_GPU);
 		assert(!first->needs_flush || use_active);
@@ -1990,7 +2015,7 @@ struct kgem_bo *kgem_create_for_name(struct kgem *kgem, uint32_t name)
 		return NULL;
 
 	DBG(("%s: new handle=%d\n", __FUNCTION__, open_arg.handle));
-	bo = __kgem_bo_alloc(open_arg.handle, open_arg.size);
+	bo = __kgem_bo_alloc(open_arg.handle, open_arg.size / PAGE_SIZE);
 	if (bo == NULL) {
 		gem_close(kgem->fd, open_arg.handle);
 		return NULL;
@@ -2007,7 +2032,7 @@ struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size)
 
 	DBG(("%s(%d)\n", __FUNCTION__, size));
 
-	size = PAGE_ALIGN(size);
+	size = (size + PAGE_SIZE - 1) / PAGE_SIZE;
 	bo = search_linear_cache(kgem, size, CREATE_INACTIVE);
 	if (bo)
 		return kgem_bo_reference(bo);
@@ -2019,7 +2044,7 @@ struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size)
 	DBG(("%s: new handle=%d\n", __FUNCTION__, handle));
 	bo = __kgem_bo_alloc(handle, size);
 	if (bo == NULL) {
-		gem_close(kgem->fd, size);
+		gem_close(kgem->fd, handle);
 		return NULL;
 	}
 
@@ -2028,8 +2053,6 @@ struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size)
 
 int kgem_choose_tiling(struct kgem *kgem, int tiling, int width, int height, int bpp)
 {
-	uint32_t pitch;
-
 	if (DBG_NO_TILING)
 		return tiling < 0 ? tiling : I915_TILING_NONE;
 
@@ -2058,17 +2081,6 @@ int kgem_choose_tiling(struct kgem *kgem, int tiling, int width, int height, int
 		}
 	}
 
-	/* First check that we can fence the whole object */
-	if (tiling &&
-	    kgem_surface_size(kgem, false, false,
-			      width, height, bpp, tiling,
-			      &pitch) > kgem->max_gpu_size) {
-		DBG(("%s: too large (%dx%d) to be fenced, discarding tiling\n",
-		     __FUNCTION__, width, height));
-		tiling = I915_TILING_NONE;
-		goto done;
-	}
-
 	if (tiling < 0)
 		return tiling;
 
@@ -2125,18 +2137,42 @@ done:
 	return tiling;
 }
 
-bool kgem_can_create_cpu(struct kgem *kgem,
+bool kgem_can_create_2d(struct kgem *kgem,
 			 int width, int height, int depth)
 {
+	int bpp = BitsPerPixel(depth);
 	uint32_t pitch, size;
 
 	if (depth < 8 || kgem->wedged)
 		return false;
 
 	size = kgem_surface_size(kgem, false, false,
-				 width, height, BitsPerPixel(depth),
+				 width, height, bpp,
+				 I915_TILING_X, &pitch);
+	if (size > 0 && size <= kgem->max_object_size)
+		return true;
+
+	size = kgem_surface_size(kgem, false, false,
+				 width, height, bpp,
 				 I915_TILING_NONE, &pitch);
-	return size > 0 && size < kgem->max_cpu_size;
+	if (size > 0 && size <= kgem->max_object_size)
+		return true;
+
+	return false;
+}
+
+bool kgem_can_create_cpu(struct kgem *kgem,
+			 int width, int height, int bpp)
+{
+	uint32_t pitch, size;
+
+	if (bpp < 8 || kgem->wedged)
+		return false;
+
+	size = kgem_surface_size(kgem, false, false,
+				 width, height, bpp,
+				 I915_TILING_NONE, &pitch);
+	return size > 0 && size <= kgem->max_cpu_size;
 }
 
 static bool _kgem_can_create_gpu(struct kgem *kgem,
@@ -2179,7 +2215,7 @@ inline int kgem_bo_fenced_size(struct kgem *kgem, struct kgem_bo *bo)
 		size = 512 * 1024;
 	else
 		size = 1024 * 1024;
-	while (size < bo->size)
+	while (size < bytes(bo))
 		size *= 2;
 
 	return size;
@@ -2213,10 +2249,52 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 				 kgem->has_relaxed_fencing,
 				 flags & CREATE_SCANOUT,
 				 width, height, bpp, tiling, &pitch);
-	assert(size && size < kgem->max_cpu_size);
-	assert(tiling == I915_TILING_NONE || size < kgem->max_gpu_size);
+	assert(size && size <= kgem->max_object_size);
+	size /= PAGE_SIZE;
 	bucket = cache_bucket(size);
 
+	if (bucket >= NUM_CACHE_BUCKETS) {
+		DBG(("%s: large bo num pages=%d, bucket=%d\n",
+		     __FUNCTION__, size, bucket));
+
+		if (flags & CREATE_INACTIVE)
+			goto create;
+
+		tiled_height = kgem_aligned_height(kgem, height, I915_TILING_Y);
+		untiled_pitch = kgem_untiled_pitch(kgem,
+						   width, bpp,
+						   flags & CREATE_SCANOUT);
+
+		list_for_each_entry(bo, &kgem->large, list) {
+			assert(!bo->purged);
+			assert(bo->refcnt == 0);
+			assert(bo->reusable);
+
+			if (bo->tiling) {
+				if (bo->pitch < pitch) {
+					DBG(("tiled and pitch too small: tiling=%d, (want %d), pitch=%d, need %d\n",
+					     bo->tiling, tiling,
+					     bo->pitch, pitch));
+					continue;
+				}
+			} else
+				bo->pitch = untiled_pitch;
+
+			if (bo->pitch * tiled_height > bytes(bo))
+				continue;
+
+			kgem_bo_remove_from_active(kgem, bo);
+
+			bo->unique_id = kgem_get_unique_id(kgem);
+			bo->delta = 0;
+			DBG(("  1:from active: pitch=%d, tiling=%d, handle=%d, id=%d\n",
+			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+			return kgem_bo_reference(bo);
+		}
+
+		goto create;
+	}
+
 	if (flags & (CREATE_CPU_MAP | CREATE_GTT_MAP)) {
 		int for_cpu = !!(flags & CREATE_CPU_MAP);
 		if (kgem->has_llc && tiling == I915_TILING_NONE)
@@ -2227,16 +2305,16 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 		cache = &kgem->vma[for_cpu].inactive[bucket];
 		do {
 			list_for_each_entry(bo, cache, vma) {
-				assert(bo->bucket == bucket);
+				assert(bucket(bo) == bucket);
 				assert(bo->refcnt == 0);
 				assert(bo->map);
 				assert(IS_CPU_MAP(bo->map) == for_cpu);
 				assert(bo->rq == NULL);
 				assert(list_is_empty(&bo->request));
 
-				if (size > bo->size) {
+				if (size > num_pages(bo)) {
 					DBG(("inactive too small: %d < %d\n",
-					     bo->size, size));
+					     num_pages(bo), size));
 					continue;
 				}
 
@@ -2275,13 +2353,14 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 	if (retry > 3)
 		retry = 3;
 search_again:
+	assert(bucket < NUM_CACHE_BUCKETS);
 	cache = &kgem->active[bucket][tiling];
 	if (tiling) {
 		tiled_height = kgem_aligned_height(kgem, height, tiling);
 		list_for_each_entry(bo, cache, list) {
 			assert(!bo->purged);
 			assert(bo->refcnt == 0);
-			assert(bo->bucket == bucket);
+			assert(bucket(bo) == bucket);
 			assert(bo->reusable);
 			assert(bo->tiling == tiling);
 
@@ -2292,7 +2371,7 @@ search_again:
 				continue;
 			}
 
-			if (bo->pitch * tiled_height > bo->size)
+			if (bo->pitch * tiled_height > bytes(bo))
 				continue;
 
 			kgem_bo_remove_from_active(kgem, bo);
@@ -2305,13 +2384,13 @@ search_again:
 		}
 	} else {
 		list_for_each_entry(bo, cache, list) {
-			assert(bo->bucket == bucket);
+			assert(bucket(bo) == bucket);
 			assert(!bo->purged);
 			assert(bo->refcnt == 0);
 			assert(bo->reusable);
 			assert(bo->tiling == tiling);
 
-			if (bo->size < size)
+			if (num_pages(bo) < size)
 				continue;
 
 			kgem_bo_remove_from_active(kgem, bo);
@@ -2340,7 +2419,7 @@ search_again:
 							 kgem->has_relaxed_fencing,
 							 flags & CREATE_SCANOUT,
 							 width, height, bpp, tiling, &pitch);
-			cache = active(kgem, tiled_height, i);
+			cache = active(kgem, tiled_height / PAGE_SIZE, i);
 			tiled_height = kgem_aligned_height(kgem, height, i);
 			list_for_each_entry(bo, cache, list) {
 				assert(!bo->purged);
@@ -2357,7 +2436,7 @@ search_again:
 				} else
 					bo->pitch = untiled_pitch;
 
-				if (bo->pitch * tiled_height > bo->size)
+				if (bo->pitch * tiled_height > bytes(bo))
 					continue;
 
 				kgem_bo_remove_from_active(kgem, bo);
@@ -2378,13 +2457,14 @@ skip_active_search:
 		retry = 3;
 search_inactive:
 	/* Now just look for a close match and prefer any currently active */
+	assert(bucket < NUM_CACHE_BUCKETS);
 	cache = &kgem->inactive[bucket];
 	list_for_each_entry_safe(bo, next, cache, list) {
-		assert(bo->bucket == bucket);
+		assert(bucket(bo) == bucket);
 
-		if (size > bo->size) {
+		if (size > num_pages(bo)) {
 			DBG(("inactive too small: %d < %d\n",
-			     bo->size, size));
+			     num_pages(bo), size));
 			continue;
 		}
 
@@ -2439,6 +2519,7 @@ search_inactive:
 		goto search_inactive;
 	}
 
+create:
 	handle = gem_create(kgem->fd, size);
 	if (handle == 0)
 		return NULL;
@@ -2455,7 +2536,7 @@ search_inactive:
 	if (tiling != I915_TILING_NONE)
 		bo->tiling = gem_set_tiling(kgem->fd, handle, tiling, pitch);
 
-	assert(bo->size >= bo->pitch * kgem_aligned_height(kgem, height, bo->tiling));
+	assert(bytes(bo) >= bo->pitch * kgem_aligned_height(kgem, height, bo->tiling));
 
 	DBG(("  new pitch=%d, tiling=%d, handle=%d, id=%d\n",
 	     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
@@ -2470,9 +2551,9 @@ static void _kgem_bo_delete_partial(struct kgem *kgem, struct kgem_bo *bo)
 		return;
 
 	DBG(("%s: size=%d, offset=%d, parent used=%d\n",
-	     __FUNCTION__, bo->size, bo->delta, io->used));
+	     __FUNCTION__, bo->size.bytes, bo->delta, io->used));
 
-	if (bo->delta + bo->size == io->used) {
+	if (bo->delta + bo->size.bytes == io->used) {
 		io->used = bo->delta;
 		bubble_sort_partial(kgem, io);
 	}
@@ -2508,25 +2589,30 @@ bool kgem_check_bo(struct kgem *kgem, ...)
 	va_list ap;
 	struct kgem_bo *bo;
 	int num_exec = 0;
-	int size = 0;
+	int num_pages = 0;
 
 	va_start(ap, kgem);
 	while ((bo = va_arg(ap, struct kgem_bo *))) {
 		if (bo->exec)
 			continue;
 
-		size += bo->size;
+		if (bo->proxy) {
+			bo = bo->proxy;
+			if (bo->exec)
+				continue;
+		}
+		num_pages += num_pages(bo);
 		num_exec++;
 	}
 	va_end(ap);
 
-	if (!size)
+	if (!num_pages)
 		return true;
 
 	if (kgem->aperture > kgem->aperture_low)
 		return false;
 
-	if (size + kgem->aperture > kgem->aperture_high)
+	if (num_pages + kgem->aperture > kgem->aperture_high)
 		return false;
 
 	if (kgem->nexec + num_exec >= KGEM_EXEC_SIZE(kgem))
@@ -2541,11 +2627,13 @@ bool kgem_check_bo_fenced(struct kgem *kgem, ...)
 	struct kgem_bo *bo;
 	int num_fence = 0;
 	int num_exec = 0;
-	int size = 0;
+	int num_pages = 0;
 	int fenced_size = 0;
 
 	va_start(ap, kgem);
 	while ((bo = va_arg(ap, struct kgem_bo *))) {
+		if (bo->proxy)
+			bo = bo->proxy;
 		if (bo->exec) {
 			if (kgem->gen >= 40 || bo->tiling == I915_TILING_NONE)
 				continue;
@@ -2558,7 +2646,7 @@ bool kgem_check_bo_fenced(struct kgem *kgem, ...)
 			continue;
 		}
 
-		size += bo->size;
+		num_pages += num_pages(bo);
 		num_exec++;
 		if (kgem->gen < 40 && bo->tiling) {
 			fenced_size += kgem_bo_fenced_size(kgem, bo);
@@ -2573,13 +2661,13 @@ bool kgem_check_bo_fenced(struct kgem *kgem, ...)
 	if (kgem->nfence + num_fence > kgem->fence_max)
 		return false;
 
-	if (!size)
+	if (!num_pages)
 		return true;
 
 	if (kgem->aperture > kgem->aperture_low)
 		return false;
 
-	if (size + kgem->aperture > kgem->aperture_high)
+	if (num_pages + kgem->aperture > kgem->aperture_high)
 		return false;
 
 	if (kgem->nexec + num_exec >= KGEM_EXEC_SIZE(kgem))
@@ -2698,7 +2786,7 @@ static void kgem_trim_vma_cache(struct kgem *kgem, int type, int bucket)
 		assert(bo->rq == NULL);
 
 		VG(if (type) VALGRIND_FREELIKE_BLOCK(CPU_MAP(bo->map), 0));
-		munmap(CPU_MAP(bo->map), bo->size);
+		munmap(CPU_MAP(bo->map), bytes(bo));
 		bo->map = NULL;
 		list_del(&bo->vma);
 		kgem->vma[type].count--;
@@ -2736,11 +2824,11 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 
 	ptr = bo->map;
 	if (ptr == NULL) {
-		assert(bo->size <= kgem->aperture_mappable / 4);
+		assert(bytes(bo) <= kgem->aperture_mappable / 4);
 
-		kgem_trim_vma_cache(kgem, MAP_GTT, bo->bucket);
+		kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
 
-		ptr = gem_mmap(kgem->fd, bo->handle, bo->size,
+		ptr = gem_mmap(kgem->fd, bo->handle, bytes(bo),
 			       PROT_READ | PROT_WRITE);
 		if (ptr == NULL)
 			return NULL;
@@ -2780,8 +2868,8 @@ void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo)
 	if (bo->map)
 		return bo->map;
 
-	kgem_trim_vma_cache(kgem, MAP_GTT, bo->bucket);
-	return bo->map = gem_mmap(kgem->fd, bo->handle, bo->size,
+	kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
+	return bo->map = gem_mmap(kgem->fd, bo->handle, bytes(bo),
 				  PROT_READ | PROT_WRITE);
 }
 
@@ -2789,7 +2877,7 @@ void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
 {
 	struct drm_i915_gem_mmap mmap_arg;
 
-	DBG(("%s(handle=%d, size=%d)\n", __FUNCTION__, bo->handle, bo->size));
+	DBG(("%s(handle=%d, size=%d)\n", __FUNCTION__, bo->handle, bytes(bo)));
 	assert(!bo->purged);
 	assert(list_is_empty(&bo->list));
 
@@ -2799,18 +2887,19 @@ void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo)
 	if (bo->map)
 		kgem_bo_release_map(kgem, bo);
 
-	kgem_trim_vma_cache(kgem, MAP_CPU, bo->bucket);
+	kgem_trim_vma_cache(kgem, MAP_CPU, bucket(bo));
 
 	VG_CLEAR(mmap_arg);
 	mmap_arg.handle = bo->handle;
 	mmap_arg.offset = 0;
-	mmap_arg.size = bo->size;
+	mmap_arg.size = bytes(bo);
 	if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) {
-		assert(0);
+		ErrorF("%s: failed to mmap %d, %d bytes, into CPU domain\n",
+		       __FUNCTION__, bo->handle, bytes(bo));
 		return NULL;
 	}
 
-	VG(VALGRIND_MALLOCLIKE_BLOCK(mmap_arg.addr_ptr, bo->size, 0, 1));
+	VG(VALGRIND_MALLOCLIKE_BLOCK(mmap_arg.addr_ptr, bytes(bo), 0, 1));
 
 	DBG(("%s: caching CPU vma for %d\n", __FUNCTION__, bo->handle));
 	bo->map = MAKE_CPU_MAP(mmap_arg.addr_ptr);
@@ -2876,6 +2965,9 @@ struct kgem_bo *kgem_create_map(struct kgem *kgem,
 	if (!kgem->has_vmap)
 		return NULL;
 
+	if (size >= MAX_CACHE_SIZE)
+		return NULL;
+
 	handle = gem_vmap(kgem->fd, ptr, size, read_only);
 	if (handle == 0)
 		return NULL;
@@ -2972,6 +3064,7 @@ struct kgem_bo *kgem_create_proxy(struct kgem_bo *target,
 	if (bo == NULL)
 		return NULL;
 
+	bo->size.bytes = length;
 	bo->io = target->io;
 	bo->dirty = target->dirty;
 	bo->tiling = target->tiling;
@@ -2982,11 +3075,11 @@ struct kgem_bo *kgem_create_proxy(struct kgem_bo *target,
 	return bo;
 }
 
-static struct kgem_partial_bo *partial_bo_alloc(int size)
+static struct kgem_partial_bo *partial_bo_alloc(int num_pages)
 {
 	struct kgem_partial_bo *bo;
 
-	bo = malloc(sizeof(*bo) + 128 + size);
+	bo = malloc(sizeof(*bo) + 128 + num_pages * PAGE_SIZE);
 	if (bo) {
 		bo->mem = (void *)ALIGN((uintptr_t)bo + sizeof(*bo), 64);
 		bo->mmapped = false;
@@ -3010,20 +3103,20 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	     !!(flags & KGEM_BUFFER_LAST)));
 	assert(size);
 	/* we should never be asked to create anything TOO large */
-	assert(size < kgem->max_cpu_size);
+	assert(size <= kgem->max_cpu_size);
 
 	list_for_each_entry(bo, &kgem->partial, base.list) {
 		if (flags == KGEM_BUFFER_LAST && bo->write) {
 			/* We can reuse any write buffer which we can fit */
-			if (size <= bo->base.size) {
+			if (size <= bytes(&bo->base)) {
 				if (bo->base.refcnt == 1 && bo->base.exec) {
 					DBG(("%s: reusing write buffer for read of %d bytes? used=%d, total=%d\n",
-					     __FUNCTION__, size, bo->used, bo->base.size));
+					     __FUNCTION__, size, bo->used, bytes(&bo->base)));
 					offset = 0;
 					goto done;
-				} else if (bo->used + size <= bo->base.size) {
+				} else if (bo->used + size <= bytes(&bo->base)) {
 					DBG(("%s: reusing unfinished write buffer for read of %d bytes? used=%d, total=%d\n",
-					     __FUNCTION__, size, bo->used, bo->base.size));
+					     __FUNCTION__, size, bo->used, bytes(&bo->base)));
 					offset = bo->used;
 					goto done;
 				}
@@ -3037,24 +3130,25 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			continue;
 		}
 
-		if (bo->used + size <= bo->base.size) {
+		if (bo->used + size <= bytes(&bo->base)) {
 			DBG(("%s: reusing partial buffer? used=%d + size=%d, total=%d\n",
-			     __FUNCTION__, bo->used, size, bo->base.size));
+			     __FUNCTION__, bo->used, size, bytes(&bo->base)));
 			offset = bo->used;
 			bo->used += size;
 			goto done;
 		}
 
 		DBG(("%s: too small (%d < %d)\n",
-		     __FUNCTION__, bo->base.size - bo->used, size));
+		     __FUNCTION__, bytes(&bo->base) - bo->used, size));
 		break;
 	}
 
 #if !DBG_NO_MAP_UPLOAD
 	/* Be a little more generous and hope to hold fewer mmappings */
 	alloc = ALIGN(2*size, kgem->partial_buffer_size);
-	if (alloc >= kgem->max_cpu_size)
+	if (alloc > kgem->max_gpu_size)
 		alloc = PAGE_ALIGN(size);
+	alloc /= PAGE_SIZE;
 	if (kgem->has_cpu_bo) {
 		bo = malloc(sizeof(*bo));
 		if (bo == NULL)
@@ -3098,7 +3192,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			bo->base.io = true;
 			bo->mmapped = true;
 
-			alloc = bo->base.size;
+			alloc = num_pages(&bo->base);
 			goto init;
 		} else {
 			bo->base.refcnt = 0; /* for valgrind */
@@ -3107,7 +3201,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		}
 	}
 
-	if (alloc > kgem->aperture_mappable / 4)
+	if (PAGE_SIZE * alloc > kgem->aperture_mappable / 4)
 		flags &= ~KGEM_BUFFER_INPLACE;
 
 	if ((flags & KGEM_BUFFER_WRITE_INPLACE) == KGEM_BUFFER_WRITE_INPLACE) {
@@ -3164,7 +3258,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 				bo->mmapped = true;
 				bo->base.refcnt = 1;
 
-				alloc = bo->base.size;
+				alloc = num_pages(&bo->base);
 				goto init;
 			} else {
 				kgem_bo_free(kgem, &bo->base);
@@ -3173,11 +3267,11 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		}
 	}
 #else
-	alloc = ALIGN(size, 64*1024);
+	alloc = ALIGN(size, 64*1024) / PAGE_SIZE;
 #endif
 	/* Be more parsimonious with pwrite/pread buffers */
 	if ((flags & KGEM_BUFFER_INPLACE) == 0)
-		alloc = PAGE_ALIGN(size);
+		alloc = PAGE_ALIGN(size) / PAGE_SIZE;
 	flags &= ~KGEM_BUFFER_INPLACE;
 
 	old = NULL;
@@ -3188,7 +3282,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	if (old) {
 		DBG(("%s: reusing ordinary handle %d for io\n",
 		     __FUNCTION__, old->handle));
-		alloc = old->size;
+		alloc = num_pages(old);
 		bo = partial_bo_alloc(alloc);
 		if (bo == NULL)
 			return NULL;
@@ -3240,21 +3334,40 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		}
 
 		bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
-		if (bo->mem == NULL) {
-			kgem_bo_free(kgem, &bo->base);
+		if (bo->mem != NULL) {
+			if (flags & KGEM_BUFFER_WRITE)
+				kgem_bo_sync__cpu(kgem, &bo->base);
+
+			bo->need_io = false;
+			bo->base.io = true;
+			bo->mmapped = true;
+			goto init;
+		}
+
+		DBG(("%s: failing back to new pwrite buffer\n", __FUNCTION__));
+		old = &bo->base;
+		bo = partial_bo_alloc(alloc);
+		if (bo == NULL) {
+			free(old);
 			return NULL;
 		}
 
-		if (flags & KGEM_BUFFER_WRITE)
-			kgem_bo_sync__cpu(kgem, &bo->base);
+		memcpy(&bo->base, old, sizeof(*old));
+		free(old);
+
+		assert(bo->mem);
+		assert(!bo->mmapped);
 
-		bo->need_io = false;
+		list_init(&bo->base.request);
+		list_init(&bo->base.vma);
+		list_init(&bo->base.list);
+		bo->base.refcnt = 1;
+		bo->need_io = flags & KGEM_BUFFER_WRITE;
 		bo->base.io = true;
-		bo->mmapped = true;
 	}
 init:
 	bo->base.reusable = false;
-	assert(bo->base.size == alloc);
+	assert(num_pages(&bo->base) == alloc);
 	assert(!bo->need_io || !bo->base.needs_flush);
 	assert(!bo->need_io || bo->base.domain != DOMAIN_GPU);
 
@@ -3263,12 +3376,12 @@ init:
 	offset = 0;
 
 	list_add(&bo->base.list, &kgem->partial);
-	DBG(("%s(size=%d) new handle=%d\n",
+	DBG(("%s(pages=%d) new handle=%d\n",
 	     __FUNCTION__, alloc, bo->base.handle));
 
 done:
 	/* adjust the position within the list to maintain decreasing order */
-	alloc = bo->base.size - bo->used;
+	alloc = bytes(&bo->base) - bo->used;
 	{
 		struct kgem_partial_bo *p, *first;
 
@@ -3276,9 +3389,9 @@ done:
 					     struct kgem_partial_bo,
 					     base.list);
 		while (&p->base.list != &kgem->partial &&
-		       alloc < p->base.size - p->used) {
+		       alloc < bytes(&p->base) - p->used) {
 			DBG(("%s: this=%d, right=%d\n",
-			     __FUNCTION__, alloc, p->base.size -p->used));
+			     __FUNCTION__, alloc, bytes(&p->base) -p->used));
 			p = list_first_entry(&p->base.list,
 					     struct kgem_partial_bo,
 					     base.list);
@@ -3287,6 +3400,7 @@ done:
 			list_move_tail(&bo->base.list, &p->base.list);
 		assert(validate_partials(kgem));
 	}
+	assert(bo->mem);
 	*ret = (char *)bo->mem + offset;
 	return kgem_create_proxy(&bo->base, offset, size);
 }
@@ -3300,6 +3414,7 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 	int stride;
 
 	assert(width > 0 && height > 0);
+	assert(ret != NULL);
 	stride = ALIGN(width, 2) * bpp >> 3;
 	stride = ALIGN(stride, kgem->min_alignment);
 
@@ -3307,8 +3422,12 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 	     __FUNCTION__, width, height, bpp, stride));
 
 	bo = kgem_create_buffer(kgem, stride * ALIGN(height, 2), flags, ret);
-	if (bo == NULL)
+	if (bo == NULL) {
+		DBG(("%s: allocation failure for upload buffer\n",
+		     __FUNCTION__));
 		return NULL;
+	}
+	assert(*ret != NULL);
 
 	if (height & 1) {
 		struct kgem_partial_bo *io = (struct kgem_partial_bo *)bo->proxy;
@@ -3319,7 +3438,7 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 		 */
 		if (io->used)
 			io->used -= stride;
-		bo->size -= stride;
+		bo->size.bytes -= stride;
 		bubble_sort_partial(kgem, io);
 	}
 
@@ -3357,8 +3476,9 @@ struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
 void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 {
 	struct kgem_partial_bo *bo;
-	uint32_t offset = _bo->delta, length = _bo->size;
+	uint32_t offset = _bo->delta, length = _bo->size.bytes;
 
+	assert(_bo->io);
 	assert(_bo->exec == NULL);
 	if (_bo->proxy)
 		_bo = _bo->proxy;
@@ -3461,7 +3581,7 @@ kgem_replace_bo(struct kgem *kgem,
 	assert(src->tiling == I915_TILING_NONE);
 
 	size = height * pitch;
-	size = PAGE_ALIGN(size);
+	size = PAGE_ALIGN(size) / PAGE_SIZE;
 
 	dst = search_linear_cache(kgem, size, 0);
 	if (dst == NULL)
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index 0dc67dac..2631e818 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -66,10 +66,16 @@ struct kgem_bo {
 	uint32_t handle;
 	uint32_t presumed_offset;
 	uint32_t delta;
-	uint32_t size:28;
-	uint32_t bucket:4;
-#define MAX_OBJECT_SIZE (1 << 28)
-
+	union {
+		struct {
+			uint32_t count:27;
+#define PAGE_SIZE 4096
+			uint32_t bucket:5;
+#define NUM_CACHE_BUCKETS 16
+#define MAX_CACHE_SIZE (1 << (NUM_CACHE_BUCKETS+12))
+		} pages;
+		uint32_t bytes;
+	} size;
 	uint32_t pitch : 18; /* max 128k */
 	uint32_t tiling : 2;
 	uint32_t reusable : 1;
@@ -100,8 +106,6 @@ enum {
 	NUM_MAP_TYPES,
 };
 
-#define NUM_CACHE_BUCKETS 16
-
 struct kgem {
 	int fd;
 	int wedged;
@@ -117,7 +121,10 @@ struct kgem {
 		KGEM_BLT,
 	} mode, ring;
 
-	struct list flushing, active[NUM_CACHE_BUCKETS][3], inactive[NUM_CACHE_BUCKETS];
+	struct list flushing;
+	struct list large;
+	struct list active[NUM_CACHE_BUCKETS][3];
+	struct list inactive[NUM_CACHE_BUCKETS];
 	struct list partial;
 	struct list requests;
 	struct kgem_request *next_request;
@@ -154,7 +161,7 @@ struct kgem {
 	uint32_t aperture_total, aperture_high, aperture_low, aperture_mappable;
 	uint32_t aperture, aperture_fenced;
 	uint32_t min_alignment;
-	uint32_t max_gpu_size, max_cpu_size;
+	uint32_t max_gpu_size, max_cpu_size, max_object_size;
 	uint32_t partial_buffer_size;
 
 	void (*context_switch)(struct kgem *kgem, int new_mode);
@@ -194,8 +201,9 @@ struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
 
 int kgem_choose_tiling(struct kgem *kgem,
 		       int tiling, int width, int height, int bpp);
+bool kgem_can_create_2d(struct kgem *kgem, int width, int height, int depth);
 bool kgem_can_create_gpu(struct kgem *kgem, int width, int height, int bpp);
-bool kgem_can_create_cpu(struct kgem *kgem, int width, int height, int depth);
+bool kgem_can_create_cpu(struct kgem *kgem, int width, int height, int bpp);
 
 struct kgem_bo *
 kgem_replace_bo(struct kgem *kgem,
@@ -354,11 +362,46 @@ Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 
 int kgem_bo_fenced_size(struct kgem *kgem, struct kgem_bo *bo);
 
+static inline int kgem_bo_size(struct kgem_bo *bo)
+{
+	assert(!(bo->proxy && bo->io));
+	return PAGE_SIZE * bo->size.pages.count;
+}
+
+static inline int kgem_buffer_size(struct kgem_bo *bo)
+{
+	assert(bo->proxy && bo->io);
+	return bo->size.bytes;
+}
+
+static inline bool kgem_bo_can_blt(struct kgem *kgem,
+				   struct kgem_bo *bo)
+{
+	int pitch;
+
+	if (bo->tiling == I915_TILING_Y) {
+		DBG(("%s: can not blt to handle=%d, tiling=Y\n",
+		     __FUNCTION__, bo->handle));
+		return false;
+	}
+
+	pitch = bo->pitch;
+	if (kgem->gen >= 40 && bo->tiling)
+		pitch /= 4;
+	if (pitch > MAXSHORT) {
+		DBG(("%s: can not blt to handle=%d, adjusted pitch=%d\n",
+		     __FUNCTION__, pitch));
+		return false;
+	}
+
+	return true;
+}
+
 static inline bool kgem_bo_is_mappable(struct kgem *kgem,
 				       struct kgem_bo *bo)
 {
 	DBG_HDR(("%s: domain=%d, offset: %d size: %d\n",
-		 __FUNCTION__, bo->domain, bo->presumed_offset, bo->size));
+		 __FUNCTION__, bo->domain, bo->presumed_offset, kgem_bo_size(bo)));
 
 	if (bo->domain == DOMAIN_GTT)
 		return true;
@@ -371,9 +414,9 @@ static inline bool kgem_bo_is_mappable(struct kgem *kgem,
 		return false;
 
 	if (!bo->presumed_offset)
-		return bo->size <= kgem->aperture_mappable / 4;
+		return kgem_bo_size(bo) <= kgem->aperture_mappable / 4;
 
-	return bo->presumed_offset + bo->size <= kgem->aperture_mappable;
+	return bo->presumed_offset + kgem_bo_size(bo) <= kgem->aperture_mappable;
 }
 
 static inline bool kgem_bo_mapped(struct kgem_bo *bo)
diff --git a/src/sna/kgem_debug_gen5.c b/src/sna/kgem_debug_gen5.c
index f21220f3..9e7360af 100644
--- a/src/sna/kgem_debug_gen5.c
+++ b/src/sna/kgem_debug_gen5.c
@@ -79,7 +79,7 @@ static void gen5_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
 	} else {
 		bo = kgem_debug_get_bo_for_reloc_entry(kgem, reloc);
 		base = kgem_bo_map__debug(kgem, bo);
-		size = bo->size;
+		size = kgem_bo_size(bo);
 	}
 	ptr = (char *)base + reloc->delta;
 
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 5910daf2..d9ba7735 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -92,7 +92,6 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define DEBUG_NO_RENDER 0
 #define DEBUG_NO_BLT 0
-#define DEBUG_NO_IO 0
 
 #define DEBUG_FLUSH_BATCH 0
 #define DEBUG_FLUSH_SYNC 0
@@ -647,7 +646,7 @@ void sna_read_boxes(struct sna *sna,
 		    struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
 		    PixmapPtr dst, int16_t dst_dx, int16_t dst_dy,
 		    const BoxRec *box, int n);
-void sna_write_boxes(struct sna *sna, PixmapPtr dst,
+bool sna_write_boxes(struct sna *sna, PixmapPtr dst,
 		     struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
 		     const void *src, int stride, int16_t src_dx, int16_t src_dy,
 		     const BoxRec *box, int n);
@@ -657,10 +656,10 @@ void sna_write_boxes__xor(struct sna *sna, PixmapPtr dst,
 			  const BoxRec *box, int nbox,
 			  uint32_t and, uint32_t or);
 
-struct kgem_bo *sna_replace(struct sna *sna,
-			    PixmapPtr pixmap,
-			    struct kgem_bo *bo,
-			    const void *src, int stride);
+bool sna_replace(struct sna *sna,
+		 PixmapPtr pixmap,
+		 struct kgem_bo **bo,
+		 const void *src, int stride);
 struct kgem_bo *sna_replace__xor(struct sna *sna,
 				 PixmapPtr pixmap,
 				 struct kgem_bo *bo,
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index f2997d02..ce351131 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -243,9 +243,14 @@ sna_pixmap_alloc_cpu(struct sna *sna,
 	if (priv->ptr)
 		goto done;
 
+	DBG(("%s: pixmap=%ld\n", __FUNCTION__, pixmap->drawable.serialNumber));
 	assert(priv->stride);
 
-	if (sna->kgem.has_cpu_bo || !priv->gpu) {
+	if ((sna->kgem.has_cpu_bo || !priv->gpu) &&
+	    kgem_can_create_cpu(&sna->kgem,
+				pixmap->drawable.width,
+				pixmap->drawable.height,
+				pixmap->drawable.bitsPerPixel)) {
 		DBG(("%s: allocating CPU buffer (%dx%d)\n", __FUNCTION__,
 		     pixmap->drawable.width, pixmap->drawable.height));
 
@@ -270,8 +275,11 @@ sna_pixmap_alloc_cpu(struct sna *sna,
 		}
 	}
 
-	if (priv->ptr == NULL)
+	if (priv->ptr == NULL) {
+		DBG(("%s: allocating ordinary memory for shadow pixels [%d bytes]\n",
+		     __FUNCTION__, priv->stride * pixmap->drawable.height));
 		priv->ptr = malloc(priv->stride * pixmap->drawable.height);
+	}
 
 	assert(priv->ptr);
 done:
@@ -289,7 +297,7 @@ static void sna_pixmap_free_cpu(struct sna *sna, struct sna_pixmap *priv)
 
 	if (priv->cpu_bo) {
 		DBG(("%s: discarding CPU buffer, handle=%d, size=%d\n",
-		     __FUNCTION__, priv->cpu_bo->handle, priv->cpu_bo->size));
+		     __FUNCTION__, priv->cpu_bo->handle, kgem_bo_size(priv->cpu_bo)));
 
 		kgem_bo_destroy(&sna->kgem, priv->cpu_bo);
 		priv->cpu_bo = NULL;
@@ -515,10 +523,10 @@ struct sna_pixmap *_sna_pixmap_attach(PixmapPtr pixmap)
 		break;
 
 	default:
-		if (!kgem_can_create_gpu(&sna->kgem,
-					 pixmap->drawable.width,
-					 pixmap->drawable.height,
-					 pixmap->drawable.bitsPerPixel))
+		if (!kgem_can_create_2d(&sna->kgem,
+					pixmap->drawable.width,
+					pixmap->drawable.height,
+					pixmap->drawable.depth))
 			return NULL;
 		break;
 	}
@@ -669,8 +677,11 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 	DBG(("%s(%d, %d, %d, usage=%x)\n", __FUNCTION__,
 	     width, height, depth, usage));
 
-	if (!kgem_can_create_cpu(&sna->kgem, width, height, depth))
+	if (!kgem_can_create_2d(&sna->kgem, width, height, depth)) {
+		DBG(("%s: can not use GPU, just creating shadow\n",
+		     __FUNCTION__));
 		return create_pixmap(sna, screen, width, height, depth, usage);
+	}
 
 	if (!sna->have_render)
 		return create_pixmap(sna, screen,
@@ -704,6 +715,8 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 
 	pad = PixmapBytePad(width, depth);
 	if (pad * height <= 4096) {
+		DBG(("%s: small buffer [%d], attaching to shadow pixmap\n",
+		     __FUNCTION__, pad * height));
 		pixmap = create_pixmap(sna, screen,
 				       width, height, depth, usage);
 		if (pixmap == NullPixmap)
@@ -713,6 +726,9 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 	} else {
 		struct sna_pixmap *priv;
 
+		DBG(("%s: creating GPU pixmap %dx%d, stride=%d\n",
+		     __FUNCTION__, width, height, pad));
+
 		pixmap = create_pixmap(sna, screen, 0, 0, depth, usage);
 		if (pixmap == NullPixmap)
 			return NullPixmap;
@@ -1609,19 +1625,20 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 				    box->x1 <= 0 && box->y1 <= 0 &&
 				    box->x2 >= pixmap->drawable.width &&
 				    box->y2 >= pixmap->drawable.height) {
-					priv->gpu_bo =
-						sna_replace(sna, pixmap,
-							    priv->gpu_bo,
-							    pixmap->devPrivate.ptr,
-							    pixmap->devKind);
+					ok = sna_replace(sna, pixmap,
+							 &priv->gpu_bo,
+							 pixmap->devPrivate.ptr,
+							 pixmap->devKind);
 				} else {
-					sna_write_boxes(sna, pixmap,
-							priv->gpu_bo, 0, 0,
-							pixmap->devPrivate.ptr,
-							pixmap->devKind,
-							0, 0,
-							box, n);
+					ok = sna_write_boxes(sna, pixmap,
+							     priv->gpu_bo, 0, 0,
+							     pixmap->devPrivate.ptr,
+							     pixmap->devKind,
+							     0, 0,
+							     box, n);
 				}
+				if (!ok)
+					return false;
 			}
 		}
 
@@ -1637,12 +1654,14 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 						    pixmap, priv->gpu_bo, 0, 0,
 						    box, 1);
 		if (!ok)
-			sna_write_boxes(sna, pixmap,
-					priv->gpu_bo, 0, 0,
-					pixmap->devPrivate.ptr,
-					pixmap->devKind,
-					0, 0,
-					box, 1);
+			ok = sna_write_boxes(sna, pixmap,
+					     priv->gpu_bo, 0, 0,
+					     pixmap->devPrivate.ptr,
+					     pixmap->devKind,
+					     0, 0,
+					     box, 1);
+		if (!ok)
+			return false;
 
 		sna_damage_subtract(&priv->cpu_damage, &r);
 		priv->undamaged = true;
@@ -1658,12 +1677,14 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 						    pixmap, priv->gpu_bo, 0, 0,
 						    box, n);
 		if (!ok)
-			sna_write_boxes(sna, pixmap,
-					priv->gpu_bo, 0, 0,
-					pixmap->devPrivate.ptr,
-					pixmap->devKind,
-					0, 0,
-					box, n);
+			ok = sna_write_boxes(sna, pixmap,
+					     priv->gpu_bo, 0, 0,
+					     pixmap->devPrivate.ptr,
+					     pixmap->devKind,
+					     0, 0,
+					     box, n);
+		if (!ok)
+			return false;
 
 		sna_damage_subtract(&priv->cpu_damage, &r);
 		priv->undamaged = true;
@@ -1671,7 +1692,7 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, BoxPtr box, unsigned int flags)
 	}
 
 done:
-	if (!priv->pinned)
+	if (!priv->pinned && priv->gpu)
 		list_move(&priv->inactive, &sna->active_pixmaps);
 	priv->clear = false;
 	return true;
@@ -1811,7 +1832,7 @@ done:
 
 use_gpu_bo:
 	priv->clear = false;
-	if (!priv->pinned)
+	if (!priv->pinned && priv->gpu)
 		list_move(&priv->inactive,
 			  &to_sna_from_pixmap(pixmap)->active_pixmaps);
 	*damage = NULL;
@@ -1978,6 +1999,17 @@ sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags)
 	if (!sna_pixmap_move_to_gpu(pixmap, flags))
 		return NULL;
 
+	/* For large bo, try to keep only a single copy around */
+	if (!priv->gpu && priv->ptr) {
+		sna_damage_all(&priv->gpu_damage,
+			       pixmap->drawable.width,
+			       pixmap->drawable.height);
+		sna_damage_destroy(&priv->cpu_damage);
+		priv->undamaged = false;
+		list_del(&priv->list);
+		sna_pixmap_free_cpu(to_sna_from_pixmap(pixmap), priv);
+	}
+
 	return priv;
 }
 
@@ -2070,19 +2102,20 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 			if (n == 1 && !priv->pinned &&
 			    (box->x2 - box->x1) >= pixmap->drawable.width &&
 			    (box->y2 - box->y1) >= pixmap->drawable.height) {
-				priv->gpu_bo =
-					sna_replace(sna, pixmap,
-						    priv->gpu_bo,
-						    pixmap->devPrivate.ptr,
-						    pixmap->devKind);
+				ok = sna_replace(sna, pixmap,
+						 &priv->gpu_bo,
+						 pixmap->devPrivate.ptr,
+						 pixmap->devKind);
 			} else {
-				sna_write_boxes(sna, pixmap,
+				ok = sna_write_boxes(sna, pixmap,
 						priv->gpu_bo, 0, 0,
 						pixmap->devPrivate.ptr,
 						pixmap->devKind,
 						0, 0,
 						box, n);
 			}
+			if (!ok)
+				return NULL;
 		}
 	}
 
@@ -2098,7 +2131,7 @@ done:
 	if (DAMAGE_IS_ALL(priv->gpu_damage))
 		priv->undamaged = false;
 active:
-	if (!priv->pinned)
+	if (!priv->pinned && priv->gpu)
 		list_move(&priv->inactive, &sna->active_pixmaps);
 	priv->clear = false;
 	return priv;
@@ -2321,11 +2354,8 @@ sna_put_image_upload_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	    !priv->pinned && nbox == 1 &&
 	    box->x1 <= 0 && box->y1 <= 0 &&
 	    box->x2 >= pixmap->drawable.width &&
-	    box->y2 >= pixmap->drawable.height) {
-		priv->gpu_bo =
-			sna_replace(sna, pixmap, priv->gpu_bo, bits, stride);
-		return TRUE;
-	}
+	    box->y2 >= pixmap->drawable.height)
+		return sna_replace(sna, pixmap, &priv->gpu_bo, bits, stride);
 
 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
 	x += dx + drawable->x;
@@ -2341,15 +2371,13 @@ sna_put_image_upload_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		kgem_bo_destroy(&sna->kgem, src_bo);
 	}
 
-	if (!ok && gc->alu == GXcopy) {
-		sna_write_boxes(sna, pixmap,
-				priv->gpu_bo, 0, 0,
-				bits,
-				stride,
-				-x, -y,
-				box, nbox);
-		ok = TRUE;
-	}
+	if (!ok && gc->alu == GXcopy)
+		ok = sna_write_boxes(sna, pixmap,
+				     priv->gpu_bo, 0, 0,
+				     bits,
+				     stride,
+				     -x, -y,
+				     box, nbox);
 
 	return ok;
 }
@@ -3213,7 +3241,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 			}
 		} else {
 			dst_priv->clear = false;
-			if (!dst_priv->pinned)
+			if (!dst_priv->pinned && dst_priv->gpu)
 				list_move(&dst_priv->inactive,
 					  &sna->active_pixmaps);
 		}
@@ -3400,10 +3428,10 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 				assert(src_dy + box->y1 + dst_pixmap->drawable.height <= src_pixmap->drawable.height);
 				assert(src_dx + box->x1 + dst_pixmap->drawable.width <= src_pixmap->drawable.width);
 
-				dst_priv->gpu_bo =
-					sna_replace(sna, dst_pixmap,
-						    dst_priv->gpu_bo,
-						    bits, stride);
+				if (!sna_replace(sna, dst_pixmap,
+						 &dst_priv->gpu_bo,
+						 bits, stride))
+					goto fallback;
 
 				if (!DAMAGE_IS_ALL(dst_priv->gpu_damage)) {
 					sna_damage_destroy(&dst_priv->cpu_damage);
@@ -3416,12 +3444,13 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 			} else {
 				DBG(("%s: dst is on the GPU, src is on the CPU, uploading into dst\n",
 				     __FUNCTION__));
-				sna_write_boxes(sna, dst_pixmap,
-						dst_priv->gpu_bo, dst_dx, dst_dy,
-						src_pixmap->devPrivate.ptr,
-						src_pixmap->devKind,
-						src_dx, src_dy,
-						box, n);
+				if (!sna_write_boxes(sna, dst_pixmap,
+						     dst_priv->gpu_bo, dst_dx, dst_dy,
+						     src_pixmap->devPrivate.ptr,
+						     src_pixmap->devKind,
+						     src_dx, src_dy,
+						     box, n))
+					goto fallback;
 
 				if (!DAMAGE_IS_ALL(dst_priv->gpu_damage)) {
 					RegionTranslate(&region, dst_dx, dst_dy);
@@ -11502,7 +11531,7 @@ static void sna_accel_inactive(struct sna *sna)
 		count = bytes = 0;
 		list_for_each_entry(priv, &sna->inactive_clock[1], inactive)
 			if (!priv->pinned)
-				count++, bytes += priv->gpu_bo->size;
+				count++, bytes += kgem_bo_size(priv->gpu_bo);
 
 		DBG(("%s: trimming %d inactive GPU buffers, %d bytes\n",
 		    __FUNCTION__, count, bytes));
@@ -11528,6 +11557,9 @@ static void sna_accel_inactive(struct sna *sna)
 		priv = list_first_entry(&sna->inactive_clock[1],
 					struct sna_pixmap,
 					inactive);
+		assert(priv->gpu);
+		assert(priv->gpu_bo);
+
 		/* XXX Rather than discarding the GPU buffer here, we
 		 * could mark it purgeable and allow the shrinker to
 		 * reap its storage only under memory pressure.
diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index 535628c0..7efbcf90 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -235,7 +235,7 @@ inline static void sna_blt_fill_one(struct sna *sna,
 
 	assert(x >= 0);
 	assert(y >= 0);
-	assert((y+height) * blt->bo[0]->pitch <= blt->bo[0]->size);
+	assert((y+height) * blt->bo[0]->pitch <= kgem_bo_size(blt->bo[0]));
 
 	if (!kgem_check_batch(kgem, 3))
 		sna_blt_fill_begin(sna, blt);
@@ -358,10 +358,10 @@ static void sna_blt_alpha_fixup_one(struct sna *sna,
 
 	assert(src_x >= 0);
 	assert(src_y >= 0);
-	assert((src_y + height) * blt->bo[0]->pitch <= blt->bo[0]->size);
+	assert((src_y + height) * blt->bo[0]->pitch <= kgem_bo_size(blt->bo[0]));
 	assert(dst_x >= 0);
 	assert(dst_y >= 0);
-	assert((dst_y + height) * blt->bo[1]->pitch <= blt->bo[1]->size);
+	assert((dst_y + height) * blt->bo[1]->pitch <= kgem_bo_size(blt->bo[1]));
 	assert(width > 0);
 	assert(height > 0);
 
@@ -409,10 +409,10 @@ static void sna_blt_copy_one(struct sna *sna,
 
 	assert(src_x >= 0);
 	assert(src_y >= 0);
-	assert((src_y + height) * blt->bo[0]->pitch <= blt->bo[0]->size);
+	assert((src_y + height) * blt->bo[0]->pitch <= kgem_bo_size(blt->bo[0]));
 	assert(dst_x >= 0);
 	assert(dst_y >= 0);
-	assert((dst_y + height) * blt->bo[1]->pitch <= blt->bo[1]->size);
+	assert((dst_y + height) * blt->bo[1]->pitch <= kgem_bo_size(blt->bo[1]));
 	assert(width > 0);
 	assert(height > 0);
 
@@ -787,7 +787,7 @@ inline static void _sna_blt_fill_box(struct sna *sna,
 
 	assert(box->x1 >= 0);
 	assert(box->y1 >= 0);
-	assert(box->y2 * blt->bo[0]->pitch <= blt->bo[0]->size);
+	assert(box->y2 * blt->bo[0]->pitch <= kgem_bo_size(blt->bo[0]));
 
 	if (!kgem_check_batch(kgem, 3))
 		sna_blt_fill_begin(sna, blt);
@@ -1106,7 +1106,7 @@ prepare_blt_copy(struct sna *sna,
 	PixmapPtr src = op->u.blt.src_pixmap;
 	struct sna_pixmap *priv = sna_pixmap(src);
 
-	if (priv->gpu_bo->tiling == I915_TILING_Y)
+	if (!kgem_bo_can_blt(&sna->kgem, priv->gpu_bo))
 		return FALSE;
 
 	if (!kgem_check_bo_fenced(&sna->kgem, priv->gpu_bo, NULL)) {
@@ -1176,9 +1176,8 @@ blt_put_composite(struct sna *sna,
 		data += (src_x - dst_x) * bpp / 8;
 		data += (src_y - dst_y) * pitch;
 
-		dst_priv->gpu_bo =
-			sna_replace(sna, op->dst.pixmap, dst_priv->gpu_bo,
-				    data, pitch);
+		sna_replace(sna, op->dst.pixmap, &dst_priv->gpu_bo,
+			    data, pitch);
 	} else {
 		BoxRec box;
 
@@ -1215,9 +1214,8 @@ fastcall static void blt_put_composite_box(struct sna *sna,
 		data += (box->y1 + op->u.blt.sy) * pitch;
 		data += (box->x1 + op->u.blt.sx) * bpp;
 
-		dst_priv->gpu_bo =
-			sna_replace(sna, op->dst.pixmap, op->dst.bo,
-				    data, pitch);
+		sna_replace(sna, op->dst.pixmap, &dst_priv->gpu_bo,
+			    data, pitch);
 	} else {
 		sna_write_boxes(sna, op->dst.pixmap,
 				op->dst.bo, op->dst.x, op->dst.y,
@@ -1250,9 +1248,8 @@ static void blt_put_composite_boxes(struct sna *sna,
 		data += (box->y1 + op->u.blt.sy) * pitch;
 		data += (box->x1 + op->u.blt.sx) * bpp;
 
-		dst_priv->gpu_bo =
-			sna_replace(sna, op->dst.pixmap, op->dst.bo,
-				    data, pitch);
+		sna_replace(sna, op->dst.pixmap, &dst_priv->gpu_bo,
+			    data, pitch);
 	} else {
 		sna_write_boxes(sna, op->dst.pixmap,
 				op->dst.bo, op->dst.x, op->dst.y,
@@ -1573,9 +1570,13 @@ sna_blt_composite(struct sna *sna,
 
 	tmp->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
 	priv = sna_pixmap_move_to_gpu(tmp->dst.pixmap, MOVE_WRITE | MOVE_READ);
-	if (priv == NULL || priv->gpu_bo->tiling == I915_TILING_Y) {
-		DBG(("%s: dst not on the gpu or using Y-tiling\n",
-		     __FUNCTION__));
+	if (priv == NULL) {
+		DBG(("%s: dst not attached\n", __FUNCTION__));
+		return FALSE;
+	}
+	if (!kgem_bo_can_blt(&sna->kgem, priv->gpu_bo)) {
+		DBG(("%s: can not blit to dst, tiling? %d, pitch? %d\n",
+		     __FUNCTION__, priv->gpu_bo->tiling, priv->gpu_bo->pitch));
 		return FALSE;
 	}
 
@@ -1747,7 +1748,7 @@ bool sna_blt_fill(struct sna *sna, uint8_t alu,
 
 	DBG(("%s(alu=%d, pixel=%x, bpp=%d)\n", __FUNCTION__, alu, pixel, bpp));
 
-	if (bo->tiling == I915_TILING_Y) {
+	if (!kgem_bo_can_blt(&sna->kgem, bo)) {
 		DBG(("%s: rejected due to incompatible Y-tiling\n",
 		     __FUNCTION__));
 		return FALSE;
@@ -1797,10 +1798,10 @@ bool sna_blt_copy(struct sna *sna, uint8_t alu,
 	return FALSE;
 #endif
 
-	if (src->tiling == I915_TILING_Y)
+	if (!kgem_bo_can_blt(&sna->kgem, src))
 		return FALSE;
 
-	if (dst->tiling == I915_TILING_Y)
+	if (!kgem_bo_can_blt(&sna->kgem, dst))
 		return FALSE;
 
 	if (!sna_blt_copy_init(sna, &op->base.u.blt,
@@ -1926,7 +1927,7 @@ Bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
 	DBG(("%s (%d, %08x, %d) x %d\n",
 	     __FUNCTION__, bpp, pixel, alu, nbox));
 
-	if (bo->tiling == I915_TILING_Y) {
+	if (!kgem_bo_can_blt(kgem, bo)) {
 		DBG(("%s: fallback -- dst uses Y-tiling\n", __FUNCTION__));
 		return FALSE;
 	}
@@ -2020,7 +2021,7 @@ Bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
 
 			assert(box->x1 >= 0);
 			assert(box->y1 >= 0);
-			assert(box->y2 * bo->pitch <= bo->size);
+			assert(box->y2 * bo->pitch <= kgem_bo_size(bo));
 
 			b = kgem->batch + kgem->nbatch;
 			kgem->nbatch += 3;
@@ -2075,8 +2076,13 @@ Bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 	    src_bo->tiling, dst_bo->tiling,
 	    src_bo->pitch, dst_bo->pitch));
 
-	if (src_bo->tiling == I915_TILING_Y || dst_bo->tiling == I915_TILING_Y)
+	if (!kgem_bo_can_blt(kgem, src_bo) || !kgem_bo_can_blt(kgem, dst_bo)) {
+		DBG(("%s: cannot blt to src? %d or dst? %d\n",
+		     __FUNCTION__,
+		     kgem_bo_can_blt(kgem, src_bo),
+		     kgem_bo_can_blt(kgem, dst_bo)));
 		return FALSE;
+	}
 
 	cmd = XY_SRC_COPY_BLT_CMD;
 	if (bpp == 32)
@@ -2087,7 +2093,7 @@ Bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 		cmd |= BLT_SRC_TILED;
 		src_pitch >>= 2;
 	}
-	assert(src_pitch < MAXSHORT);
+	assert(src_pitch <= MAXSHORT);
 
 	br13 = dst_bo->pitch;
 	if (kgem->gen >= 40 && dst_bo->tiling) {
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index f3ca212c..14a79013 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -44,6 +44,27 @@
 
 /* XXX Need to avoid using GTT fenced access for I915_TILING_Y on 855GM */
 
+static Bool
+box_intersect(BoxPtr a, const BoxRec *b)
+{
+	if (a->x1 < b->x1)
+		a->x1 = b->x1;
+	if (a->x2 > b->x2)
+		a->x2 = b->x2;
+	if (a->y1 < b->y1)
+		a->y1 = b->y1;
+	if (a->y2 > b->y2)
+		a->y2 = b->y2;
+
+	return a->x1 < a->x2 && a->y1 < a->y2;
+}
+
+static inline bool must_tile(struct sna *sna, int width, int height)
+{
+	return (width  > sna->render.max_3d_size ||
+		height > sna->render.max_3d_size);
+}
+
 static void read_boxes_inplace(struct kgem *kgem,
 			       struct kgem_bo *bo, int16_t src_dx, int16_t src_dy,
 			       PixmapPtr pixmap, int16_t dst_dx, int16_t dst_dy,
@@ -105,13 +126,13 @@ void sna_read_boxes(struct sna *sna,
 	for (n = 0; n < nbox; n++) {
 		if (box[n].x1 + src_dx < 0 || box[n].y1 + src_dy < 0 ||
 		    (box[n].x2 + src_dx) * dst->drawable.bitsPerPixel/8 > src_bo->pitch ||
-		    (box[n].y2 + src_dy) * src_bo->pitch > src_bo->size)
+		    (box[n].y2 + src_dy) * src_bo->pitch > kgem_bo_size(src_bo))
 		{
 			FatalError("source out-of-bounds box[%d]=(%d, %d), (%d, %d) + (%d, %d), pitch=%d, size=%d\n", n,
 				   box[n].x1, box[n].y1,
 				   box[n].x2, box[n].y2,
 				   src_dx, src_dy,
-				   src_bo->pitch, src_bo->size);
+				   src_bo->pitch, kgem_bo_size(src_bo));
 		}
 	}
 #endif
@@ -132,7 +153,6 @@ fallback:
 		return;
 	}
 
-	/* Is it worth detiling? */
 	extents = box[0];
 	for (n = 1; n < nbox; n++) {
 		if (box[n].x1 < extents.x1)
@@ -145,11 +165,16 @@ fallback:
 		if (box[n].y2 > extents.y2)
 			extents.y2 = box[n].y2;
 	}
-	if ((extents.y2 - extents.y1) * src_bo->pitch < 4096)
-		goto fallback;
+	if (kgem_bo_is_mappable(kgem, src_bo)) {
+		/* Is it worth detiling? */
+		if ((extents.y2 - extents.y1) * src_bo->pitch < 4096)
+			goto fallback;
+	}
 
 	/* Try to avoid switching rings... */
-	if (src_bo->tiling == I915_TILING_Y || kgem->ring == KGEM_RENDER) {
+	if (kgem->ring == KGEM_RENDER ||
+	    !kgem_bo_can_blt(kgem, src_bo) ||
+	    must_tile(sna, extents.x2 - extents.x1, extents.y2 - extents.y1)) {
 		PixmapRec tmp;
 
 		tmp.drawable.width  = extents.x2 - extents.x1;
@@ -161,38 +186,124 @@ fallback:
 		assert(tmp.drawable.width);
 		assert(tmp.drawable.height);
 
-		dst_bo = kgem_create_buffer_2d(kgem,
-					       tmp.drawable.width,
-					       tmp.drawable.height,
-					       tmp.drawable.bitsPerPixel,
-					       KGEM_BUFFER_LAST,
-					       &ptr);
-		if (!dst_bo)
-			goto fallback;
+		if (must_tile(sna, tmp.drawable.width, tmp.drawable.height)) {
+			BoxRec tile, stack[64], *clipped, *c;
+			int step;
+
+			if (n > ARRAY_SIZE(stack)) {
+				clipped = malloc(sizeof(BoxRec) * n);
+				if (clipped == NULL)
+					goto fallback;
+			} else
+				clipped = stack;
+
+			step = MIN(sna->render.max_3d_size,
+				   8*(MAXSHORT&~63) / dst->drawable.bitsPerPixel);
+			DBG(("%s: tiling download, using %dx%d tiles\n",
+			     __FUNCTION__, step, step));
+
+			for (tile.y1 = extents.y1; tile.y1 < extents.y2; tile.y1 = tile.y2) {
+				tile.y2 = tile.y1 + step;
+				if (tile.y2 > extents.y2)
+					tile.y2 = extents.y2;
+
+				for (tile.x1 = extents.x1; tile.x1 < extents.x2; tile.x1 = tile.x2) {
+					tile.x2 = tile.x1 + step;
+					if (tile.x2 > extents.x2)
+						tile.x2 = extents.x2;
+
+					tmp.drawable.width  = tile.x2 - tile.x1;
+					tmp.drawable.height = tile.y2 - tile.y1;
+
+					c = clipped;
+					for (n = 0; n < nbox; n++) {
+						*c = box[n];
+						if (!box_intersect(c, &tile))
+							continue;
+
+						DBG(("%s: box(%d, %d), (%d, %d), src=(%d, %d), dst=(%d, %d)\n",
+						     __FUNCTION__,
+						     c->x1, c->y1,
+						     c->x2, c->y2,
+						     src_dx, src_dy,
+						     c->x1 - tile.x1,
+						     c->y1 - tile.y1));
+						c++;
+					}
+					if (c == clipped)
+						continue;
+
+					dst_bo = kgem_create_buffer_2d(kgem,
+								       tmp.drawable.width,
+								       tmp.drawable.height,
+								       tmp.drawable.bitsPerPixel,
+								       KGEM_BUFFER_LAST,
+								       &ptr);
+					if (!dst_bo)
+						goto fallback;
+
+					if (!sna->render.copy_boxes(sna, GXcopy,
+								    dst, src_bo, src_dx, src_dy,
+								    &tmp, dst_bo, -tile.x1, -tile.y1,
+								    clipped, c-clipped)) {
+						kgem_bo_destroy(&sna->kgem, dst_bo);
+						goto fallback;
+					}
+
+					kgem_bo_submit(&sna->kgem, dst_bo);
+					kgem_buffer_read_sync(kgem, dst_bo);
+
+					while (c-- != clipped) {
+						memcpy_blt(ptr, dst->devPrivate.ptr, tmp.drawable.bitsPerPixel,
+							   dst_bo->pitch, dst->devKind,
+							   c->x1 - tile.x1,
+							   c->y1 - tile.y1,
+							   c->x1 + dst_dx,
+							   c->y1 + dst_dy,
+							   c->x2 - c->x1,
+							   c->y2 - c->y1);
+					}
+
+					kgem_bo_destroy(&sna->kgem, dst_bo);
+				}
+			}
 
-		if (!sna->render.copy_boxes(sna, GXcopy,
-					    dst, src_bo, src_dx, src_dy,
-					    &tmp, dst_bo, -extents.x1, -extents.y1,
-					    box, nbox)) {
-			kgem_bo_destroy(&sna->kgem, dst_bo);
-			goto fallback;
-		}
+			if (clipped != stack)
+				free(clipped);
+		} else {
+			dst_bo = kgem_create_buffer_2d(kgem,
+						       tmp.drawable.width,
+						       tmp.drawable.height,
+						       tmp.drawable.bitsPerPixel,
+						       KGEM_BUFFER_LAST,
+						       &ptr);
+			if (!dst_bo)
+				goto fallback;
+
+			if (!sna->render.copy_boxes(sna, GXcopy,
+						    dst, src_bo, src_dx, src_dy,
+						    &tmp, dst_bo, -extents.x1, -extents.y1,
+						    box, nbox)) {
+				kgem_bo_destroy(&sna->kgem, dst_bo);
+				goto fallback;
+			}
 
-		kgem_bo_submit(&sna->kgem, dst_bo);
-		kgem_buffer_read_sync(kgem, dst_bo);
+			kgem_bo_submit(&sna->kgem, dst_bo);
+			kgem_buffer_read_sync(kgem, dst_bo);
+
+			for (n = 0; n < nbox; n++) {
+				memcpy_blt(ptr, dst->devPrivate.ptr, tmp.drawable.bitsPerPixel,
+					   dst_bo->pitch, dst->devKind,
+					   box[n].x1 - extents.x1,
+					   box[n].y1 - extents.y1,
+					   box[n].x1 + dst_dx,
+					   box[n].y1 + dst_dy,
+					   box[n].x2 - box[n].x1,
+					   box[n].y2 - box[n].y1);
+			}
 
-		for (n = 0; n < nbox; n++) {
-			memcpy_blt(ptr, dst->devPrivate.ptr, tmp.drawable.bitsPerPixel,
-				   dst_bo->pitch, dst->devKind,
-				   box[n].x1 - extents.x1,
-				   box[n].y1 - extents.y1,
-				   box[n].x1 + dst_dx,
-				   box[n].y1 + dst_dy,
-				   box[n].x2 - box[n].x1,
-				   box[n].y2 - box[n].y1);
+			kgem_bo_destroy(&sna->kgem, dst_bo);
 		}
-
-		kgem_bo_destroy(&sna->kgem, dst_bo);
 		return;
 	}
 
@@ -270,7 +381,7 @@ fallback:
 			assert(tmp_box[n].x1 + src_dx >= 0);
 			assert((tmp_box[n].x2 + src_dx) * dst->drawable.bitsPerPixel/8 <= src_bo->pitch);
 			assert(tmp_box[n].y1 + src_dy >= 0);
-			assert((tmp_box[n].y2 + src_dy) * src_bo->pitch <= src_bo->size);
+			assert((tmp_box[n].y2 + src_dy) * src_bo->pitch <= kgem_bo_size(src_bo));
 
 			b[0] = cmd;
 			b[1] = br13 | pitch;
@@ -299,7 +410,7 @@ fallback:
 		_kgem_set_mode(kgem, KGEM_BLT);
 		tmp_box += nbox_this_time;
 	} while (1);
-	assert(offset == dst_bo->size);
+	assert(offset == kgem_buffer_size(dst_bo));
 
 	kgem_buffer_read_sync(kgem, dst_bo);
 
@@ -331,12 +442,12 @@ fallback:
 
 		src += pitch * height;
 	} while (--nbox);
-	assert(src - (char *)ptr == dst_bo->size);
+	assert(src - (char *)ptr == kgem_buffer_size(dst_bo));
 	kgem_bo_destroy(kgem, dst_bo);
 	sna->blt_state.fill_bo = 0;
 }
 
-static void write_boxes_inplace(struct kgem *kgem,
+static bool write_boxes_inplace(struct kgem *kgem,
 				const void *src, int stride, int bpp, int16_t src_dx, int16_t src_dy,
 				struct kgem_bo *bo, int16_t dst_dx, int16_t dst_dy,
 				const BoxRec *box, int n)
@@ -346,11 +457,14 @@ static void write_boxes_inplace(struct kgem *kgem,
 	DBG(("%s x %d, handle=%d, tiling=%d\n",
 	     __FUNCTION__, n, bo->handle, bo->tiling));
 
+	if (!kgem_bo_is_mappable(kgem, bo))
+		return false;
+
 	kgem_bo_submit(kgem, bo);
 
 	dst = kgem_bo_map(kgem, bo);
 	if (dst == NULL)
-		return;
+		return false;
 
 	assert(dst != src);
 
@@ -364,7 +478,7 @@ static void write_boxes_inplace(struct kgem *kgem,
 		assert(box->x1 + dst_dx >= 0);
 		assert((box->x2 + dst_dx)*bpp <= 8*bo->pitch);
 		assert(box->y1 + dst_dy >= 0);
-		assert((box->y2 + dst_dy)*bo->pitch <= bo->size);
+		assert((box->y2 + dst_dy)*bo->pitch <= kgem_bo_size(bo));
 
 		assert(box->x1 + src_dx >= 0);
 		assert((box->x2 + src_dx)*bpp <= 8*stride);
@@ -377,6 +491,7 @@ static void write_boxes_inplace(struct kgem *kgem,
 			   box->x2 - box->x1, box->y2 - box->y1);
 		box++;
 	} while (--n);
+	return true;
 }
 
 static bool upload_inplace(struct kgem *kgem,
@@ -384,9 +499,6 @@ static bool upload_inplace(struct kgem *kgem,
 			   const BoxRec *box,
 			   int n, int bpp)
 {
-	if (DEBUG_NO_IO)
-		return kgem_bo_is_mappable(kgem, bo);
-
 	/* If we are writing through the GTT, check first if we might be
 	 * able to almagamate a series of small writes into a single
 	 * operation.
@@ -404,13 +516,14 @@ static bool upload_inplace(struct kgem *kgem,
 	return !kgem_bo_map_will_stall(kgem, bo);
 }
 
-void sna_write_boxes(struct sna *sna, PixmapPtr dst,
+bool sna_write_boxes(struct sna *sna, PixmapPtr dst,
 		     struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
 		     const void *src, int stride, int16_t src_dx, int16_t src_dy,
 		     const BoxRec *box, int nbox)
 {
 	struct kgem *kgem = &sna->kgem;
 	struct kgem_bo *src_bo;
+	BoxRec extents;
 	void *ptr;
 	int offset;
 	int n, cmd, br13;
@@ -419,30 +532,30 @@ void sna_write_boxes(struct sna *sna, PixmapPtr dst,
 
 	if (upload_inplace(kgem, dst_bo, box, nbox, dst->drawable.bitsPerPixel)) {
 fallback:
-		write_boxes_inplace(kgem,
-				    src, stride, dst->drawable.bitsPerPixel, src_dx, src_dy,
-				    dst_bo, dst_dx, dst_dy,
-				    box, nbox);
-		return;
+		return write_boxes_inplace(kgem,
+					   src, stride, dst->drawable.bitsPerPixel, src_dx, src_dy,
+					   dst_bo, dst_dx, dst_dy,
+					   box, nbox);
 	}
 
-	/* Try to avoid switching rings... */
-	if (dst_bo->tiling == I915_TILING_Y || kgem->ring == KGEM_RENDER) {
-		PixmapRec tmp;
-		BoxRec extents;
+	extents = box[0];
+	for (n = 1; n < nbox; n++) {
+		if (box[n].x1 < extents.x1)
+			extents.x1 = box[n].x1;
+		if (box[n].x2 > extents.x2)
+			extents.x2 = box[n].x2;
 
-		extents = box[0];
-		for (n = 1; n < nbox; n++) {
-			if (box[n].x1 < extents.x1)
-				extents.x1 = box[n].x1;
-			if (box[n].x2 > extents.x2)
-				extents.x2 = box[n].x2;
+		if (box[n].y1 < extents.y1)
+			extents.y1 = box[n].y1;
+		if (box[n].y2 > extents.y2)
+			extents.y2 = box[n].y2;
+	}
 
-			if (box[n].y1 < extents.y1)
-				extents.y1 = box[n].y1;
-			if (box[n].y2 > extents.y2)
-				extents.y2 = box[n].y2;
-		}
+	/* Try to avoid switching rings... */
+	if (kgem->ring == KGEM_RENDER ||
+	    !kgem_bo_can_blt(kgem, dst_bo) ||
+	    must_tile(sna, extents.x2 - extents.x1, extents.y2 - extents.y1)) {
+		PixmapRec tmp;
 
 		tmp.drawable.width  = extents.x2 - extents.x1;
 		tmp.drawable.height = extents.y2 - extents.y1;
@@ -453,37 +566,130 @@ fallback:
 		assert(tmp.drawable.width);
 		assert(tmp.drawable.height);
 
-		src_bo = kgem_create_buffer_2d(kgem,
-					       tmp.drawable.width,
-					       tmp.drawable.height,
-					       tmp.drawable.bitsPerPixel,
-					       KGEM_BUFFER_WRITE_INPLACE,
-					       &ptr);
-		if (!src_bo)
-			goto fallback;
+		DBG(("%s: upload (%d, %d)x(%d, %d), max %dx%d\n",
+		     __FUNCTION__,
+		     extents.x1, extents.y1,
+		     tmp.drawable.width, tmp.drawable.height,
+		     sna->render.max_3d_size, sna->render.max_3d_size));
+		if (must_tile(sna, tmp.drawable.width, tmp.drawable.height)) {
+			BoxRec tile, stack[64], *clipped, *c;
+			int step;
+
+			step = MIN(sna->render.max_3d_size,
+				   8*(MAXSHORT&~63) / dst->drawable.bitsPerPixel);
+			DBG(("%s: tiling upload, using %dx%d tiles\n",
+			     __FUNCTION__, step, step));
+
+			if (n > ARRAY_SIZE(stack)) {
+				clipped = malloc(sizeof(BoxRec) * n);
+				if (clipped == NULL)
+					goto fallback;
+			} else
+				clipped = stack;
+
+			for (tile.y1 = extents.y1; tile.y1 < extents.y2; tile.y1 = tile.y2) {
+				tile.y2 = tile.y1 + step;
+				if (tile.y2 > extents.y2)
+					tile.y2 = extents.y2;
+
+				for (tile.x1 = extents.x1; tile.x1 < extents.x2; tile.x1 = tile.x2) {
+					tile.x2 = tile.x1 + step;
+					if (tile.x2 > extents.x2)
+						tile.x2 = extents.x2;
+
+					tmp.drawable.width  = tile.x2 - tile.x1;
+					tmp.drawable.height = tile.y2 - tile.y1;
+
+					src_bo = kgem_create_buffer_2d(kgem,
+								       tmp.drawable.width,
+								       tmp.drawable.height,
+								       tmp.drawable.bitsPerPixel,
+								       KGEM_BUFFER_WRITE_INPLACE,
+								       &ptr);
+					if (!src_bo)
+						goto fallback;
+
+					c = clipped;
+					for (n = 0; n < nbox; n++) {
+						*c = box[n];
+						if (!box_intersect(c, &tile))
+							continue;
+
+						DBG(("%s: box(%d, %d), (%d, %d), src=(%d, %d), dst=(%d, %d)\n",
+						     __FUNCTION__,
+						     c->x1, c->y1,
+						     c->x2, c->y2,
+						     src_dx, src_dy,
+						     c->x1 - tile.x1,
+						     c->y1 - tile.y1));
+						memcpy_blt(src, ptr, tmp.drawable.bitsPerPixel,
+							   stride, src_bo->pitch,
+							   c->x1 + src_dx,
+							   c->y1 + src_dy,
+							   c->x1 - tile.x1,
+							   c->y1 - tile.y1,
+							   c->x2 - c->x1,
+							   c->y2 - c->y1);
+						c++;
+					}
+
+					if (c != clipped)
+						n = sna->render.copy_boxes(sna, GXcopy,
+									   &tmp, src_bo, -tile.x1, -tile.y1,
+									   dst, dst_bo, dst_dx, dst_dy,
+									   clipped, c - clipped);
+					else
+						n = 1;
+
+					kgem_bo_destroy(&sna->kgem, src_bo);
+
+					if (!n)
+						goto fallback;
+				}
+			}
 
-		for (n = 0; n < nbox; n++) {
-			memcpy_blt(src, ptr, tmp.drawable.bitsPerPixel,
-				   stride, src_bo->pitch,
-				   box[n].x1 + src_dx,
-				   box[n].y1 + src_dy,
-				   box[n].x1 - extents.x1,
-				   box[n].y1 - extents.y1,
-				   box[n].x2 - box[n].x1,
-				   box[n].y2 - box[n].y1);
-		}
+			if (clipped != stack)
+				free(clipped);
+		} else {
+			src_bo = kgem_create_buffer_2d(kgem,
+						       tmp.drawable.width,
+						       tmp.drawable.height,
+						       tmp.drawable.bitsPerPixel,
+						       KGEM_BUFFER_WRITE_INPLACE,
+						       &ptr);
+			if (!src_bo)
+				goto fallback;
+
+			for (n = 0; n < nbox; n++) {
+				DBG(("%s: box(%d, %d), (%d, %d), src=(%d, %d), dst=(%d, %d)\n",
+				     __FUNCTION__,
+				     box[n].x1, box[n].y1,
+				     box[n].x2, box[n].y2,
+				     src_dx, src_dy,
+				     box[n].x1 - extents.x1,
+				     box[n].y1 - extents.y1));
+				memcpy_blt(src, ptr, tmp.drawable.bitsPerPixel,
+					   stride, src_bo->pitch,
+					   box[n].x1 + src_dx,
+					   box[n].y1 + src_dy,
+					   box[n].x1 - extents.x1,
+					   box[n].y1 - extents.y1,
+					   box[n].x2 - box[n].x1,
+					   box[n].y2 - box[n].y1);
+			}
 
-		n = sna->render.copy_boxes(sna, GXcopy,
-					   &tmp, src_bo, -extents.x1, -extents.y1,
-					   dst, dst_bo, dst_dx, dst_dy,
-					   box, nbox);
+			n = sna->render.copy_boxes(sna, GXcopy,
+						   &tmp, src_bo, -extents.x1, -extents.y1,
+						   dst, dst_bo, dst_dx, dst_dy,
+						   box, nbox);
 
-		kgem_bo_destroy(&sna->kgem, src_bo);
+			kgem_bo_destroy(&sna->kgem, src_bo);
 
-		if (!n)
-			goto fallback;
+			if (!n)
+				goto fallback;
+		}
 
-		return;
+		return true;
 	}
 
 	cmd = XY_SRC_COPY_BLT_CMD;
@@ -586,7 +792,7 @@ fallback:
 			box++;
 			offset += pitch * height;
 		} while (--nbox_this_time);
-		assert(offset == src_bo->size);
+		assert(offset == kgem_buffer_size(src_bo));
 
 		if (nbox) {
 			_kgem_submit(kgem);
@@ -597,6 +803,7 @@ fallback:
 	} while (nbox);
 
 	sna->blt_state.fill_bo = 0;
+	return true;
 }
 
 static void
@@ -823,7 +1030,7 @@ fallback:
 			box++;
 			offset += pitch * height;
 		} while (--nbox_this_time);
-		assert(offset == src_bo->size);
+		assert(offset == kgem_buffer_size(src_bo));
 
 		if (nbox) {
 			_kgem_submit(kgem);
@@ -951,11 +1158,12 @@ indirect_replace(struct sna *sna,
 	return ret;
 }
 
-struct kgem_bo *sna_replace(struct sna *sna,
-			    PixmapPtr pixmap,
-			    struct kgem_bo *bo,
-			    const void *src, int stride)
+bool sna_replace(struct sna *sna,
+		 PixmapPtr pixmap,
+		 struct kgem_bo **_bo,
+		 const void *src, int stride)
 {
+	struct kgem_bo *bo = *_bo;
 	struct kgem *kgem = &sna->kgem;
 	void *dst;
 
@@ -968,7 +1176,7 @@ struct kgem_bo *sna_replace(struct sna *sna,
 
 	if ((!kgem_bo_mapped(bo) || bo->rq) &&
 	    indirect_replace(sna, pixmap, bo, src, stride))
-		return bo;
+		return true;
 
 	if (kgem_bo_is_busy(bo)) {
 		struct kgem_bo *new_bo;
@@ -979,26 +1187,26 @@ struct kgem_bo *sna_replace(struct sna *sna,
 					pixmap->drawable.bitsPerPixel,
 					bo->tiling,
 					CREATE_GTT_MAP | CREATE_INACTIVE);
-		if (new_bo) {
-			kgem_bo_destroy(kgem, bo);
+		if (new_bo)
 			bo = new_bo;
-		}
 	}
 
 	if (bo->tiling == I915_TILING_NONE && bo->pitch == stride) {
-		kgem_bo_write(kgem, bo, src,
-			      (pixmap->drawable.height-1)*stride + pixmap->drawable.width*pixmap->drawable.bitsPerPixel/8);
+		if (!kgem_bo_write(kgem, bo, src,
+				   (pixmap->drawable.height-1)*stride + pixmap->drawable.width*pixmap->drawable.bitsPerPixel/8))
+			goto err;
 	} else {
 		if (kgem_bo_is_mappable(kgem, bo)) {
 			dst = kgem_bo_map(kgem, bo);
-			if (dst) {
-				memcpy_blt(src, dst, pixmap->drawable.bitsPerPixel,
-					   stride, bo->pitch,
-					   0, 0,
-					   0, 0,
-					   pixmap->drawable.width,
-					   pixmap->drawable.height);
-			}
+			if (!dst)
+				goto err;
+
+			memcpy_blt(src, dst, pixmap->drawable.bitsPerPixel,
+				   stride, bo->pitch,
+				   0, 0,
+				   0, 0,
+				   pixmap->drawable.width,
+				   pixmap->drawable.height);
 		} else {
 			BoxRec box;
 
@@ -1006,14 +1214,23 @@ struct kgem_bo *sna_replace(struct sna *sna,
 			box.x2 = pixmap->drawable.width;
 			box.y2 = pixmap->drawable.height;
 
-			sna_write_boxes(sna, pixmap,
-					bo, 0, 0,
-					src, stride, 0, 0,
-					&box, 1);
+			if (!sna_write_boxes(sna, pixmap,
+					     bo, 0, 0,
+					     src, stride, 0, 0,
+					     &box, 1))
+				goto err;
 		}
 	}
 
-	return bo;
+	if (bo != *_bo)
+		kgem_bo_destroy(kgem, *_bo);
+	*_bo = bo;
+	return true;
+
+err:
+	if (bo != *_bo)
+		kgem_bo_destroy(kgem, bo);
+	return false;
 }
 
 struct kgem_bo *sna_replace__xor(struct sna *sna,
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index f9151e08..7077f363 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -696,6 +696,11 @@ static int sna_render_picture_downsample(struct sna *sna,
 	DBG(("%s: creating temporary GPU bo %dx%d\n",
 	     __FUNCTION__, width, height));
 
+	if (!sna_pixmap_force_to_gpu(pixmap, MOVE_READ))
+		return sna_render_picture_fixup(sna, picture, channel,
+						x, y, ow, oh,
+						dst_x, dst_y);
+
 	tmp = screen->CreatePixmap(screen,
 				   width, height,
 				   pixmap->drawable.depth,
@@ -1306,9 +1311,6 @@ do_fixup:
 		return 0;
 	}
 
-	/* XXX Convolution filter? */
-	memset(ptr, 0, channel->bo->size);
-
 	/* Composite in the original format to preserve idiosyncracies */
 	if (picture->format == channel->pict_format)
 		dst = pixman_image_create_bits(picture->format,
@@ -1354,7 +1356,7 @@ do_fixup:
 					       w, h);
 			pixman_image_unref(src);
 		} else {
-			memset(ptr, 0, channel->bo->size);
+			memset(ptr, 0, kgem_buffer_size(channel->bo));
 			dst = src;
 		}
 	}
@@ -1528,7 +1530,7 @@ sna_render_composite_redirect(struct sna *sna,
 	if (op->dst.pixmap->drawable.width <= sna->render.max_3d_size) {
 		int y1, y2;
 
-		assert(op->dst.pixmap.drawable.height > sna->render.max_3d_size);
+		assert(op->dst.pixmap->drawable.height > sna->render.max_3d_size);
 		y1 =  y + op->dst.y;
 		y2 =  y1 + height;
 		y1 &= y1 & (64 - 1);
diff --git a/src/sna/sna_video.c b/src/sna/sna_video.c
index 7b759a7e..cec04733 100644
--- a/src/sna/sna_video.c
+++ b/src/sna/sna_video.c
@@ -100,7 +100,7 @@ sna_video_buffer(struct sna *sna,
 		 struct sna_video_frame *frame)
 {
 	/* Free the current buffer if we're going to have to reallocate */
-	if (video->buf && video->buf->size < frame->size)
+	if (video->buf && kgem_bo_size(video->buf) < frame->size)
 		sna_video_free_buffers(sna, video);
 
 	if (video->buf == NULL)
diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c
index d99f8847..1aaf9723 100644
--- a/src/sna/sna_video_textured.c
+++ b/src/sna/sna_video_textured.c
@@ -271,7 +271,7 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 			return BadAlloc;
 		}
 
-		assert(frame.bo->size >= frame.size);
+		assert(kgem_bo_size(frame.bo) >= frame.size);
 	} else {
 		frame.bo = kgem_create_linear(&sna->kgem, frame.size);
 		if (frame.bo == NULL) {