8 files changed, 5116 insertions, 0 deletions
diff --git a/lib/libGL/gallium/drivers/Makefile b/lib/libGL/gallium/drivers/Makefile
new file mode 100644
index 000000000..11265f2a7
--- /dev/null
+++ b/lib/libGL/gallium/drivers/Makefile
@@ -0,0 +1,11 @@
+# $OpenBSD: Makefile,v 1.1.1.1 2012/07/13 12:43:34 mpi Exp $
+
+SUBDIR= rbug softpipe trace
+
+.ifdef XENOCARA_HAVE_LLVM
+SUBDIR+= llvmpipe
+.endif
+
+build depend all install clean cleandir: _SUBDIRUSE
+
+.include <bsd.xorg.mk>
diff --git a/lib/libGL/gallium/drivers/Makefile.inc b/lib/libGL/gallium/drivers/Makefile.inc
new file mode 100644
index 000000000..974cc3dab
--- /dev/null
+++ b/lib/libGL/gallium/drivers/Makefile.inc
@@ -0,0 +1,23 @@
+# $OpenBSD: Makefile.inc,v 1.1.1.1 2012/07/13 12:43:34 mpi Exp $
+
+GALLIUM=	${.CURDIR}/../../../../../dist/Mesa/src/gallium
+
+CPPFLAGS+= \
+	-I${GALLIUM}/auxiliary \
+	-I${GALLIUM}/include 
+
+all: lib${LIB}_pic.a
+
+obj: _xenocara_obj
+
+install:
+	@echo "Not installing lib${LIB}"
+
+clean: 
+	rm -f lib${LIB}_pic.a ${OBJS}
+
+cleandir:	clean
+
+.PATH: ${GALLIUM}/drivers/${LIB}
+
+.include "../Makefile.inc"
diff --git a/lib/libGL/gallium/drivers/llvmpipe/Makefile b/lib/libGL/gallium/drivers/llvmpipe/Makefile
new file mode 100644
index 000000000..5953cebaa
--- /dev/null
+++ b/lib/libGL/gallium/drivers/llvmpipe/Makefile
@@ -0,0 +1,56 @@
+# $OpenBSD: Makefile,v 1.1.1.1 2012/07/13 12:43:34 mpi Exp $
+
+LIB = llvmpipe
+
+SRCS = \
+	lp_bld_alpha.c \
+	lp_bld_blend_aos.c \
+	lp_bld_blend_logicop.c \
+	lp_bld_blend_soa.c \
+	lp_bld_depth.c \
+	lp_bld_interp.c \
+	lp_clear.c \
+	lp_context.c \
+	lp_draw_arrays.c \
+	lp_fence.c \
+	lp_flush.c \
+	lp_jit.c \
+	lp_memory.c \
+	lp_perf.c \
+	lp_query.c \
+	lp_rast.c \
+	lp_rast_debug.c \
+	lp_rast_tri.c \
+	lp_scene.c \
+	lp_scene_queue.c \
+	lp_screen.c \
+	lp_setup.c \
+	lp_setup_line.c \
+	lp_setup_point.c \
+	lp_setup_tri.c \
+	lp_setup_vbuf.c \
+	lp_state_blend.c \
+	lp_state_clip.c \
+	lp_state_derived.c \
+	lp_state_fs.c \
+	lp_state_setup.c \
+	lp_state_gs.c \
+	lp_state_rasterizer.c \
+	lp_state_sampler.c \
+        lp_state_so.c \
+	lp_state_surface.c \
+	lp_state_vertex.c \
+	lp_state_vs.c \
+	lp_surface.c \
+	lp_tex_sample.c \
+	lp_texture.c \
+	lp_tile_image.c \
+	lp_tile_soa.c
+
+CPPFLAGS+= -I${GALLIUM}/drivers/llvmpipe
+
+lp_test_sincos.o: sse_mathfun.h
+
+.include <bsd.xorg.mk>
+
+.PATH: ${.CURDIR}/generated
diff --git a/lib/libGL/gallium/drivers/llvmpipe/generated/Makefile b/lib/libGL/gallium/drivers/llvmpipe/generated/Makefile
new file mode 100644
index 000000000..afd593ed7
--- /dev/null
+++ b/lib/libGL/gallium/drivers/llvmpipe/generated/Makefile
@@ -0,0 +1,30 @@
+# $OpenBSD: Makefile,v 1.1.1.1 2012/07/13 12:43:34 mpi Exp $
+
+AUX=	${.CURDIR}/../../../../../../dist/Mesa/src/gallium/auxiliary
+DEV=	${.CURDIR}/../../../../../../dist/Mesa/src/gallium/drivers
+
+PYTHON=		python${PYTHON_VERSION}
+PYTHON_FLAGS=	-t -O -O
+
+GENERATED=	lp_tile_soa.c	
+
+all: ${GENERATED}
+
+obj:
+
+depend:
+
+install:
+
+clean distclean:
+	rm -f ${GENERATED}
+
+lp_tile_soa.c: lp_tile_soa.py u_format_parse.py u_format_pack.py u_format.csv
+	$(PYTHON) $(PYTHON_FLAGS) ${DEV}/llvmpipe/lp_tile_soa.py ${AUX}/util/u_format.csv > $@
+
+.SUFFIXES: .py .csv
+
+.PATH: ${DEV}/llvmpipe
+.PATH: ${AUX}/util
+
+.include <bsd.xorg.mk>
diff --git a/lib/libGL/gallium/drivers/llvmpipe/generated/lp_tile_soa.c b/lib/libGL/gallium/drivers/llvmpipe/generated/lp_tile_soa.c
new file mode 100644
index 000000000..936bad915
--- /dev/null
+++ b/lib/libGL/gallium/drivers/llvmpipe/generated/lp_tile_soa.c
@@ -0,0 +1,4936 @@
+/* This file is autogenerated by lp_tile_soa.py from u_format.csv. Do not edit directly. */
+
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Pixel format accessor functions.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_compiler.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_half.h"
+#include "util/u_cpu_detect.h"
+#include "lp_tile_soa.h"
+
+#ifdef DEBUG
+unsigned lp_tile_unswizzle_count = 0;
+unsigned lp_tile_swizzle_count = 0;
+#endif
+
+const unsigned char
+tile_offset[TILE_VECTOR_HEIGHT][TILE_VECTOR_WIDTH] = {
+   {  0,  1,  4,  5},
+   {  2,  3,  6,  7},
+   {  8,  9, 12, 13},
+   { 10, 11, 14, 15}
+};
+
+/* Note: these lookup tables could be replaced with some
+ * bit-twiddling code, but this is a little faster.
+ */
+static unsigned tile_x_offset[TILE_VECTOR_WIDTH * TILE_VECTOR_HEIGHT] = {
+   0, 1, 0, 1, 2, 3, 2, 3,
+   0, 1, 0, 1, 2, 3, 2, 3
+};
+
+static unsigned tile_y_offset[TILE_VECTOR_WIDTH * TILE_VECTOR_HEIGHT] = {
+   0, 0, 1, 1, 0, 0, 1, 1,
+   2, 2, 3, 3, 2, 2, 3, 3
+};
+
+
+#if defined(PIPE_ARCH_SSE)
+
+#include "util/u_sse.h"
+
+static ALWAYS_INLINE void 
+swz4( const __m128i * restrict x, 
+      const __m128i * restrict y, 
+      const __m128i * restrict z, 
+      const __m128i * restrict w, 
+      __m128i * restrict a, 
+      __m128i * restrict b, 
+      __m128i * restrict c, 
+      __m128i * restrict d)
+{
+   __m128i i, j, k, l;
+   __m128i m, n, o, p;
+   __m128i e, f, g, h;
+
+   m = _mm_unpacklo_epi8(*x,*y);
+   n = _mm_unpackhi_epi8(*x,*y);
+   o = _mm_unpacklo_epi8(*z,*w);
+   p = _mm_unpackhi_epi8(*z,*w);
+
+   i = _mm_unpacklo_epi16(m,n);
+   j = _mm_unpackhi_epi16(m,n);
+   k = _mm_unpacklo_epi16(o,p);
+   l = _mm_unpackhi_epi16(o,p);
+
+   e = _mm_unpacklo_epi8(i,j);
+   f = _mm_unpackhi_epi8(i,j);
+   g = _mm_unpacklo_epi8(k,l);
+   h = _mm_unpackhi_epi8(k,l);
+
+   *a = _mm_unpacklo_epi64(e,g);
+   *b = _mm_unpackhi_epi64(e,g);
+   *c = _mm_unpacklo_epi64(f,h);
+   *d = _mm_unpackhi_epi64(f,h);
+}
+
+static ALWAYS_INLINE void
+unswz4( const __m128i * restrict a, 
+        const __m128i * restrict b, 
+        const __m128i * restrict c, 
+        const __m128i * restrict d, 
+        __m128i * restrict x, 
+        __m128i * restrict y, 
+        __m128i * restrict z, 
+        __m128i * restrict w)
+{
+   __m128i i, j, k, l;
+   __m128i m, n, o, p;
+
+   i = _mm_unpacklo_epi8(*a,*b);
+   j = _mm_unpackhi_epi8(*a,*b);
+   k = _mm_unpacklo_epi8(*c,*d);
+   l = _mm_unpackhi_epi8(*c,*d);
+
+   m = _mm_unpacklo_epi16(i,k);
+   n = _mm_unpackhi_epi16(i,k);
+   o = _mm_unpacklo_epi16(j,l);
+   p = _mm_unpackhi_epi16(j,l);
+
+   *x = _mm_unpacklo_epi64(m,n);
+   *y = _mm_unpackhi_epi64(m,n);
+   *z = _mm_unpacklo_epi64(o,p);
+   *w = _mm_unpackhi_epi64(o,p);
+}
+
+static void
+lp_tile_b8g8r8a8_unorm_swizzle_4ub_sse2(uint8_t * restrict dst,
+                                        const uint8_t * restrict src, unsigned src_stride,
+                                        unsigned x0, unsigned y0)
+{
+   __m128i *dst128 = (__m128i *) dst;
+   unsigned x, y;
+   
+   src += y0 * src_stride;
+   src += x0 * sizeof(uint32_t);
+
+   for (y = 0; y < TILE_SIZE; y += 4) {
+      const uint8_t *src_row = src;
+
+      for (x = 0; x < TILE_SIZE; x += 4) {
+         swz4((const __m128i *) (src_row + 0 * src_stride),
+              (const __m128i *) (src_row + 1 * src_stride),
+              (const __m128i *) (src_row + 2 * src_stride),
+              (const __m128i *) (src_row + 3 * src_stride),
+              dst128 + 2,     /* b */
+              dst128 + 1,     /* g */
+              dst128 + 0,     /* r */
+              dst128 + 3);    /* a */
+
+         dst128 += 4;
+         src_row += sizeof(__m128i);
+      }
+
+      src += 4 * src_stride;
+   }
+}
+
+static void
+lp_tile_b8g8r8a8_unorm_unswizzle_4ub_sse2(const uint8_t * restrict src,
+                                          uint8_t * restrict dst, unsigned dst_stride,
+                                          unsigned x0, unsigned y0)
+{
+   unsigned int x, y;
+   const __m128i *src128 = (const __m128i *) src;
+   
+   dst += y0 * dst_stride;
+   dst += x0 * sizeof(uint32_t);
+   
+   for (y = 0; y < TILE_SIZE; y += 4) {
+      const uint8_t *dst_row = dst;
+
+      for (x = 0; x < TILE_SIZE; x += 4) {
+         unswz4( &src128[2],     /* b */
+                 &src128[1],     /* g */
+                 &src128[0],     /* r */
+                 &src128[3],     /* a */
+                 (__m128i *) (dst_row + 0 * dst_stride),
+                 (__m128i *) (dst_row + 1 * dst_stride),
+                 (__m128i *) (dst_row + 2 * dst_stride),
+                 (__m128i *) (dst_row + 3 * dst_stride));
+
+         src128 += 4;
+         dst_row += sizeof(__m128i);;
+      }
+
+      dst += 4 * dst_stride;
+   }
+}
+
+#endif /* PIPE_ARCH_SSE */
+
+static void
+lp_tile_none_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_b8g8r8a8_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t b;
+         uint8_t g;
+         uint8_t r;
+         uint8_t a;
+         b = (*src_pixel++);
+         g = (*src_pixel++);
+         r = (*src_pixel++);
+         a = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_b8g8r8x8_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t b;
+         uint8_t g;
+         uint8_t r;
+         b = (*src_pixel++);
+         g = (*src_pixel++);
+         r = (*src_pixel++);
+         ++src_pixel;
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_a8r8g8b8_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t a;
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         a = (*src_pixel++);
+         r = (*src_pixel++);
+         g = (*src_pixel++);
+         b = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_x8r8g8b8_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         ++src_pixel;
+         r = (*src_pixel++);
+         g = (*src_pixel++);
+         b = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_a8b8g8r8_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t a;
+         uint8_t b;
+         uint8_t g;
+         uint8_t r;
+         a = (*src_pixel++);
+         b = (*src_pixel++);
+         g = (*src_pixel++);
+         r = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_x8b8g8r8_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t b;
+         uint8_t g;
+         uint8_t r;
+         ++src_pixel;
+         b = (*src_pixel++);
+         g = (*src_pixel++);
+         r = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8x8_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (*src_pixel++);
+         g = (*src_pixel++);
+         b = (*src_pixel++);
+         ++src_pixel;
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_b5g5r5x1_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint16_t pixel = *src_pixel++;
+         uint8_t b = (uint8_t)(((uint32_t)(pixel & 0x1f)) * 0xff / 0x1f);
+         uint8_t g = (uint8_t)(((uint32_t)((pixel >> 5) & 0x1f)) * 0xff / 0x1f);
+         uint8_t r = (uint8_t)(((uint32_t)((pixel >> 10) & 0x1f)) * 0xff / 0x1f);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_b5g5r5a1_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint16_t pixel = *src_pixel++;
+         uint8_t b = (uint8_t)(((uint32_t)(pixel & 0x1f)) * 0xff / 0x1f);
+         uint8_t g = (uint8_t)(((uint32_t)((pixel >> 5) & 0x1f)) * 0xff / 0x1f);
+         uint8_t r = (uint8_t)(((uint32_t)((pixel >> 10) & 0x1f)) * 0xff / 0x1f);
+         uint8_t a = (uint8_t)(((uint32_t)(pixel >> 15)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_b4g4r4a4_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint16_t pixel = *src_pixel++;
+         uint8_t b = (uint8_t)(((uint32_t)(pixel & 0xf)) * 0xff / 0xf);
+         uint8_t g = (uint8_t)(((uint32_t)((pixel >> 4) & 0xf)) * 0xff / 0xf);
+         uint8_t r = (uint8_t)(((uint32_t)((pixel >> 8) & 0xf)) * 0xff / 0xf);
+         uint8_t a = (uint8_t)(((uint32_t)(pixel >> 12)) * 0xff / 0xf);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_b4g4r4x4_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint16_t pixel = *src_pixel++;
+         uint8_t b = (uint8_t)(((uint32_t)(pixel & 0xf)) * 0xff / 0xf);
+         uint8_t g = (uint8_t)(((uint32_t)((pixel >> 4) & 0xf)) * 0xff / 0xf);
+         uint8_t r = (uint8_t)(((uint32_t)((pixel >> 8) & 0xf)) * 0xff / 0xf);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_b5g6r5_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint16_t pixel = *src_pixel++;
+         uint8_t b = (uint8_t)(((uint32_t)(pixel & 0x1f)) * 0xff / 0x1f);
+         uint8_t g = (uint8_t)(((uint32_t)((pixel >> 5) & 0x3f)) * 0xff / 0x3f);
+         uint8_t r = (uint8_t)(((uint32_t)(pixel >> 11)) * 0xff / 0x1f);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r10g10b10a2_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint32_t pixel = *src_pixel++;
+         uint8_t r = (uint8_t)((pixel & 0x3ff) >> 2);
+         uint8_t g = (uint8_t)(((pixel >> 10) & 0x3ff) >> 2);
+         uint8_t b = (uint8_t)(((pixel >> 20) & 0x3ff) >> 2);
+         uint8_t a = (uint8_t)(((uint32_t)(pixel >> 30)) * 0xff / 0x3);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_b10g10r10a2_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint32_t pixel = *src_pixel++;
+         uint8_t b = (uint8_t)((pixel & 0x3ff) >> 2);
+         uint8_t g = (uint8_t)(((pixel >> 10) & 0x3ff) >> 2);
+         uint8_t r = (uint8_t)(((pixel >> 20) & 0x3ff) >> 2);
+         uint8_t a = (uint8_t)(((uint32_t)(pixel >> 30)) * 0xff / 0x3);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_l8_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t rgb;
+         rgb = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = rgb; /* r */
+         TILE_PIXEL(dst, x, y, 1) = rgb; /* g */
+         TILE_PIXEL(dst, x, y, 2) = rgb; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_a8_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t a;
+         a = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = 0; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_i8_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t rgba;
+         rgba = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = rgba; /* r */
+         TILE_PIXEL(dst, x, y, 1) = rgba; /* g */
+         TILE_PIXEL(dst, x, y, 2) = rgba; /* b */
+         TILE_PIXEL(dst, x, y, 3) = rgba; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_l4a4_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t pixel = *src_pixel++;
+         uint8_t rgb = (uint8_t)(((uint32_t)(pixel & 0xf)) * 0xff / 0xf);
+         uint8_t a = (uint8_t)(((uint32_t)(pixel >> 4)) * 0xff / 0xf);
+         TILE_PIXEL(dst, x, y, 0) = rgb; /* r */
+         TILE_PIXEL(dst, x, y, 1) = rgb; /* g */
+         TILE_PIXEL(dst, x, y, 2) = rgb; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_l8a8_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t rgb;
+         uint8_t a;
+         rgb = (*src_pixel++);
+         a = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = rgb; /* r */
+         TILE_PIXEL(dst, x, y, 1) = rgb; /* g */
+         TILE_PIXEL(dst, x, y, 2) = rgb; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_l16_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t rgb;
+         rgb = (uint8_t)((*src_pixel++) >> 8);
+         TILE_PIXEL(dst, x, y, 0) = rgb; /* r */
+         TILE_PIXEL(dst, x, y, 1) = rgb; /* g */
+         TILE_PIXEL(dst, x, y, 2) = rgb; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_l8_srgb_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t rgb;
+         rgb = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = rgb; /* r */
+         TILE_PIXEL(dst, x, y, 1) = rgb; /* g */
+         TILE_PIXEL(dst, x, y, 2) = rgb; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_l8a8_srgb_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t rgb;
+         uint8_t a;
+         rgb = (*src_pixel++);
+         a = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = rgb; /* r */
+         TILE_PIXEL(dst, x, y, 1) = rgb; /* g */
+         TILE_PIXEL(dst, x, y, 2) = rgb; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8_srgb_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*3);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (*src_pixel++);
+         g = (*src_pixel++);
+         b = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8a8_srgb_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = (*src_pixel++);
+         g = (*src_pixel++);
+         b = (*src_pixel++);
+         a = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_a8b8g8r8_srgb_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t a;
+         uint8_t b;
+         uint8_t g;
+         uint8_t r;
+         a = (*src_pixel++);
+         b = (*src_pixel++);
+         g = (*src_pixel++);
+         r = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_x8b8g8r8_srgb_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t b;
+         uint8_t g;
+         uint8_t r;
+         ++src_pixel;
+         b = (*src_pixel++);
+         g = (*src_pixel++);
+         r = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_b8g8r8a8_srgb_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t b;
+         uint8_t g;
+         uint8_t r;
+         uint8_t a;
+         b = (*src_pixel++);
+         g = (*src_pixel++);
+         r = (*src_pixel++);
+         a = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_b8g8r8x8_srgb_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t b;
+         uint8_t g;
+         uint8_t r;
+         b = (*src_pixel++);
+         g = (*src_pixel++);
+         r = (*src_pixel++);
+         ++src_pixel;
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_a8r8g8b8_srgb_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t a;
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         a = (*src_pixel++);
+         r = (*src_pixel++);
+         g = (*src_pixel++);
+         b = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_x8r8g8b8_srgb_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         ++src_pixel;
+         r = (*src_pixel++);
+         g = (*src_pixel++);
+         b = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8sg8sb8ux8u_norm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int8_t *src_pixel = (const int8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x7f);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x7f);
+         b = (*src_pixel++);
+         ++src_pixel;
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r10sg10sb10sa2u_norm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint32_t pixel = *src_pixel++;
+         uint8_t r = (uint8_t)((pixel & 0x3ff) >> 1);
+         uint8_t g = (uint8_t)(((pixel >> 10) & 0x3ff) >> 1);
+         uint8_t b = (uint8_t)(((pixel >> 20) & 0x3ff) >> 1);
+         uint8_t a = (uint8_t)(((uint32_t)(pixel >> 30)) * 0xff / 0x3);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r5sg5sb6u_norm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint16_t pixel = *src_pixel++;
+         uint8_t r = (uint8_t)(((uint32_t)(pixel & 0x1f)) * 0xff / 0xf);
+         uint8_t g = (uint8_t)(((uint32_t)((pixel >> 5) & 0x1f)) * 0xff / 0xf);
+         uint8_t b = (uint8_t)(((uint32_t)(pixel >> 10)) * 0xff / 0x3f);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r10g10b10a2_uscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint32_t pixel = *src_pixel++;
+         uint8_t r = (uint8_t)(((uint32_t)(pixel & 0x3ff)) * 0xff / 0x1);
+         uint8_t g = (uint8_t)(((uint32_t)((pixel >> 10) & 0x3ff)) * 0xff / 0x1);
+         uint8_t b = (uint8_t)(((uint32_t)((pixel >> 20) & 0x3ff)) * 0xff / 0x1);
+         uint8_t a = (uint8_t)(((uint32_t)(pixel >> 30)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r64_float_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const double *src_pixel = (const double *)(src_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = (uint8_t)((*src_pixel++) * 0xff);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r64g64_float_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const double *src_pixel = (const double *)(src_row + x0*16);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         r = (uint8_t)((*src_pixel++) * 0xff);
+         g = (uint8_t)((*src_pixel++) * 0xff);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r64g64b64_float_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const double *src_pixel = (const double *)(src_row + x0*24);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (uint8_t)((*src_pixel++) * 0xff);
+         g = (uint8_t)((*src_pixel++) * 0xff);
+         b = (uint8_t)((*src_pixel++) * 0xff);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r64g64b64a64_float_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const double *src_pixel = (const double *)(src_row + x0*32);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = (uint8_t)((*src_pixel++) * 0xff);
+         g = (uint8_t)((*src_pixel++) * 0xff);
+         b = (uint8_t)((*src_pixel++) * 0xff);
+         a = (uint8_t)((*src_pixel++) * 0xff);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32_float_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const float *src_pixel = (const float *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = float_to_ubyte((*src_pixel++));
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32g32_float_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const float *src_pixel = (const float *)(src_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         r = float_to_ubyte((*src_pixel++));
+         g = float_to_ubyte((*src_pixel++));
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32_float_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const float *src_pixel = (const float *)(src_row + x0*12);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = float_to_ubyte((*src_pixel++));
+         g = float_to_ubyte((*src_pixel++));
+         b = float_to_ubyte((*src_pixel++));
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32a32_float_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const float *src_pixel = (const float *)(src_row + x0*16);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = float_to_ubyte((*src_pixel++));
+         g = float_to_ubyte((*src_pixel++));
+         b = float_to_ubyte((*src_pixel++));
+         a = float_to_ubyte((*src_pixel++));
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = (uint8_t)((*src_pixel++) >> 24);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32g32_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         r = (uint8_t)((*src_pixel++) >> 24);
+         g = (uint8_t)((*src_pixel++) >> 24);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*12);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (uint8_t)((*src_pixel++) >> 24);
+         g = (uint8_t)((*src_pixel++) >> 24);
+         b = (uint8_t)((*src_pixel++) >> 24);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32a32_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*16);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = (uint8_t)((*src_pixel++) >> 24);
+         g = (uint8_t)((*src_pixel++) >> 24);
+         b = (uint8_t)((*src_pixel++) >> 24);
+         a = (uint8_t)((*src_pixel++) >> 24);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32_uscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32g32_uscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         r = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32_uscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*12);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         b = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32a32_uscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*16);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         b = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         a = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32_snorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int32_t *src_pixel = (const int32_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = (uint8_t)((*src_pixel++) >> 23);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32g32_snorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int32_t *src_pixel = (const int32_t *)(src_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         r = (uint8_t)((*src_pixel++) >> 23);
+         g = (uint8_t)((*src_pixel++) >> 23);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32_snorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int32_t *src_pixel = (const int32_t *)(src_row + x0*12);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (uint8_t)((*src_pixel++) >> 23);
+         g = (uint8_t)((*src_pixel++) >> 23);
+         b = (uint8_t)((*src_pixel++) >> 23);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32a32_snorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int32_t *src_pixel = (const int32_t *)(src_row + x0*16);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = (uint8_t)((*src_pixel++) >> 23);
+         g = (uint8_t)((*src_pixel++) >> 23);
+         b = (uint8_t)((*src_pixel++) >> 23);
+         a = (uint8_t)((*src_pixel++) >> 23);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32_sscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int32_t *src_pixel = (const int32_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32g32_sscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int32_t *src_pixel = (const int32_t *)(src_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         r = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32_sscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int32_t *src_pixel = (const int32_t *)(src_row + x0*12);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         b = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32a32_sscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int32_t *src_pixel = (const int32_t *)(src_row + x0*16);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         b = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         a = (uint8_t)(((uint64_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16_float_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = float_to_ubyte(util_half_to_float((*src_pixel++)));
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16g16_float_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         r = float_to_ubyte(util_half_to_float((*src_pixel++)));
+         g = float_to_ubyte(util_half_to_float((*src_pixel++)));
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16_float_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*6);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = float_to_ubyte(util_half_to_float((*src_pixel++)));
+         g = float_to_ubyte(util_half_to_float((*src_pixel++)));
+         b = float_to_ubyte(util_half_to_float((*src_pixel++)));
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16a16_float_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = float_to_ubyte(util_half_to_float((*src_pixel++)));
+         g = float_to_ubyte(util_half_to_float((*src_pixel++)));
+         b = float_to_ubyte(util_half_to_float((*src_pixel++)));
+         a = float_to_ubyte(util_half_to_float((*src_pixel++)));
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = (uint8_t)((*src_pixel++) >> 8);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16g16_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         r = (uint8_t)((*src_pixel++) >> 8);
+         g = (uint8_t)((*src_pixel++) >> 8);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*6);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (uint8_t)((*src_pixel++) >> 8);
+         g = (uint8_t)((*src_pixel++) >> 8);
+         b = (uint8_t)((*src_pixel++) >> 8);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16a16_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = (uint8_t)((*src_pixel++) >> 8);
+         g = (uint8_t)((*src_pixel++) >> 8);
+         b = (uint8_t)((*src_pixel++) >> 8);
+         a = (uint8_t)((*src_pixel++) >> 8);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16_uscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16g16_uscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16_uscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*6);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         b = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16a16_uscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint16_t *src_pixel = (const uint16_t *)(src_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         b = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         a = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16_snorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int16_t *src_pixel = (const int16_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = (uint8_t)((*src_pixel++) >> 7);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16g16_snorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int16_t *src_pixel = (const int16_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         r = (uint8_t)((*src_pixel++) >> 7);
+         g = (uint8_t)((*src_pixel++) >> 7);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16_snorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int16_t *src_pixel = (const int16_t *)(src_row + x0*6);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (uint8_t)((*src_pixel++) >> 7);
+         g = (uint8_t)((*src_pixel++) >> 7);
+         b = (uint8_t)((*src_pixel++) >> 7);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16a16_snorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int16_t *src_pixel = (const int16_t *)(src_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = (uint8_t)((*src_pixel++) >> 7);
+         g = (uint8_t)((*src_pixel++) >> 7);
+         b = (uint8_t)((*src_pixel++) >> 7);
+         a = (uint8_t)((*src_pixel++) >> 7);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16_sscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int16_t *src_pixel = (const int16_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16g16_sscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int16_t *src_pixel = (const int16_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16_sscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int16_t *src_pixel = (const int16_t *)(src_row + x0*6);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         b = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16a16_sscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int16_t *src_pixel = (const int16_t *)(src_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         b = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         a = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8g8_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         r = (*src_pixel++);
+         g = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*3);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (*src_pixel++);
+         g = (*src_pixel++);
+         b = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8a8_unorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = (*src_pixel++);
+         g = (*src_pixel++);
+         b = (*src_pixel++);
+         a = (*src_pixel++);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8_uscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*1);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8g8_uscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8_uscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*3);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         b = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8a8_uscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint8_t *src_pixel = (const uint8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         b = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         a = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8_snorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int8_t *src_pixel = (const int8_t *)(src_row + x0*1);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x7f);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8g8_snorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int8_t *src_pixel = (const int8_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x7f);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x7f);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8_snorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int8_t *src_pixel = (const int8_t *)(src_row + x0*3);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x7f);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x7f);
+         b = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x7f);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8a8_snorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int8_t *src_pixel = (const int8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x7f);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x7f);
+         b = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x7f);
+         a = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x7f);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8_sscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int8_t *src_pixel = (const int8_t *)(src_row + x0*1);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = 0; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8g8_sscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int8_t *src_pixel = (const int8_t *)(src_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = 0; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8_sscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int8_t *src_pixel = (const int8_t *)(src_row + x0*3);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         b = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8a8_sscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const int8_t *src_pixel = (const int8_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint8_t r;
+         uint8_t g;
+         uint8_t b;
+         uint8_t a;
+         r = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         g = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         b = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         a = (uint8_t)(((uint32_t)(*src_pixel++)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = a; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r10g10b10x2_uscaled_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint32_t pixel = *src_pixel++;
+         uint8_t r = (uint8_t)(((uint32_t)(pixel & 0x3ff)) * 0xff / 0x1);
+         uint8_t g = (uint8_t)(((uint32_t)((pixel >> 10) & 0x3ff)) * 0xff / 0x1);
+         uint8_t b = (uint8_t)(((uint32_t)((pixel >> 20) & 0x3ff)) * 0xff / 0x1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+static void
+lp_tile_r10g10b10x2_snorm_swizzle_4ub(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   const uint8_t *src_row = src + y0*src_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      const uint32_t *src_pixel = (const uint32_t *)(src_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint32_t pixel = *src_pixel++;
+         uint8_t r = (uint8_t)((pixel & 0x3ff) >> 1);
+         uint8_t g = (uint8_t)(((pixel >> 10) & 0x3ff) >> 1);
+         uint8_t b = (uint8_t)(((pixel >> 20) & 0x3ff) >> 1);
+         TILE_PIXEL(dst, x, y, 0) = r; /* r */
+         TILE_PIXEL(dst, x, y, 1) = g; /* g */
+         TILE_PIXEL(dst, x, y, 2) = b; /* b */
+         TILE_PIXEL(dst, x, y, 3) = 255; /* a */
+      }
+      src_row += src_stride;
+   }
+}
+
+void
+lp_tile_swizzle_4ub(enum pipe_format format, uint8_t *dst, const void *src, unsigned src_stride, unsigned x, unsigned y)
+{
+   void (*func)(uint8_t *dst, const uint8_t *src, unsigned src_stride, unsigned x0, unsigned y0);
+#ifdef DEBUG
+   lp_tile_swizzle_count += 1;
+#endif
+   switch(format) {
+   case PIPE_FORMAT_NONE:
+      func = lp_tile_none_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+#ifdef PIPE_ARCH_SSE
+      func = util_cpu_caps.has_sse2 ? lp_tile_b8g8r8a8_unorm_swizzle_4ub_sse2 : lp_tile_b8g8r8a8_unorm_swizzle_4ub;
+#else
+      func = lp_tile_b8g8r8a8_unorm_swizzle_4ub;
+#endif
+      break;
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      func = lp_tile_b8g8r8x8_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      func = lp_tile_a8r8g8b8_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      func = lp_tile_x8r8g8b8_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_A8B8G8R8_UNORM:
+      func = lp_tile_a8b8g8r8_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_X8B8G8R8_UNORM:
+      func = lp_tile_x8b8g8r8_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8X8_UNORM:
+      func = lp_tile_r8g8b8x8_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_B5G5R5X1_UNORM:
+      func = lp_tile_b5g5r5x1_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
+      func = lp_tile_b5g5r5a1_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
+      func = lp_tile_b4g4r4a4_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_B4G4R4X4_UNORM:
+      func = lp_tile_b4g4r4x4_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_B5G6R5_UNORM:
+      func = lp_tile_b5g6r5_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R10G10B10A2_UNORM:
+      func = lp_tile_r10g10b10a2_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_B10G10R10A2_UNORM:
+      func = lp_tile_b10g10r10a2_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_L8_UNORM:
+      func = lp_tile_l8_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_A8_UNORM:
+      func = lp_tile_a8_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_I8_UNORM:
+      func = lp_tile_i8_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_L4A4_UNORM:
+      func = lp_tile_l4a4_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_L8A8_UNORM:
+      func = lp_tile_l8a8_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_L16_UNORM:
+      func = lp_tile_l16_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_L8_SRGB:
+      func = lp_tile_l8_srgb_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_L8A8_SRGB:
+      func = lp_tile_l8a8_srgb_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8_SRGB:
+      func = lp_tile_r8g8b8_srgb_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8A8_SRGB:
+      func = lp_tile_r8g8b8a8_srgb_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_A8B8G8R8_SRGB:
+      func = lp_tile_a8b8g8r8_srgb_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_X8B8G8R8_SRGB:
+      func = lp_tile_x8b8g8r8_srgb_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_B8G8R8A8_SRGB:
+      func = lp_tile_b8g8r8a8_srgb_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_B8G8R8X8_SRGB:
+      func = lp_tile_b8g8r8x8_srgb_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_A8R8G8B8_SRGB:
+      func = lp_tile_a8r8g8b8_srgb_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_X8R8G8B8_SRGB:
+      func = lp_tile_x8r8g8b8_srgb_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
+      func = lp_tile_r8sg8sb8ux8u_norm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
+      func = lp_tile_r10sg10sb10sa2u_norm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R5SG5SB6U_NORM:
+      func = lp_tile_r5sg5sb6u_norm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R10G10B10A2_USCALED:
+      func = lp_tile_r10g10b10a2_uscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R64_FLOAT:
+      func = lp_tile_r64_float_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      func = lp_tile_r64g64_float_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      func = lp_tile_r64g64b64_float_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      func = lp_tile_r64g64b64a64_float_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32_FLOAT:
+      func = lp_tile_r32_float_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      func = lp_tile_r32g32_float_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      func = lp_tile_r32g32b32_float_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      func = lp_tile_r32g32b32a32_float_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32_UNORM:
+      func = lp_tile_r32_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32_UNORM:
+      func = lp_tile_r32g32_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      func = lp_tile_r32g32b32_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      func = lp_tile_r32g32b32a32_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32_USCALED:
+      func = lp_tile_r32_uscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32_USCALED:
+      func = lp_tile_r32g32_uscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      func = lp_tile_r32g32b32_uscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      func = lp_tile_r32g32b32a32_uscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32_SNORM:
+      func = lp_tile_r32_snorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32_SNORM:
+      func = lp_tile_r32g32_snorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      func = lp_tile_r32g32b32_snorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      func = lp_tile_r32g32b32a32_snorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32_SSCALED:
+      func = lp_tile_r32_sscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      func = lp_tile_r32g32_sscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      func = lp_tile_r32g32b32_sscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      func = lp_tile_r32g32b32a32_sscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16_FLOAT:
+      func = lp_tile_r16_float_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16_FLOAT:
+      func = lp_tile_r16g16_float_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16_FLOAT:
+      func = lp_tile_r16g16b16_float_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16A16_FLOAT:
+      func = lp_tile_r16g16b16a16_float_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16_UNORM:
+      func = lp_tile_r16_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16_UNORM:
+      func = lp_tile_r16g16_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      func = lp_tile_r16g16b16_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      func = lp_tile_r16g16b16a16_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16_USCALED:
+      func = lp_tile_r16_uscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16_USCALED:
+      func = lp_tile_r16g16_uscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      func = lp_tile_r16g16b16_uscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      func = lp_tile_r16g16b16a16_uscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16_SNORM:
+      func = lp_tile_r16_snorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16_SNORM:
+      func = lp_tile_r16g16_snorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      func = lp_tile_r16g16b16_snorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      func = lp_tile_r16g16b16a16_snorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16_SSCALED:
+      func = lp_tile_r16_sscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      func = lp_tile_r16g16_sscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      func = lp_tile_r16g16b16_sscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      func = lp_tile_r16g16b16a16_sscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8_UNORM:
+      func = lp_tile_r8_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8_UNORM:
+      func = lp_tile_r8g8_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      func = lp_tile_r8g8b8_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      func = lp_tile_r8g8b8a8_unorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8_USCALED:
+      func = lp_tile_r8_uscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8_USCALED:
+      func = lp_tile_r8g8_uscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      func = lp_tile_r8g8b8_uscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      func = lp_tile_r8g8b8a8_uscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8_SNORM:
+      func = lp_tile_r8_snorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8_SNORM:
+      func = lp_tile_r8g8_snorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      func = lp_tile_r8g8b8_snorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      func = lp_tile_r8g8b8a8_snorm_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8_SSCALED:
+      func = lp_tile_r8_sscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      func = lp_tile_r8g8_sscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      func = lp_tile_r8g8b8_sscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      func = lp_tile_r8g8b8a8_sscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R10G10B10X2_USCALED:
+      func = lp_tile_r10g10b10x2_uscaled_swizzle_4ub;
+      break;
+   case PIPE_FORMAT_R10G10B10X2_SNORM:
+      func = lp_tile_r10g10b10x2_snorm_swizzle_4ub;
+      break;
+   default:
+      debug_printf("%s: unsupported format %s\n", __FUNCTION__, util_format_name(format));
+      return;
+   }
+   func(dst, (const uint8_t *)src, src_stride, x, y);
+}
+
+static void
+lp_tile_none_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 1;
+   uint8_t *dstpix = (uint8_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint8_t)(((uint32_t)r[i+0]) * 0x1 / 0xff)) << 0);
+            const uint32_t pixel1 = (((uint8_t)(((uint32_t)r[i+1]) * 0x1 / 0xff)) << 0);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_b8g8r8a8_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = ((b[i+0]) << 0) | ((g[i+0]) << 8) | ((r[i+0]) << 16) | ((a[i+0]) << 24);
+            const uint32_t pixel1 = ((b[i+1]) << 0) | ((g[i+1]) << 8) | ((r[i+1]) << 16) | ((a[i+1]) << 24);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_b8g8r8x8_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = ((b[i+0]) << 0) | ((g[i+0]) << 8) | ((r[i+0]) << 16);
+            const uint32_t pixel1 = ((b[i+1]) << 0) | ((g[i+1]) << 8) | ((r[i+1]) << 16);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_a8r8g8b8_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = ((a[i+0]) << 0) | ((r[i+0]) << 8) | ((g[i+0]) << 16) | ((b[i+0]) << 24);
+            const uint32_t pixel1 = ((a[i+1]) << 0) | ((r[i+1]) << 8) | ((g[i+1]) << 16) | ((b[i+1]) << 24);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_x8r8g8b8_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = ((r[i+0]) << 8) | ((g[i+0]) << 16) | ((b[i+0]) << 24);
+            const uint32_t pixel1 = ((r[i+1]) << 8) | ((g[i+1]) << 16) | ((b[i+1]) << 24);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_a8b8g8r8_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = ((a[i+0]) << 0) | ((b[i+0]) << 8) | ((g[i+0]) << 16) | ((r[i+0]) << 24);
+            const uint32_t pixel1 = ((a[i+1]) << 0) | ((b[i+1]) << 8) | ((g[i+1]) << 16) | ((r[i+1]) << 24);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_x8b8g8r8_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = ((b[i+0]) << 8) | ((g[i+0]) << 16) | ((r[i+0]) << 24);
+            const uint32_t pixel1 = ((b[i+1]) << 8) | ((g[i+1]) << 16) | ((r[i+1]) << 24);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r8g8b8x8_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = ((r[i+0]) << 0) | ((g[i+0]) << 8) | ((b[i+0]) << 16);
+            const uint32_t pixel1 = ((r[i+1]) << 0) | ((g[i+1]) << 8) | ((b[i+1]) << 16);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_b5g5r5x1_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 2;
+   uint16_t *dstpix = (uint16_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint16_t)(b[i+0] >> 3)) << 0) | (((uint16_t)(g[i+0] >> 3)) << 5) | (((uint16_t)(r[i+0] >> 3)) << 10);
+            const uint32_t pixel1 = (((uint16_t)(b[i+1] >> 3)) << 0) | (((uint16_t)(g[i+1] >> 3)) << 5) | (((uint16_t)(r[i+1] >> 3)) << 10);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_b5g5r5a1_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 2;
+   uint16_t *dstpix = (uint16_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint16_t)(b[i+0] >> 3)) << 0) | (((uint16_t)(g[i+0] >> 3)) << 5) | (((uint16_t)(r[i+0] >> 3)) << 10) | (((uint16_t)(a[i+0] >> 7)) << 15);
+            const uint32_t pixel1 = (((uint16_t)(b[i+1] >> 3)) << 0) | (((uint16_t)(g[i+1] >> 3)) << 5) | (((uint16_t)(r[i+1] >> 3)) << 10) | (((uint16_t)(a[i+1] >> 7)) << 15);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_b4g4r4a4_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 2;
+   uint16_t *dstpix = (uint16_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint16_t)(b[i+0] >> 4)) << 0) | (((uint16_t)(g[i+0] >> 4)) << 4) | (((uint16_t)(r[i+0] >> 4)) << 8) | (((uint16_t)(a[i+0] >> 4)) << 12);
+            const uint32_t pixel1 = (((uint16_t)(b[i+1] >> 4)) << 0) | (((uint16_t)(g[i+1] >> 4)) << 4) | (((uint16_t)(r[i+1] >> 4)) << 8) | (((uint16_t)(a[i+1] >> 4)) << 12);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_b4g4r4x4_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 2;
+   uint16_t *dstpix = (uint16_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint16_t)(b[i+0] >> 4)) << 0) | (((uint16_t)(g[i+0] >> 4)) << 4) | (((uint16_t)(r[i+0] >> 4)) << 8);
+            const uint32_t pixel1 = (((uint16_t)(b[i+1] >> 4)) << 0) | (((uint16_t)(g[i+1] >> 4)) << 4) | (((uint16_t)(r[i+1] >> 4)) << 8);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_b5g6r5_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 2;
+   uint16_t *dstpix = (uint16_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint16_t)(b[i+0] >> 3)) << 0) | (((uint16_t)(g[i+0] >> 2)) << 5) | (((uint16_t)(r[i+0] >> 3)) << 11);
+            const uint32_t pixel1 = (((uint16_t)(b[i+1] >> 3)) << 0) | (((uint16_t)(g[i+1] >> 2)) << 5) | (((uint16_t)(r[i+1] >> 3)) << 11);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r10g10b10a2_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint32_t)(((uint32_t)r[i+0]) * 0x3ff / 0xff)) << 0) | (((uint32_t)(((uint32_t)g[i+0]) * 0x3ff / 0xff)) << 10) | (((uint32_t)(((uint32_t)b[i+0]) * 0x3ff / 0xff)) << 20) | (((uint32_t)(a[i+0] >> 6)) << 30);
+            const uint32_t pixel1 = (((uint32_t)(((uint32_t)r[i+1]) * 0x3ff / 0xff)) << 0) | (((uint32_t)(((uint32_t)g[i+1]) * 0x3ff / 0xff)) << 10) | (((uint32_t)(((uint32_t)b[i+1]) * 0x3ff / 0xff)) << 20) | (((uint32_t)(a[i+1] >> 6)) << 30);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_b10g10r10a2_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint32_t)(((uint32_t)b[i+0]) * 0x3ff / 0xff)) << 0) | (((uint32_t)(((uint32_t)g[i+0]) * 0x3ff / 0xff)) << 10) | (((uint32_t)(((uint32_t)r[i+0]) * 0x3ff / 0xff)) << 20) | (((uint32_t)(a[i+0] >> 6)) << 30);
+            const uint32_t pixel1 = (((uint32_t)(((uint32_t)b[i+1]) * 0x3ff / 0xff)) << 0) | (((uint32_t)(((uint32_t)g[i+1]) * 0x3ff / 0xff)) << 10) | (((uint32_t)(((uint32_t)r[i+1]) * 0x3ff / 0xff)) << 20) | (((uint32_t)(a[i+1] >> 6)) << 30);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_l8_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 1;
+   uint8_t *dstpix = (uint8_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = ((b[i+0]) << 0);
+            const uint32_t pixel1 = ((b[i+1]) << 0);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_a8_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 1;
+   uint8_t *dstpix = (uint8_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = ((a[i+0]) << 0);
+            const uint32_t pixel1 = ((a[i+1]) << 0);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_i8_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 1;
+   uint8_t *dstpix = (uint8_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = ((a[i+0]) << 0);
+            const uint32_t pixel1 = ((a[i+1]) << 0);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_l4a4_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 1;
+   uint8_t *dstpix = (uint8_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint8_t)(b[i+0] >> 4)) << 0) | (((uint8_t)(a[i+0] >> 4)) << 4);
+            const uint32_t pixel1 = (((uint8_t)(b[i+1] >> 4)) << 0) | (((uint8_t)(a[i+1] >> 4)) << 4);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_l8a8_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 2;
+   uint16_t *dstpix = (uint16_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = ((b[i+0]) << 0) | ((a[i+0]) << 8);
+            const uint32_t pixel1 = ((b[i+1]) << 0) | ((a[i+1]) << 8);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_l16_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 2;
+   uint16_t *dstpix = (uint16_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint16_t)(((uint32_t)b[i+0]) * 0xffff / 0xff)) << 0);
+            const uint32_t pixel1 = (((uint16_t)(((uint32_t)b[i+1]) * 0xffff / 0xff)) << 0);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_l8_srgb_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*1);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 2);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_l8a8_srgb_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 2);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 3);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8_srgb_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*3);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 0);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 1);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 2);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8a8_srgb_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 0);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 1);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 2);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 3);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_a8b8g8r8_srgb_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 3);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 2);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 1);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 0);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_x8b8g8r8_srgb_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         ++dst_pixel;
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 2);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 1);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 0);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_b8g8r8a8_srgb_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 2);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 1);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 0);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 3);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_b8g8r8x8_srgb_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 2);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 1);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 0);
+         ++dst_pixel;
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_a8r8g8b8_srgb_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 3);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 0);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 1);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 2);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_x8r8g8b8_srgb_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         ++dst_pixel;
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 0);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 1);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 2);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r8sg8sb8ux8u_norm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int8_t *dst_pixel = (int8_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int8_t)(TILE_PIXEL(src, x, y, 0) >> 1);
+         *dst_pixel++ = (int8_t)(TILE_PIXEL(src, x, y, 1) >> 1);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 2);
+         ++dst_pixel;
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r10sg10sb10sa2u_norm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint32_t pixel = 0;
+         pixel |= (uint32_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x1ff / 0xff);
+         pixel |= ((uint32_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0x1ff / 0xff) << 10);
+         pixel |= ((uint32_t)(((uint32_t)TILE_PIXEL(src, x, y, 2)) * 0x1ff / 0xff) << 20);
+         pixel |= ((uint32_t)(TILE_PIXEL(src, x, y, 3) >> 6) << 30);
+         *dst_pixel++ = pixel;
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r5sg5sb6u_norm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint16_t pixel = 0;
+         pixel |= (uint16_t)(TILE_PIXEL(src, x, y, 0) >> 4);
+         pixel |= ((uint16_t)(TILE_PIXEL(src, x, y, 1) >> 4) << 5);
+         pixel |= ((uint16_t)(TILE_PIXEL(src, x, y, 2) >> 2) << 10);
+         *dst_pixel++ = pixel;
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r10g10b10a2_uscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint32_t)(((uint32_t)r[i+0]) * 0x1 / 0xff)) << 0) | (((uint32_t)(((uint32_t)g[i+0]) * 0x1 / 0xff)) << 10) | (((uint32_t)(((uint32_t)b[i+0]) * 0x1 / 0xff)) << 20) | (((uint32_t)(((uint32_t)a[i+0]) * 0x1 / 0xff)) << 30);
+            const uint32_t pixel1 = (((uint32_t)(((uint32_t)r[i+1]) * 0x1 / 0xff)) << 0) | (((uint32_t)(((uint32_t)g[i+1]) * 0x1 / 0xff)) << 10) | (((uint32_t)(((uint32_t)b[i+1]) * 0x1 / 0xff)) << 20) | (((uint32_t)(((uint32_t)a[i+1]) * 0x1 / 0xff)) << 30);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r64_float_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      double *dst_pixel = (double *)(dst_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (double)(TILE_PIXEL(src, x, y, 0) * (1.0f/0xff));
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r64g64_float_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      double *dst_pixel = (double *)(dst_row + x0*16);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (double)(TILE_PIXEL(src, x, y, 0) * (1.0f/0xff));
+         *dst_pixel++ = (double)(TILE_PIXEL(src, x, y, 1) * (1.0f/0xff));
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r64g64b64_float_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      double *dst_pixel = (double *)(dst_row + x0*24);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (double)(TILE_PIXEL(src, x, y, 0) * (1.0f/0xff));
+         *dst_pixel++ = (double)(TILE_PIXEL(src, x, y, 1) * (1.0f/0xff));
+         *dst_pixel++ = (double)(TILE_PIXEL(src, x, y, 2) * (1.0f/0xff));
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r64g64b64a64_float_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      double *dst_pixel = (double *)(dst_row + x0*32);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (double)(TILE_PIXEL(src, x, y, 0) * (1.0f/0xff));
+         *dst_pixel++ = (double)(TILE_PIXEL(src, x, y, 1) * (1.0f/0xff));
+         *dst_pixel++ = (double)(TILE_PIXEL(src, x, y, 2) * (1.0f/0xff));
+         *dst_pixel++ = (double)(TILE_PIXEL(src, x, y, 3) * (1.0f/0xff));
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32_float_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      float *dst_pixel = (float *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = ubyte_to_float(TILE_PIXEL(src, x, y, 0));
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32g32_float_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      float *dst_pixel = (float *)(dst_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = ubyte_to_float(TILE_PIXEL(src, x, y, 0));
+         *dst_pixel++ = ubyte_to_float(TILE_PIXEL(src, x, y, 1));
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32_float_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      float *dst_pixel = (float *)(dst_row + x0*12);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = ubyte_to_float(TILE_PIXEL(src, x, y, 0));
+         *dst_pixel++ = ubyte_to_float(TILE_PIXEL(src, x, y, 1));
+         *dst_pixel++ = ubyte_to_float(TILE_PIXEL(src, x, y, 2));
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32a32_float_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      float *dst_pixel = (float *)(dst_row + x0*16);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = ubyte_to_float(TILE_PIXEL(src, x, y, 0));
+         *dst_pixel++ = ubyte_to_float(TILE_PIXEL(src, x, y, 1));
+         *dst_pixel++ = ubyte_to_float(TILE_PIXEL(src, x, y, 2));
+         *dst_pixel++ = ubyte_to_float(TILE_PIXEL(src, x, y, 3));
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint32_t)(((uint64_t)r[i+0]) * 0xffffffff / 0xff)) << 0);
+            const uint32_t pixel1 = (((uint32_t)(((uint64_t)r[i+1]) * 0xffffffff / 0xff)) << 0);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r32g32_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 0)) * 0xffffffff / 0xff);
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 1)) * 0xffffffff / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*12);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 0)) * 0xffffffff / 0xff);
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 1)) * 0xffffffff / 0xff);
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 2)) * 0xffffffff / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32a32_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*16);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 0)) * 0xffffffff / 0xff);
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 1)) * 0xffffffff / 0xff);
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 2)) * 0xffffffff / 0xff);
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 3)) * 0xffffffff / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32_uscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint32_t)(((uint64_t)r[i+0]) * 0x1 / 0xff)) << 0);
+            const uint32_t pixel1 = (((uint32_t)(((uint64_t)r[i+1]) * 0x1 / 0xff)) << 0);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r32g32_uscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 1)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32_uscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*12);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 1)) * 0x1 / 0xff);
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 2)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32a32_uscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*16);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 1)) * 0x1 / 0xff);
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 2)) * 0x1 / 0xff);
+         *dst_pixel++ = (uint32_t)(((uint64_t)TILE_PIXEL(src, x, y, 3)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32_snorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int32_t *dst_pixel = (int32_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 0)) * 0x7fffffff / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32g32_snorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int32_t *dst_pixel = (int32_t *)(dst_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 0)) * 0x7fffffff / 0xff);
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 1)) * 0x7fffffff / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32_snorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int32_t *dst_pixel = (int32_t *)(dst_row + x0*12);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 0)) * 0x7fffffff / 0xff);
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 1)) * 0x7fffffff / 0xff);
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 2)) * 0x7fffffff / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32a32_snorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int32_t *dst_pixel = (int32_t *)(dst_row + x0*16);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 0)) * 0x7fffffff / 0xff);
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 1)) * 0x7fffffff / 0xff);
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 2)) * 0x7fffffff / 0xff);
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 3)) * 0x7fffffff / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32_sscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int32_t *dst_pixel = (int32_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32g32_sscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int32_t *dst_pixel = (int32_t *)(dst_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 1)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32_sscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int32_t *dst_pixel = (int32_t *)(dst_row + x0*12);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 1)) * 0x1 / 0xff);
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 2)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r32g32b32a32_sscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int32_t *dst_pixel = (int32_t *)(dst_row + x0*16);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 1)) * 0x1 / 0xff);
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 2)) * 0x1 / 0xff);
+         *dst_pixel++ = (int32_t)(((uint64_t)TILE_PIXEL(src, x, y, 3)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16_float_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = util_float_to_half((float)(TILE_PIXEL(src, x, y, 0) * (1.0f/0xff)));
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16g16_float_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = util_float_to_half((float)(TILE_PIXEL(src, x, y, 0) * (1.0f/0xff)));
+         *dst_pixel++ = util_float_to_half((float)(TILE_PIXEL(src, x, y, 1) * (1.0f/0xff)));
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16_float_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*6);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = util_float_to_half((float)(TILE_PIXEL(src, x, y, 0) * (1.0f/0xff)));
+         *dst_pixel++ = util_float_to_half((float)(TILE_PIXEL(src, x, y, 1) * (1.0f/0xff)));
+         *dst_pixel++ = util_float_to_half((float)(TILE_PIXEL(src, x, y, 2) * (1.0f/0xff)));
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16a16_float_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = util_float_to_half((float)(TILE_PIXEL(src, x, y, 0) * (1.0f/0xff)));
+         *dst_pixel++ = util_float_to_half((float)(TILE_PIXEL(src, x, y, 1) * (1.0f/0xff)));
+         *dst_pixel++ = util_float_to_half((float)(TILE_PIXEL(src, x, y, 2) * (1.0f/0xff)));
+         *dst_pixel++ = util_float_to_half((float)(TILE_PIXEL(src, x, y, 3) * (1.0f/0xff)));
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 2;
+   uint16_t *dstpix = (uint16_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint16_t)(((uint32_t)r[i+0]) * 0xffff / 0xff)) << 0);
+            const uint32_t pixel1 = (((uint16_t)(((uint32_t)r[i+1]) * 0xffff / 0xff)) << 0);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r16g16_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint16_t)(((uint32_t)r[i+0]) * 0xffff / 0xff)) << 0) | (((uint16_t)(((uint32_t)g[i+0]) * 0xffff / 0xff)) << 16);
+            const uint32_t pixel1 = (((uint16_t)(((uint32_t)r[i+1]) * 0xffff / 0xff)) << 0) | (((uint16_t)(((uint32_t)g[i+1]) * 0xffff / 0xff)) << 16);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r16g16b16_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*6);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (uint16_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0xffff / 0xff);
+         *dst_pixel++ = (uint16_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0xffff / 0xff);
+         *dst_pixel++ = (uint16_t)(((uint32_t)TILE_PIXEL(src, x, y, 2)) * 0xffff / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16a16_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (uint16_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0xffff / 0xff);
+         *dst_pixel++ = (uint16_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0xffff / 0xff);
+         *dst_pixel++ = (uint16_t)(((uint32_t)TILE_PIXEL(src, x, y, 2)) * 0xffff / 0xff);
+         *dst_pixel++ = (uint16_t)(((uint32_t)TILE_PIXEL(src, x, y, 3)) * 0xffff / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16_uscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 2;
+   uint16_t *dstpix = (uint16_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint16_t)(((uint32_t)r[i+0]) * 0x1 / 0xff)) << 0);
+            const uint32_t pixel1 = (((uint16_t)(((uint32_t)r[i+1]) * 0x1 / 0xff)) << 0);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r16g16_uscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint16_t)(((uint32_t)r[i+0]) * 0x1 / 0xff)) << 0) | (((uint16_t)(((uint32_t)g[i+0]) * 0x1 / 0xff)) << 16);
+            const uint32_t pixel1 = (((uint16_t)(((uint32_t)r[i+1]) * 0x1 / 0xff)) << 0) | (((uint16_t)(((uint32_t)g[i+1]) * 0x1 / 0xff)) << 16);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r16g16b16_uscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*6);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (uint16_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+         *dst_pixel++ = (uint16_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0x1 / 0xff);
+         *dst_pixel++ = (uint16_t)(((uint32_t)TILE_PIXEL(src, x, y, 2)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16a16_uscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint16_t *dst_pixel = (uint16_t *)(dst_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (uint16_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+         *dst_pixel++ = (uint16_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0x1 / 0xff);
+         *dst_pixel++ = (uint16_t)(((uint32_t)TILE_PIXEL(src, x, y, 2)) * 0x1 / 0xff);
+         *dst_pixel++ = (uint16_t)(((uint32_t)TILE_PIXEL(src, x, y, 3)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16_snorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int16_t *dst_pixel = (int16_t *)(dst_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x7fff / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16g16_snorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int16_t *dst_pixel = (int16_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x7fff / 0xff);
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0x7fff / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16_snorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int16_t *dst_pixel = (int16_t *)(dst_row + x0*6);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x7fff / 0xff);
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0x7fff / 0xff);
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 2)) * 0x7fff / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16a16_snorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int16_t *dst_pixel = (int16_t *)(dst_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x7fff / 0xff);
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0x7fff / 0xff);
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 2)) * 0x7fff / 0xff);
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 3)) * 0x7fff / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16_sscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int16_t *dst_pixel = (int16_t *)(dst_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16g16_sscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int16_t *dst_pixel = (int16_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16_sscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int16_t *dst_pixel = (int16_t *)(dst_row + x0*6);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0x1 / 0xff);
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 2)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r16g16b16a16_sscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int16_t *dst_pixel = (int16_t *)(dst_row + x0*8);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0x1 / 0xff);
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 2)) * 0x1 / 0xff);
+         *dst_pixel++ = (int16_t)(((uint32_t)TILE_PIXEL(src, x, y, 3)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r8_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 1;
+   uint8_t *dstpix = (uint8_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = ((r[i+0]) << 0);
+            const uint32_t pixel1 = ((r[i+1]) << 0);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r8g8_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 2;
+   uint16_t *dstpix = (uint16_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = ((r[i+0]) << 0) | ((g[i+0]) << 8);
+            const uint32_t pixel1 = ((r[i+1]) << 0) | ((g[i+1]) << 8);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r8g8b8_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*3);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 0);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 1);
+         *dst_pixel++ = TILE_PIXEL(src, x, y, 2);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8a8_unorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = ((r[i+0]) << 0) | ((g[i+0]) << 8) | ((b[i+0]) << 16) | ((a[i+0]) << 24);
+            const uint32_t pixel1 = ((r[i+1]) << 0) | ((g[i+1]) << 8) | ((b[i+1]) << 16) | ((a[i+1]) << 24);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r8_uscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 1;
+   uint8_t *dstpix = (uint8_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint8_t)(((uint32_t)r[i+0]) * 0x1 / 0xff)) << 0);
+            const uint32_t pixel1 = (((uint8_t)(((uint32_t)r[i+1]) * 0x1 / 0xff)) << 0);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r8g8_uscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 2;
+   uint16_t *dstpix = (uint16_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint8_t)(((uint32_t)r[i+0]) * 0x1 / 0xff)) << 0) | (((uint8_t)(((uint32_t)g[i+0]) * 0x1 / 0xff)) << 8);
+            const uint32_t pixel1 = (((uint8_t)(((uint32_t)r[i+1]) * 0x1 / 0xff)) << 0) | (((uint8_t)(((uint32_t)g[i+1]) * 0x1 / 0xff)) << 8);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r8g8b8_uscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint8_t *dst_pixel = (uint8_t *)(dst_row + x0*3);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (uint8_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+         *dst_pixel++ = (uint8_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0x1 / 0xff);
+         *dst_pixel++ = (uint8_t)(((uint32_t)TILE_PIXEL(src, x, y, 2)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8a8_uscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint8_t)(((uint32_t)r[i+0]) * 0x1 / 0xff)) << 0) | (((uint8_t)(((uint32_t)g[i+0]) * 0x1 / 0xff)) << 8) | (((uint8_t)(((uint32_t)b[i+0]) * 0x1 / 0xff)) << 16) | (((uint8_t)(((uint32_t)a[i+0]) * 0x1 / 0xff)) << 24);
+            const uint32_t pixel1 = (((uint8_t)(((uint32_t)r[i+1]) * 0x1 / 0xff)) << 0) | (((uint8_t)(((uint32_t)g[i+1]) * 0x1 / 0xff)) << 8) | (((uint8_t)(((uint32_t)b[i+1]) * 0x1 / 0xff)) << 16) | (((uint8_t)(((uint32_t)a[i+1]) * 0x1 / 0xff)) << 24);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r8_snorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int8_t *dst_pixel = (int8_t *)(dst_row + x0*1);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int8_t)(TILE_PIXEL(src, x, y, 0) >> 1);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r8g8_snorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int8_t *dst_pixel = (int8_t *)(dst_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int8_t)(TILE_PIXEL(src, x, y, 0) >> 1);
+         *dst_pixel++ = (int8_t)(TILE_PIXEL(src, x, y, 1) >> 1);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8_snorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int8_t *dst_pixel = (int8_t *)(dst_row + x0*3);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int8_t)(TILE_PIXEL(src, x, y, 0) >> 1);
+         *dst_pixel++ = (int8_t)(TILE_PIXEL(src, x, y, 1) >> 1);
+         *dst_pixel++ = (int8_t)(TILE_PIXEL(src, x, y, 2) >> 1);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8a8_snorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int8_t *dst_pixel = (int8_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int8_t)(TILE_PIXEL(src, x, y, 0) >> 1);
+         *dst_pixel++ = (int8_t)(TILE_PIXEL(src, x, y, 1) >> 1);
+         *dst_pixel++ = (int8_t)(TILE_PIXEL(src, x, y, 2) >> 1);
+         *dst_pixel++ = (int8_t)(TILE_PIXEL(src, x, y, 3) >> 1);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r8_sscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int8_t *dst_pixel = (int8_t *)(dst_row + x0*1);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int8_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r8g8_sscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int8_t *dst_pixel = (int8_t *)(dst_row + x0*2);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int8_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+         *dst_pixel++ = (int8_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8_sscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int8_t *dst_pixel = (int8_t *)(dst_row + x0*3);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int8_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+         *dst_pixel++ = (int8_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0x1 / 0xff);
+         *dst_pixel++ = (int8_t)(((uint32_t)TILE_PIXEL(src, x, y, 2)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r8g8b8a8_sscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      int8_t *dst_pixel = (int8_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         *dst_pixel++ = (int8_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x1 / 0xff);
+         *dst_pixel++ = (int8_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0x1 / 0xff);
+         *dst_pixel++ = (int8_t)(((uint32_t)TILE_PIXEL(src, x, y, 2)) * 0x1 / 0xff);
+         *dst_pixel++ = (int8_t)(((uint32_t)TILE_PIXEL(src, x, y, 3)) * 0x1 / 0xff);
+      }
+      dst_row += dst_stride;
+   }
+}
+
+static void
+lp_tile_r10g10b10x2_uscaled_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   const unsigned dstpix_stride = dst_stride / 4;
+   uint32_t *dstpix = (uint32_t *) dst;
+   unsigned int qx, qy, i;
+
+   for (qy = 0; qy < TILE_SIZE; qy += TILE_VECTOR_HEIGHT) {
+      const unsigned py = y0 + qy;
+      for (qx = 0; qx < TILE_SIZE; qx += TILE_VECTOR_WIDTH) {
+         const unsigned px = x0 + qx;
+         const uint8_t *r = src + 0 * TILE_C_STRIDE;
+         const uint8_t *g = src + 1 * TILE_C_STRIDE;
+         const uint8_t *b = src + 2 * TILE_C_STRIDE;
+         const uint8_t *a = src + 3 * TILE_C_STRIDE;
+         (void) r; (void) g; (void) b; (void) a; /* silence warnings */
+         for (i = 0; i < TILE_C_STRIDE; i += 2) {
+            const uint32_t pixel0 = (((uint32_t)(((uint32_t)r[i+0]) * 0x1 / 0xff)) << 0) | (((uint32_t)(((uint32_t)g[i+0]) * 0x1 / 0xff)) << 10) | (((uint32_t)(((uint32_t)b[i+0]) * 0x1 / 0xff)) << 20);
+            const uint32_t pixel1 = (((uint32_t)(((uint32_t)r[i+1]) * 0x1 / 0xff)) << 0) | (((uint32_t)(((uint32_t)g[i+1]) * 0x1 / 0xff)) << 10) | (((uint32_t)(((uint32_t)b[i+1]) * 0x1 / 0xff)) << 20);
+            const unsigned offset = (py + tile_y_offset[i]) * dstpix_stride + (px + tile_x_offset[i]);
+            dstpix[offset + 0] = pixel0;
+            dstpix[offset + 1] = pixel1;
+         }
+         src += TILE_X_STRIDE;
+      }
+   }
+}
+
+static void
+lp_tile_r10g10b10x2_snorm_unswizzle_4ub(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0)
+{
+   unsigned x, y;
+   uint8_t *dst_row = dst + y0*dst_stride;
+   for (y = 0; y < TILE_SIZE; ++y) {
+      uint32_t *dst_pixel = (uint32_t *)(dst_row + x0*4);
+      for (x = 0; x < TILE_SIZE; ++x) {
+         uint32_t pixel = 0;
+         pixel |= (uint32_t)(((uint32_t)TILE_PIXEL(src, x, y, 0)) * 0x1ff / 0xff);
+         pixel |= ((uint32_t)(((uint32_t)TILE_PIXEL(src, x, y, 1)) * 0x1ff / 0xff) << 10);
+         pixel |= ((uint32_t)(((uint32_t)TILE_PIXEL(src, x, y, 2)) * 0x1ff / 0xff) << 20);
+         *dst_pixel++ = pixel;
+      }
+      dst_row += dst_stride;
+   }
+}
+
+void
+lp_tile_unswizzle_4ub(enum pipe_format format, const uint8_t *src, void *dst, unsigned dst_stride, unsigned x, unsigned y)
+{
+   void (*func)(const uint8_t *src, uint8_t *dst, unsigned dst_stride, unsigned x0, unsigned y0);
+#ifdef DEBUG
+   lp_tile_unswizzle_count += 1;
+#endif
+   switch(format) {
+   case PIPE_FORMAT_NONE:
+      func = lp_tile_none_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+#ifdef PIPE_ARCH_SSE
+      func = util_cpu_caps.has_sse2 ? lp_tile_b8g8r8a8_unorm_unswizzle_4ub_sse2 : lp_tile_b8g8r8a8_unorm_unswizzle_4ub;
+#else
+      func = lp_tile_b8g8r8a8_unorm_unswizzle_4ub;
+#endif
+      break;
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      func = lp_tile_b8g8r8x8_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      func = lp_tile_a8r8g8b8_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      func = lp_tile_x8r8g8b8_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_A8B8G8R8_UNORM:
+      func = lp_tile_a8b8g8r8_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_X8B8G8R8_UNORM:
+      func = lp_tile_x8b8g8r8_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8X8_UNORM:
+      func = lp_tile_r8g8b8x8_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_B5G5R5X1_UNORM:
+      func = lp_tile_b5g5r5x1_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_B5G5R5A1_UNORM:
+      func = lp_tile_b5g5r5a1_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_B4G4R4A4_UNORM:
+      func = lp_tile_b4g4r4a4_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_B4G4R4X4_UNORM:
+      func = lp_tile_b4g4r4x4_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_B5G6R5_UNORM:
+      func = lp_tile_b5g6r5_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R10G10B10A2_UNORM:
+      func = lp_tile_r10g10b10a2_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_B10G10R10A2_UNORM:
+      func = lp_tile_b10g10r10a2_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_L8_UNORM:
+      func = lp_tile_l8_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_A8_UNORM:
+      func = lp_tile_a8_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_I8_UNORM:
+      func = lp_tile_i8_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_L4A4_UNORM:
+      func = lp_tile_l4a4_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_L8A8_UNORM:
+      func = lp_tile_l8a8_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_L16_UNORM:
+      func = lp_tile_l16_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_L8_SRGB:
+      func = lp_tile_l8_srgb_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_L8A8_SRGB:
+      func = lp_tile_l8a8_srgb_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8_SRGB:
+      func = lp_tile_r8g8b8_srgb_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8A8_SRGB:
+      func = lp_tile_r8g8b8a8_srgb_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_A8B8G8R8_SRGB:
+      func = lp_tile_a8b8g8r8_srgb_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_X8B8G8R8_SRGB:
+      func = lp_tile_x8b8g8r8_srgb_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_B8G8R8A8_SRGB:
+      func = lp_tile_b8g8r8a8_srgb_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_B8G8R8X8_SRGB:
+      func = lp_tile_b8g8r8x8_srgb_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_A8R8G8B8_SRGB:
+      func = lp_tile_a8r8g8b8_srgb_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_X8R8G8B8_SRGB:
+      func = lp_tile_x8r8g8b8_srgb_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
+      func = lp_tile_r8sg8sb8ux8u_norm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
+      func = lp_tile_r10sg10sb10sa2u_norm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R5SG5SB6U_NORM:
+      func = lp_tile_r5sg5sb6u_norm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R10G10B10A2_USCALED:
+      func = lp_tile_r10g10b10a2_uscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R64_FLOAT:
+      func = lp_tile_r64_float_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      func = lp_tile_r64g64_float_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      func = lp_tile_r64g64b64_float_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      func = lp_tile_r64g64b64a64_float_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32_FLOAT:
+      func = lp_tile_r32_float_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      func = lp_tile_r32g32_float_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      func = lp_tile_r32g32b32_float_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      func = lp_tile_r32g32b32a32_float_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32_UNORM:
+      func = lp_tile_r32_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32_UNORM:
+      func = lp_tile_r32g32_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      func = lp_tile_r32g32b32_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      func = lp_tile_r32g32b32a32_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32_USCALED:
+      func = lp_tile_r32_uscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32_USCALED:
+      func = lp_tile_r32g32_uscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      func = lp_tile_r32g32b32_uscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      func = lp_tile_r32g32b32a32_uscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32_SNORM:
+      func = lp_tile_r32_snorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32_SNORM:
+      func = lp_tile_r32g32_snorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      func = lp_tile_r32g32b32_snorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      func = lp_tile_r32g32b32a32_snorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32_SSCALED:
+      func = lp_tile_r32_sscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      func = lp_tile_r32g32_sscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      func = lp_tile_r32g32b32_sscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      func = lp_tile_r32g32b32a32_sscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16_FLOAT:
+      func = lp_tile_r16_float_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16_FLOAT:
+      func = lp_tile_r16g16_float_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16_FLOAT:
+      func = lp_tile_r16g16b16_float_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16A16_FLOAT:
+      func = lp_tile_r16g16b16a16_float_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16_UNORM:
+      func = lp_tile_r16_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16_UNORM:
+      func = lp_tile_r16g16_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      func = lp_tile_r16g16b16_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      func = lp_tile_r16g16b16a16_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16_USCALED:
+      func = lp_tile_r16_uscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16_USCALED:
+      func = lp_tile_r16g16_uscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      func = lp_tile_r16g16b16_uscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      func = lp_tile_r16g16b16a16_uscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16_SNORM:
+      func = lp_tile_r16_snorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16_SNORM:
+      func = lp_tile_r16g16_snorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      func = lp_tile_r16g16b16_snorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      func = lp_tile_r16g16b16a16_snorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16_SSCALED:
+      func = lp_tile_r16_sscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      func = lp_tile_r16g16_sscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      func = lp_tile_r16g16b16_sscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      func = lp_tile_r16g16b16a16_sscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8_UNORM:
+      func = lp_tile_r8_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8_UNORM:
+      func = lp_tile_r8g8_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      func = lp_tile_r8g8b8_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      func = lp_tile_r8g8b8a8_unorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8_USCALED:
+      func = lp_tile_r8_uscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8_USCALED:
+      func = lp_tile_r8g8_uscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      func = lp_tile_r8g8b8_uscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      func = lp_tile_r8g8b8a8_uscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8_SNORM:
+      func = lp_tile_r8_snorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8_SNORM:
+      func = lp_tile_r8g8_snorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      func = lp_tile_r8g8b8_snorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      func = lp_tile_r8g8b8a8_snorm_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8_SSCALED:
+      func = lp_tile_r8_sscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      func = lp_tile_r8g8_sscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      func = lp_tile_r8g8b8_sscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      func = lp_tile_r8g8b8a8_sscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R10G10B10X2_USCALED:
+      func = lp_tile_r10g10b10x2_uscaled_unswizzle_4ub;
+      break;
+   case PIPE_FORMAT_R10G10B10X2_SNORM:
+      func = lp_tile_r10g10b10x2_snorm_unswizzle_4ub;
+      break;
+   default:
+      debug_printf("%s: unsupported format %s\n", __FUNCTION__, util_format_name(format));
+      return;
+   }
+   func(src, (uint8_t *)dst, dst_stride, x, y);
+}
+
diff --git a/lib/libGL/gallium/drivers/rbug/Makefile b/lib/libGL/gallium/drivers/rbug/Makefile
new file mode 100644
index 000000000..b38ab1195
--- /dev/null
+++ b/lib/libGL/gallium/drivers/rbug/Makefile
@@ -0,0 +1,11 @@
+# $OpenBSD: Makefile,v 1.1.1.1 2012/07/13 12:43:34 mpi Exp $
+
+LIB = rbug
+
+SRCS = \
+	rbug_core.c \
+	rbug_context.c \
+	rbug_objects.c \
+	rbug_screen.c
+
+.include <bsd.xorg.mk>
diff --git a/lib/libGL/gallium/drivers/softpipe/Makefile b/lib/libGL/gallium/drivers/softpipe/Makefile
new file mode 100644
index 000000000..7dd0edc8d
--- /dev/null
+++ b/lib/libGL/gallium/drivers/softpipe/Makefile
@@ -0,0 +1,37 @@
+# $OpenBSD: Makefile,v 1.1.1.1 2012/07/13 12:43:34 mpi Exp $
+
+LIB = softpipe
+
+SRCS = \
+	sp_fs_exec.c \
+	sp_fs_sse.c \
+	sp_clear.c \
+	sp_fence.c \
+	sp_flush.c \
+	sp_query.c \
+	sp_context.c \
+	sp_draw_arrays.c \
+	sp_prim_vbuf.c \
+	sp_quad_pipe.c \
+	sp_quad_stipple.c \
+	sp_quad_depth_test.c \
+	sp_quad_fs.c \
+	sp_quad_blend.c \
+	sp_screen.c \
+        sp_setup.c \
+	sp_state_blend.c \
+	sp_state_clip.c \
+	sp_state_derived.c \
+	sp_state_sampler.c \
+	sp_state_shader.c \
+	sp_state_so.c \
+	sp_state_rasterizer.c \
+	sp_state_surface.c \
+	sp_state_vertex.c \
+	sp_texture.c \
+	sp_tex_sample.c \
+	sp_tex_tile_cache.c \
+	sp_tile_cache.c \
+	sp_surface.c
+
+.include <bsd.xorg.mk>
diff --git a/lib/libGL/gallium/drivers/trace/Makefile b/lib/libGL/gallium/drivers/trace/Makefile
new file mode 100644
index 000000000..14e36a840
--- /dev/null
+++ b/lib/libGL/gallium/drivers/trace/Makefile
@@ -0,0 +1,12 @@
+# $OpenBSD: Makefile,v 1.1.1.1 2012/07/13 12:43:34 mpi Exp $
+
+LIB = trace
+
+SRCS = \
+	tr_context.c \
+	tr_dump.c \
+	tr_dump_state.c \
+	tr_screen.c \
+	tr_texture.c
+
+.include <bsd.xorg.mk>