Update to pixman 0.32.4. Tested by naddy@ and ajacoutot@

author: Matthieu Herrb <matthieu@cvs.openbsd.org> 2013-12-01 20:34:21 +0000
committer: Matthieu Herrb <matthieu@cvs.openbsd.org> 2013-12-01 20:34:21 +0000
commit: a5bbbb8d49b940f16b8fca367fa3e2cc5489f862 (patch)
tree: 79a94b85f71cf67460335388866fdcdf78942987 /lib
parent: 8a0bfc9a32ee1e84d7274040f2902b8e1b459d0f (diff)
47 files changed, 3883 insertions, 1926 deletions
diff --git a/lib/pixman/Makefile.bsd-wrapper b/lib/pixman/Makefile.bsd-wrapper
index 50edd691d..186518b08 100644
--- a/lib/pixman/Makefile.bsd-wrapper
+++ b/lib/pixman/Makefile.bsd-wrapper
@@ -1,8 +1,8 @@
-# $OpenBSD: Makefile.bsd-wrapper,v 1.21 2013/08/13 07:07:28 guenther Exp $
+# $OpenBSD: Makefile.bsd-wrapper,v 1.22 2013/12/01 20:34:20 matthieu Exp $
 
 .include <bsd.own.mk>
 
-SHARED_LIBS=	pixman-1 31.0
+SHARED_LIBS=	pixman-1 32.4
 
 .if ${MACHINE_ARCH} == arm
 CONFIGURE_ARGS +=  --disable-arm-simd --disable-arm-neon
diff --git a/lib/pixman/Makefile.in b/lib/pixman/Makefile.in
index 6cfa1d14d..d9c9d1294 100644
--- a/lib/pixman/Makefile.in
+++ b/lib/pixman/Makefile.in
@@ -205,7 +205,7 @@ GREP = @GREP@
 GTK_CFLAGS = @GTK_CFLAGS@
 GTK_LIBS = @GTK_LIBS@
 HAVE_LIBPNG = @HAVE_LIBPNG@
-HAVE_PTHREAD_SETSPECIFIC = @HAVE_PTHREAD_SETSPECIFIC@
+HAVE_PTHREADS = @HAVE_PTHREADS@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -251,6 +251,7 @@ PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
 PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
 PNG_CFLAGS = @PNG_CFLAGS@
 PNG_LIBS = @PNG_LIBS@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
 PTHREAD_LDFLAGS = @PTHREAD_LDFLAGS@
 PTHREAD_LIBS = @PTHREAD_LIBS@
 RANLIB = @RANLIB@
@@ -259,6 +260,7 @@ SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
 SSE2_CFLAGS = @SSE2_CFLAGS@
 SSE2_LDFLAGS = @SSE2_LDFLAGS@
+SSSE3_CFLAGS = @SSSE3_CFLAGS@
 STRIP = @STRIP@
 TESTPROGS_EXTRA_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
 TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR = @TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR@
diff --git a/lib/pixman/config.h.in b/lib/pixman/config.h.in
index d26107ff8..fbca39084 100644
--- a/lib/pixman/config.h.in
+++ b/lib/pixman/config.h.in
@@ -6,6 +6,9 @@
 /* Whether we have alarm() */
 #undef HAVE_ALARM
 
+/* Whether the compiler supports __builtin_clz */
+#undef HAVE_BUILTIN_CLZ
+
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #undef HAVE_DLFCN_H
 
@@ -48,8 +51,8 @@
 /* Whether we have posix_memalign() */
 #undef HAVE_POSIX_MEMALIGN
 
-/* Whether pthread_setspecific() is supported */
-#undef HAVE_PTHREAD_SETSPECIFIC
+/* Whether pthreads is supported */
+#undef HAVE_PTHREADS
 
 /* Whether we have sigaction() */
 #undef HAVE_SIGACTION
@@ -142,6 +145,9 @@
 /* use SSE2 compiler intrinsics */
 #undef USE_SSE2
 
+/* use SSSE3 compiler intrinsics */
+#undef USE_SSSE3
+
 /* use VMX compiler intrinsics */
 #undef USE_VMX
 
diff --git a/lib/pixman/configure b/lib/pixman/configure
index f68eac867..78dadb34f 100644
--- a/lib/pixman/configure
+++ b/lib/pixman/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for pixman 0.30.2.
+# Generated by GNU Autoconf 2.69 for pixman 0.32.4.
 #
 # Report bugs to <pixman@lists.freedesktop.org>.
 #
@@ -590,8 +590,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='pixman'
 PACKAGE_TARNAME='pixman'
-PACKAGE_VERSION='0.30.2'
-PACKAGE_STRING='pixman 0.30.2'
+PACKAGE_VERSION='0.32.4'
+PACKAGE_STRING='pixman 0.32.4'
 PACKAGE_BUGREPORT='pixman@lists.freedesktop.org'
 PACKAGE_URL=''
 
@@ -639,9 +639,10 @@ HAVE_LIBPNG
 PNG_LIBS
 PNG_CFLAGS
 TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR
+PTHREAD_CFLAGS
 PTHREAD_LIBS
 PTHREAD_LDFLAGS
-HAVE_PTHREAD_SETSPECIFIC
+HAVE_PTHREADS
 TOOLCHAIN_SUPPORTS__THREAD
 HAVE_GTK_FALSE
 HAVE_GTK_TRUE
@@ -665,12 +666,15 @@ USE_ARM_SIMD_TRUE
 USE_VMX_FALSE
 USE_VMX_TRUE
 VMX_CFLAGS
+SSSE3_CFLAGS
 SSE2_LDFLAGS
 SSE2_CFLAGS
 MMX_LDFLAGS
 MMX_CFLAGS
 IWMMXT_CFLAGS
 LS_CFLAGS
+USE_SSSE3_FALSE
+USE_SSSE3_TRUE
 USE_SSE2_FALSE
 USE_SSE2_TRUE
 USE_X86_MMX_FALSE
@@ -815,6 +819,7 @@ enable_openmp
 enable_loongson_mmi
 enable_mmx
 enable_sse2
+enable_ssse3
 enable_vmx
 enable_arm_simd
 enable_arm_neon
@@ -1385,7 +1390,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures pixman 0.30.2 to adapt to many kinds of systems.
+\`configure' configures pixman 0.32.4 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1455,7 +1460,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of pixman 0.30.2:";;
+     short | recursive ) echo "Configuration of pixman 0.32.4:";;
    esac
   cat <<\_ACEOF
 
@@ -1478,6 +1483,7 @@ Optional Features:
   --disable-loongson-mmi  disable Loongson MMI fast paths
   --disable-mmx           disable x86 MMX fast paths
   --disable-sse2          disable SSE2 fast paths
+  --disable-ssse3         disable SSSE3 fast paths
   --disable-vmx           disable VMX fast paths
   --disable-arm-simd      disable ARM SIMD fast paths
   --disable-arm-neon      disable ARM NEON fast paths
@@ -1589,7 +1595,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-pixman configure 0.30.2
+pixman configure 0.32.4
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2187,7 +2193,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by pixman $as_me 0.30.2, which was
+It was created by pixman $as_me 0.32.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -3011,7 +3017,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='pixman'
- VERSION='0.30.2'
+ VERSION='0.32.4'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -12195,13 +12201,13 @@ fi
 
 
 
-LT_VERSION_INFO="30:2:30"
+LT_VERSION_INFO="32:4:32"
 
 PIXMAN_VERSION_MAJOR=0
 
-PIXMAN_VERSION_MINOR=30
+PIXMAN_VERSION_MINOR=32
 
-PIXMAN_VERSION_MICRO=2
+PIXMAN_VERSION_MICRO=4
 
 
 
@@ -12255,6 +12261,53 @@ rm -f core conftest.err conftest.$ac_objext \
 	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $_yesno" >&5
 $as_echo "$_yesno" >&6; }
 
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the compiler supports -Wdeclaration-after-statement" >&5
+$as_echo_n "checking whether the compiler supports -Wdeclaration-after-statement... " >&6; }
+		save_CFLAGS="$CFLAGS"
+	save_LDFLAGS="$LDFLAGS"
+	save_LIBS="$LIBS"
+	CFLAGS=""
+	LDFLAGS=""
+	LIBS=""
+	CFLAGS="$WERROR -Wdeclaration-after-statement"
+	CFLAGS="$save_CFLAGS $CFLAGS"
+	LDFLAGS="$save_LDFLAGS $LDFLAGS"
+	LIBS="$save_LIBS $LIBS"
+	cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+		 int main(int c, char **v) { (void)c; (void)v; return 0; }
+
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
+		 pixman_cc_flag=yes
+else
+  pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
+		 pixman_cc_flag=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+	if test "x$pixman_cc_stderr" != "x"; then
+		pixman_cc_flag=no
+	fi
+
+	if test "x$pixman_cc_flag" = "xyes"; then
+		_yesno=yes
+	else
+		_yesno=no
+	fi
+	CFLAGS="$save_CFLAGS"
+	LDFLAGS="$save_LDFLAGS"
+	LIBS="$save_LIBS"
+
+	if test "x$_yesno" = xyes; then
+	   CFLAGS="$CFLAGS -Wdeclaration-after-statement"
+	fi
+	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $_yesno" >&5
+$as_echo "$_yesno" >&6; }
+
 	{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the compiler supports -fno-strict-aliasing" >&5
 $as_echo_n "checking whether the compiler supports -fno-strict-aliasing... " >&6; }
 		save_CFLAGS="$CFLAGS"
@@ -12675,6 +12728,12 @@ int main () {
         : "y" (v), "K" (5)
     );
 
+    /* Some versions of clang will choke on this */
+    asm ("pmulhuw %1, %0\n\t"
+	: "+y" (w)
+	: "y" (v)
+    );
+
     return _mm_cvtsi64_si32 (v);
 }
 _ACEOF
@@ -12793,6 +12852,69 @@ else
 fi
 
 
+
+if test "x$SSSE3_CFLAGS" = "x" ; then
+    SSSE3_CFLAGS="-mssse3 -Winline"
+fi
+
+have_ssse3_intrinsics=no
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to use SSSE3 intrinsics" >&5
+$as_echo_n "checking whether to use SSSE3 intrinsics... " >&6; }
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$SSSE3_CFLAGS $CFLAGS"
+
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+int main () {
+    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
+    c = _mm_maddubs_epi16 (a, b);
+    return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  have_ssse3_intrinsics=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+CFLAGS=$xserver_save_CFLAGS
+
+# Check whether --enable-ssse3 was given.
+if test "${enable_ssse3+set}" = set; then :
+  enableval=$enable_ssse3; enable_ssse3=$enableval
+else
+  enable_ssse3=auto
+fi
+
+
+if test $enable_ssse3 = no ; then
+   have_ssse3_intrinsics=disabled
+fi
+
+if test $have_ssse3_intrinsics = yes ; then
+
+$as_echo "#define USE_SSSE3 1" >>confdefs.h
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $have_ssse3_intrinsics" >&5
+$as_echo "$have_ssse3_intrinsics" >&6; }
+if test $enable_ssse3 = yes && test $have_ssse3_intrinsics = no ; then
+   as_fn_error $? "SSSE3 intrinsics not detected" "$LINENO" 5
+fi
+
+ if test $have_ssse3_intrinsics = yes; then
+  USE_SSSE3_TRUE=
+  USE_SSSE3_FALSE='#'
+else
+  USE_SSSE3_TRUE='#'
+  USE_SSSE3_FALSE=
+fi
+
+
 case $host_os in
    solaris*)
       # When building 32-bit binaries, apply a mapfile to ensure that the
@@ -12837,6 +12959,7 @@ esac
 
 
 
+
 if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then
     VMX_CFLAGS="-faltivec"
 else
@@ -13060,8 +13183,8 @@ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 #ifndef __IWMMXT__
 #error "IWMMXT not enabled (with -march=iwmmxt)"
 #endif
-#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5))
-#error "Need GCC >= 4.5 for IWMMXT intrinsics"
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8))
+#error "Need GCC >= 4.8 for IWMMXT intrinsics"
 #endif
 #include <mmintrin.h>
 int main () {
@@ -13933,13 +14056,12 @@ fi
 
 
 
-if test $ac_cv_tls = none ; then
-    support_for_pthread_setspecific=no
+support_for_pthreads=no
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthread_setspecific" >&5
-$as_echo_n "checking for pthread_setspecific... " >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for pthreads" >&5
+$as_echo_n "checking for pthreads... " >&6; }
 
-        if test "z$support_for_pthread_setspecific" != "zyes"; then
+    if test "z$support_for_pthreads" != "zyes"; then
 		save_CFLAGS="$CFLAGS"
 	save_LDFLAGS="$LDFLAGS"
 	save_LIBS="$LIBS"
@@ -14004,7 +14126,7 @@ rm -f core conftest.err conftest.$ac_objext \
 		PTHREAD_CFLAGS="$CFLAGS"
 		 PTHREAD_LIBS="$LIBS"
 		 PTHREAD_LDFLAGS="$LDFLAGS"
-		 support_for_pthread_setspecific=yes
+		 support_for_pthreads=yes
 	else
 		:
 	fi
@@ -14014,7 +14136,7 @@ rm -f core conftest.err conftest.$ac_objext \
 
     fi
 
-        if test "z$support_for_pthread_setspecific" != "zyes"; then
+    if test "z$support_for_pthreads" != "zyes"; then
 		save_CFLAGS="$CFLAGS"
 	save_LDFLAGS="$LDFLAGS"
 	save_LIBS="$LIBS"
@@ -14079,7 +14201,7 @@ rm -f core conftest.err conftest.$ac_objext \
 		PTHREAD_CFLAGS="$CFLAGS"
 		 PTHREAD_LIBS="$LIBS"
 		 PTHREAD_LDFLAGS="$LDFLAGS"
-		 support_for_pthread_setspecific=yes
+		 support_for_pthreads=yes
 	else
 		:
 	fi
@@ -14089,7 +14211,7 @@ rm -f core conftest.err conftest.$ac_objext \
 
     fi
 
-        if test "z$support_for_pthread_setspecific" != "zyes"; then
+    if test "z$support_for_pthreads" != "zyes"; then
 		save_CFLAGS="$CFLAGS"
 	save_LDFLAGS="$LDFLAGS"
 	save_LIBS="$LIBS"
@@ -14154,7 +14276,7 @@ rm -f core conftest.err conftest.$ac_objext \
 		PTHREAD_CFLAGS="$CFLAGS"
 		 PTHREAD_LIBS="$LIBS"
 		 PTHREAD_LDFLAGS="$LDFLAGS"
-		 support_for_pthread_setspecific=yes
+		 support_for_pthreads=yes
 	else
 		:
 	fi
@@ -14164,7 +14286,7 @@ rm -f core conftest.err conftest.$ac_objext \
 
     fi
 
-        if test "z$support_for_pthread_setspecific" != "zyes"; then
+    if test "z$support_for_pthreads" != "zyes"; then
 		save_CFLAGS="$CFLAGS"
 	save_LDFLAGS="$LDFLAGS"
 	save_LIBS="$LIBS"
@@ -14229,7 +14351,7 @@ rm -f core conftest.err conftest.$ac_objext \
 		PTHREAD_CFLAGS="$CFLAGS"
 		 PTHREAD_LIBS="$LIBS"
 		 PTHREAD_LDFLAGS="$LDFLAGS"
-		 support_for_pthread_setspecific=yes
+		 support_for_pthreads=yes
 	else
 		:
 	fi
@@ -14240,17 +14362,19 @@ rm -f core conftest.err conftest.$ac_objext \
     fi
 
 
-    if test $support_for_pthread_setspecific = yes; then
-	CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+if test $support_for_pthreads = yes; then
 
-$as_echo "#define HAVE_PTHREAD_SETSPECIFIC /**/" >>confdefs.h
+$as_echo "#define HAVE_PTHREADS /**/" >>confdefs.h
 
+    if test $ac_cv_tls = none ; then
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
     fi
-
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $support_for_pthread_setspecific" >&5
-$as_echo "$support_for_pthread_setspecific" >&6; };
 fi
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $support_for_pthreads" >&5
+$as_echo "$support_for_pthreads" >&6; }
+
+
 
 
 
@@ -14319,6 +14443,32 @@ fi
 $as_echo "$support_for_float128" >&6; }
 
 
+support_for_builtin_clz=no
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_clz" >&5
+$as_echo_n "checking for __builtin_clz... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+unsigned int x = 11; int main (void) { return __builtin_clz(x); }
+
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  support_for_builtin_clz=yes
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+if test x$support_for_builtin_clz = xyes; then
+
+$as_echo "#define HAVE_BUILTIN_CLZ /**/" >>confdefs.h
+
+fi
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $support_for_builtin_clz" >&5
+$as_echo "$support_for_builtin_clz" >&6; }
+
+
 # Check whether --enable-libpng was given.
 if test "${enable_libpng+set}" = set; then :
   enableval=$enable_libpng; have_libpng=$enableval
@@ -14653,6 +14803,10 @@ if test -z "${USE_SSE2_TRUE}" && test -z "${USE_SSE2_FALSE}"; then
   as_fn_error $? "conditional \"USE_SSE2\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
 fi
+if test -z "${USE_SSSE3_TRUE}" && test -z "${USE_SSSE3_FALSE}"; then
+  as_fn_error $? "conditional \"USE_SSSE3\" was never defined.
+Usually this means the macro was only invoked conditionally." "$LINENO" 5
+fi
 if test -z "${USE_VMX_TRUE}" && test -z "${USE_VMX_FALSE}"; then
   as_fn_error $? "conditional \"USE_VMX\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -15078,7 +15232,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by pixman $as_me 0.30.2, which was
+This file was extended by pixman $as_me 0.32.4, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -15144,7 +15298,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-pixman config.status 0.30.2
+pixman config.status 0.32.4
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/lib/pixman/configure.ac b/lib/pixman/configure.ac
index 25c6c5b7d..67d082cca 100644
--- a/lib/pixman/configure.ac
+++ b/lib/pixman/configure.ac
@@ -53,8 +53,8 @@ AC_PREREQ([2.57])
 #
 
 m4_define([pixman_major], 0)
-m4_define([pixman_minor], 30)
-m4_define([pixman_micro], 2)
+m4_define([pixman_minor], 32)
+m4_define([pixman_micro], 4)
 
 m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
 
@@ -183,6 +183,7 @@ AC_SUBST(LT_VERSION_INFO)
 # Check for dependencies
 
 PIXMAN_CHECK_CFLAG([-Wall])
+PIXMAN_CHECK_CFLAG([-Wdeclaration-after-statement])
 PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
 
 dnl =========================================================================
@@ -355,6 +356,12 @@ int main () {
         : "y" (v), "K" (5)
     );
 
+    /* Some versions of clang will choke on this */
+    asm ("pmulhuw %1, %0\n\t"
+	: "+y" (w)
+	: "y" (v)
+    );
+
     return _mm_cvtsi64_si32 (v);
 }]])], have_mmx_intrinsics=yes)
 CFLAGS=$xserver_save_CFLAGS
@@ -437,6 +444,50 @@ fi
 AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
 
 dnl ===========================================================================
+dnl Check for SSSE3
+
+if test "x$SSSE3_CFLAGS" = "x" ; then
+    SSSE3_CFLAGS="-mssse3 -Winline"
+fi
+
+have_ssse3_intrinsics=no
+AC_MSG_CHECKING(whether to use SSSE3 intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$SSSE3_CFLAGS $CFLAGS"
+
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+int main () {
+    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
+    c = _mm_maddubs_epi16 (a, b);
+    return 0;
+}]])], have_ssse3_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(ssse3,
+   [AC_HELP_STRING([--disable-ssse3],
+                   [disable SSSE3 fast paths])],
+   [enable_ssse3=$enableval], [enable_ssse3=auto])
+
+if test $enable_ssse3 = no ; then
+   have_ssse3_intrinsics=disabled
+fi
+
+if test $have_ssse3_intrinsics = yes ; then
+   AC_DEFINE(USE_SSSE3, 1, [use SSSE3 compiler intrinsics])
+fi
+
+AC_MSG_RESULT($have_ssse3_intrinsics)
+if test $enable_ssse3 = yes && test $have_ssse3_intrinsics = no ; then
+   AC_MSG_ERROR([SSSE3 intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_SSSE3, test $have_ssse3_intrinsics = yes)
+
+dnl ===========================================================================
 dnl Other special flags needed when building code using MMX or SSE instructions
 case $host_os in
    solaris*)
@@ -471,6 +522,7 @@ AC_SUBST(MMX_CFLAGS)
 AC_SUBST(MMX_LDFLAGS)
 AC_SUBST(SSE2_CFLAGS)
 AC_SUBST(SSE2_LDFLAGS)
+AC_SUBST(SSSE3_CFLAGS)
 
 dnl ===========================================================================
 dnl Check for VMX/Altivec
@@ -631,8 +683,8 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
 #ifndef __IWMMXT__
 #error "IWMMXT not enabled (with -march=iwmmxt)"
 #endif
-#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5))
-#error "Need GCC >= 4.5 for IWMMXT intrinsics"
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8))
+#error "Need GCC >= 4.8 for IWMMXT intrinsics"
 #endif
 #include <mmintrin.h>
 int main () {
@@ -916,38 +968,39 @@ main ()
 ]]))
 
 AC_DEFUN([PIXMAN_CHECK_PTHREAD],[dnl
-    if test "z$support_for_pthread_setspecific" != "zyes"; then
+    if test "z$support_for_pthreads" != "zyes"; then
 	PIXMAN_LINK_WITH_ENV(
 		[$1], [pthread_test_program],
 		[PTHREAD_CFLAGS="$CFLAGS"
 		 PTHREAD_LIBS="$LIBS"
 		 PTHREAD_LDFLAGS="$LDFLAGS"
-		 support_for_pthread_setspecific=yes])
+		 support_for_pthreads=yes])
     fi
 ])
 
-if test $ac_cv_tls = none ; then
-    support_for_pthread_setspecific=no
+support_for_pthreads=no
 
-    AC_MSG_CHECKING(for pthread_setspecific)
+AC_MSG_CHECKING(for pthreads)
 
-    PIXMAN_CHECK_PTHREAD([CFLAGS=""; LIBS="-L/usr/X11R6/lib -lpthread-stubs"])
-    PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"])
-    PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"])
-    PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"])
+PIXMAN_CHECK_PTHREAD([CFLAGS=""; LIBS="-L/usr/X11R6/lib -lpthread-stubs"])
+PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"])
+PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"])
+PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"])
     
-    if test $support_for_pthread_setspecific = yes; then
-	CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-	AC_DEFINE([HAVE_PTHREAD_SETSPECIFIC], [], [Whether pthread_setspecific() is supported])
+if test $support_for_pthreads = yes; then
+    AC_DEFINE([HAVE_PTHREADS], [], [Whether pthreads is supported])
+    if test $ac_cv_tls = none ; then
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
     fi
-
-    AC_MSG_RESULT($support_for_pthread_setspecific);
 fi
 
+AC_MSG_RESULT($support_for_pthreads)
+
 AC_SUBST(TOOLCHAIN_SUPPORTS__THREAD)
-AC_SUBST(HAVE_PTHREAD_SETSPECIFIC)
+AC_SUBST(HAVE_PTHREADS)
 AC_SUBST(PTHREAD_LDFLAGS)
 AC_SUBST(PTHREAD_LIBS)
+AC_SUBST(PTHREAD_CFLAGS)
 
 dnl =====================================
 dnl __attribute__((constructor))
@@ -992,6 +1045,22 @@ fi
 
 AC_MSG_RESULT($support_for_float128)
 
+dnl =====================================
+dnl __builtin_clz
+
+support_for_builtin_clz=no
+
+AC_MSG_CHECKING(for __builtin_clz)
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+unsigned int x = 11; int main (void) { return __builtin_clz(x); }
+]])], support_for_builtin_clz=yes)
+
+if test x$support_for_builtin_clz = xyes; then
+   AC_DEFINE([HAVE_BUILTIN_CLZ], [], [Whether the compiler supports __builtin_clz])
+fi
+
+AC_MSG_RESULT($support_for_builtin_clz)
+
 dnl ==================
 dnl libpng
 
diff --git a/lib/pixman/demos/Makefile.am b/lib/pixman/demos/Makefile.am
index 9be9ab670..e04743d7f 100644
--- a/lib/pixman/demos/Makefile.am
+++ b/lib/pixman/demos/Makefile.am
@@ -1,3 +1,5 @@
+EXTRA_DIST = parrot.c parrot.jpg scale.ui
+
 if HAVE_GTK
 
 AM_CFLAGS = $(OPENMP_CFLAGS)
@@ -28,8 +30,6 @@ DEMOS =				\
 	srgb-test		\
 	scale
 
-EXTRA_DIST = parrot.c parrot.jpg scale.ui
-
 gradient_test_SOURCES = gradient-test.c $(GTK_UTILS)
 alpha_test_SOURCES = alpha-test.c $(GTK_UTILS)
 composite_test_SOURCES = composite-test.c $(GTK_UTILS)
diff --git a/lib/pixman/demos/Makefile.in b/lib/pixman/demos/Makefile.in
index 277649371..1b1ce9d7a 100644
--- a/lib/pixman/demos/Makefile.in
+++ b/lib/pixman/demos/Makefile.in
@@ -337,7 +337,7 @@ GREP = @GREP@
 GTK_CFLAGS = @GTK_CFLAGS@
 GTK_LIBS = @GTK_LIBS@
 HAVE_LIBPNG = @HAVE_LIBPNG@
-HAVE_PTHREAD_SETSPECIFIC = @HAVE_PTHREAD_SETSPECIFIC@
+HAVE_PTHREADS = @HAVE_PTHREADS@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -383,6 +383,7 @@ PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
 PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
 PNG_CFLAGS = @PNG_CFLAGS@
 PNG_LIBS = @PNG_LIBS@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
 PTHREAD_LDFLAGS = @PTHREAD_LDFLAGS@
 PTHREAD_LIBS = @PTHREAD_LIBS@
 RANLIB = @RANLIB@
@@ -391,6 +392,7 @@ SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
 SSE2_CFLAGS = @SSE2_CFLAGS@
 SSE2_LDFLAGS = @SSE2_LDFLAGS@
+SSSE3_CFLAGS = @SSSE3_CFLAGS@
 STRIP = @STRIP@
 TESTPROGS_EXTRA_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
 TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR = @TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR@
@@ -449,6 +451,7 @@ target_alias = @target_alias@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
+EXTRA_DIST = parrot.c parrot.jpg scale.ui
 @HAVE_GTK_TRUE@AM_CFLAGS = $(OPENMP_CFLAGS)
 @HAVE_GTK_TRUE@AM_LDFLAGS = $(OPENMP_CFLAGS)
 @HAVE_GTK_TRUE@LDADD = $(top_builddir)/pixman/libpixman-1.la -lm $(GTK_LIBS) $(PNG_LIBS)
@@ -475,7 +478,6 @@ top_srcdir = @top_srcdir@
 @HAVE_GTK_TRUE@	srgb-test		\
 @HAVE_GTK_TRUE@	scale
 
-@HAVE_GTK_TRUE@EXTRA_DIST = parrot.c parrot.jpg scale.ui
 @HAVE_GTK_TRUE@gradient_test_SOURCES = gradient-test.c $(GTK_UTILS)
 @HAVE_GTK_TRUE@alpha_test_SOURCES = alpha-test.c $(GTK_UTILS)
 @HAVE_GTK_TRUE@composite_test_SOURCES = composite-test.c $(GTK_UTILS)
diff --git a/lib/pixman/demos/scale.c b/lib/pixman/demos/scale.c
index 869ada12b..d00307e44 100644
--- a/lib/pixman/demos/scale.c
+++ b/lib/pixman/demos/scale.c
@@ -103,8 +103,8 @@ compute_extents (pixman_f_transform_t *trans, double *sx, double *sy)
 
 typedef struct
 {
-    char		name [20];
-    pixman_kernel_t	value;
+    char	name [20];
+    int		value;
 } named_int_t;
 
 static const named_int_t filters[] =
@@ -127,7 +127,7 @@ static const named_int_t repeats[] =
     { "Pad",                    PIXMAN_REPEAT_PAD },
 };
 
-static pixman_kernel_t
+static int
 get_value (app_t *app, const named_int_t table[], const char *box_name)
 {
     GtkComboBox *box = GTK_COMBO_BOX (get_widget (app, box_name));
diff --git a/lib/pixman/demos/scale.ui b/lib/pixman/demos/scale.ui
index b3450d34d..ee985dd1c 100644
--- a/lib/pixman/demos/scale.ui
+++ b/lib/pixman/demos/scale.ui
@@ -24,7 +24,7 @@
     <property name="page_size">10</property>
   </object>
   <object class="GtkAdjustment" id="subsample_adjustment">
-    <property name="lower">1</property>
+    <property name="lower">0</property>
     <property name="upper">12</property>
     <property name="step_increment">1</property>
     <property name="page_increment">1</property>
diff --git a/lib/pixman/pixman/Makefile.am b/lib/pixman/pixman/Makefile.am
index b9ea75424..b376d9aeb 100644
--- a/lib/pixman/pixman/Makefile.am
+++ b/lib/pixman/pixman/Makefile.am
@@ -52,6 +52,18 @@ libpixman_1_la_LIBADD += libpixman-sse2.la
 ASM_CFLAGS_sse2=$(SSE2_CFLAGS)
 endif
 
+# ssse3 code
+if USE_SSSE3
+noinst_LTLIBRARIES += libpixman-ssse3.la
+libpixman_ssse3_la_SOURCES = \
+	pixman-ssse3.c
+libpixman_ssse3_la_CFLAGS = $(SSSE3_CFLAGS)
+libpixman_1_la_LDFLAGS += $(SSSE3_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-ssse3.la
+
+ASM_CFLAGS_ssse3=$(SSSE3_CFLAGS)
+endif
+
 # arm simd code
 if USE_ARM_SIMD
 noinst_LTLIBRARIES += libpixman-arm-simd.la
diff --git a/lib/pixman/pixman/Makefile.in b/lib/pixman/pixman/Makefile.in
index ec79ebf6d..26c5fbe4f 100644
--- a/lib/pixman/pixman/Makefile.in
+++ b/lib/pixman/pixman/Makefile.in
@@ -69,24 +69,29 @@ DIST_COMMON = $(libpixmaninclude_HEADERS) $(srcdir)/Makefile.am \
 @USE_SSE2_TRUE@am__append_7 = $(SSE2_LDFLAGS)
 @USE_SSE2_TRUE@am__append_8 = libpixman-sse2.la
 
+# ssse3 code
+@USE_SSSE3_TRUE@am__append_9 = libpixman-ssse3.la
+@USE_SSSE3_TRUE@am__append_10 = $(SSSE3_LDFLAGS)
+@USE_SSSE3_TRUE@am__append_11 = libpixman-ssse3.la
+
 # arm simd code
-@USE_ARM_SIMD_TRUE@am__append_9 = libpixman-arm-simd.la
-@USE_ARM_SIMD_TRUE@am__append_10 = libpixman-arm-simd.la
+@USE_ARM_SIMD_TRUE@am__append_12 = libpixman-arm-simd.la
+@USE_ARM_SIMD_TRUE@am__append_13 = libpixman-arm-simd.la
 
 # arm neon code
-@USE_ARM_NEON_TRUE@am__append_11 = libpixman-arm-neon.la
-@USE_ARM_NEON_TRUE@am__append_12 = libpixman-arm-neon.la
-@USE_ARM_IWMMXT_TRUE@am__append_13 = libpixman-iwmmxt.la
-@USE_ARM_IWMMXT_TRUE@am__append_14 = libpixman-iwmmxt.la
+@USE_ARM_NEON_TRUE@am__append_14 = libpixman-arm-neon.la
+@USE_ARM_NEON_TRUE@am__append_15 = libpixman-arm-neon.la
+@USE_ARM_IWMMXT_TRUE@am__append_16 = libpixman-iwmmxt.la
+@USE_ARM_IWMMXT_TRUE@am__append_17 = libpixman-iwmmxt.la
 
 # mips dspr2 code
-@USE_MIPS_DSPR2_TRUE@am__append_15 = libpixman-mips-dspr2.la
-@USE_MIPS_DSPR2_TRUE@am__append_16 = libpixman-mips-dspr2.la
+@USE_MIPS_DSPR2_TRUE@am__append_18 = libpixman-mips-dspr2.la
+@USE_MIPS_DSPR2_TRUE@am__append_19 = libpixman-mips-dspr2.la
 
 # loongson code
-@USE_LOONGSON_MMI_TRUE@am__append_17 = libpixman-loongson-mmi.la
-@USE_LOONGSON_MMI_TRUE@am__append_18 = $(LS_LDFLAGS)
-@USE_LOONGSON_MMI_TRUE@am__append_19 = libpixman-loongson-mmi.la
+@USE_LOONGSON_MMI_TRUE@am__append_20 = libpixman-loongson-mmi.la
+@USE_LOONGSON_MMI_TRUE@am__append_21 = $(LS_LDFLAGS)
+@USE_LOONGSON_MMI_TRUE@am__append_22 = libpixman-loongson-mmi.la
 subdir = pixman
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/configure.ac
@@ -127,8 +132,9 @@ am__installdirs = "$(DESTDIR)$(libdir)" \
 	"$(DESTDIR)$(libpixmanincludedir)"
 LTLIBRARIES = $(lib_LTLIBRARIES) $(noinst_LTLIBRARIES)
 libpixman_1_la_DEPENDENCIES = $(am__append_3) $(am__append_5) \
-	$(am__append_8) $(am__append_10) $(am__append_12) \
-	$(am__append_14) $(am__append_16) $(am__append_19)
+	$(am__append_8) $(am__append_11) $(am__append_13) \
+	$(am__append_15) $(am__append_17) $(am__append_19) \
+	$(am__append_22)
 am__objects_1 = pixman.lo pixman-access.lo pixman-access-accessors.lo \
 	pixman-bits-image.lo pixman-combine32.lo \
 	pixman-combine-float.lo pixman-conical-gradient.lo \
@@ -216,6 +222,16 @@ libpixman_sse2_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
 	$(libpixman_sse2_la_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
 	-o $@
 @USE_SSE2_TRUE@am_libpixman_sse2_la_rpath =
+libpixman_ssse3_la_LIBADD =
+am__libpixman_ssse3_la_SOURCES_DIST = pixman-ssse3.c
+@USE_SSSE3_TRUE@am_libpixman_ssse3_la_OBJECTS =  \
+@USE_SSSE3_TRUE@	libpixman_ssse3_la-pixman-ssse3.lo
+libpixman_ssse3_la_OBJECTS = $(am_libpixman_ssse3_la_OBJECTS)
+libpixman_ssse3_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(libpixman_ssse3_la_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+@USE_SSSE3_TRUE@am_libpixman_ssse3_la_rpath =
 libpixman_vmx_la_LIBADD =
 am__libpixman_vmx_la_SOURCES_DIST = pixman-vmx.c pixman-combine32.h
 @USE_VMX_TRUE@am_libpixman_vmx_la_OBJECTS =  \
@@ -275,7 +291,8 @@ SOURCES = $(libpixman_1_la_SOURCES) $(libpixman_arm_neon_la_SOURCES) \
 	$(libpixman_iwmmxt_la_SOURCES) \
 	$(libpixman_loongson_mmi_la_SOURCES) \
 	$(libpixman_mips_dspr2_la_SOURCES) $(libpixman_mmx_la_SOURCES) \
-	$(libpixman_sse2_la_SOURCES) $(libpixman_vmx_la_SOURCES)
+	$(libpixman_sse2_la_SOURCES) $(libpixman_ssse3_la_SOURCES) \
+	$(libpixman_vmx_la_SOURCES)
 DIST_SOURCES = $(libpixman_1_la_SOURCES) \
 	$(am__libpixman_arm_neon_la_SOURCES_DIST) \
 	$(am__libpixman_arm_simd_la_SOURCES_DIST) \
@@ -284,6 +301,7 @@ DIST_SOURCES = $(libpixman_1_la_SOURCES) \
 	$(am__libpixman_mips_dspr2_la_SOURCES_DIST) \
 	$(am__libpixman_mmx_la_SOURCES_DIST) \
 	$(am__libpixman_sse2_la_SOURCES_DIST) \
+	$(am__libpixman_ssse3_la_SOURCES_DIST) \
 	$(am__libpixman_vmx_la_SOURCES_DIST)
 am__can_run_installinfo = \
   case $$AM_UPDATE_INFO_DIR in \
@@ -326,7 +344,7 @@ GREP = @GREP@
 GTK_CFLAGS = @GTK_CFLAGS@
 GTK_LIBS = @GTK_LIBS@
 HAVE_LIBPNG = @HAVE_LIBPNG@
-HAVE_PTHREAD_SETSPECIFIC = @HAVE_PTHREAD_SETSPECIFIC@
+HAVE_PTHREADS = @HAVE_PTHREADS@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -372,6 +390,7 @@ PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
 PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
 PNG_CFLAGS = @PNG_CFLAGS@
 PNG_LIBS = @PNG_LIBS@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
 PTHREAD_LDFLAGS = @PTHREAD_LDFLAGS@
 PTHREAD_LIBS = @PTHREAD_LIBS@
 RANLIB = @RANLIB@
@@ -380,6 +399,7 @@ SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
 SSE2_CFLAGS = @SSE2_CFLAGS@
 SSE2_LDFLAGS = @SSE2_LDFLAGS@
+SSSE3_CFLAGS = @SSSE3_CFLAGS@
 STRIP = @STRIP@
 TESTPROGS_EXTRA_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
 TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR = @TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR@
@@ -484,17 +504,17 @@ libpixman_headers = \
 lib_LTLIBRARIES = libpixman-1.la
 libpixman_1_la_LDFLAGS = -version-info $(LT_VERSION_INFO) \
 	-no-undefined @PTHREAD_LDFLAGS@ $(am__append_2) \
-	$(am__append_7) $(am__append_18)
+	$(am__append_7) $(am__append_10) $(am__append_21)
 libpixman_1_la_LIBADD = @PTHREAD_LIBS@ -lm $(am__append_3) \
-	$(am__append_5) $(am__append_8) $(am__append_10) \
-	$(am__append_12) $(am__append_14) $(am__append_16) \
-	$(am__append_19)
+	$(am__append_5) $(am__append_8) $(am__append_11) \
+	$(am__append_13) $(am__append_15) $(am__append_17) \
+	$(am__append_19) $(am__append_22)
 libpixman_1_la_SOURCES = $(libpixman_sources) $(libpixman_headers)
 libpixmanincludedir = $(includedir)/pixman-1
 libpixmaninclude_HEADERS = pixman.h pixman-version.h
 noinst_LTLIBRARIES = $(am__append_1) $(am__append_4) $(am__append_6) \
-	$(am__append_9) $(am__append_11) $(am__append_13) \
-	$(am__append_15) $(am__append_17)
+	$(am__append_9) $(am__append_12) $(am__append_14) \
+	$(am__append_16) $(am__append_18) $(am__append_20)
 EXTRA_DIST = \
 	Makefile.win32			\
 	pixman-region.c			\
@@ -517,6 +537,11 @@ EXTRA_DIST = \
 
 @USE_SSE2_TRUE@libpixman_sse2_la_CFLAGS = $(SSE2_CFLAGS)
 @USE_SSE2_TRUE@ASM_CFLAGS_sse2 = $(SSE2_CFLAGS)
+@USE_SSSE3_TRUE@libpixman_ssse3_la_SOURCES = \
+@USE_SSSE3_TRUE@	pixman-ssse3.c
+
+@USE_SSSE3_TRUE@libpixman_ssse3_la_CFLAGS = $(SSSE3_CFLAGS)
+@USE_SSSE3_TRUE@ASM_CFLAGS_ssse3 = $(SSSE3_CFLAGS)
 @USE_ARM_SIMD_TRUE@libpixman_arm_simd_la_SOURCES = \
 @USE_ARM_SIMD_TRUE@	pixman-arm-simd.c	\
 @USE_ARM_SIMD_TRUE@	pixman-arm-common.h	\
@@ -650,6 +675,8 @@ libpixman-mmx.la: $(libpixman_mmx_la_OBJECTS) $(libpixman_mmx_la_DEPENDENCIES) $
 	$(AM_V_CCLD)$(libpixman_mmx_la_LINK) $(am_libpixman_mmx_la_rpath) $(libpixman_mmx_la_OBJECTS) $(libpixman_mmx_la_LIBADD) $(LIBS)
 libpixman-sse2.la: $(libpixman_sse2_la_OBJECTS) $(libpixman_sse2_la_DEPENDENCIES) $(EXTRA_libpixman_sse2_la_DEPENDENCIES) 
 	$(AM_V_CCLD)$(libpixman_sse2_la_LINK) $(am_libpixman_sse2_la_rpath) $(libpixman_sse2_la_OBJECTS) $(libpixman_sse2_la_LIBADD) $(LIBS)
+libpixman-ssse3.la: $(libpixman_ssse3_la_OBJECTS) $(libpixman_ssse3_la_DEPENDENCIES) $(EXTRA_libpixman_ssse3_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(libpixman_ssse3_la_LINK) $(am_libpixman_ssse3_la_rpath) $(libpixman_ssse3_la_OBJECTS) $(libpixman_ssse3_la_LIBADD) $(LIBS)
 libpixman-vmx.la: $(libpixman_vmx_la_OBJECTS) $(libpixman_vmx_la_DEPENDENCIES) $(EXTRA_libpixman_vmx_la_DEPENDENCIES) 
 	$(AM_V_CCLD)$(libpixman_vmx_la_LINK) $(am_libpixman_vmx_la_rpath) $(libpixman_vmx_la_OBJECTS) $(libpixman_vmx_la_LIBADD) $(LIBS)
 
@@ -662,6 +689,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_loongson_mmi_la-pixman-mmx.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_mmx_la-pixman-mmx.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_sse2_la-pixman-sse2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_ssse3_la-pixman-ssse3.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_vmx_la-pixman-vmx.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-access-accessors.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-access.Plo@am__quote@
@@ -767,6 +795,13 @@ libpixman_sse2_la-pixman-sse2.lo: pixman-sse2.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_sse2_la_CFLAGS) $(CFLAGS) -c -o libpixman_sse2_la-pixman-sse2.lo `test -f 'pixman-sse2.c' || echo '$(srcdir)/'`pixman-sse2.c
 
+libpixman_ssse3_la-pixman-ssse3.lo: pixman-ssse3.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_ssse3_la_CFLAGS) $(CFLAGS) -MT libpixman_ssse3_la-pixman-ssse3.lo -MD -MP -MF $(DEPDIR)/libpixman_ssse3_la-pixman-ssse3.Tpo -c -o libpixman_ssse3_la-pixman-ssse3.lo `test -f 'pixman-ssse3.c' || echo '$(srcdir)/'`pixman-ssse3.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpixman_ssse3_la-pixman-ssse3.Tpo $(DEPDIR)/libpixman_ssse3_la-pixman-ssse3.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='pixman-ssse3.c' object='libpixman_ssse3_la-pixman-ssse3.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_ssse3_la_CFLAGS) $(CFLAGS) -c -o libpixman_ssse3_la-pixman-ssse3.lo `test -f 'pixman-ssse3.c' || echo '$(srcdir)/'`pixman-ssse3.c
+
 libpixman_vmx_la-pixman-vmx.lo: pixman-vmx.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_vmx_la_CFLAGS) $(CFLAGS) -MT libpixman_vmx_la-pixman-vmx.lo -MD -MP -MF $(DEPDIR)/libpixman_vmx_la-pixman-vmx.Tpo -c -o libpixman_vmx_la-pixman-vmx.lo `test -f 'pixman-vmx.c' || echo '$(srcdir)/'`pixman-vmx.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libpixman_vmx_la-pixman-vmx.Tpo $(DEPDIR)/libpixman_vmx_la-pixman-vmx.Plo
diff --git a/lib/pixman/pixman/Makefile.win32 b/lib/pixman/pixman/Makefile.win32
index 57ed7a5dc..7b64033bc 100644
--- a/lib/pixman/pixman/Makefile.win32
+++ b/lib/pixman/pixman/Makefile.win32
@@ -14,8 +14,14 @@ ifeq ($(SSE2_VAR),)
 SSE2_VAR=on
 endif
 
+SSSE3_VAR = $(SSSE3)
+ifeq ($(SSSE3_VAR),)
+SSSE3_VAR=on
+endif
+
 MMX_CFLAGS = -DUSE_X86_MMX -w14710 -w14714
 SSE2_CFLAGS = -DUSE_SSE2
+SSSE3_CFLAGS = -DUSE_SSSE3
 
 # MMX compilation flags
 ifeq ($(MMX_VAR),on)
@@ -29,10 +35,16 @@ PIXMAN_CFLAGS += $(SSE2_CFLAGS)
 libpixman_sources += pixman-sse2.c
 endif
 
+# SSSE3 compilation flags
+ifeq ($(SSSE3_VAR),on)
+PIXMAN_CFLAGS += $(SSSE3_CFLAGS)
+libpixman_sources += pixman-ssse3.c
+endif
+
 OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(libpixman_sources))
 
 # targets
-all: inform informMMX informSSE2 $(CFG_VAR)/$(LIBRARY).lib
+all: inform informMMX informSSE2 informSSSE3 $(CFG_VAR)/$(LIBRARY).lib
 
 informMMX:
 ifneq ($(MMX),off)
@@ -60,9 +72,22 @@ endif
 endif
 endif
 
+informSSSE3:
+ifneq ($(SSSE3),off)
+ifneq ($(SSSE3),on)
+ifneq ($(SSSE3),)
+	@echo "Invalid specified SSE option : "$(SSSE3)"."
+	@echo
+	@echo "Possible choices for SSSE3 are 'on' or 'off'"
+	@exit 1
+endif
+	@echo "Setting SSSE3 flag to default value 'on'... (use SSSE3=on or SSSE3=off)"
+endif
+endif
+
 
 # pixman linking
 $(CFG_VAR)/$(LIBRARY).lib: $(OBJECTS)
 	@$(AR) $(PIXMAN_ARFLAGS) -OUT:$@ $^
 
-.PHONY: all informMMX informSSE2
+.PHONY: all informMMX informSSE2 informSSSE3
diff --git a/lib/pixman/pixman/pixman-access.c b/lib/pixman/pixman/pixman-access.c
index b5c8e4017..4f0642d77 100644
--- a/lib/pixman/pixman/pixman-access.c
+++ b/lib/pixman/pixman/pixman-access.c
@@ -294,14 +294,14 @@ convert_pixel (pixman_format_code_t from, pixman_format_code_t to, uint32_t pixe
 }
 
 static force_inline uint32_t
-convert_pixel_to_a8r8g8b8 (pixman_image_t *image,
+convert_pixel_to_a8r8g8b8 (bits_image_t *image,
 			   pixman_format_code_t format,
 			   uint32_t pixel)
 {
     if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY		||
 	PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
     {
-	return image->bits.indexed->rgba[pixel];
+	return image->indexed->rgba[pixel];
     }
     else
     {
@@ -332,7 +332,7 @@ convert_pixel_from_a8r8g8b8 (pixman_image_t *image,
 }
 
 static force_inline uint32_t
-fetch_and_convert_pixel (pixman_image_t	*	image,
+fetch_and_convert_pixel (bits_image_t *		image,
 			 const uint8_t *	bits,
 			 int			offset,
 			 pixman_format_code_t	format)
@@ -417,7 +417,7 @@ convert_and_store_pixel (bits_image_t *		image,
 
 #define MAKE_ACCESSORS(format)						\
     static void								\
-    fetch_scanline_ ## format (pixman_image_t *image,			\
+    fetch_scanline_ ## format (bits_image_t *image,			\
 			       int	       x,			\
 			       int             y,			\
 			       int             width,			\
@@ -425,7 +425,7 @@ convert_and_store_pixel (bits_image_t *		image,
 			       const uint32_t *mask)			\
     {									\
 	uint8_t *bits =							\
-	    (uint8_t *)(image->bits.bits + y * image->bits.rowstride);	\
+	    (uint8_t *)(image->bits + y * image->rowstride);		\
 	int i;								\
 									\
 	for (i = 0; i < width; ++i)					\
@@ -461,8 +461,8 @@ convert_and_store_pixel (bits_image_t *		image,
 	uint8_t *bits =							\
 	    (uint8_t *)(image->bits + line * image->rowstride);		\
 									\
-	return fetch_and_convert_pixel ((pixman_image_t *)image,	\
-					bits, offset, PIXMAN_ ## format); \
+	return fetch_and_convert_pixel (				\
+	    image, bits, offset, PIXMAN_ ## format);			\
     }									\
 									\
     static const void *const __dummy__ ## format
@@ -583,14 +583,14 @@ to_srgb (float f)
 }
 
 static void
-fetch_scanline_a8r8g8b8_sRGB_float (pixman_image_t *image,
+fetch_scanline_a8r8g8b8_sRGB_float (bits_image_t *  image,
 				    int             x,
 				    int             y,
 				    int             width,
 				    uint32_t *      b,
 				    const uint32_t *mask)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *bits = image->bits + y * image->rowstride;
     const uint32_t *pixel = bits + x;
     const uint32_t *end = pixel + width;
     argb_t *buffer = (argb_t *)b;
@@ -612,14 +612,14 @@ fetch_scanline_a8r8g8b8_sRGB_float (pixman_image_t *image,
 
 /* Expects a float buffer */
 static void
-fetch_scanline_a2r10g10b10_float (pixman_image_t *image,
+fetch_scanline_a2r10g10b10_float (bits_image_t *  image,
 				  int             x,
 				  int             y,
 				  int             width,
 				  uint32_t *      b,
 				  const uint32_t *mask)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *bits = image->bits + y * image->rowstride;
     const uint32_t *pixel = bits + x;
     const uint32_t *end = pixel + width;
     argb_t *buffer = (argb_t *)b;
@@ -643,14 +643,14 @@ fetch_scanline_a2r10g10b10_float (pixman_image_t *image,
 
 /* Expects a float buffer */
 static void
-fetch_scanline_x2r10g10b10_float (pixman_image_t *image,
+fetch_scanline_x2r10g10b10_float (bits_image_t   *image,
 				  int             x,
 				  int             y,
 				  int             width,
 				  uint32_t *      b,
 				  const uint32_t *mask)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *bits = image->bits + y * image->rowstride;
     const uint32_t *pixel = (uint32_t *)bits + x;
     const uint32_t *end = pixel + width;
     argb_t *buffer = (argb_t *)b;
@@ -673,14 +673,14 @@ fetch_scanline_x2r10g10b10_float (pixman_image_t *image,
 
 /* Expects a float buffer */
 static void
-fetch_scanline_a2b10g10r10_float (pixman_image_t *image,
+fetch_scanline_a2b10g10r10_float (bits_image_t   *image,
 				  int             x,
 				  int             y,
 				  int             width,
 				  uint32_t *      b,
 				  const uint32_t *mask)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *bits = image->bits + y * image->rowstride;
     const uint32_t *pixel = bits + x;
     const uint32_t *end = pixel + width;
     argb_t *buffer = (argb_t *)b;
@@ -704,14 +704,14 @@ fetch_scanline_a2b10g10r10_float (pixman_image_t *image,
 
 /* Expects a float buffer */
 static void
-fetch_scanline_x2b10g10r10_float (pixman_image_t *image,
+fetch_scanline_x2b10g10r10_float (bits_image_t   *image,
 				  int             x,
 				  int             y,
 				  int             width,
 				  uint32_t *      b,
 				  const uint32_t *mask)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *bits = image->bits + y * image->rowstride;
     const uint32_t *pixel = (uint32_t *)bits + x;
     const uint32_t *end = pixel + width;
     argb_t *buffer = (argb_t *)b;
@@ -733,14 +733,14 @@ fetch_scanline_x2b10g10r10_float (pixman_image_t *image,
 }
 
 static void
-fetch_scanline_yuy2 (pixman_image_t *image,
+fetch_scanline_yuy2 (bits_image_t   *image,
                      int             x,
                      int             line,
                      int             width,
                      uint32_t *      buffer,
                      const uint32_t *mask)
 {
-    const uint32_t *bits = image->bits.bits + image->bits.rowstride * line;
+    const uint32_t *bits = image->bits + image->rowstride * line;
     int i;
     
     for (i = 0; i < width; i++)
@@ -767,7 +767,7 @@ fetch_scanline_yuy2 (pixman_image_t *image,
 }
 
 static void
-fetch_scanline_yv12 (pixman_image_t *image,
+fetch_scanline_yv12 (bits_image_t   *image,
                      int             x,
                      int             line,
                      int             width,
@@ -1121,30 +1121,30 @@ store_scanline_generic_float (bits_image_t *  image,
 }
 
 static void
-fetch_scanline_generic_float (pixman_image_t *image,
+fetch_scanline_generic_float (bits_image_t *  image,
 			      int	      x,
 			      int	      y,
 			      int	      width,
 			      uint32_t *      buffer,
 			      const uint32_t *mask)
 {
-    image->bits.fetch_scanline_32 (image, x, y, width, buffer, NULL);
+    image->fetch_scanline_32 (image, x, y, width, buffer, NULL);
 
-    pixman_expand_to_float ((argb_t *)buffer, buffer, image->bits.format, width);
+    pixman_expand_to_float ((argb_t *)buffer, buffer, image->format, width);
 }
 
 /* The 32_sRGB paths should be deleted after narrow processing
  * is no longer invoked for formats that are considered wide.
  * (Also see fetch_pixel_generic_lossy_32) */
 static void
-fetch_scanline_a8r8g8b8_32_sRGB (pixman_image_t *image,
+fetch_scanline_a8r8g8b8_32_sRGB (bits_image_t   *image,
                                  int             x,
                                  int             y,
                                  int             width,
                                  uint32_t       *buffer,
                                  const uint32_t *mask)
 {
-    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *bits = image->bits + y * image->rowstride;
     const uint32_t *pixel = (uint32_t *)bits + x;
     const uint32_t *end = pixel + width;
     uint32_t tmp;
diff --git a/lib/pixman/pixman/pixman-bits-image.c b/lib/pixman/pixman/pixman-bits-image.c
index 75a39a115..f9121a365 100644
--- a/lib/pixman/pixman/pixman-bits-image.c
+++ b/lib/pixman/pixman/pixman-bits-image.c
@@ -137,221 +137,6 @@ bits_image_fetch_pixel_bilinear (bits_image_t   *image,
     return bilinear_interpolation (tl, tr, bl, br, distx, disty);
 }
 
-static uint32_t *
-bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,
-					  const uint32_t *mask)
-{
-
-    pixman_image_t * ima = iter->image;
-    int              offset = iter->x;
-    int              line = iter->y++;
-    int              width = iter->width;
-    uint32_t *       buffer = iter->buffer;
-
-    bits_image_t *bits = &ima->bits;
-    pixman_fixed_t x_top, x_bottom, x;
-    pixman_fixed_t ux_top, ux_bottom, ux;
-    pixman_vector_t v;
-    uint32_t top_mask, bottom_mask;
-    uint32_t *top_row;
-    uint32_t *bottom_row;
-    uint32_t *end;
-    uint32_t zero[2] = { 0, 0 };
-    uint32_t one = 1;
-    int y, y1, y2;
-    int disty;
-    int mask_inc;
-    int w;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (bits->common.transform, &v))
-	return iter->buffer;
-
-    ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
-    x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
-
-    y = v.vector[1] - pixman_fixed_1/2;
-    disty = pixman_fixed_to_bilinear_weight (y);
-
-    /* Load the pointers to the first and second lines from the source
-     * image that bilinear code must read.
-     *
-     * The main trick in this code is about the check if any line are
-     * outside of the image;
-     *
-     * When I realize that a line (any one) is outside, I change
-     * the pointer to a dummy area with zeros. Once I change this, I
-     * must be sure the pointer will not change, so I set the
-     * variables to each pointer increments inside the loop.
-     */
-    y1 = pixman_fixed_to_int (y);
-    y2 = y1 + 1;
-
-    if (y1 < 0 || y1 >= bits->height)
-    {
-	top_row = zero;
-	x_top = 0;
-	ux_top = 0;
-    }
-    else
-    {
-	top_row = bits->bits + y1 * bits->rowstride;
-	x_top = x;
-	ux_top = ux;
-    }
-
-    if (y2 < 0 || y2 >= bits->height)
-    {
-	bottom_row = zero;
-	x_bottom = 0;
-	ux_bottom = 0;
-    }
-    else
-    {
-	bottom_row = bits->bits + y2 * bits->rowstride;
-	x_bottom = x;
-	ux_bottom = ux;
-    }
-
-    /* Instead of checking whether the operation uses the mast in
-     * each loop iteration, verify this only once and prepare the
-     * variables to make the code smaller inside the loop.
-     */
-    if (!mask)
-    {
-        mask_inc = 0;
-        mask = &one;
-    }
-    else
-    {
-        /* If have a mask, prepare the variables to check it */
-        mask_inc = 1;
-    }
-
-    /* If both are zero, then the whole thing is zero */
-    if (top_row == zero && bottom_row == zero)
-    {
-	memset (buffer, 0, width * sizeof (uint32_t));
-	return iter->buffer;
-    }
-    else if (bits->format == PIXMAN_x8r8g8b8)
-    {
-	if (top_row == zero)
-	{
-	    top_mask = 0;
-	    bottom_mask = 0xff000000;
-	}
-	else if (bottom_row == zero)
-	{
-	    top_mask = 0xff000000;
-	    bottom_mask = 0;
-	}
-	else
-	{
-	    top_mask = 0xff000000;
-	    bottom_mask = 0xff000000;
-	}
-    }
-    else
-    {
-	top_mask = 0;
-	bottom_mask = 0;
-    }
-
-    end = buffer + width;
-
-    /* Zero fill to the left of the image */
-    while (buffer < end && x < pixman_fixed_minus_1)
-    {
-	*buffer++ = 0;
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
-    }
-
-    /* Left edge
-     */
-    while (buffer < end && x < 0)
-    {
-	uint32_t tr, br;
-	int32_t distx;
-
-	tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
-	br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
-
-	distx = pixman_fixed_to_bilinear_weight (x);
-
-	*buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
-
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
-    }
-
-    /* Main part */
-    w = pixman_int_to_fixed (bits->width - 1);
-
-    while (buffer < end  &&  x < w)
-    {
-	if (*mask)
-	{
-	    uint32_t tl, tr, bl, br;
-	    int32_t distx;
-
-	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
-	    tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
-	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
-	    br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
-
-	    distx = pixman_fixed_to_bilinear_weight (x);
-
-	    *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
-	}
-
-	buffer++;
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
-    }
-
-    /* Right Edge */
-    w = pixman_int_to_fixed (bits->width);
-    while (buffer < end  &&  x < w)
-    {
-	if (*mask)
-	{
-	    uint32_t tl, bl;
-	    int32_t distx;
-
-	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
-	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
-
-	    distx = pixman_fixed_to_bilinear_weight (x);
-
-	    *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
-	}
-
-	buffer++;
-	x += ux;
-	x_top += ux_top;
-	x_bottom += ux_bottom;
-	mask += mask_inc;
-    }
-
-    /* Zero fill to the left of the image */
-    while (buffer < end)
-	*buffer++ = 0;
-
-    return iter->buffer;
-}
-
 static force_inline uint32_t
 bits_image_fetch_pixel_convolution (bits_image_t   *image,
 				    pixman_fixed_t  x,
@@ -720,472 +505,6 @@ bits_image_fetch_general (pixman_iter_t  *iter,
     return buffer;
 }
 
-typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
-
-static force_inline void
-bits_image_fetch_separable_convolution_affine (pixman_image_t * image,
-					       int              offset,
-					       int              line,
-					       int              width,
-					       uint32_t *       buffer,
-					       const uint32_t * mask,
-
-					       convert_pixel_t	convert_pixel,
-					       pixman_format_code_t	format,
-					       pixman_repeat_t	repeat_mode)
-{
-    bits_image_t *bits = &image->bits;
-    pixman_fixed_t *params = image->common.filter_params;
-    int cwidth = pixman_fixed_to_int (params[0]);
-    int cheight = pixman_fixed_to_int (params[1]);
-    int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1;
-    int y_off = ((cheight << 16) - pixman_fixed_1) >> 1;
-    int x_phase_bits = pixman_fixed_to_int (params[2]);
-    int y_phase_bits = pixman_fixed_to_int (params[3]);
-    int x_phase_shift = 16 - x_phase_bits;
-    int y_phase_shift = 16 - y_phase_bits;
-    pixman_fixed_t vx, vy;
-    pixman_fixed_t ux, uy;
-    pixman_vector_t v;
-    int k;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (image->common.transform, &v))
-	return;
-
-    ux = image->common.transform->matrix[0][0];
-    uy = image->common.transform->matrix[1][0];
-
-    vx = v.vector[0];
-    vy = v.vector[1];
-
-    for (k = 0; k < width; ++k)
-    {
-	pixman_fixed_t *y_params;
-	int satot, srtot, sgtot, sbtot;
-	pixman_fixed_t x, y;
-	int32_t x1, x2, y1, y2;
-	int32_t px, py;
-	int i, j;
-
-	if (mask && !mask[k])
-	    goto next;
-
-	/* Round x and y to the middle of the closest phase before continuing. This
-	 * ensures that the convolution matrix is aligned right, since it was
-	 * positioned relative to a particular phase (and not relative to whatever
-	 * exact fraction we happen to get here).
-	 */
-	x = ((vx >> x_phase_shift) << x_phase_shift) + ((1 << x_phase_shift) >> 1);
-	y = ((vy >> y_phase_shift) << y_phase_shift) + ((1 << y_phase_shift) >> 1);
-
-	px = (x & 0xffff) >> x_phase_shift;
-	py = (y & 0xffff) >> y_phase_shift;
-
-	x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
-	y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
-	x2 = x1 + cwidth;
-	y2 = y1 + cheight;
-
-	satot = srtot = sgtot = sbtot = 0;
-
-	y_params = params + 4 + (1 << x_phase_bits) * cwidth + py * cheight;
-
-	for (i = y1; i < y2; ++i)
-	{
-	    pixman_fixed_t fy = *y_params++;
-
-	    if (fy)
-	    {
-		pixman_fixed_t *x_params = params + 4 + px * cwidth;
-
-		for (j = x1; j < x2; ++j)
-		{
-		    pixman_fixed_t fx = *x_params++;
-		    int rx = j;
-		    int ry = i;
-		    
-		    if (fx)
-		    {
-			pixman_fixed_t f;
-			uint32_t pixel, mask;
-			uint8_t *row;
-
-			mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
-			if (repeat_mode != PIXMAN_REPEAT_NONE)
-			{
-			    repeat (repeat_mode, &rx, bits->width);
-			    repeat (repeat_mode, &ry, bits->height);
-
-			    row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
-			    pixel = convert_pixel (row, rx) | mask;
-			}
-			else
-			{
-			    if (rx < 0 || ry < 0 || rx >= bits->width || ry >= bits->height)
-			    {
-				pixel = 0;
-			    }
-			    else
-			    {
-				row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
-				pixel = convert_pixel (row, rx) | mask;
-			    }
-			}
-
-			f = ((pixman_fixed_32_32_t)fx * fy + 0x8000) >> 16;
-			srtot += (int)RED_8 (pixel) * f;
-			sgtot += (int)GREEN_8 (pixel) * f;
-			sbtot += (int)BLUE_8 (pixel) * f;
-			satot += (int)ALPHA_8 (pixel) * f;
-		    }
-		}
-	    }
-	}
-
-	satot = (satot + 0x8000) >> 16;
-	srtot = (srtot + 0x8000) >> 16;
-	sgtot = (sgtot + 0x8000) >> 16;
-	sbtot = (sbtot + 0x8000) >> 16;
-
-	satot = CLIP (satot, 0, 0xff);
-	srtot = CLIP (srtot, 0, 0xff);
-	sgtot = CLIP (sgtot, 0, 0xff);
-	sbtot = CLIP (sbtot, 0, 0xff);
-
-	buffer[k] = (satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot << 0);
-
-    next:
-	vx += ux;
-	vy += uy;
-    }
-}
-
-static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
-
-static force_inline void
-bits_image_fetch_bilinear_affine (pixman_image_t * image,
-				  int              offset,
-				  int              line,
-				  int              width,
-				  uint32_t *       buffer,
-				  const uint32_t * mask,
-
-				  convert_pixel_t	convert_pixel,
-				  pixman_format_code_t	format,
-				  pixman_repeat_t	repeat_mode)
-{
-    pixman_fixed_t x, y;
-    pixman_fixed_t ux, uy;
-    pixman_vector_t v;
-    bits_image_t *bits = &image->bits;
-    int i;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (image->common.transform, &v))
-	return;
-
-    ux = image->common.transform->matrix[0][0];
-    uy = image->common.transform->matrix[1][0];
-
-    x = v.vector[0];
-    y = v.vector[1];
-
-    for (i = 0; i < width; ++i)
-    {
-	int x1, y1, x2, y2;
-	uint32_t tl, tr, bl, br;
-	int32_t distx, disty;
-	int width = image->bits.width;
-	int height = image->bits.height;
-	const uint8_t *row1;
-	const uint8_t *row2;
-
-	if (mask && !mask[i])
-	    goto next;
-
-	x1 = x - pixman_fixed_1 / 2;
-	y1 = y - pixman_fixed_1 / 2;
-
-	distx = pixman_fixed_to_bilinear_weight (x1);
-	disty = pixman_fixed_to_bilinear_weight (y1);
-
-	y1 = pixman_fixed_to_int (y1);
-	y2 = y1 + 1;
-	x1 = pixman_fixed_to_int (x1);
-	x2 = x1 + 1;
-
-	if (repeat_mode != PIXMAN_REPEAT_NONE)
-	{
-	    uint32_t mask;
-
-	    mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
-	    repeat (repeat_mode, &x1, width);
-	    repeat (repeat_mode, &y1, height);
-	    repeat (repeat_mode, &x2, width);
-	    repeat (repeat_mode, &y2, height);
-
-	    row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
-	    row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
-
-	    tl = convert_pixel (row1, x1) | mask;
-	    tr = convert_pixel (row1, x2) | mask;
-	    bl = convert_pixel (row2, x1) | mask;
-	    br = convert_pixel (row2, x2) | mask;
-	}
-	else
-	{
-	    uint32_t mask1, mask2;
-	    int bpp;
-
-	    /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
-	     * which means if you use it in expressions, those
-	     * expressions become unsigned themselves. Since
-	     * the variables below can be negative in some cases,
-	     * that will lead to crashes on 64 bit architectures.
-	     *
-	     * So this line makes sure bpp is signed
-	     */
-	    bpp = PIXMAN_FORMAT_BPP (format);
-
-	    if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
-	    {
-		buffer[i] = 0;
-		goto next;
-	    }
-
-	    if (y2 == 0)
-	    {
-		row1 = zero;
-		mask1 = 0;
-	    }
-	    else
-	    {
-		row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
-		row1 += bpp / 8 * x1;
-
-		mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-	    }
-
-	    if (y1 == height - 1)
-	    {
-		row2 = zero;
-		mask2 = 0;
-	    }
-	    else
-	    {
-		row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
-		row2 += bpp / 8 * x1;
-
-		mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-	    }
-
-	    if (x2 == 0)
-	    {
-		tl = 0;
-		bl = 0;
-	    }
-	    else
-	    {
-		tl = convert_pixel (row1, 0) | mask1;
-		bl = convert_pixel (row2, 0) | mask2;
-	    }
-
-	    if (x1 == width - 1)
-	    {
-		tr = 0;
-		br = 0;
-	    }
-	    else
-	    {
-		tr = convert_pixel (row1, 1) | mask1;
-		br = convert_pixel (row2, 1) | mask2;
-	    }
-	}
-
-	buffer[i] = bilinear_interpolation (
-	    tl, tr, bl, br, distx, disty);
-
-    next:
-	x += ux;
-	y += uy;
-    }
-}
-
-static force_inline void
-bits_image_fetch_nearest_affine (pixman_image_t * image,
-				 int              offset,
-				 int              line,
-				 int              width,
-				 uint32_t *       buffer,
-				 const uint32_t * mask,
-				 
-				 convert_pixel_t	convert_pixel,
-				 pixman_format_code_t	format,
-				 pixman_repeat_t	repeat_mode)
-{
-    pixman_fixed_t x, y;
-    pixman_fixed_t ux, uy;
-    pixman_vector_t v;
-    bits_image_t *bits = &image->bits;
-    int i;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    if (!pixman_transform_point_3d (image->common.transform, &v))
-	return;
-
-    ux = image->common.transform->matrix[0][0];
-    uy = image->common.transform->matrix[1][0];
-
-    x = v.vector[0];
-    y = v.vector[1];
-
-    for (i = 0; i < width; ++i)
-    {
-	int width, height, x0, y0;
-	const uint8_t *row;
-
-	if (mask && !mask[i])
-	    goto next;
-	
-	width = image->bits.width;
-	height = image->bits.height;
-	x0 = pixman_fixed_to_int (x - pixman_fixed_e);
-	y0 = pixman_fixed_to_int (y - pixman_fixed_e);
-
-	if (repeat_mode == PIXMAN_REPEAT_NONE &&
-	    (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
-	{
-	    buffer[i] = 0;
-	}
-	else
-	{
-	    uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
-
-	    if (repeat_mode != PIXMAN_REPEAT_NONE)
-	    {
-		repeat (repeat_mode, &x0, width);
-		repeat (repeat_mode, &y0, height);
-	    }
-
-	    row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
-
-	    buffer[i] = convert_pixel (row, x0) | mask;
-	}
-
-    next:
-	x += ux;
-	y += uy;
-    }
-}
-
-static force_inline uint32_t
-convert_a8r8g8b8 (const uint8_t *row, int x)
-{
-    return *(((uint32_t *)row) + x);
-}
-
-static force_inline uint32_t
-convert_x8r8g8b8 (const uint8_t *row, int x)
-{
-    return *(((uint32_t *)row) + x);
-}
-
-static force_inline uint32_t
-convert_a8 (const uint8_t *row, int x)
-{
-    return *(row + x) << 24;
-}
-
-static force_inline uint32_t
-convert_r5g6b5 (const uint8_t *row, int x)
-{
-    return convert_0565_to_0888 (*((uint16_t *)row + x));
-}
-
-#define MAKE_SEPARABLE_CONVOLUTION_FETCHER(name, format, repeat_mode)  \
-    static uint32_t *							\
-    bits_image_fetch_separable_convolution_affine_ ## name (pixman_iter_t   *iter, \
-							    const uint32_t * mask) \
-    {									\
-	bits_image_fetch_separable_convolution_affine (                 \
-	    iter->image,                                                \
-	    iter->x, iter->y++,                                         \
-	    iter->width,                                                \
-	    iter->buffer, mask,                                         \
-	    convert_ ## format,                                         \
-	    PIXMAN_ ## format,                                          \
-	    repeat_mode);                                               \
-									\
-	return iter->buffer;                                            \
-    }
-
-#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode)		\
-    static uint32_t *							\
-    bits_image_fetch_bilinear_affine_ ## name (pixman_iter_t   *iter,	\
-					       const uint32_t * mask)	\
-    {									\
-	bits_image_fetch_bilinear_affine (iter->image,			\
-					  iter->x, iter->y++,		\
-					  iter->width,			\
-					  iter->buffer, mask,		\
-					  convert_ ## format,		\
-					  PIXMAN_ ## format,		\
-					  repeat_mode);			\
-	return iter->buffer;						\
-    }
-
-#define MAKE_NEAREST_FETCHER(name, format, repeat_mode)			\
-    static uint32_t *							\
-    bits_image_fetch_nearest_affine_ ## name (pixman_iter_t   *iter,	\
-					      const uint32_t * mask)	\
-    {									\
-	bits_image_fetch_nearest_affine (iter->image,			\
-					 iter->x, iter->y++,		\
-					 iter->width,			\
-					 iter->buffer, mask,		\
-					 convert_ ## format,		\
-					 PIXMAN_ ## format,		\
-					 repeat_mode);			\
-	return iter->buffer;						\
-    }
-
-#define MAKE_FETCHERS(name, format, repeat_mode)			\
-    MAKE_NEAREST_FETCHER (name, format, repeat_mode)			\
-    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)			\
-    MAKE_SEPARABLE_CONVOLUTION_FETCHER (name, format, repeat_mode)
-
-MAKE_FETCHERS (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_a8r8g8b8,  a8r8g8b8, PIXMAN_REPEAT_NORMAL)
-MAKE_FETCHERS (pad_x8r8g8b8,     x8r8g8b8, PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_x8r8g8b8,    x8r8g8b8, PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_x8r8g8b8,  x8r8g8b8, PIXMAN_REPEAT_NORMAL)
-MAKE_FETCHERS (pad_a8,           a8,       PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_a8,          a8,       PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_a8,	 a8,       PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_a8,	 a8,       PIXMAN_REPEAT_NORMAL)
-MAKE_FETCHERS (pad_r5g6b5,       r5g6b5,   PIXMAN_REPEAT_PAD)
-MAKE_FETCHERS (none_r5g6b5,      r5g6b5,   PIXMAN_REPEAT_NONE)
-MAKE_FETCHERS (reflect_r5g6b5,   r5g6b5,   PIXMAN_REPEAT_REFLECT)
-MAKE_FETCHERS (normal_r5g6b5,    r5g6b5,   PIXMAN_REPEAT_NORMAL)
-
 static void
 replicate_pixel_32 (bits_image_t *   bits,
 		    int              x,
@@ -1253,9 +572,9 @@ bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
 	w = MIN (width, image->width - x);
 
 	if (wide)
-	    image->fetch_scanline_float ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	    image->fetch_scanline_float (image, x, y, w, buffer, NULL);
 	else
-	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	    image->fetch_scanline_32 (image, x, y, w, buffer, NULL);
 
 	width -= w;
 	buffer += w * (wide? 4 : 1);
@@ -1301,9 +620,9 @@ bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
 	w = MIN (width, image->width - x);
 
 	if (wide)
-	    image->fetch_scanline_float ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	    image->fetch_scanline_float (image, x, y, w, buffer, NULL);
 	else
-	    image->fetch_scanline_32 ((pixman_image_t *)image, x, y, w, buffer, NULL);
+	    image->fetch_scanline_32 (image, x, y, w, buffer, NULL);
 
 	buffer += w * (wide? 4 : 1);
 	x += w;
@@ -1381,92 +700,6 @@ static const fetcher_info_t fetcher_info[] =
       bits_image_fetch_untransformed_float
     },
 
-#define FAST_BILINEAR_FLAGS						\
-    (FAST_PATH_NO_ALPHA_MAP		|				\
-     FAST_PATH_NO_ACCESSORS		|				\
-     FAST_PATH_HAS_TRANSFORM		|				\
-     FAST_PATH_AFFINE_TRANSFORM		|				\
-     FAST_PATH_X_UNIT_POSITIVE		|				\
-     FAST_PATH_Y_UNIT_ZERO		|				\
-     FAST_PATH_NONE_REPEAT		|				\
-     FAST_PATH_BILINEAR_FILTER)
-
-    { PIXMAN_a8r8g8b8,
-      FAST_BILINEAR_FLAGS,
-      bits_image_fetch_bilinear_no_repeat_8888,
-      _pixman_image_get_scanline_generic_float
-    },
-
-    { PIXMAN_x8r8g8b8,
-      FAST_BILINEAR_FLAGS,
-      bits_image_fetch_bilinear_no_repeat_8888,
-      _pixman_image_get_scanline_generic_float
-    },
-
-#define GENERAL_BILINEAR_FLAGS						\
-    (FAST_PATH_NO_ALPHA_MAP		|				\
-     FAST_PATH_NO_ACCESSORS		|				\
-     FAST_PATH_HAS_TRANSFORM		|				\
-     FAST_PATH_AFFINE_TRANSFORM		|				\
-     FAST_PATH_BILINEAR_FILTER)
-
-#define GENERAL_NEAREST_FLAGS						\
-    (FAST_PATH_NO_ALPHA_MAP		|				\
-     FAST_PATH_NO_ACCESSORS		|				\
-     FAST_PATH_HAS_TRANSFORM		|				\
-     FAST_PATH_AFFINE_TRANSFORM		|				\
-     FAST_PATH_NEAREST_FILTER)
-
-#define GENERAL_SEPARABLE_CONVOLUTION_FLAGS				\
-    (FAST_PATH_NO_ALPHA_MAP            |				\
-     FAST_PATH_NO_ACCESSORS            |				\
-     FAST_PATH_HAS_TRANSFORM           |				\
-     FAST_PATH_AFFINE_TRANSFORM        |				\
-     FAST_PATH_SEPARABLE_CONVOLUTION_FILTER)
-    
-#define SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)   \
-    { PIXMAN_ ## format,                                               \
-      GENERAL_SEPARABLE_CONVOLUTION_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
-      bits_image_fetch_separable_convolution_affine_ ## name,          \
-      _pixman_image_get_scanline_generic_float			       \
-    },
-
-#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
-    { PIXMAN_ ## format,						\
-      GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
-      bits_image_fetch_bilinear_affine_ ## name,			\
-      _pixman_image_get_scanline_generic_float				\
-    },
-
-#define NEAREST_AFFINE_FAST_PATH(name, format, repeat)			\
-    { PIXMAN_ ## format,						\
-      GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
-      bits_image_fetch_nearest_affine_ ## name,				\
-      _pixman_image_get_scanline_generic_float				\
-    },
-
-#define AFFINE_FAST_PATHS(name, format, repeat)				\
-    SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)	\
-    BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
-    NEAREST_AFFINE_FAST_PATH(name, format, repeat)
-    
-    AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
-    AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
-    AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
-    AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
-    AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
-    AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
-    AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
-    AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
-    AFFINE_FAST_PATHS (pad_a8, a8, PAD)
-    AFFINE_FAST_PATHS (none_a8, a8, NONE)
-    AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
-    AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
-    AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
-    AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
-    AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
-    AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
-
     /* Affine, no alpha */
     { PIXMAN_any,
       (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
@@ -1528,7 +761,7 @@ dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
     int             width  = iter->width;
     uint32_t *	    buffer = iter->buffer;
 
-    image->bits.fetch_scanline_32 (image, x, y, width, buffer, mask);
+    image->bits.fetch_scanline_32 (&image->bits, x, y, width, buffer, mask);
     if (image->common.alpha_map)
     {
 	uint32_t *alpha;
@@ -1541,8 +774,7 @@ dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
 	    y -= image->common.alpha_origin_y;
 
 	    image->common.alpha_map->fetch_scanline_32 (
-		(pixman_image_t *)image->common.alpha_map,
-		x, y, width, alpha, mask);
+		image->common.alpha_map, x, y, width, alpha, mask);
 
 	    for (i = 0; i < width; ++i)
 	    {
@@ -1567,7 +799,7 @@ dest_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
     argb_t *	    buffer = (argb_t *)iter->buffer;
 
     image->fetch_scanline_float (
-	(pixman_image_t *)image, x, y, width, (uint32_t *)buffer, mask);
+	image, x, y, width, (uint32_t *)buffer, mask);
     if (image->common.alpha_map)
     {
 	argb_t *alpha;
@@ -1580,8 +812,7 @@ dest_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
 	    y -= image->common.alpha_origin_y;
 
 	    image->common.alpha_map->fetch_scanline_float (
-		(pixman_image_t *)image->common.alpha_map,
-		x, y, width, (uint32_t *)alpha, mask);
+		image->common.alpha_map, x, y, width, (uint32_t *)alpha, mask);
 
 	    for (i = 0; i < width; ++i)
 		buffer[i].a = alpha[i].a;
diff --git a/lib/pixman/pixman/pixman-combine32.c b/lib/pixman/pixman/pixman-combine32.c
index 3ac7576bd..450114a52 100644
--- a/lib/pixman/pixman/pixman-combine32.c
+++ b/lib/pixman/pixman/pixman-combine32.c
@@ -142,12 +142,12 @@ combine_mask (const uint32_t *src, const uint32_t *mask, int i)
 static void
 combine_clear (pixman_implementation_t *imp,
                pixman_op_t              op,
-               uint32_t *                dest,
-               const uint32_t *          src,
-               const uint32_t *          mask,
+               uint32_t *               dest,
+               const uint32_t *         src,
+               const uint32_t *         mask,
                int                      width)
 {
-    memset (dest, 0, width * sizeof(uint32_t));
+    memset (dest, 0, width * sizeof (uint32_t));
 }
 
 static void
@@ -155,7 +155,7 @@ combine_dst (pixman_implementation_t *imp,
 	     pixman_op_t	      op,
 	     uint32_t *		      dest,
 	     const uint32_t *	      src,
-	     const uint32_t *          mask,
+	     const uint32_t *         mask,
 	     int		      width)
 {
     return;
@@ -164,9 +164,9 @@ combine_dst (pixman_implementation_t *imp,
 static void
 combine_src_u (pixman_implementation_t *imp,
                pixman_op_t              op,
-               uint32_t *                dest,
-               const uint32_t *          src,
-               const uint32_t *          mask,
+               uint32_t *               dest,
+               const uint32_t *         src,
+               const uint32_t *         mask,
                int                      width)
 {
     int i;
@@ -189,9 +189,9 @@ combine_src_u (pixman_implementation_t *imp,
 static void
 combine_over_u (pixman_implementation_t *imp,
                 pixman_op_t              op,
-                uint32_t *                dest,
-                const uint32_t *          src,
-                const uint32_t *          mask,
+                uint32_t *               dest,
+                const uint32_t *         src,
+                const uint32_t *         mask,
                 int                      width)
 {
     int i;
@@ -254,9 +254,9 @@ combine_over_u (pixman_implementation_t *imp,
 static void
 combine_over_reverse_u (pixman_implementation_t *imp,
                         pixman_op_t              op,
-                        uint32_t *                dest,
-                        const uint32_t *          src,
-                        const uint32_t *          mask,
+                        uint32_t *               dest,
+                        const uint32_t *         src,
+                        const uint32_t *         mask,
                         int                      width)
 {
     int i;
@@ -274,9 +274,9 @@ combine_over_reverse_u (pixman_implementation_t *imp,
 static void
 combine_in_u (pixman_implementation_t *imp,
               pixman_op_t              op,
-              uint32_t *                dest,
-              const uint32_t *          src,
-              const uint32_t *          mask,
+              uint32_t *               dest,
+              const uint32_t *         src,
+              const uint32_t *         mask,
               int                      width)
 {
     int i;
@@ -293,9 +293,9 @@ combine_in_u (pixman_implementation_t *imp,
 static void
 combine_in_reverse_u (pixman_implementation_t *imp,
                       pixman_op_t              op,
-                      uint32_t *                dest,
-                      const uint32_t *          src,
-                      const uint32_t *          mask,
+                      uint32_t *               dest,
+                      const uint32_t *         src,
+                      const uint32_t *         mask,
                       int                      width)
 {
     int i;
@@ -313,9 +313,9 @@ combine_in_reverse_u (pixman_implementation_t *imp,
 static void
 combine_out_u (pixman_implementation_t *imp,
                pixman_op_t              op,
-               uint32_t *                dest,
-               const uint32_t *          src,
-               const uint32_t *          mask,
+               uint32_t *               dest,
+               const uint32_t *         src,
+               const uint32_t *         mask,
                int                      width)
 {
     int i;
@@ -332,9 +332,9 @@ combine_out_u (pixman_implementation_t *imp,
 static void
 combine_out_reverse_u (pixman_implementation_t *imp,
                        pixman_op_t              op,
-                       uint32_t *                dest,
-                       const uint32_t *          src,
-                       const uint32_t *          mask,
+                       uint32_t *               dest,
+                       const uint32_t *         src,
+                       const uint32_t *         mask,
                        int                      width)
 {
     int i;
@@ -352,9 +352,9 @@ combine_out_reverse_u (pixman_implementation_t *imp,
 static void
 combine_atop_u (pixman_implementation_t *imp,
                 pixman_op_t              op,
-                uint32_t *                dest,
-                const uint32_t *          src,
-                const uint32_t *          mask,
+                uint32_t *               dest,
+                const uint32_t *         src,
+                const uint32_t *         mask,
                 int                      width)
 {
     int i;
@@ -374,9 +374,9 @@ combine_atop_u (pixman_implementation_t *imp,
 static void
 combine_atop_reverse_u (pixman_implementation_t *imp,
                         pixman_op_t              op,
-                        uint32_t *                dest,
-                        const uint32_t *          src,
-                        const uint32_t *          mask,
+                        uint32_t *               dest,
+                        const uint32_t *         src,
+                        const uint32_t *         mask,
                         int                      width)
 {
     int i;
@@ -396,9 +396,9 @@ combine_atop_reverse_u (pixman_implementation_t *imp,
 static void
 combine_xor_u (pixman_implementation_t *imp,
                pixman_op_t              op,
-               uint32_t *                dest,
-               const uint32_t *          src,
-               const uint32_t *          mask,
+               uint32_t *               dest,
+               const uint32_t *         src,
+               const uint32_t *         mask,
                int                      width)
 {
     int i;
@@ -418,9 +418,9 @@ combine_xor_u (pixman_implementation_t *imp,
 static void
 combine_add_u (pixman_implementation_t *imp,
                pixman_op_t              op,
-               uint32_t *                dest,
-               const uint32_t *          src,
-               const uint32_t *          mask,
+               uint32_t *               dest,
+               const uint32_t *         src,
+               const uint32_t *         mask,
                int                      width)
 {
     int i;
@@ -437,9 +437,9 @@ combine_add_u (pixman_implementation_t *imp,
 static void
 combine_saturate_u (pixman_implementation_t *imp,
                     pixman_op_t              op,
-                    uint32_t *                dest,
-                    const uint32_t *          src,
-                    const uint32_t *          mask,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
                     int                      width)
 {
     int i;
@@ -463,39 +463,66 @@ combine_saturate_u (pixman_implementation_t *imp,
     }
 }
 
+
 /*
  * PDF blend modes:
+ *
  * The following blend modes have been taken from the PDF ISO 32000
  * specification, which at this point in time is available from
- * http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
- * The relevant chapters are 11.3.5 and 11.3.6.
+ *
+ *     http://www.adobe.com/devnet/pdf/pdf_reference.html
+ *
+ * The specific documents of interest are the PDF spec itself:
+ *
+ *     http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/PDF32000_2008.pdf
+ *
+ * chapters 11.3.5 and 11.3.6 and a later supplement for Adobe Acrobat
+ * 9.1 and Reader 9.1:
+ *
+ *     http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000_1.pdf
+ *
+ * that clarifies the specifications for blend modes ColorDodge and
+ * ColorBurn.
+ *
  * The formula for computing the final pixel color given in 11.3.6 is:
- * αr × Cr = (1 – αs) × αb × Cb + (1 – αb) × αs × Cs + αb × αs × B(Cb, Cs)
- * with B() being the blend function.
- * Note that OVER is a special case of this operation, using B(Cb, Cs) = Cs
- *
- * These blend modes should match the SVG filter draft specification, as
- * it has been designed to mirror ISO 32000. Note that at the current point
- * no released draft exists that shows this, as the formulas have not been
- * updated yet after the release of ISO 32000.
- *
- * The default implementation here uses the PDF_SEPARABLE_BLEND_MODE and
- * PDF_NON_SEPARABLE_BLEND_MODE macros, which take the blend function as an
- * argument. Note that this implementation operates on premultiplied colors,
- * while the PDF specification does not. Therefore the code uses the formula
- * Cra = (1 – as) . Dca + (1 – ad) . Sca + B(Dca, ad, Sca, as)
+ *
+ *     αr × Cr = (1 – αs) × αb × Cb + (1 – αb) × αs × Cs + αb × αs × B(Cb, Cs)
+ *
+ * with B() is the blend function. When B(Cb, Cs) = Cs, this formula
+ * reduces to the regular OVER operator.
+ *
+ * Cs and Cb are not premultiplied, so in our implementation we instead
+ * use:
+ *
+ *     cr = (1 – αs) × cb  +  (1 – αb) × cs  +  αb × αs × B (cb/αb, cs/αs)
+ *
+ * where cr, cs, and cb are premultiplied colors, and where the
+ *
+ *     αb × αs × B(cb/αb, cs/αs)
+ *
+ * part is first arithmetically simplified under the assumption that αb
+ * and αs are not 0, and then updated to produce a meaningful result when
+ * they are.
+ *
+ * For all the blend mode operators, the alpha channel is given by
+ *
+ *     αr = αs + αb + αb × αs
  */
 
 /*
  * Multiply
- * B(Dca, ad, Sca, as) = Dca.Sca
+ *
+ *      ad * as * B(d / ad, s / as)
+ *    = ad * as * d/ad * s/as
+ *    = d * s
+ *
  */
 static void
 combine_multiply_u (pixman_implementation_t *imp,
                     pixman_op_t              op,
-                    uint32_t *                dest,
-                    const uint32_t *          src,
-                    const uint32_t *          mask,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
                     int                      width)
 {
     int i;
@@ -519,9 +546,9 @@ combine_multiply_u (pixman_implementation_t *imp,
 static void
 combine_multiply_ca (pixman_implementation_t *imp,
                      pixman_op_t              op,
-                     uint32_t *                dest,
-                     const uint32_t *          src,
-                     const uint32_t *          mask,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
                      int                      width)
 {
     int i;
@@ -548,13 +575,14 @@ combine_multiply_ca (pixman_implementation_t *imp,
     static void								\
     combine_ ## name ## _u (pixman_implementation_t *imp,		\
 			    pixman_op_t              op,		\
-                            uint32_t *                dest,		\
-			    const uint32_t *          src,		\
-			    const uint32_t *          mask,		\
+                            uint32_t *               dest,		\
+			    const uint32_t *         src,		\
+			    const uint32_t *         mask,		\
 			    int                      width)		\
     {									\
 	int i;								\
-	for (i = 0; i < width; ++i) {					\
+	for (i = 0; i < width; ++i)					\
+	{								\
 	    uint32_t s = combine_mask (src, mask, i);			\
 	    uint32_t d = *(dest + i);					\
 	    uint8_t sa = ALPHA_8 (s);					\
@@ -577,13 +605,14 @@ combine_multiply_ca (pixman_implementation_t *imp,
     static void								\
     combine_ ## name ## _ca (pixman_implementation_t *imp,		\
 			     pixman_op_t              op,		\
-                             uint32_t *                dest,		\
-			     const uint32_t *          src,		\
-			     const uint32_t *          mask,		\
-			     int                     width)		\
+                             uint32_t *               dest,		\
+			     const uint32_t *         src,		\
+			     const uint32_t *         mask,		\
+			     int                      width)		\
     {									\
 	int i;								\
-	for (i = 0; i < width; ++i) {					\
+	for (i = 0; i < width; ++i)					\
+	{								\
 	    uint32_t m = *(mask + i);					\
 	    uint32_t s = *(src + i);					\
 	    uint32_t d = *(dest + i);					\
@@ -608,49 +637,69 @@ combine_multiply_ca (pixman_implementation_t *imp,
 
 /*
  * Screen
- * B(Dca, ad, Sca, as) = Dca.sa + Sca.da - Dca.Sca
+ *
+ *      ad * as * B(d/ad, s/as)
+ *    = ad * as * (d/ad + s/as - s/as * d/ad)
+ *    = ad * s + as * d - s * d
  */
 static inline uint32_t
-blend_screen (uint32_t dca, uint32_t da, uint32_t sca, uint32_t sa)
+blend_screen (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
 {
-    return DIV_ONE_UN8 (sca * da + dca * sa - sca * dca);
+    return DIV_ONE_UN8 (s * ad + d * as - s * d);
 }
 
 PDF_SEPARABLE_BLEND_MODE (screen)
 
 /*
  * Overlay
- * B(Dca, Da, Sca, Sa) =
- *   if 2.Dca < Da
- *     2.Sca.Dca
- *   otherwise
- *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = ad * as * Hardlight (s, d)
+ *   = if (d / ad < 0.5)
+ *         as * ad * Multiply (s/as, 2 * d/ad)
+ *     else
+ *         as * ad * Screen (s/as, 2 * d / ad - 1)
+ *   = if (d < 0.5 * ad)
+ *         as * ad * s/as * 2 * d /ad
+ *     else
+ *         as * ad * (s/as + 2 * d / ad - 1 - s / as * (2 * d / ad - 1))
+ *   = if (2 * d < ad)
+ *         2 * s * d
+ *     else
+ *         ad * s + 2 * as * d - as * ad - ad * s * (2 * d / ad - 1)
+ *   = if (2 * d < ad)
+ *         2 * s * d
+ *     else
+ *         as * ad - 2 * (ad - d) * (as - s)
  */
 static inline uint32_t
-blend_overlay (uint32_t dca, uint32_t da, uint32_t sca, uint32_t sa)
+blend_overlay (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
 {
-    uint32_t rca;
+    uint32_t r;
 
-    if (2 * dca < da)
-	rca = 2 * sca * dca;
+    if (2 * d < ad)
+	r = 2 * s * d;
     else
-	rca = sa * da - 2 * (da - dca) * (sa - sca);
-    return DIV_ONE_UN8 (rca);
+	r = as * ad - 2 * (ad - d) * (as - s);
+
+    return DIV_ONE_UN8 (r);
 }
 
 PDF_SEPARABLE_BLEND_MODE (overlay)
 
 /*
  * Darken
- * B(Dca, Da, Sca, Sa) = min (Sca.Da, Dca.Sa)
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = ad * as * MIN(d/ad, s/as)
+ *   = MIN (as * d, ad * s)
  */
 static inline uint32_t
-blend_darken (uint32_t dca, uint32_t da, uint32_t sca, uint32_t sa)
+blend_darken (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
 {
-    uint32_t s, d;
+    s = ad * s;
+    d = as * d;
 
-    s = sca * da;
-    d = dca * sa;
     return DIV_ONE_UN8 (s > d ? d : s);
 }
 
@@ -658,15 +707,17 @@ PDF_SEPARABLE_BLEND_MODE (darken)
 
 /*
  * Lighten
- * B(Dca, Da, Sca, Sa) = max (Sca.Da, Dca.Sa)
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = ad * as * MAX(d/ad, s/as)
+ *   = MAX (as * d, ad * s)
  */
 static inline uint32_t
-blend_lighten (uint32_t dca, uint32_t da, uint32_t sca, uint32_t sa)
+blend_lighten (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
 {
-    uint32_t s, d;
-
-    s = sca * da;
-    d = dca * sa;
+    s = ad * s;
+    d = as * d;
+    
     return DIV_ONE_UN8 (s > d ? s : d);
 }
 
@@ -674,152 +725,197 @@ PDF_SEPARABLE_BLEND_MODE (lighten)
 
 /*
  * Color dodge
- * B(Dca, Da, Sca, Sa) =
- *   if Dca == 0
- *     0
- *   if Sca == Sa
- *     Sa.Da
- *   otherwise
- *     Sa.Da. min (1, Dca / Da / (1 - Sca/Sa))
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = if d/ad = 0
+ *         ad * as * 0
+ *     else if (d/ad >= (1 - s/as)
+ *         ad * as * 1
+ *     else
+ *         ad * as * ((d/ad) / (1 - s/as))
+ *   = if d = 0
+ *         0
+ *     elif as * d >= ad * (as - s)
+ *         ad * as
+ *     else
+ *         as * (as * d / (as - s))
+ *
  */
 static inline uint32_t
-blend_color_dodge (uint32_t dca, uint32_t da, uint32_t sca, uint32_t sa)
-{
-    if (sca >= sa)
-    {
-	return dca == 0 ? 0 : DIV_ONE_UN8 (sa * da);
-    }
+blend_color_dodge (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
+{
+    if (d == 0)
+        return 0;
+    else if (as * d >= ad * (as - s))
+	return DIV_ONE_UN8 (as * ad);
+    else if (as - s == 0)
+        return DIV_ONE_UN8 (as * ad);
     else
-    {
-	uint32_t rca = dca * sa / (sa - sca);
-	return DIV_ONE_UN8 (sa * MIN (rca, da));
-    }
+        return DIV_ONE_UN8 (as * ((d * as) / ((as - s))));
 }
 
 PDF_SEPARABLE_BLEND_MODE (color_dodge)
 
 /*
  * Color burn
- * B(Dca, Da, Sca, Sa) =
- *   if Dca == Da
- *     Sa.Da
- *   if Sca == 0
- *     0
- *   otherwise
- *     Sa.Da.(1 - min (1, (1 - Dca/Da).Sa / Sca))
+ *
+ * We modify the first clause "if d = 1" to "if d >= 1" since with
+ * premultiplied colors d > 1 can actually happen.
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = if d/ad >= 1
+ *         ad * as * 1
+ *     elif (1 - d/ad) >= s/as
+ *         ad * as * 0
+ *     else
+ *         ad * as * (1 - ((1 - d/ad) / (s/as)))
+ *   = if d >= ad
+ *         ad * as
+ *     elif as * ad - as * d >= ad * s
+ *         0
+ *     else
+ *         ad * as  - as * as * (ad - d) / s
  */
 static inline uint32_t
-blend_color_burn (uint32_t dca, uint32_t da, uint32_t sca, uint32_t sa)
-{
-    if (sca == 0)
-    {
-	return dca < da ? 0 : DIV_ONE_UN8 (sa * da);
-    }
+blend_color_burn (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
+{
+    if (d >= ad)
+	return DIV_ONE_UN8 (ad * as);
+    else if (as * ad - as * d >= ad * s)
+	return 0;
+    else if (s == 0)
+	return 0;
     else
-    {
-	uint32_t rca = (da - dca) * sa / sca;
-	return DIV_ONE_UN8 (sa * (MAX (rca, da) - rca));
-    }
+	return DIV_ONE_UN8 (ad * as - (as * as * (ad - d)) / s);
 }
 
 PDF_SEPARABLE_BLEND_MODE (color_burn)
 
 /*
  * Hard light
- * B(Dca, Da, Sca, Sa) =
- *   if 2.Sca < Sa
- *     2.Sca.Dca
- *   otherwise
- *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = if (s/as <= 0.5)
+ *         ad * as * Multiply (d/ad, 2 * s/as)
+ *     else
+ *         ad * as * Screen (d/ad, 2 * s/as - 1)
+ *   = if 2 * s <= as
+ *         ad * as * d/ad * 2 * s / as
+ *     else
+ *         ad * as * (d/ad + (2 * s/as - 1) + d/ad * (2 * s/as - 1))
+ *   = if 2 * s <= as
+ *         2 * s * d
+ *     else
+ *         as * ad - 2 * (ad - d) * (as - s)
  */
 static inline uint32_t
-blend_hard_light (uint32_t dca, uint32_t da, uint32_t sca, uint32_t sa)
+blend_hard_light (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
 {
-    if (2 * sca < sa)
-	return DIV_ONE_UN8 (2 * sca * dca);
+    if (2 * s < as)
+	return DIV_ONE_UN8 (2 * s * d);
     else
-	return DIV_ONE_UN8 (sa * da - 2 * (da - dca) * (sa - sca));
+	return DIV_ONE_UN8 (as * ad - 2 * (ad - d) * (as - s));
 }
 
 PDF_SEPARABLE_BLEND_MODE (hard_light)
 
 /*
  * Soft light
- * B(Dca, Da, Sca, Sa) =
- *   if (2.Sca <= Sa)
- *     Dca.(Sa - (1 - Dca/Da).(2.Sca - Sa))
- *   otherwise if Dca.4 <= Da
- *     Dca.(Sa + (2.Sca - Sa).((16.Dca/Da - 12).Dca/Da + 3)
- *   otherwise
- *     (Dca.Sa + (SQRT (Dca/Da).Da - Dca).(2.Sca - Sa))
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = if (s/as <= 0.5)
+ *         ad * as * (d/ad - (1 - 2 * s/as) * d/ad * (1 - d/ad))
+ *     else if (d/ad <= 0.25)
+ *         ad * as * (d/ad + (2 * s/as - 1) * ((((16 * d/ad - 12) * d/ad + 4) * d/ad) - d/ad))
+ *     else
+ *         ad * as * (d/ad + (2 * s/as - 1) * sqrt (d/ad))
+ *   = if (2 * s <= as)
+ *         d * as - d * (ad - d) * (as - 2 * s) / ad;
+ *     else if (4 * d <= ad)
+ *         (2 * s - as) * d * ((16 * d / ad - 12) * d / ad + 3);
+ *     else
+ *         d * as + (sqrt (d * ad) - d) * (2 * s - as);
  */
 static inline uint32_t
-blend_soft_light (uint32_t dca_org,
-		  uint32_t da_org,
-		  uint32_t sca_org,
-		  uint32_t sa_org)
-{
-    double dca = dca_org * (1.0 / MASK);
-    double da = da_org * (1.0 / MASK);
-    double sca = sca_org * (1.0 / MASK);
-    double sa = sa_org * (1.0 / MASK);
-    double rca;
-
-    if (2 * sca < sa)
+blend_soft_light (uint32_t d_org,
+		  uint32_t ad_org,
+		  uint32_t s_org,
+		  uint32_t as_org)
+{
+    double d = d_org * (1.0 / MASK);
+    double ad = ad_org * (1.0 / MASK);
+    double s = s_org * (1.0 / MASK);
+    double as = as_org * (1.0 / MASK);
+    double r;
+
+    if (2 * s < as)
     {
-	if (da == 0)
-	    rca = dca * sa;
+	if (ad == 0)
+	    r = d * as;
 	else
-	    rca = dca * sa - dca * (da - dca) * (sa - 2 * sca) / da;
+	    r = d * as - d * (ad - d) * (as - 2 * s) / ad;
     }
-    else if (da == 0)
+    else if (ad == 0)
     {
-	rca = 0;
+	r = 0;
     }
-    else if (4 * dca <= da)
+    else if (4 * d <= ad)
     {
-	rca = dca * sa +
-	    (2 * sca - sa) * dca * ((16 * dca / da - 12) * dca / da + 3);
+	r = d * as +
+	    (2 * s - as) * d * ((16 * d / ad - 12) * d / ad + 3);
     }
     else
     {
-	rca = dca * sa + (sqrt (dca * da) - dca) * (2 * sca - sa);
+	r = d * as + (sqrt (d * ad) - d) * (2 * s - as);
     }
-    return rca * MASK + 0.5;
+    return r * MASK + 0.5;
 }
 
 PDF_SEPARABLE_BLEND_MODE (soft_light)
 
 /*
  * Difference
- * B(Dca, Da, Sca, Sa) = abs (Dca.Sa - Sca.Da)
+ *
+ *     ad * as * B(s/as, d/ad)
+ *   = ad * as * abs (s/as - d/ad)
+ *   = if (s/as <= d/ad)
+ *         ad * as * (d/ad - s/as)
+ *     else
+ *         ad * as * (s/as - d/ad)
+ *   = if (ad * s <= as * d)
+ *        as * d - ad * s
+ *     else
+ *        ad * s - as * d
  */
 static inline uint32_t
-blend_difference (uint32_t dca, uint32_t da, uint32_t sca, uint32_t sa)
+blend_difference (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
 {
-    uint32_t dcasa = dca * sa;
-    uint32_t scada = sca * da;
+    uint32_t das = d * as;
+    uint32_t sad = s * ad;
 
-    if (scada < dcasa)
-	return DIV_ONE_UN8 (dcasa - scada);
+    if (sad < das)
+	return DIV_ONE_UN8 (das - sad);
     else
-	return DIV_ONE_UN8 (scada - dcasa);
+	return DIV_ONE_UN8 (sad - das);
 }
 
 PDF_SEPARABLE_BLEND_MODE (difference)
 
 /*
  * Exclusion
- * B(Dca, Da, Sca, Sa) = (Sca.Da + Dca.Sa - 2.Sca.Dca)
+ *
+ *     ad * as * B(s/as, d/ad)
+ *   = ad * as * (d/ad + s/as - 2 * d/ad * s/as)
+ *   = as * d + ad * s - 2 * s * d
  */
 
 /* This can be made faster by writing it directly and not using
  * PDF_SEPARABLE_BLEND_MODE, but that's a performance optimization */
 
 static inline uint32_t
-blend_exclusion (uint32_t dca, uint32_t da, uint32_t sca, uint32_t sa)
+blend_exclusion (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
 {
-    return DIV_ONE_UN8 (sca * da + dca * sa - 2 * dca * sca);
+    return DIV_ONE_UN8 (s * ad + d * as - 2 * d * s);
 }
 
 PDF_SEPARABLE_BLEND_MODE (exclusion)
@@ -834,103 +930,70 @@ PDF_SEPARABLE_BLEND_MODE (exclusion)
  * LUM (C) = 0.3 × Cred + 0.59 × Cgreen + 0.11 × Cblue
  *
  * clip_color (C):
- *   l = LUM (C)
- *   min = Cmin
- *   max = Cmax
- *   if n < 0.0
- *     C = l + ( ( ( C – l ) × l ) ⁄ ( l – min ) )
- *   if x > 1.0
- *     C = l + ( ( ( C – l ) × ( 1 – l ) ) ⁄ ( max – l ) )
- *   return C
+ *     l = LUM (C)
+ *     min = Cmin
+ *     max = Cmax
+ *     if n < 0.0
+ *         C = l + (((C – l) × l) ⁄ (l – min))
+ *     if x > 1.0
+ *         C = l + (((C – l) × (1 – l) ) ⁄ (max – l))
+ *     return C
  *
  * set_lum (C, l):
- *   d = l – LUM (C)
- *   C += d
- *   return clip_color (C)
+ *     d = l – LUM (C)
+ *     C += d
+ *     return clip_color (C)
  *
  * SAT (C) = CH_MAX (C) - CH_MIN (C)
  *
  * set_sat (C, s):
- *  if Cmax > Cmin
- *    Cmid = ( ( ( Cmid – Cmin ) × s ) ⁄ ( Cmax – Cmin ) )
- *    Cmax = s
- *  else
- *    Cmid = Cmax = 0.0
- *  Cmin = 0.0
- *  return C
+ *     if Cmax > Cmin
+ *         Cmid = ( ( ( Cmid – Cmin ) × s ) ⁄ ( Cmax – Cmin ) )
+ *         Cmax = s
+ *     else
+ *         Cmid = Cmax = 0.0
+ *         Cmin = 0.0
+ *     return C
  */
 
 /* For premultiplied colors, we need to know what happens when C is
  * multiplied by a real number. LUM and SAT are linear:
  *
- *    LUM (r × C) = r × LUM (C)		SAT (r * C) = r * SAT (C)
+ *     LUM (r × C) = r × LUM (C)	SAT (r * C) = r * SAT (C)
  *
  * If we extend clip_color with an extra argument a and change
  *
- *        if x >= 1.0
+ *     if x >= 1.0
  *
  * into
  *
- *        if x >= a
+ *     if x >= a
  *
  * then clip_color is also linear:
  *
- *    r * clip_color (C, a) = clip_color (r_c, ra);
+ *     r * clip_color (C, a) = clip_color (r * C, r * a);
  *
  * for positive r.
  *
  * Similarly, we can extend set_lum with an extra argument that is just passed
  * on to clip_color:
  *
- *   r * set_lum ( C, l, a)
+ *       r * set_lum (C, l, a)
  *
- *   = r × clip_color ( C + l - LUM (C), a)
+ *     = r × clip_color (C + l - LUM (C), a)
  *
- *   = clip_color ( r * C + r × l - r * LUM (C), r * a)
+ *     = clip_color (r * C + r × l - r * LUM (C), r * a)
  *
- *   = set_lum ( r * C, r * l, r * a)
+ *     = set_lum (r * C, r * l, r * a)
  *
  * Finally, set_sat:
  *
- *    r * set_sat (C, s) = set_sat (x * C, r * s)
+ *       r * set_sat (C, s) = set_sat (x * C, r * s)
  *
  * The above holds for all non-zero x, because the x'es in the fraction for
  * C_mid cancel out. Specifically, it holds for x = r:
  *
- *    r * set_sat (C, s) = set_sat (r_c, rs)
- *
- */
-
-/* So, for the non-separable PDF blend modes, we have (using s, d for
- * non-premultiplied colors, and S, D for premultiplied:
- *
- *   Color:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (S/a_s, LUM (D/a_d), 1)
- *   = set_lum (S * a_d, a_s * LUM (D), a_s * a_d)
- *
- *
- *   Luminosity:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (D/a_d, LUM(S/a_s), 1)
- *   = set_lum (a_s * D, a_d * LUM(S), a_s * a_d)
- *
- *
- *   Saturation:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (set_sat (D/a_d, SAT (S/a_s)), LUM (D/a_d), 1)
- *   = set_lum (a_s * a_d * set_sat (D/a_d, SAT (S/a_s)),
- *                                        a_s * LUM (D), a_s * a_d)
- *   = set_lum (set_sat (a_s * D, a_d * SAT (S), a_s * LUM (D), a_s * a_d))
- *
- *   Hue:
- *
- *     a_s * a_d * B(s, d)
- *   = a_s * a_d * set_lum (set_sat (S/a_s, SAT (D/a_d)), LUM (D/a_d), 1)
- *   = set_lum (set_sat (a_d * S, a_s * SAT (D)), a_s * LUM (D), a_s * a_d)
+ *       r * set_sat (C, s) = set_sat (r * C, r * s)
  *
  */
 
@@ -942,11 +1005,11 @@ PDF_SEPARABLE_BLEND_MODE (exclusion)
 #define PDF_NON_SEPARABLE_BLEND_MODE(name)				\
     static void								\
     combine_ ## name ## _u (pixman_implementation_t *imp,		\
-			    pixman_op_t op,				\
-                            uint32_t *dest,				\
-			    const uint32_t *src,				\
-			    const uint32_t *mask,			\
-			    int width)					\
+			    pixman_op_t		     op,		\
+                            uint32_t *               dest,		\
+			    const uint32_t *         src,		\
+			    const uint32_t *         mask,		\
+			    int                      width)		\
     {									\
 	int i;								\
 	for (i = 0; i < width; ++i)					\
@@ -958,7 +1021,7 @@ PDF_SEPARABLE_BLEND_MODE (exclusion)
 	    uint8_t da = ALPHA_8 (d);					\
 	    uint8_t ida = ~da;						\
 	    uint32_t result;						\
-	    uint32_t sc[3], dc[3], c[3];					\
+	    uint32_t sc[3], dc[3], c[3];				\
             								\
 	    result = d;							\
 	    UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (result, isa, s, ida);	\
@@ -1104,80 +1167,94 @@ set_sat (uint32_t dest[3], uint32_t src[3], uint32_t sat)
     }
 }
 
-/*
- * Hue:
- * B(Cb, Cs) = set_lum (set_sat (Cs, SAT (Cb)), LUM (Cb))
+/* Hue:
+ *
+ *       as * ad * B(s/as, d/as)
+ *     = as * ad * set_lum (set_sat (s/as, SAT (d/ad)), LUM (d/ad), 1)
+ *     = set_lum (set_sat (ad * s, as * SAT (d)), as * LUM (d), as * ad)
+ *
  */
 static inline void
-blend_hsl_hue (uint32_t c[3],
-               uint32_t dc[3],
-               uint32_t da,
-               uint32_t sc[3],
-               uint32_t sa)
+blend_hsl_hue (uint32_t r[3],
+               uint32_t d[3],
+               uint32_t ad,
+               uint32_t s[3],
+               uint32_t as)
 {
-    c[0] = sc[0] * da;
-    c[1] = sc[1] * da;
-    c[2] = sc[2] * da;
-    set_sat (c, c, SAT (dc) * sa);
-    set_lum (c, c, sa * da, LUM (dc) * sa);
+    r[0] = s[0] * ad;
+    r[1] = s[1] * ad;
+    r[2] = s[2] * ad;
+    set_sat (r, r, SAT (d) * as);
+    set_lum (r, r, as * ad, LUM (d) * as);
 }
 
 PDF_NON_SEPARABLE_BLEND_MODE (hsl_hue)
 
-/*
- * Saturation:
- * B(Cb, Cs) = set_lum (set_sat (Cb, SAT (Cs)), LUM (Cb))
+/* 
+ * Saturation
+ *
+ *     as * ad * B(s/as, d/ad)
+ *   = as * ad * set_lum (set_sat (d/ad, SAT (s/as)), LUM (d/ad), 1)
+ *   = set_lum (as * ad * set_sat (d/ad, SAT (s/as)),
+ *                                       as * LUM (d), as * ad)
+ *   = set_lum (set_sat (as * d, ad * SAT (s), as * LUM (d), as * ad))
  */
 static inline void
-blend_hsl_saturation (uint32_t c[3],
-                      uint32_t dc[3],
-                      uint32_t da,
-                      uint32_t sc[3],
-                      uint32_t sa)
+blend_hsl_saturation (uint32_t r[3],
+                      uint32_t d[3],
+                      uint32_t ad,
+                      uint32_t s[3],
+                      uint32_t as)
 {
-    c[0] = dc[0] * sa;
-    c[1] = dc[1] * sa;
-    c[2] = dc[2] * sa;
-    set_sat (c, c, SAT (sc) * da);
-    set_lum (c, c, sa * da, LUM (dc) * sa);
+    r[0] = d[0] * as;
+    r[1] = d[1] * as;
+    r[2] = d[2] * as;
+    set_sat (r, r, SAT (s) * ad);
+    set_lum (r, r, as * ad, LUM (d) * as);
 }
 
 PDF_NON_SEPARABLE_BLEND_MODE (hsl_saturation)
 
-/*
- * Color:
- * B(Cb, Cs) = set_lum (Cs, LUM (Cb))
+/* 
+ * Color
+ *
+ *     as * ad * B(s/as, d/as)
+ *   = as * ad * set_lum (s/as, LUM (d/ad), 1)
+ *   = set_lum (s * ad, as * LUM (d), as * ad)
  */
 static inline void
-blend_hsl_color (uint32_t c[3],
-                 uint32_t dc[3],
-                 uint32_t da,
-                 uint32_t sc[3],
-                 uint32_t sa)
+blend_hsl_color (uint32_t r[3],
+                 uint32_t d[3],
+                 uint32_t ad,
+                 uint32_t s[3],
+                 uint32_t as)
 {
-    c[0] = sc[0] * da;
-    c[1] = sc[1] * da;
-    c[2] = sc[2] * da;
-    set_lum (c, c, sa * da, LUM (dc) * sa);
+    r[0] = s[0] * ad;
+    r[1] = s[1] * ad;
+    r[2] = s[2] * ad;
+    set_lum (r, r, as * ad, LUM (d) * as);
 }
 
 PDF_NON_SEPARABLE_BLEND_MODE (hsl_color)
 
 /*
- * Luminosity:
- * B(Cb, Cs) = set_lum (Cb, LUM (Cs))
+ * Luminosity
+ *
+ *     as * ad * B(s/as, d/ad)
+ *   = as * ad * set_lum (d/ad, LUM (s/as), 1)
+ *   = set_lum (as * d, ad * LUM (s), as * ad)
  */
 static inline void
-blend_hsl_luminosity (uint32_t c[3],
-                      uint32_t dc[3],
-                      uint32_t da,
-                      uint32_t sc[3],
-                      uint32_t sa)
+blend_hsl_luminosity (uint32_t r[3],
+                      uint32_t d[3],
+                      uint32_t ad,
+                      uint32_t s[3],
+                      uint32_t as)
 {
-    c[0] = dc[0] * sa;
-    c[1] = dc[1] * sa;
-    c[2] = dc[2] * sa;
-    set_lum (c, c, sa * da, LUM (sc) * da);
+    r[0] = d[0] * as;
+    r[1] = d[1] * as;
+    r[2] = d[2] * as;
+    set_lum (r, r, as * ad, LUM (s) * ad);
 }
 
 PDF_NON_SEPARABLE_BLEND_MODE (hsl_luminosity)
@@ -1194,7 +1271,7 @@ PDF_NON_SEPARABLE_BLEND_MODE (hsl_luminosity)
  * come from each of the four areas of the picture -- areas covered by neither
  * A nor B, areas covered only by A, areas covered only by B and finally
  * areas covered by both A and B.
- * 
+ *
  * Disjoint			Conjoint
  * Fa		Fb		Fa		Fb
  * (0,0,0,0)	0		0		0		0
diff --git a/lib/pixman/pixman/pixman-compiler.h b/lib/pixman/pixman/pixman-compiler.h
index 9b190b422..2489adc38 100644
--- a/lib/pixman/pixman/pixman-compiler.h
+++ b/lib/pixman/pixman/pixman-compiler.h
@@ -178,7 +178,7 @@
 #   define PIXMAN_GET_THREAD_LOCAL(name)				\
     (&name)
 
-#elif defined(HAVE_PTHREAD_SETSPECIFIC)
+#elif defined(HAVE_PTHREADS)
 
 #include <pthread.h>
 
diff --git a/lib/pixman/pixman/pixman-fast-path.c b/lib/pixman/pixman/pixman-fast-path.c
index 247aea645..c6e43de10 100644
--- a/lib/pixman/pixman/pixman-fast-path.c
+++ b/lib/pixman/pixman/pixman-fast-path.c
@@ -2263,87 +2263,1022 @@ fast_write_back_r5g6b5 (pixman_iter_t *iter)
 
 typedef struct
 {
-    pixman_format_code_t	format;
-    pixman_iter_get_scanline_t	get_scanline;
-    pixman_iter_write_back_t	write_back;
-} fetcher_info_t;
+    int		y;
+    uint64_t *	buffer;
+} line_t;
 
-static const fetcher_info_t fetchers[] =
+typedef struct
 {
-    { PIXMAN_r5g6b5, fast_fetch_r5g6b5, fast_write_back_r5g6b5 },
-    { PIXMAN_null }
-};
+    line_t		lines[2];
+    pixman_fixed_t	y;
+    pixman_fixed_t	x;
+    uint64_t		data[1];
+} bilinear_info_t;
 
-static pixman_bool_t
-fast_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+static void
+fetch_horizontal (bits_image_t *image, line_t *line,
+		  int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
 {
-    pixman_image_t *image = iter->image;
+    uint32_t *bits = image->bits + y * image->rowstride;
+    int i;
 
-#define FLAGS								\
-    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
-     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+    for (i = 0; i < n; ++i)
+    {
+	int x0 = pixman_fixed_to_int (x);
+	int x1 = x0 + 1;
+	int32_t dist_x;
+
+	uint32_t left = *(bits + x0);
+	uint32_t right = *(bits + x1);
+
+	dist_x = pixman_fixed_to_bilinear_weight (x);
+	dist_x <<= (8 - BILINEAR_INTERPOLATION_BITS);
+
+#if SIZEOF_LONG <= 4
+	{
+	    uint32_t lag, rag, ag;
+	    uint32_t lrb, rrb, rb;
+
+	    lag = (left & 0xff00ff00) >> 8;
+	    rag = (right & 0xff00ff00) >> 8;
+	    ag = (lag << 8) + dist_x * (rag - lag);
+
+	    lrb = (left & 0x00ff00ff);
+	    rrb = (right & 0x00ff00ff);
+	    rb = (lrb << 8) + dist_x * (rrb - lrb);
+
+	    *((uint32_t *)(line->buffer + i)) = ag;
+	    *((uint32_t *)(line->buffer + i) + 1) = rb;
+	}
+#else
+	{
+	    uint64_t lagrb, ragrb;
+	    uint32_t lag, rag;
+	    uint32_t lrb, rrb;
+
+	    lag = (left & 0xff00ff00);
+	    lrb = (left & 0x00ff00ff);
+	    rag = (right & 0xff00ff00);
+	    rrb = (right & 0x00ff00ff);
+	    lagrb = (((uint64_t)lag) << 24) | lrb;
+	    ragrb = (((uint64_t)rag) << 24) | rrb;
+
+	    line->buffer[i] = (lagrb << 8) + dist_x * (ragrb - lagrb);
+	}
+#endif
+
+	x += ux;
+    }
+
+    line->y = y;
+}
+
+static uint32_t *
+fast_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_fixed_t fx, ux;
+    bilinear_info_t *info = iter->data;
+    line_t *line0, *line1;
+    int y0, y1;
+    int32_t dist_y;
+    int i;
+
+    fx = info->x;
+    ux = iter->image->common.transform->matrix[0][0];
+
+    y0 = pixman_fixed_to_int (info->y);
+    y1 = y0 + 1;
+    dist_y = pixman_fixed_to_bilinear_weight (info->y);
+    dist_y <<= (8 - BILINEAR_INTERPOLATION_BITS);
+
+    line0 = &info->lines[y0 & 0x01];
+    line1 = &info->lines[y1 & 0x01];
+
+    if (line0->y != y0)
+    {
+	fetch_horizontal (
+	    &iter->image->bits, line0, y0, fx, ux, iter->width);
+    }
+
+    if (line1->y != y1)
+    {
+	fetch_horizontal (
+	    &iter->image->bits, line1, y1, fx, ux, iter->width);
+    }
+
+    for (i = 0; i < iter->width; ++i)
+    {
+#if SIZEOF_LONG <= 4
+	uint32_t ta, tr, tg, tb;
+	uint32_t ba, br, bg, bb;
+	uint32_t tag, trb;
+	uint32_t bag, brb;
+	uint32_t a, r, g, b;
+
+	tag = *((uint32_t *)(line0->buffer + i));
+	trb = *((uint32_t *)(line0->buffer + i) + 1);
+	bag = *((uint32_t *)(line1->buffer + i));
+	brb = *((uint32_t *)(line1->buffer + i) + 1);
+
+	ta = tag >> 16;
+	ba = bag >> 16;
+	a = (ta << 8) + dist_y * (ba - ta);
+
+	tr = trb >> 16;
+	br = brb >> 16;
+	r = (tr << 8) + dist_y * (br - tr);
+
+	tg = tag & 0xffff;
+	bg = bag & 0xffff;
+	g = (tg << 8) + dist_y * (bg - tg);
+	
+	tb = trb & 0xffff;
+	bb = brb & 0xffff;
+	b = (tb << 8) + dist_y * (bb - tb);
+
+	a = (a <<  8) & 0xff000000;
+	r = (r <<  0) & 0x00ff0000;
+	g = (g >>  8) & 0x0000ff00;
+	b = (b >> 16) & 0x000000ff;
+#else
+	uint64_t top = line0->buffer[i];
+	uint64_t bot = line1->buffer[i];
+	uint64_t tar = (top & 0xffff0000ffff0000ULL) >> 16;
+	uint64_t bar = (bot & 0xffff0000ffff0000ULL) >> 16;
+	uint64_t tgb = (top & 0x0000ffff0000ffffULL);
+	uint64_t bgb = (bot & 0x0000ffff0000ffffULL);
+	uint64_t ar, gb;
+	uint32_t a, r, g, b;
+
+	ar = (tar << 8) + dist_y * (bar - tar);
+	gb = (tgb << 8) + dist_y * (bgb - tgb);
+
+	a = ((ar >> 24) & 0xff000000);
+	r = ((ar >>  0) & 0x00ff0000);
+	g = ((gb >> 40) & 0x0000ff00);
+	b = ((gb >> 16) & 0x000000ff);
+#endif
+
+	iter->buffer[i] = a | r | g | b;
+    }
+
+    info->y += iter->image->common.transform->matrix[1][1];
+
+    return iter->buffer;
+}
+
+static void
+bilinear_cover_iter_fini (pixman_iter_t *iter)
+{
+    free (iter->data);
+}
+
+static void
+fast_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info)
+{
+    int width = iter->width;
+    bilinear_info_t *info;
+    pixman_vector_t v;
+
+    /* Reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (iter->image->common.transform, &v))
+	goto fail;
+
+    info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t));
+    if (!info)
+	goto fail;
+
+    info->x = v.vector[0] - pixman_fixed_1 / 2;
+    info->y = v.vector[1] - pixman_fixed_1 / 2;
+
+    /* It is safe to set the y coordinates to -1 initially
+     * because COVER_CLIP_BILINEAR ensures that we will only
+     * be asked to fetch lines in the [0, height) interval
+     */
+    info->lines[0].y = -1;
+    info->lines[0].buffer = &(info->data[0]);
+    info->lines[1].y = -1;
+    info->lines[1].buffer = &(info->data[width]);
+
+    iter->get_scanline = fast_fetch_bilinear_cover;
+    iter->fini = bilinear_cover_iter_fini;
+
+    iter->data = info;
+    return;
+
+fail:
+    /* Something went wrong, either a bad matrix or OOM; in such cases,
+     * we don't guarantee any particular rendering.
+     */
+    _pixman_log_error (
+	FUNC, "Allocation failure or bad matrix, skipping rendering\n");
+    
+    iter->get_scanline = _pixman_iter_get_scanline_noop;
+    iter->fini = NULL;
+}
+
+static uint32_t *
+bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,
+					  const uint32_t *mask)
+{
+
+    pixman_image_t * ima = iter->image;
+    int              offset = iter->x;
+    int              line = iter->y++;
+    int              width = iter->width;
+    uint32_t *       buffer = iter->buffer;
+
+    bits_image_t *bits = &ima->bits;
+    pixman_fixed_t x_top, x_bottom, x;
+    pixman_fixed_t ux_top, ux_bottom, ux;
+    pixman_vector_t v;
+    uint32_t top_mask, bottom_mask;
+    uint32_t *top_row;
+    uint32_t *bottom_row;
+    uint32_t *end;
+    uint32_t zero[2] = { 0, 0 };
+    uint32_t one = 1;
+    int y, y1, y2;
+    int disty;
+    int mask_inc;
+    int w;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
 
-    if ((iter->iter_flags & ITER_NARROW)			&&
-	(iter->image_flags & FLAGS) == FLAGS)
+    if (!pixman_transform_point_3d (bits->common.transform, &v))
+	return iter->buffer;
+
+    ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
+    x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
+
+    y = v.vector[1] - pixman_fixed_1/2;
+    disty = pixman_fixed_to_bilinear_weight (y);
+
+    /* Load the pointers to the first and second lines from the source
+     * image that bilinear code must read.
+     *
+     * The main trick in this code is about the check if any line are
+     * outside of the image;
+     *
+     * When I realize that a line (any one) is outside, I change
+     * the pointer to a dummy area with zeros. Once I change this, I
+     * must be sure the pointer will not change, so I set the
+     * variables to each pointer increments inside the loop.
+     */
+    y1 = pixman_fixed_to_int (y);
+    y2 = y1 + 1;
+
+    if (y1 < 0 || y1 >= bits->height)
+    {
+	top_row = zero;
+	x_top = 0;
+	ux_top = 0;
+    }
+    else
     {
-	const fetcher_info_t *f;
+	top_row = bits->bits + y1 * bits->rowstride;
+	x_top = x;
+	ux_top = ux;
+    }
 
-	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+    if (y2 < 0 || y2 >= bits->height)
+    {
+	bottom_row = zero;
+	x_bottom = 0;
+	ux_bottom = 0;
+    }
+    else
+    {
+	bottom_row = bits->bits + y2 * bits->rowstride;
+	x_bottom = x;
+	ux_bottom = ux;
+    }
+
+    /* Instead of checking whether the operation uses the mast in
+     * each loop iteration, verify this only once and prepare the
+     * variables to make the code smaller inside the loop.
+     */
+    if (!mask)
+    {
+        mask_inc = 0;
+        mask = &one;
+    }
+    else
+    {
+        /* If have a mask, prepare the variables to check it */
+        mask_inc = 1;
+    }
+
+    /* If both are zero, then the whole thing is zero */
+    if (top_row == zero && bottom_row == zero)
+    {
+	memset (buffer, 0, width * sizeof (uint32_t));
+	return iter->buffer;
+    }
+    else if (bits->format == PIXMAN_x8r8g8b8)
+    {
+	if (top_row == zero)
 	{
-	    if (image->common.extended_format_code == f->format)
-	    {
-		uint8_t *b = (uint8_t *)image->bits.bits;
-		int s = image->bits.rowstride * 4;
+	    top_mask = 0;
+	    bottom_mask = 0xff000000;
+	}
+	else if (bottom_row == zero)
+	{
+	    top_mask = 0xff000000;
+	    bottom_mask = 0;
+	}
+	else
+	{
+	    top_mask = 0xff000000;
+	    bottom_mask = 0xff000000;
+	}
+    }
+    else
+    {
+	top_mask = 0;
+	bottom_mask = 0;
+    }
 
-		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
-		iter->stride = s;
+    end = buffer + width;
 
-		iter->get_scanline = f->get_scanline;
-		return TRUE;
-	    }
+    /* Zero fill to the left of the image */
+    while (buffer < end && x < pixman_fixed_minus_1)
+    {
+	*buffer++ = 0;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Left edge
+     */
+    while (buffer < end && x < 0)
+    {
+	uint32_t tr, br;
+	int32_t distx;
+
+	tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
+	br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+	distx = pixman_fixed_to_bilinear_weight (x);
+
+	*buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
+
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Main part */
+    w = pixman_int_to_fixed (bits->width - 1);
+
+    while (buffer < end  &&  x < w)
+    {
+	if (*mask)
+	{
+	    uint32_t tl, tr, bl, br;
+	    int32_t distx;
+
+	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+	    tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
+	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+	    br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+	    distx = pixman_fixed_to_bilinear_weight (x);
+
+	    *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
 	}
+
+	buffer++;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
     }
 
-    return FALSE;
+    /* Right Edge */
+    w = pixman_int_to_fixed (bits->width);
+    while (buffer < end  &&  x < w)
+    {
+	if (*mask)
+	{
+	    uint32_t tl, bl;
+	    int32_t distx;
+
+	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+
+	    distx = pixman_fixed_to_bilinear_weight (x);
+
+	    *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
+	}
+
+	buffer++;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Zero fill to the left of the image */
+    while (buffer < end)
+	*buffer++ = 0;
+
+    return iter->buffer;
 }
 
-static pixman_bool_t
-fast_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
+
+static force_inline void
+bits_image_fetch_separable_convolution_affine (pixman_image_t * image,
+					       int              offset,
+					       int              line,
+					       int              width,
+					       uint32_t *       buffer,
+					       const uint32_t * mask,
+
+					       convert_pixel_t	convert_pixel,
+					       pixman_format_code_t	format,
+					       pixman_repeat_t	repeat_mode)
 {
-    pixman_image_t *image = iter->image;
+    bits_image_t *bits = &image->bits;
+    pixman_fixed_t *params = image->common.filter_params;
+    int cwidth = pixman_fixed_to_int (params[0]);
+    int cheight = pixman_fixed_to_int (params[1]);
+    int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1;
+    int y_off = ((cheight << 16) - pixman_fixed_1) >> 1;
+    int x_phase_bits = pixman_fixed_to_int (params[2]);
+    int y_phase_bits = pixman_fixed_to_int (params[3]);
+    int x_phase_shift = 16 - x_phase_bits;
+    int y_phase_shift = 16 - y_phase_bits;
+    pixman_fixed_t vx, vy;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    int k;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
 
-    if ((iter->iter_flags & ITER_NARROW)		&&
-	(iter->image_flags & FAST_PATH_STD_DEST_FLAGS) == FAST_PATH_STD_DEST_FLAGS)
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    vx = v.vector[0];
+    vy = v.vector[1];
+
+    for (k = 0; k < width; ++k)
     {
-	const fetcher_info_t *f;
+	pixman_fixed_t *y_params;
+	int satot, srtot, sgtot, sbtot;
+	pixman_fixed_t x, y;
+	int32_t x1, x2, y1, y2;
+	int32_t px, py;
+	int i, j;
+
+	if (mask && !mask[k])
+	    goto next;
+
+	/* Round x and y to the middle of the closest phase before continuing. This
+	 * ensures that the convolution matrix is aligned right, since it was
+	 * positioned relative to a particular phase (and not relative to whatever
+	 * exact fraction we happen to get here).
+	 */
+	x = ((vx >> x_phase_shift) << x_phase_shift) + ((1 << x_phase_shift) >> 1);
+	y = ((vy >> y_phase_shift) << y_phase_shift) + ((1 << y_phase_shift) >> 1);
 
-	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+	px = (x & 0xffff) >> x_phase_shift;
+	py = (y & 0xffff) >> y_phase_shift;
+
+	x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
+	y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
+	x2 = x1 + cwidth;
+	y2 = y1 + cheight;
+
+	satot = srtot = sgtot = sbtot = 0;
+
+	y_params = params + 4 + (1 << x_phase_bits) * cwidth + py * cheight;
+
+	for (i = y1; i < y2; ++i)
 	{
-	    if (image->common.extended_format_code == f->format)
-	    {
-		uint8_t *b = (uint8_t *)image->bits.bits;
-		int s = image->bits.rowstride * 4;
+	    pixman_fixed_t fy = *y_params++;
 
-		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
-		iter->stride = s;
+	    if (fy)
+	    {
+		pixman_fixed_t *x_params = params + 4 + px * cwidth;
 
-		if ((iter->iter_flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
-		    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
+		for (j = x1; j < x2; ++j)
 		{
-		    iter->get_scanline = fast_dest_fetch_noop;
+		    pixman_fixed_t fx = *x_params++;
+		    int rx = j;
+		    int ry = i;
+		    
+		    if (fx)
+		    {
+			pixman_fixed_t f;
+			uint32_t pixel, mask;
+			uint8_t *row;
+
+			mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+			if (repeat_mode != PIXMAN_REPEAT_NONE)
+			{
+			    repeat (repeat_mode, &rx, bits->width);
+			    repeat (repeat_mode, &ry, bits->height);
+
+			    row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
+			    pixel = convert_pixel (row, rx) | mask;
+			}
+			else
+			{
+			    if (rx < 0 || ry < 0 || rx >= bits->width || ry >= bits->height)
+			    {
+				pixel = 0;
+			    }
+			    else
+			    {
+				row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
+				pixel = convert_pixel (row, rx) | mask;
+			    }
+			}
+
+			f = ((pixman_fixed_32_32_t)fx * fy + 0x8000) >> 16;
+			srtot += (int)RED_8 (pixel) * f;
+			sgtot += (int)GREEN_8 (pixel) * f;
+			sbtot += (int)BLUE_8 (pixel) * f;
+			satot += (int)ALPHA_8 (pixel) * f;
+		    }
 		}
-		else
-		{
-		    iter->get_scanline = f->get_scanline;
-		}
-		iter->write_back = f->write_back;
-		return TRUE;
 	    }
 	}
+
+	satot = (satot + 0x8000) >> 16;
+	srtot = (srtot + 0x8000) >> 16;
+	sgtot = (sgtot + 0x8000) >> 16;
+	sbtot = (sbtot + 0x8000) >> 16;
+
+	satot = CLIP (satot, 0, 0xff);
+	srtot = CLIP (srtot, 0, 0xff);
+	sgtot = CLIP (sgtot, 0, 0xff);
+	sbtot = CLIP (sbtot, 0, 0xff);
+
+	buffer[k] = (satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot << 0);
+
+    next:
+	vx += ux;
+	vy += uy;
     }
-    return FALSE;
 }
 
+static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+static force_inline void
+bits_image_fetch_bilinear_affine (pixman_image_t * image,
+				  int              offset,
+				  int              line,
+				  int              width,
+				  uint32_t *       buffer,
+				  const uint32_t * mask,
+
+				  convert_pixel_t	convert_pixel,
+				  pixman_format_code_t	format,
+				  pixman_repeat_t	repeat_mode)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    bits_image_t *bits = &image->bits;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+	int x1, y1, x2, y2;
+	uint32_t tl, tr, bl, br;
+	int32_t distx, disty;
+	int width = image->bits.width;
+	int height = image->bits.height;
+	const uint8_t *row1;
+	const uint8_t *row2;
+
+	if (mask && !mask[i])
+	    goto next;
+
+	x1 = x - pixman_fixed_1 / 2;
+	y1 = y - pixman_fixed_1 / 2;
+
+	distx = pixman_fixed_to_bilinear_weight (x1);
+	disty = pixman_fixed_to_bilinear_weight (y1);
+
+	y1 = pixman_fixed_to_int (y1);
+	y2 = y1 + 1;
+	x1 = pixman_fixed_to_int (x1);
+	x2 = x1 + 1;
+
+	if (repeat_mode != PIXMAN_REPEAT_NONE)
+	{
+	    uint32_t mask;
+
+	    mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+	    repeat (repeat_mode, &x1, width);
+	    repeat (repeat_mode, &y1, height);
+	    repeat (repeat_mode, &x2, width);
+	    repeat (repeat_mode, &y2, height);
+
+	    row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+	    row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+
+	    tl = convert_pixel (row1, x1) | mask;
+	    tr = convert_pixel (row1, x2) | mask;
+	    bl = convert_pixel (row2, x1) | mask;
+	    br = convert_pixel (row2, x2) | mask;
+	}
+	else
+	{
+	    uint32_t mask1, mask2;
+	    int bpp;
+
+	    /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
+	     * which means if you use it in expressions, those
+	     * expressions become unsigned themselves. Since
+	     * the variables below can be negative in some cases,
+	     * that will lead to crashes on 64 bit architectures.
+	     *
+	     * So this line makes sure bpp is signed
+	     */
+	    bpp = PIXMAN_FORMAT_BPP (format);
+
+	    if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
+	    {
+		buffer[i] = 0;
+		goto next;
+	    }
+
+	    if (y2 == 0)
+	    {
+		row1 = zero;
+		mask1 = 0;
+	    }
+	    else
+	    {
+		row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+		row1 += bpp / 8 * x1;
+
+		mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+	    }
+
+	    if (y1 == height - 1)
+	    {
+		row2 = zero;
+		mask2 = 0;
+	    }
+	    else
+	    {
+		row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+		row2 += bpp / 8 * x1;
+
+		mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+	    }
+
+	    if (x2 == 0)
+	    {
+		tl = 0;
+		bl = 0;
+	    }
+	    else
+	    {
+		tl = convert_pixel (row1, 0) | mask1;
+		bl = convert_pixel (row2, 0) | mask2;
+	    }
+
+	    if (x1 == width - 1)
+	    {
+		tr = 0;
+		br = 0;
+	    }
+	    else
+	    {
+		tr = convert_pixel (row1, 1) | mask1;
+		br = convert_pixel (row2, 1) | mask2;
+	    }
+	}
+
+	buffer[i] = bilinear_interpolation (
+	    tl, tr, bl, br, distx, disty);
+
+    next:
+	x += ux;
+	y += uy;
+    }
+}
+
+static force_inline void
+bits_image_fetch_nearest_affine (pixman_image_t * image,
+				 int              offset,
+				 int              line,
+				 int              width,
+				 uint32_t *       buffer,
+				 const uint32_t * mask,
+				 
+				 convert_pixel_t	convert_pixel,
+				 pixman_format_code_t	format,
+				 pixman_repeat_t	repeat_mode)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    bits_image_t *bits = &image->bits;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+	int width, height, x0, y0;
+	const uint8_t *row;
+
+	if (mask && !mask[i])
+	    goto next;
+	
+	width = image->bits.width;
+	height = image->bits.height;
+	x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+	y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+	if (repeat_mode == PIXMAN_REPEAT_NONE &&
+	    (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
+	{
+	    buffer[i] = 0;
+	}
+	else
+	{
+	    uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+	    if (repeat_mode != PIXMAN_REPEAT_NONE)
+	    {
+		repeat (repeat_mode, &x0, width);
+		repeat (repeat_mode, &y0, height);
+	    }
+
+	    row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
+
+	    buffer[i] = convert_pixel (row, x0) | mask;
+	}
+
+    next:
+	x += ux;
+	y += uy;
+    }
+}
+
+static force_inline uint32_t
+convert_a8r8g8b8 (const uint8_t *row, int x)
+{
+    return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_x8r8g8b8 (const uint8_t *row, int x)
+{
+    return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_a8 (const uint8_t *row, int x)
+{
+    return *(row + x) << 24;
+}
+
+static force_inline uint32_t
+convert_r5g6b5 (const uint8_t *row, int x)
+{
+    return convert_0565_to_0888 (*((uint16_t *)row + x));
+}
+
+#define MAKE_SEPARABLE_CONVOLUTION_FETCHER(name, format, repeat_mode)  \
+    static uint32_t *							\
+    bits_image_fetch_separable_convolution_affine_ ## name (pixman_iter_t   *iter, \
+							    const uint32_t * mask) \
+    {									\
+	bits_image_fetch_separable_convolution_affine (                 \
+	    iter->image,                                                \
+	    iter->x, iter->y++,                                         \
+	    iter->width,                                                \
+	    iter->buffer, mask,                                         \
+	    convert_ ## format,                                         \
+	    PIXMAN_ ## format,                                          \
+	    repeat_mode);                                               \
+									\
+	return iter->buffer;                                            \
+    }
+
+#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode)		\
+    static uint32_t *							\
+    bits_image_fetch_bilinear_affine_ ## name (pixman_iter_t   *iter,	\
+					       const uint32_t * mask)	\
+    {									\
+	bits_image_fetch_bilinear_affine (iter->image,			\
+					  iter->x, iter->y++,		\
+					  iter->width,			\
+					  iter->buffer, mask,		\
+					  convert_ ## format,		\
+					  PIXMAN_ ## format,		\
+					  repeat_mode);			\
+	return iter->buffer;						\
+    }
+
+#define MAKE_NEAREST_FETCHER(name, format, repeat_mode)			\
+    static uint32_t *							\
+    bits_image_fetch_nearest_affine_ ## name (pixman_iter_t   *iter,	\
+					      const uint32_t * mask)	\
+    {									\
+	bits_image_fetch_nearest_affine (iter->image,			\
+					 iter->x, iter->y++,		\
+					 iter->width,			\
+					 iter->buffer, mask,		\
+					 convert_ ## format,		\
+					 PIXMAN_ ## format,		\
+					 repeat_mode);			\
+	return iter->buffer;						\
+    }
+
+#define MAKE_FETCHERS(name, format, repeat_mode)			\
+    MAKE_NEAREST_FETCHER (name, format, repeat_mode)			\
+    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)			\
+    MAKE_SEPARABLE_CONVOLUTION_FETCHER (name, format, repeat_mode)
+
+MAKE_FETCHERS (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8r8g8b8,  a8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_x8r8g8b8,     x8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_x8r8g8b8,    x8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_x8r8g8b8,  x8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_a8,           a8,       PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8,          a8,       PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8,	 a8,       PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8,	 a8,       PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_r5g6b5,       r5g6b5,   PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_r5g6b5,      r5g6b5,   PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_r5g6b5,   r5g6b5,   PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_r5g6b5,    r5g6b5,   PIXMAN_REPEAT_NORMAL)
+
+#define IMAGE_FLAGS							\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
+     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+static const pixman_iter_info_t fast_iters[] = 
+{
+    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW | ITER_SRC,
+      _pixman_iter_init_bits_stride, fast_fetch_r5g6b5, NULL },
+
+    { PIXMAN_r5g6b5, FAST_PATH_STD_DEST_FLAGS,
+      ITER_NARROW | ITER_DEST,
+      _pixman_iter_init_bits_stride,
+      fast_fetch_r5g6b5, fast_write_back_r5g6b5 },
+    
+    { PIXMAN_r5g6b5, FAST_PATH_STD_DEST_FLAGS,
+      ITER_NARROW | ITER_DEST | ITER_IGNORE_RGB | ITER_IGNORE_ALPHA,
+      _pixman_iter_init_bits_stride,
+      fast_dest_fetch_noop, fast_write_back_r5g6b5 },
+
+    { PIXMAN_a8r8g8b8,
+      (FAST_PATH_STANDARD_FLAGS			|
+       FAST_PATH_SCALE_TRANSFORM		|
+       FAST_PATH_BILINEAR_FILTER		|
+       FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR),
+      ITER_NARROW | ITER_SRC,
+      fast_bilinear_cover_iter_init,
+      NULL, NULL
+    },
+
+#define FAST_BILINEAR_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_X_UNIT_POSITIVE		|				\
+     FAST_PATH_Y_UNIT_ZERO		|				\
+     FAST_PATH_NONE_REPEAT		|				\
+     FAST_PATH_BILINEAR_FILTER)
+
+    { PIXMAN_a8r8g8b8,
+      FAST_BILINEAR_FLAGS,
+      ITER_NARROW | ITER_SRC,
+      NULL, bits_image_fetch_bilinear_no_repeat_8888, NULL
+    },
+
+    { PIXMAN_x8r8g8b8,
+      FAST_BILINEAR_FLAGS,
+      ITER_NARROW | ITER_SRC,
+      NULL, bits_image_fetch_bilinear_no_repeat_8888, NULL
+    },
+
+#define GENERAL_BILINEAR_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_BILINEAR_FILTER)
+
+#define GENERAL_NEAREST_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_NEAREST_FILTER)
+
+#define GENERAL_SEPARABLE_CONVOLUTION_FLAGS				\
+    (FAST_PATH_NO_ALPHA_MAP            |				\
+     FAST_PATH_NO_ACCESSORS            |				\
+     FAST_PATH_HAS_TRANSFORM           |				\
+     FAST_PATH_AFFINE_TRANSFORM        |				\
+     FAST_PATH_SEPARABLE_CONVOLUTION_FILTER)
+    
+#define SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)   \
+    { PIXMAN_ ## format,						\
+      GENERAL_SEPARABLE_CONVOLUTION_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
+      ITER_NARROW | ITER_SRC,						\
+      NULL, bits_image_fetch_separable_convolution_affine_ ## name, NULL \
+    },
+
+#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
+    { PIXMAN_ ## format,						\
+      GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
+      ITER_NARROW | ITER_SRC,						\
+      NULL, bits_image_fetch_bilinear_affine_ ## name, NULL,		\
+    },
+
+#define NEAREST_AFFINE_FAST_PATH(name, format, repeat)			\
+    { PIXMAN_ ## format,						\
+      GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
+      ITER_NARROW | ITER_SRC,						\
+      NULL, bits_image_fetch_nearest_affine_ ## name, NULL		\
+    },
+
+#define AFFINE_FAST_PATHS(name, format, repeat)				\
+    SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)	\
+    BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
+    NEAREST_AFFINE_FAST_PATH(name, format, repeat)
+    
+    AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
+    AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
+    AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
+    AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
+    AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
+    AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
+    AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
+    AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
+    AFFINE_FAST_PATHS (pad_a8, a8, PAD)
+    AFFINE_FAST_PATHS (none_a8, a8, NONE)
+    AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
+    AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
+    AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
+    AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
+    AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
+    AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
+
+    { PIXMAN_null },
+};
 
 pixman_implementation_t *
 _pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
@@ -2351,8 +3286,7 @@ _pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
 
     imp->fill = fast_path_fill;
-    imp->src_iter_init = fast_src_iter_init;
-    imp->dest_iter_init = fast_dest_iter_init;
+    imp->iter_info = fast_iters;
 
     return imp;
 }
diff --git a/lib/pixman/pixman/pixman-filter.c b/lib/pixman/pixman/pixman-filter.c
index 5ff7b6eaa..b2bf53fed 100644
--- a/lib/pixman/pixman/pixman-filter.c
+++ b/lib/pixman/pixman/pixman-filter.c
@@ -275,7 +275,7 @@ create_1d_filter (int             *width,
 	    }
 
 	    total += c;
-            *p++ = (pixman_fixed_t)(c * 65535.0 + 0.5);
+            *p++ = (pixman_fixed_t)(c * 65536.0 + 0.5);
         }
 
 	/* Normalize */
diff --git a/lib/pixman/pixman/pixman-general.c b/lib/pixman/pixman/pixman-general.c
index 93a1b9acf..a653fa71a 100644
--- a/lib/pixman/pixman/pixman-general.c
+++ b/lib/pixman/pixman/pixman-general.c
@@ -37,43 +37,47 @@
 #include <string.h>
 #include "pixman-private.h"
 
-static pixman_bool_t
-general_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+static void
+general_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *info)
 {
     pixman_image_t *image = iter->image;
 
-    if (image->type == LINEAR)
-	_pixman_linear_gradient_iter_init (image, iter);
-    else if (image->type == RADIAL)
+    switch (image->type)
+    {
+    case BITS:
+        if ((iter->iter_flags & ITER_SRC) == ITER_SRC)
+            _pixman_bits_image_src_iter_init (image, iter);
+        else
+            _pixman_bits_image_dest_iter_init (image, iter);
+        break;
+
+    case LINEAR:
+        _pixman_linear_gradient_iter_init (image, iter);
+        break;
+
+    case RADIAL:
 	_pixman_radial_gradient_iter_init (image, iter);
-    else if (image->type == CONICAL)
+        break;
+
+    case CONICAL:
 	_pixman_conical_gradient_iter_init (image, iter);
-    else if (image->type == BITS)
-	_pixman_bits_image_src_iter_init (image, iter);
-    else if (image->type == SOLID)
+        break;
+
+    case SOLID:
         _pixman_log_error (FUNC, "Solid image not handled by noop");
-    else         
-	_pixman_log_error (FUNC, "Pixman bug: unknown image type\n");
+        break;
 
-    return TRUE;
+    default:
+	_pixman_log_error (FUNC, "Pixman bug: unknown image type\n");
+        break;
+    }
 }
 
-static pixman_bool_t
-general_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+static const pixman_iter_info_t general_iters[] =
 {
-    if (iter->image->type == BITS)
-    {
-	_pixman_bits_image_dest_iter_init (iter->image, iter);
-
-	return TRUE;
-    }
-    else
-    {
-	_pixman_log_error (FUNC, "Trying to write to a non-writable image");
-
-	return FALSE;
-    }
-}
+    { PIXMAN_any, 0, 0, general_iter_init, NULL, NULL },
+    { PIXMAN_null },
+};
 
 typedef struct op_info_t op_info_t;
 struct op_info_t
@@ -110,13 +114,13 @@ general_composite_rect  (pixman_implementation_t *imp,
                          pixman_composite_info_t *info)
 {
     PIXMAN_COMPOSITE_ARGS (info);
-    uint64_t stack_scanline_buffer[(SCANLINE_BUFFER_LENGTH * 3 + 7) / 8];
+    uint8_t stack_scanline_buffer[3 * SCANLINE_BUFFER_LENGTH];
     uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer;
     uint8_t *src_buffer, *mask_buffer, *dest_buffer;
     pixman_iter_t src_iter, mask_iter, dest_iter;
     pixman_combine_32_func_t compose;
     pixman_bool_t component_alpha;
-    iter_flags_t narrow, src_iter_flags;
+    iter_flags_t width_flag, src_iter_flags;
     int Bpp;
     int i;
 
@@ -124,28 +128,36 @@ general_composite_rect  (pixman_implementation_t *imp,
 	(!mask_image || mask_image->common.flags & FAST_PATH_NARROW_FORMAT) &&
 	(dest_image->common.flags & FAST_PATH_NARROW_FORMAT))
     {
-	narrow = ITER_NARROW;
+	width_flag = ITER_NARROW;
 	Bpp = 4;
     }
     else
     {
-	narrow = 0;
+	width_flag = ITER_WIDE;
 	Bpp = 16;
     }
 
-    if (width * Bpp > SCANLINE_BUFFER_LENGTH)
+#define ALIGN(addr)							\
+    ((uint8_t *)((((uintptr_t)(addr)) + 15) & (~15)))
+
+    src_buffer = ALIGN (scanline_buffer);
+    mask_buffer = ALIGN (src_buffer + width * Bpp);
+    dest_buffer = ALIGN (mask_buffer + width * Bpp);
+
+    if (ALIGN (dest_buffer + width * Bpp) >
+	    scanline_buffer + sizeof (stack_scanline_buffer))
     {
-	scanline_buffer = pixman_malloc_abc (width, 3, Bpp);
+	scanline_buffer = pixman_malloc_ab_plus_c (width, Bpp * 3, 32 * 3);
 
 	if (!scanline_buffer)
 	    return;
-    }
 
-    src_buffer = scanline_buffer;
-    mask_buffer = src_buffer + width * Bpp;
-    dest_buffer = mask_buffer + width * Bpp;
+	src_buffer = ALIGN (scanline_buffer);
+	mask_buffer = ALIGN (src_buffer + width * Bpp);
+	dest_buffer = ALIGN (mask_buffer + width * Bpp);
+    }
 
-    if (!narrow)
+    if (width_flag == ITER_WIDE)
     {
 	/* To make sure there aren't any NANs in the buffers */
 	memset (src_buffer, 0, width * Bpp);
@@ -154,11 +166,12 @@ general_composite_rect  (pixman_implementation_t *imp,
     }
     
     /* src iter */
-    src_iter_flags = narrow | op_flags[op].src;
+    src_iter_flags = width_flag | op_flags[op].src | ITER_SRC;
 
-    _pixman_implementation_src_iter_init (imp->toplevel, &src_iter, src_image,
-					  src_x, src_y, width, height,
-					  src_buffer, src_iter_flags, info->src_flags);
+    _pixman_implementation_iter_init (imp->toplevel, &src_iter, src_image,
+                                      src_x, src_y, width, height,
+                                      src_buffer, src_iter_flags,
+                                      info->src_flags);
 
     /* mask iter */
     if ((src_iter_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
@@ -176,17 +189,19 @@ general_composite_rect  (pixman_implementation_t *imp,
         mask_image->common.component_alpha    &&
         PIXMAN_FORMAT_RGB (mask_image->bits.format);
 
-    _pixman_implementation_src_iter_init (
-	imp->toplevel, &mask_iter, mask_image, mask_x, mask_y, width, height,
-	mask_buffer, narrow | (component_alpha? 0 : ITER_IGNORE_RGB), info->mask_flags);
+    _pixman_implementation_iter_init (
+	imp->toplevel, &mask_iter,
+	mask_image, mask_x, mask_y, width, height, mask_buffer,
+	ITER_SRC | width_flag | (component_alpha? 0 : ITER_IGNORE_RGB),
+	info->mask_flags);
 
     /* dest iter */
-    _pixman_implementation_dest_iter_init (
+    _pixman_implementation_iter_init (
 	imp->toplevel, &dest_iter, dest_image, dest_x, dest_y, width, height,
-	dest_buffer, narrow | op_flags[op].dst, info->dest_flags);
+	dest_buffer, ITER_DEST | width_flag | op_flags[op].dst, info->dest_flags);
 
     compose = _pixman_implementation_lookup_combiner (
-	imp->toplevel, op, component_alpha, narrow);
+	imp->toplevel, op, component_alpha, width_flag != ITER_WIDE);
 
     for (i = 0; i < height; ++i)
     {
@@ -201,6 +216,13 @@ general_composite_rect  (pixman_implementation_t *imp,
 	dest_iter.write_back (&dest_iter);
     }
 
+    if (src_iter.fini)
+	src_iter.fini (&src_iter);
+    if (mask_iter.fini)
+	mask_iter.fini (&mask_iter);
+    if (dest_iter.fini)
+	dest_iter.fini (&dest_iter);
+    
     if (scanline_buffer != (uint8_t *) stack_scanline_buffer)
 	free (scanline_buffer);
 }
@@ -219,8 +241,7 @@ _pixman_implementation_create_general (void)
     _pixman_setup_combiner_functions_32 (imp);
     _pixman_setup_combiner_functions_float (imp);
 
-    imp->src_iter_init = general_src_iter_init;
-    imp->dest_iter_init = general_dest_iter_init;
+    imp->iter_info = general_iters;
 
     return imp;
 }
diff --git a/lib/pixman/pixman/pixman-glyph.c b/lib/pixman/pixman/pixman-glyph.c
index 5a271b64b..96a349ab4 100644
--- a/lib/pixman/pixman/pixman-glyph.c
+++ b/lib/pixman/pixman/pixman-glyph.c
@@ -391,6 +391,9 @@ box32_intersect (pixman_box32_t *dest,
     return dest->x2 > dest->x1 && dest->y2 > dest->y1;
 }
 
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
 PIXMAN_EXPORT void
 pixman_composite_glyphs_no_mask (pixman_op_t            op,
 				 pixman_image_t        *src,
@@ -630,6 +633,9 @@ out:
  *   - Trim the mask to the destination clip/image?
  *   - Trim composite region based on sources, when the op ignores 0s.
  */
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
 PIXMAN_EXPORT void
 pixman_composite_glyphs (pixman_op_t            op,
 			 pixman_image_t        *src,
diff --git a/lib/pixman/pixman/pixman-image.c b/lib/pixman/pixman/pixman-image.c
index 65041b43b..1ff1a4974 100644
--- a/lib/pixman/pixman/pixman-image.c
+++ b/lib/pixman/pixman/pixman-image.c
@@ -502,8 +502,10 @@ compute_image_info (pixman_image_t *image)
 	break;
     }
 
-    /* Alpha map */
-    if (!image->common.alpha_map)
+    /* Alpha maps are only supported for BITS images, so it's always
+     * safe to ignore their presense for non-BITS images
+     */
+    if (!image->common.alpha_map || image->type != BITS)
     {
 	flags |= FAST_PATH_NO_ALPHA_MAP;
     }
@@ -918,12 +920,15 @@ _pixman_image_get_solid (pixman_implementation_t *imp,
 	pixman_iter_t iter;
 
     otherwise:
-	_pixman_implementation_src_iter_init (
+	_pixman_implementation_iter_init (
 	    imp, &iter, image, 0, 0, 1, 1,
 	    (uint8_t *)&result,
-	    ITER_NARROW, image->common.flags);
+	    ITER_NARROW | ITER_SRC, image->common.flags);
 	
 	result = *iter.get_scanline (&iter, NULL);
+
+	if (iter.fini)
+	    iter.fini (&iter);
     }
 
     /* If necessary, convert RGB <--> BGR. */
diff --git a/lib/pixman/pixman/pixman-implementation.c b/lib/pixman/pixman/pixman-implementation.c
index cfb82bb1f..588405451 100644
--- a/lib/pixman/pixman/pixman-implementation.c
+++ b/lib/pixman/pixman/pixman-implementation.c
@@ -285,18 +285,26 @@ _pixman_implementation_fill (pixman_implementation_t *imp,
     return FALSE;
 }
 
-pixman_bool_t
-_pixman_implementation_src_iter_init (pixman_implementation_t	*imp,
-				      pixman_iter_t             *iter,
-				      pixman_image_t		*image,
-				      int			 x,
-				      int			 y,
-				      int			 width,
-				      int			 height,
-				      uint8_t			*buffer,
-				      iter_flags_t		 iter_flags,
-				      uint32_t                   image_flags)
+static uint32_t *
+get_scanline_null (pixman_iter_t *iter, const uint32_t *mask)
 {
+    return NULL;
+}
+
+void
+_pixman_implementation_iter_init (pixman_implementation_t *imp,
+                                  pixman_iter_t           *iter,
+                                  pixman_image_t          *image,
+                                  int                      x,
+                                  int                      y,
+                                  int                      width,
+                                  int                      height,
+                                  uint8_t                 *buffer,
+                                  iter_flags_t             iter_flags,
+                                  uint32_t                 image_flags)
+{
+    pixman_format_code_t format;
+
     iter->image = image;
     iter->buffer = (uint32_t *)buffer;
     iter->x = x;
@@ -305,48 +313,40 @@ _pixman_implementation_src_iter_init (pixman_implementation_t	*imp,
     iter->height = height;
     iter->iter_flags = iter_flags;
     iter->image_flags = image_flags;
+    iter->fini = NULL;
 
-    while (imp)
+    if (!iter->image)
     {
-	if (imp->src_iter_init && (*imp->src_iter_init) (imp, iter))
-	    return TRUE;
-
-	imp = imp->fallback;
+	iter->get_scanline = get_scanline_null;
+	return;
     }
 
-    return FALSE;
-}
-
-pixman_bool_t
-_pixman_implementation_dest_iter_init (pixman_implementation_t	*imp,
-				       pixman_iter_t            *iter,
-				       pixman_image_t		*image,
-				       int			 x,
-				       int			 y,
-				       int			 width,
-				       int			 height,
-				       uint8_t			*buffer,
-				       iter_flags_t		 iter_flags,
-				       uint32_t                  image_flags)
-{
-    iter->image = image;
-    iter->buffer = (uint32_t *)buffer;
-    iter->x = x;
-    iter->y = y;
-    iter->width = width;
-    iter->height = height;
-    iter->iter_flags = iter_flags;
-    iter->image_flags = image_flags;
+    format = iter->image->common.extended_format_code;
 
     while (imp)
     {
-	if (imp->dest_iter_init && (*imp->dest_iter_init) (imp, iter))
-	    return TRUE;
-
-	imp = imp->fallback;
+        if (imp->iter_info)
+        {
+            const pixman_iter_info_t *info;
+
+            for (info = imp->iter_info; info->format != PIXMAN_null; ++info)
+            {
+                if ((info->format == PIXMAN_any || info->format == format) &&
+                    (info->image_flags & image_flags) == info->image_flags &&
+                    (info->iter_flags & iter_flags) == info->iter_flags)
+                {
+                    iter->get_scanline = info->get_scanline;
+                    iter->write_back = info->write_back;
+
+                    if (info->initializer)
+                        info->initializer (iter, info);
+                    return;
+                }
+            }
+        }
+
+        imp = imp->fallback;
     }
-
-    return FALSE;
 }
 
 pixman_bool_t
diff --git a/lib/pixman/pixman/pixman-matrix.c b/lib/pixman/pixman/pixman-matrix.c
index cacc05cc8..4032c137a 100644
--- a/lib/pixman/pixman/pixman-matrix.c
+++ b/lib/pixman/pixman/pixman-matrix.c
@@ -37,8 +37,7 @@
 static force_inline int
 count_leading_zeros (uint32_t x)
 {
-#if defined(__GNUC__) && \
-	(__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#ifdef HAVE_BUILTIN_CLZ
     return __builtin_clz (x);
 #else
     int n = 0;
diff --git a/lib/pixman/pixman/pixman-mmx.c b/lib/pixman/pixman/pixman-mmx.c
index 14790c029..f9a92ce09 100644
--- a/lib/pixman/pixman/pixman-mmx.c
+++ b/lib/pixman/pixman/pixman-mmx.c
@@ -301,6 +301,29 @@ negate (__m64 mask)
     return _mm_xor_si64 (mask, MC (4x00ff));
 }
 
+/* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1
+ * and maps its result to the same range.
+ *
+ * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner:
+ * Notation, Notation, Notation", the first of which is
+ *
+ *   prod(a, b) = (a * b + 128) / 255.
+ *
+ * By approximating the division by 255 as 257/65536 it can be replaced by a
+ * multiply and a right shift. This is the implementation that we use in
+ * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended
+ * 3DNow!, and unavailable at the time of the book's publication) to perform
+ * the multiplication and right shift in a single operation.
+ *
+ *   prod(a, b) = ((a * b + 128) * 257) >> 16.
+ *
+ * A third way (how pix_multiply() was implemented prior to 14208344) exists
+ * also that performs the multiplication by 257 with adds and shifts.
+ *
+ * Where temp = a * b + 128
+ *
+ *   prod(a, b) = (temp + (temp >> 8)) >> 8.
+ */
 static force_inline __m64
 pix_multiply (__m64 a, __m64 b)
 {
@@ -3538,7 +3561,6 @@ mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
 #define BILINEAR_DECLARE_VARIABLES						\
     const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\
     const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\
-    const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT);	\
     const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);				\
     const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);			\
     const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\
@@ -3557,36 +3579,16 @@ do {										\
     __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\
     __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\
     __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\
-    vx += unit_x;								\
-    if (BILINEAR_INTERPOLATION_BITS < 8)					\
-    {										\
-	/* calculate horizontal weights */					\
-	__m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\
+    /* calculate horizontal weights */						\
+    __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\
 			  _mm_srli_pi16 (mm_x,					\
 					 16 - BILINEAR_INTERPOLATION_BITS)));	\
-	/* horizontal interpolation */						\
-	__m64 p = _mm_unpacklo_pi16 (lo, hi);					\
-	__m64 q = _mm_unpackhi_pi16 (lo, hi);					\
-	lo = _mm_madd_pi16 (p, mm_wh);						\
-	hi = _mm_madd_pi16 (q, mm_wh);						\
-    }										\
-    else									\
-    {										\
-	/* calculate horizontal weights */					\
-	__m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x,		\
-					16 - BILINEAR_INTERPOLATION_BITS));	\
-	__m64 mm_wh_hi = _mm_srli_pi16 (mm_x,					\
-					16 - BILINEAR_INTERPOLATION_BITS);	\
-	/* horizontal interpolation */						\
-	__m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);				\
-	__m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi);				\
-	__m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo);				\
-	__m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);				\
-	lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo),		\
-			   _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi));		\
-	hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo),		\
-			   _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi));		\
-    }										\
+    /* horizontal interpolation */						\
+    __m64 p = _mm_unpacklo_pi16 (lo, hi);					\
+    __m64 q = _mm_unpackhi_pi16 (lo, hi);					\
+    vx += unit_x;								\
+    lo = _mm_madd_pi16 (p, mm_wh);						\
+    hi = _mm_madd_pi16 (q, mm_wh);						\
     mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
     /* shift and pack the result */						\
     hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\
@@ -3899,52 +3901,23 @@ mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
     return iter->buffer;
 }
 
-typedef struct
-{
-    pixman_format_code_t	format;
-    pixman_iter_get_scanline_t	get_scanline;
-} fetcher_info_t;
-
-static const fetcher_info_t fetchers[] =
-{
-    { PIXMAN_x8r8g8b8,		mmx_fetch_x8r8g8b8 },
-    { PIXMAN_r5g6b5,		mmx_fetch_r5g6b5 },
-    { PIXMAN_a8,		mmx_fetch_a8 },
-    { PIXMAN_null }
-};
-
-static pixman_bool_t
-mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
-{
-    pixman_image_t *image = iter->image;
-
-#define FLAGS								\
+#define IMAGE_FLAGS							\
     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
 
-    if ((iter->iter_flags & ITER_NARROW)			&&
-	(iter->image_flags & FLAGS) == FLAGS)
-    {
-	const fetcher_info_t *f;
-
-	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
-	{
-	    if (image->common.extended_format_code == f->format)
-	    {
-		uint8_t *b = (uint8_t *)image->bits.bits;
-		int s = image->bits.rowstride * 4;
-
-		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
-		iter->stride = s;
-
-		iter->get_scanline = f->get_scanline;
-		return TRUE;
-	    }
-	}
-    }
-
-    return FALSE;
-}
+static const pixman_iter_info_t mmx_iters[] = 
+{
+    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL
+    },
+    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL
+    },
+    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL
+    },
+    { PIXMAN_null },
+};
 
 static const pixman_fast_path_t mmx_fast_paths[] =
 {
@@ -4074,7 +4047,7 @@ _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
     imp->blt = mmx_blt;
     imp->fill = mmx_fill;
 
-    imp->src_iter_init = mmx_src_iter_init;
+    imp->iter_info = mmx_iters;
 
     return imp;
 }
diff --git a/lib/pixman/pixman/pixman-noop.c b/lib/pixman/pixman/pixman-noop.c
index e39996d9d..e59890492 100644
--- a/lib/pixman/pixman/pixman-noop.c
+++ b/lib/pixman/pixman/pixman-noop.c
@@ -37,12 +37,6 @@ noop_composite (pixman_implementation_t *imp,
     return;
 }
 
-static void
-dest_write_back_direct (pixman_iter_t *iter)
-{
-    iter->buffer += iter->image->bits.rowstride;
-}
-
 static uint32_t *
 noop_get_scanline (pixman_iter_t *iter, const uint32_t *mask)
 {
@@ -53,110 +47,102 @@ noop_get_scanline (pixman_iter_t *iter, const uint32_t *mask)
     return result;
 }
 
-static uint32_t *
-get_scanline_null (pixman_iter_t *iter, const uint32_t *mask)
-{
-    return NULL;
+static void
+noop_init_solid_narrow (pixman_iter_t *iter,
+			const pixman_iter_info_t *info)
+{ 
+    pixman_image_t *image = iter->image;
+    uint32_t *buffer = iter->buffer;
+    uint32_t *end = buffer + iter->width;
+    uint32_t color;
+
+    if (iter->image->type == SOLID)
+	color = image->solid.color_32;
+    else
+	color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
+
+    while (buffer < end)
+	*(buffer++) = color;
 }
 
-static pixman_bool_t
-noop_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+static void
+noop_init_solid_wide (pixman_iter_t *iter,
+		      const pixman_iter_info_t *info)
 {
     pixman_image_t *image = iter->image;
+    argb_t *buffer = (argb_t *)iter->buffer;
+    argb_t *end = buffer + iter->width;
+    argb_t color;
 
-#define FLAGS						\
-    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
-
-    if (!image)
-    {
-	iter->get_scanline = get_scanline_null;
-    }
-    else if ((iter->iter_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
-	     (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
-    {
-	iter->get_scanline = _pixman_iter_get_scanline_noop;
-    }
-    else if (image->common.extended_format_code == PIXMAN_solid		&&
-	     (iter->image->type == SOLID ||
-	      (iter->image_flags & FAST_PATH_NO_ALPHA_MAP)))
-    {
-	if (iter->iter_flags & ITER_NARROW)
-	{
-	    uint32_t *buffer = iter->buffer;
-	    uint32_t *end = buffer + iter->width;
-	    uint32_t color;
-
-	    if (image->type == SOLID)
-		color = image->solid.color_32;
-	    else
-		color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
-
-	    while (buffer < end)
-		*(buffer++) = color;
-	}
-	else
-	{
-	    argb_t *buffer = (argb_t *)iter->buffer;
-	    argb_t *end = buffer + iter->width;
-	    argb_t color;
-
-	    if (image->type == SOLID)
-		color = image->solid.color_float;
-	    else
-		color = image->bits.fetch_pixel_float (&image->bits, 0, 0);
-
-	    while (buffer < end)
-		*(buffer++) = color;
-	}
-
-	iter->get_scanline = _pixman_iter_get_scanline_noop;
-    }
-    else if (image->common.extended_format_code == PIXMAN_a8r8g8b8	&&
-	     (iter->iter_flags & ITER_NARROW)				&&
-	     (iter->image_flags & FLAGS) == FLAGS			&&
-	     iter->x >= 0 && iter->y >= 0				&&
-	     iter->x + iter->width <= image->bits.width			&&
-	     iter->y + iter->height <= image->bits.height)
-    {
-	iter->buffer =
-	    image->bits.bits + iter->y * image->bits.rowstride + iter->x;
-
-	iter->get_scanline = noop_get_scanline;
-    }
+    if (iter->image->type == SOLID)
+	color = image->solid.color_float;
     else
-    {
-	return FALSE;
-    }
+	color = image->bits.fetch_pixel_float (&image->bits, 0, 0);
 
-    return TRUE;
+    while (buffer < end)
+	*(buffer++) = color;
 }
 
-static pixman_bool_t
-noop_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+static void
+noop_init_direct_buffer (pixman_iter_t *iter, const pixman_iter_info_t *info)
 {
     pixman_image_t *image = iter->image;
-    uint32_t image_flags = iter->image_flags;
-    uint32_t iter_flags = iter->iter_flags;
-    
-    if ((image_flags & FAST_PATH_STD_DEST_FLAGS) == FAST_PATH_STD_DEST_FLAGS	&&
-	(iter_flags & ITER_NARROW) == ITER_NARROW				&&
-	((image->common.extended_format_code == PIXMAN_a8r8g8b8)	||
-	 (image->common.extended_format_code == PIXMAN_x8r8g8b8 &&
-	  (iter_flags & (ITER_LOCALIZED_ALPHA)))))
-    {
-	iter->buffer = image->bits.bits + iter->y * image->bits.rowstride + iter->x;
-
-	iter->get_scanline = _pixman_iter_get_scanline_noop;
-	iter->write_back = dest_write_back_direct;
-
-	return TRUE;
-    }
-    else
-    {
-	return FALSE;
-    }
+
+    iter->buffer =
+	image->bits.bits + iter->y * image->bits.rowstride + iter->x;
 }
 
+static void
+dest_write_back_direct (pixman_iter_t *iter)
+{
+    iter->buffer += iter->image->bits.rowstride;
+}
+
+static const pixman_iter_info_t noop_iters[] =
+{
+    /* Source iters */
+    { PIXMAN_any,
+      0, ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_SRC,
+      NULL,
+      _pixman_iter_get_scanline_noop,
+      NULL
+    },
+    { PIXMAN_solid,
+      FAST_PATH_NO_ALPHA_MAP, ITER_NARROW | ITER_SRC,
+      noop_init_solid_narrow,
+      _pixman_iter_get_scanline_noop,
+      NULL,
+    },
+    { PIXMAN_solid,
+      FAST_PATH_NO_ALPHA_MAP, ITER_WIDE | ITER_SRC,
+      noop_init_solid_wide,
+      _pixman_iter_get_scanline_noop,
+      NULL
+    },
+    { PIXMAN_a8r8g8b8,
+      FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |
+          FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,
+      ITER_NARROW | ITER_SRC,
+      noop_init_direct_buffer,
+      noop_get_scanline,
+      NULL
+    },
+    /* Dest iters */
+    { PIXMAN_a8r8g8b8,
+      FAST_PATH_STD_DEST_FLAGS, ITER_NARROW | ITER_DEST,
+      noop_init_direct_buffer,
+      _pixman_iter_get_scanline_noop,
+      dest_write_back_direct
+    },
+    { PIXMAN_x8r8g8b8,
+      FAST_PATH_STD_DEST_FLAGS, ITER_NARROW | ITER_DEST | ITER_LOCALIZED_ALPHA,
+      noop_init_direct_buffer,
+      _pixman_iter_get_scanline_noop,
+      dest_write_back_direct
+    },
+    { PIXMAN_null },
+};
+
 static const pixman_fast_path_t noop_fast_paths[] =
 {
     { PIXMAN_OP_DST, PIXMAN_any, 0, PIXMAN_any, 0, PIXMAN_any, 0, noop_composite },
@@ -169,8 +155,7 @@ _pixman_implementation_create_noop (pixman_implementation_t *fallback)
     pixman_implementation_t *imp =
 	_pixman_implementation_create (fallback, noop_fast_paths);
  
-    imp->src_iter_init = noop_src_iter_init;
-    imp->dest_iter_init = noop_dest_iter_init;
+    imp->iter_info = noop_iters;
 
     return imp;
 }
diff --git a/lib/pixman/pixman/pixman-private.h b/lib/pixman/pixman/pixman-private.h
index 6d9c05321..6ca13b216 100644
--- a/lib/pixman/pixman/pixman-private.h
+++ b/lib/pixman/pixman/pixman-private.h
@@ -57,7 +57,7 @@ struct argb_t
     float b;
 };
 
-typedef void (*fetch_scanline_t) (pixman_image_t *image,
+typedef void (*fetch_scanline_t) (bits_image_t   *image,
 				  int             x,
 				  int             y,
 				  int             width,
@@ -209,10 +209,12 @@ union pixman_image
 typedef struct pixman_iter_t pixman_iter_t;
 typedef uint32_t *(* pixman_iter_get_scanline_t) (pixman_iter_t *iter, const uint32_t *mask);
 typedef void      (* pixman_iter_write_back_t)   (pixman_iter_t *iter);
+typedef void	  (* pixman_iter_fini_t)	 (pixman_iter_t *iter);
 
 typedef enum
 {
-    ITER_NARROW =		(1 << 0),
+    ITER_NARROW =               (1 << 0),
+    ITER_WIDE =                 (1 << 1),
 
     /* "Localized alpha" is when the alpha channel is used only to compute
      * the alpha value of the destination. This means that the computation
@@ -229,9 +231,15 @@ typedef enum
      * we can treat it as if it were ARGB, which means in some cases we can
      * avoid copying it to a temporary buffer.
      */
-    ITER_LOCALIZED_ALPHA =	(1 << 1),
-    ITER_IGNORE_ALPHA =		(1 << 2),
-    ITER_IGNORE_RGB =		(1 << 3)
+    ITER_LOCALIZED_ALPHA =	(1 << 2),
+    ITER_IGNORE_ALPHA =		(1 << 3),
+    ITER_IGNORE_RGB =		(1 << 4),
+
+    /* These indicate whether the iterator is for a source
+     * or a destination image
+     */
+    ITER_SRC =			(1 << 5),
+    ITER_DEST =			(1 << 6)
 } iter_flags_t;
 
 struct pixman_iter_t
@@ -248,6 +256,7 @@ struct pixman_iter_t
     /* These function pointers are initialized by the implementation */
     pixman_iter_get_scanline_t	get_scanline;
     pixman_iter_write_back_t	write_back;
+    pixman_iter_fini_t          fini;
 
     /* These fields are scratch data that implementations can use */
     void *			data;
@@ -255,6 +264,19 @@ struct pixman_iter_t
     int				stride;
 };
 
+typedef struct pixman_iter_info_t pixman_iter_info_t;
+typedef void (* pixman_iter_initializer_t) (pixman_iter_t *iter,
+                                            const pixman_iter_info_t *info);
+struct pixman_iter_info_t
+{
+    pixman_format_code_t	format;
+    uint32_t			image_flags;
+    iter_flags_t		iter_flags;
+    pixman_iter_initializer_t	initializer;
+    pixman_iter_get_scanline_t	get_scanline;
+    pixman_iter_write_back_t	write_back;
+};
+
 void
 _pixman_bits_image_setup_accessors (bits_image_t *image);
 
@@ -454,8 +476,6 @@ typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp,
 					     int                      width,
 					     int                      height,
 					     uint32_t                 filler);
-typedef pixman_bool_t (*pixman_iter_init_func_t) (pixman_implementation_t *imp,
-						  pixman_iter_t           *iter);
 
 void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp);
 void _pixman_setup_combiner_functions_float (pixman_implementation_t *imp);
@@ -477,11 +497,10 @@ struct pixman_implementation_t
     pixman_implementation_t *	toplevel;
     pixman_implementation_t *	fallback;
     const pixman_fast_path_t *	fast_paths;
+    const pixman_iter_info_t *  iter_info;
 
     pixman_blt_func_t		blt;
     pixman_fill_func_t		fill;
-    pixman_iter_init_func_t     src_iter_init;
-    pixman_iter_init_func_t     dest_iter_init;
 
     pixman_combine_32_func_t	combine_32[PIXMAN_N_OPERATORS];
     pixman_combine_32_func_t	combine_32_ca[PIXMAN_N_OPERATORS];
@@ -542,29 +561,17 @@ _pixman_implementation_fill (pixman_implementation_t *imp,
                              int                      height,
                              uint32_t                 filler);
 
-pixman_bool_t
-_pixman_implementation_src_iter_init (pixman_implementation_t       *imp,
-				      pixman_iter_t                 *iter,
-				      pixman_image_t                *image,
-				      int                            x,
-				      int                            y,
-				      int                            width,
-				      int                            height,
-				      uint8_t                       *buffer,
-				      iter_flags_t                   flags,
-				      uint32_t                       image_flags);
-
-pixman_bool_t
-_pixman_implementation_dest_iter_init (pixman_implementation_t       *imp,
-				       pixman_iter_t                 *iter,
-				       pixman_image_t                *image,
-				       int                            x,
-				       int                            y,
-				       int                            width,
-				       int                            height,
-				       uint8_t                       *buffer,
-				       iter_flags_t                   flags,
-				       uint32_t                       image_flags);
+void
+_pixman_implementation_iter_init (pixman_implementation_t       *imp,
+                                  pixman_iter_t                 *iter,
+                                  pixman_image_t                *image,
+                                  int                            x,
+                                  int                            y,
+                                  int                            width,
+                                  int                            height,
+                                  uint8_t                       *buffer,
+                                  iter_flags_t                   flags,
+                                  uint32_t                       image_flags);
 
 /* Specific implementations */
 pixman_implementation_t *
@@ -586,6 +593,11 @@ pixman_implementation_t *
 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback);
 #endif
 
+#ifdef USE_SSSE3
+pixman_implementation_t *
+_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback);
+#endif
+
 #ifdef USE_ARM_SIMD
 pixman_implementation_t *
 _pixman_implementation_create_arm_simd (pixman_implementation_t *fallback);
@@ -647,6 +659,9 @@ _pixman_compute_composite_region32 (pixman_region32_t * region,
 uint32_t *
 _pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask);
 
+void
+_pixman_iter_init_bits_stride (pixman_iter_t *iter, const pixman_iter_info_t *info);
+
 /* These "formats" all have depth 0, so they
  * will never clash with any real ones
  */
@@ -777,6 +792,9 @@ pixman_malloc_ab (unsigned int n, unsigned int b);
 void *
 pixman_malloc_abc (unsigned int a, unsigned int b, unsigned int c);
 
+void *
+pixman_malloc_ab_plus_c (unsigned int a, unsigned int b, unsigned int c);
+
 pixman_bool_t
 _pixman_multiply_overflows_size (size_t a, size_t b);
 
diff --git a/lib/pixman/pixman/pixman-sse2.c b/lib/pixman/pixman/pixman-sse2.c
index 8a82eda7e..a6e780815 100644
--- a/lib/pixman/pixman/pixman-sse2.c
+++ b/lib/pixman/pixman/pixman-sse2.c
@@ -30,6 +30,9 @@
 #include <config.h>
 #endif
 
+/* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
+#define PSHUFD_IS_FAST 0
+
 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
 #include <emmintrin.h> /* for SSE2 intrinsics */
 #include "pixman-private.h"
@@ -5554,77 +5557,134 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
 			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
 
-#if BILINEAR_INTERPOLATION_BITS < 8
+#if PSHUFD_IS_FAST
+
+/***********************************************************************************/
+
 # define BILINEAR_DECLARE_VARIABLES						\
     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
     const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
-    const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
-					  unit_x, -unit_x, unit_x, -unit_x);	\
+    const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
+					   unit_x, -unit_x, unit_x, -unit_x);	\
+    const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4);		\
     const __m128i xmm_zero = _mm_setzero_si128 ();				\
-    __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),		\
-				   vx, -(vx + 1), vx, -(vx + 1))
-#else
+    __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3,	\
+				   vx + unit_x * 2, -(vx + 1) - unit_x * 2,	\
+				   vx + unit_x * 1, -(vx + 1) - unit_x * 1,	\
+				   vx + unit_x * 0, -(vx + 1) - unit_x * 0);	\
+    __m128i xmm_wh_state;
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_)			\
+do {										\
+    int phase = phase_;								\
+    __m128i xmm_wh, xmm_a, xmm_b;						\
+    /* fetch 2x2 pixel block into sse2 registers */				\
+    __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\
+    __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\
+    vx += unit_x;								\
+    /* vertical interpolation */						\
+    xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\
+    xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\
+    xmm_a = _mm_add_epi16 (xmm_a, xmm_b);						\
+    /* calculate horizontal weights */						\
+    if (phase <= 0)								\
+    {										\
+	xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
+	xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4);		\
+	phase = 0;								\
+    }										\
+    xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase,	\
+							   phase, phase));	\
+    /* horizontal interpolation */						\
+    xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
+		xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh);		\
+    /* shift the result */							\
+    pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\
+} while (0)
+
+#else /************************************************************************/
+
 # define BILINEAR_DECLARE_VARIABLES						\
     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
-    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
-    const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,	\
-					  -unit_x, -unit_x, -unit_x, -unit_x);	\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
+    const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
+					  unit_x, -unit_x, unit_x, -unit_x);	\
+    const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4);		\
     const __m128i xmm_zero = _mm_setzero_si128 ();				\
-    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx,				\
-				   -(vx + 1), -(vx + 1), -(vx + 1), -(vx + 1))
-#endif
+    __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),		\
+				   vx, -(vx + 1), vx, -(vx + 1))
 
-#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
+#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase)			\
 do {										\
-    __m128i xmm_wh, xmm_lo, xmm_hi, a;						\
+    __m128i xmm_wh, xmm_a, xmm_b;						\
     /* fetch 2x2 pixel block into sse2 registers */				\
-    __m128i tltr = _mm_loadl_epi64 (						\
-			    (__m128i *)&src_top[pixman_fixed_to_int (vx)]);	\
-    __m128i blbr = _mm_loadl_epi64 (						\
-			    (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]);	\
+    __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\
+    __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\
+    (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */		\
     vx += unit_x;								\
     /* vertical interpolation */						\
-    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero),	\
-					xmm_wt),				\
-		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),	\
-					xmm_wb));				\
-    if (BILINEAR_INTERPOLATION_BITS < 8)					\
-    {										\
-	/* calculate horizontal weights */					\
-	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
+    xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\
+    xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\
+    xmm_a = _mm_add_epi16 (xmm_a, xmm_b);					\
+    /* calculate horizontal weights */						\
+    xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,			\
 					16 - BILINEAR_INTERPOLATION_BITS));	\
-	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
-	/* horizontal interpolation */						\
-	a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
-		a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);			\
-    }										\
-    else									\
-    {										\
-	/* calculate horizontal weights */					\
-	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
-					16 - BILINEAR_INTERPOLATION_BITS));	\
-	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
-	/* horizontal interpolation */						\
-	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
-	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
-	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
-			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\
-    }										\
-    /* shift and pack the result */						\
-    a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);			\
-    a = _mm_packs_epi32 (a, a);							\
-    a = _mm_packus_epi16 (a, a);						\
-    pix = _mm_cvtsi128_si32 (a);						\
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\
+    /* horizontal interpolation */						\
+    xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a);	\
+    xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh);		\
+    /* shift the result */							\
+    pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\
 } while (0)
 
+/***********************************************************************************/
+
+#endif
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix);					\
+do {										\
+	__m128i xmm_pix;							\
+	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1);			\
+	xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix);				\
+	xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix);				\
+	pix = _mm_cvtsi128_si32 (xmm_pix);					\
+} while(0)
+
+#define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix);					\
+do {										\
+	__m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4;				\
+	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0);			\
+	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1);			\
+	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2);			\
+	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3);			\
+	xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2);			\
+	xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4);			\
+	pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3);				\
+} while(0)
+
 #define BILINEAR_SKIP_ONE_PIXEL()						\
 do {										\
     vx += unit_x;								\
-    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\
+} while(0)
+
+#define BILINEAR_SKIP_FOUR_PIXELS()						\
+do {										\
+    vx += unit_x * 4;								\
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4);					\
 } while(0)
 
+/***********************************************************************************/
+
 static force_inline void
 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
 					     const uint32_t * mask,
@@ -5633,24 +5693,28 @@ scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
 					     int32_t          w,
 					     int              wt,
 					     int              wb,
-					     pixman_fixed_t   vx,
-					     pixman_fixed_t   unit_x,
+					     pixman_fixed_t   vx_,
+					     pixman_fixed_t   unit_x_,
 					     pixman_fixed_t   max_vx,
 					     pixman_bool_t    zero_src)
 {
+    intptr_t vx = vx_;
+    intptr_t unit_x = unit_x_;
     BILINEAR_DECLARE_VARIABLES;
-    uint32_t pix1, pix2, pix3, pix4;
+    uint32_t pix1, pix2;
 
-    while ((w -= 4) >= 0)
+    while (w && ((uintptr_t)dst & 15))
     {
 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
 	*dst++ = pix1;
-	*dst++ = pix2;
-	*dst++ = pix3;
-	*dst++ = pix4;
+	w--;
+    }
+
+    while ((w -= 4) >= 0) {
+	__m128i xmm_src;
+	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
+	_mm_store_si128 ((__m128i *)dst, xmm_src);
+	dst += 4;
     }
 
     if (w & 2)
@@ -5687,6 +5751,66 @@ FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
 			       NORMAL, FLAG_NONE)
 
 static force_inline void
+scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t *       dst,
+					     const uint32_t * mask,
+					     const uint32_t * src_top,
+					     const uint32_t * src_bottom,
+					     int32_t          w,
+					     int              wt,
+					     int              wb,
+					     pixman_fixed_t   vx_,
+					     pixman_fixed_t   unit_x_,
+					     pixman_fixed_t   max_vx,
+					     pixman_bool_t    zero_src)
+{
+    intptr_t vx = vx_;
+    intptr_t unit_x = unit_x_;
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2;
+
+    while (w && ((uintptr_t)dst & 15))
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	*dst++ = pix1 | 0xFF000000;
+	w--;
+    }
+
+    while ((w -= 4) >= 0) {
+	__m128i xmm_src;
+	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
+	_mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000));
+	dst += 4;
+    }
+
+    if (w & 2)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+	*dst++ = pix1 | 0xFF000000;
+	*dst++ = pix2 | 0xFF000000;
+    }
+
+    if (w & 1)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	*dst = pix1 | 0xFF000000;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC,
+			       scaled_bilinear_scanline_sse2_x888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC,
+			       scaled_bilinear_scanline_sse2_x888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC,
+			       scaled_bilinear_scanline_sse2_x888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
+static force_inline void
 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
 					      const uint32_t * mask,
 					      const uint32_t * src_top,
@@ -5694,13 +5818,15 @@ scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
 					      int32_t          w,
 					      int              wt,
 					      int              wb,
-					      pixman_fixed_t   vx,
-					      pixman_fixed_t   unit_x,
+					      pixman_fixed_t   vx_,
+					      pixman_fixed_t   unit_x_,
 					      pixman_fixed_t   max_vx,
 					      pixman_bool_t    zero_src)
 {
+    intptr_t vx = vx_;
+    intptr_t unit_x = unit_x_;
     BILINEAR_DECLARE_VARIABLES;
-    uint32_t pix1, pix2, pix3, pix4;
+    uint32_t pix1, pix2;
 
     while (w && ((uintptr_t)dst & 15))
     {
@@ -5722,12 +5848,7 @@ scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
 	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
 	__m128i xmm_alpha_hi, xmm_alpha_lo;
 
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
-
-	xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
 
 	if (!is_zero (xmm_src))
 	{
@@ -5794,13 +5915,15 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
 						int32_t          w,
 						int              wt,
 						int              wb,
-						pixman_fixed_t   vx,
-						pixman_fixed_t   unit_x,
+						pixman_fixed_t   vx_,
+						pixman_fixed_t   unit_x_,
 						pixman_fixed_t   max_vx,
 						pixman_bool_t    zero_src)
 {
+    intptr_t vx = vx_;
+    intptr_t unit_x = unit_x_;
     BILINEAR_DECLARE_VARIABLES;
-    uint32_t pix1, pix2, pix3, pix4;
+    uint32_t pix1, pix2;
     uint32_t m;
 
     while (w && ((uintptr_t)dst & 15))
@@ -5851,12 +5974,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
 
 	if (m)
 	{
-	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
-	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
-	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
-	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
-
-	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
+	    BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
 
 	    if (m == 0xffffffff && is_opaque (xmm_src))
 	    {
@@ -5883,10 +6001,7 @@ scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
 	}
 	else
 	{
-	    BILINEAR_SKIP_ONE_PIXEL ();
-	    BILINEAR_SKIP_ONE_PIXEL ();
-	    BILINEAR_SKIP_ONE_PIXEL ();
-	    BILINEAR_SKIP_ONE_PIXEL ();
+	    BILINEAR_SKIP_FOUR_PIXELS ();
 	}
 
 	w -= 4;
@@ -5958,13 +6073,15 @@ scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
 						int32_t          w,
 						int              wt,
 						int              wb,
-						pixman_fixed_t   vx,
-						pixman_fixed_t   unit_x,
+						pixman_fixed_t   vx_,
+						pixman_fixed_t   unit_x_,
 						pixman_fixed_t   max_vx,
 						pixman_bool_t    zero_src)
 {
+    intptr_t vx = vx_;
+    intptr_t unit_x = unit_x_;
     BILINEAR_DECLARE_VARIABLES;
-    uint32_t pix1, pix2, pix3, pix4;
+    uint32_t pix1;
     __m128i xmm_mask;
 
     if (zero_src || (*mask >> 24) == 0)
@@ -5994,19 +6111,15 @@ scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
 
     while (w >= 4)
     {
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
-	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
+	__m128i xmm_src;
+	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
 
-	if (pix1 | pix2 | pix3 | pix4)
+	if (!is_zero (xmm_src))
 	{
-	    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+	    __m128i xmm_src_lo, xmm_src_hi;
 	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 	    __m128i xmm_alpha_lo, xmm_alpha_hi;
 
-	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
-
 	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
@@ -6194,6 +6307,13 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
 
+    SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
+
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
@@ -6340,54 +6460,25 @@ sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
     return iter->buffer;
 }
 
-typedef struct
-{
-    pixman_format_code_t	format;
-    pixman_iter_get_scanline_t	get_scanline;
-} fetcher_info_t;
-
-static const fetcher_info_t fetchers[] =
-{
-    { PIXMAN_x8r8g8b8,		sse2_fetch_x8r8g8b8 },
-    { PIXMAN_r5g6b5,		sse2_fetch_r5g6b5 },
-    { PIXMAN_a8,		sse2_fetch_a8 },
-    { PIXMAN_null }
-};
-
-static pixman_bool_t
-sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
-{
-    pixman_image_t *image = iter->image;
-
-#define FLAGS								\
+#define IMAGE_FLAGS							\
     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
 
-    if ((iter->iter_flags & ITER_NARROW)			&&
-	(iter->image_flags & FLAGS) == FLAGS)
-    {
-	const fetcher_info_t *f;
-
-	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
-	{
-	    if (image->common.extended_format_code == f->format)
-	    {
-		uint8_t *b = (uint8_t *)image->bits.bits;
-		int s = image->bits.rowstride * 4;
-
-		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
-		iter->stride = s;
-
-		iter->get_scanline = f->get_scanline;
-		return TRUE;
-	    }
-	}
-    }
-
-    return FALSE;
-}
+static const pixman_iter_info_t sse2_iters[] = 
+{
+    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL
+    },
+    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL
+    },
+    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL
+    },
+    { PIXMAN_null },
+};
 
-#if defined(__GNUC__) // && !defined(__x86_64__) && !defined(__amd64__)
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
 __attribute__((__force_align_arg_pointer__))
 #endif
 pixman_implementation_t *
@@ -6443,7 +6534,7 @@ _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
     imp->blt = sse2_blt;
     imp->fill = sse2_fill;
 
-    imp->src_iter_init = sse2_src_iter_init;
+    imp->iter_info = sse2_iters;
 
     return imp;
 }
diff --git a/lib/pixman/pixman/pixman-ssse3.c b/lib/pixman/pixman/pixman-ssse3.c
new file mode 100644
index 000000000..680d6b95a
--- /dev/null
+++ b/lib/pixman/pixman/pixman-ssse3.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright © 2013 Soren Sandmann Pedersen
+ * Copyright © 2013 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Soren Sandmann (soren.sandmann@gmail.com)
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include "pixman-private.h"
+#include "pixman-inlines.h"
+
+typedef struct
+{
+    int		y;
+    uint64_t *	buffer;
+} line_t;
+
+typedef struct
+{
+    line_t		lines[2];
+    pixman_fixed_t	y;
+    pixman_fixed_t	x;
+    uint64_t		data[1];
+} bilinear_info_t;
+
+static void
+ssse3_fetch_horizontal (bits_image_t *image, line_t *line,
+			int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
+{
+    uint32_t *bits = image->bits + y * image->rowstride;
+    __m128i vx = _mm_set_epi16 (
+	- (x + 1), x, - (x + 1), x,
+	- (x + ux + 1), x + ux,  - (x + ux + 1), x + ux);
+    __m128i vux = _mm_set_epi16 (
+	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
+	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
+    __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
+    __m128i *b = (__m128i *)line->buffer;
+    __m128i vrl0, vrl1;
+
+    while ((n -= 2) >= 0)
+    {
+	__m128i vw, vr, s;
+
+	vrl1 = _mm_loadl_epi64 (
+	    (__m128i *)(bits + pixman_fixed_to_int (x + ux)));
+	/* vrl1: R1, L1 */
+
+    final_pixel:
+	vrl0 = _mm_loadl_epi64 (
+	    (__m128i *)(bits + pixman_fixed_to_int (x)));
+	/* vrl0: R0, L0 */
+
+	/* The weights are based on vx which is a vector of 
+	 *
+	 *    - (x + 1), x, - (x + 1), x,
+	 *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
+	 *
+	 * so the 16 bit weights end up like this:
+	 *
+	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1
+	 *
+	 * and after shifting and packing, we get these bytes:
+	 *
+	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+	 *        iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+	 *
+	 * which means the first and the second input pixel 
+	 * have to be interleaved like this:
+	 *
+	 *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
+	 *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
+	 *
+	 * before maddubsw can be used.
+	 */
+
+	vw = _mm_add_epi16 (
+	    vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
+	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
+	 */
+
+	vw = _mm_packus_epi16 (vw, vw);
+	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+	 *         iw0, w0, iw0, w0, iw1, w1, iw1, w1
+	 */
+	vx = _mm_add_epi16 (vx, vux);
+
+	x += 2 * ux;
+
+	vr = _mm_unpacklo_epi16 (vrl1, vrl0);
+	/* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
+
+	s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
+	/* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
+
+	vr = _mm_unpackhi_epi8 (vr, s);
+	/* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
+	 *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
+	 */
+
+	vr = _mm_maddubs_epi16 (vr, vw);
+
+	/* When the weight is 0, the inverse weight is
+	 * 128 which can't be represented in a signed byte.
+	 * As a result maddubsw computes the following:
+	 *
+	 *     r = l * -128 + r * 0
+	 *
+	 * rather than the desired
+	 *
+	 *     r = l * 128 + r * 0
+	 *
+	 * We fix this by taking the absolute value of the
+	 * result.
+	 */
+	vr = _mm_abs_epi16 (vr);
+
+	/* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
+	_mm_store_si128 (b++, vr);
+    }
+
+    if (n == -1)
+    {
+	vrl1 = _mm_setzero_si128();
+	goto final_pixel;
+    }
+
+    line->y = y;
+}
+
+static uint32_t *
+ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_fixed_t fx, ux;
+    bilinear_info_t *info = iter->data;
+    line_t *line0, *line1;
+    int y0, y1;
+    int32_t dist_y;
+    __m128i vw;
+    int i;
+
+    fx = info->x;
+    ux = iter->image->common.transform->matrix[0][0];
+
+    y0 = pixman_fixed_to_int (info->y);
+    y1 = y0 + 1;
+
+    line0 = &info->lines[y0 & 0x01];
+    line1 = &info->lines[y1 & 0x01];
+
+    if (line0->y != y0)
+    {
+	ssse3_fetch_horizontal (
+	    &iter->image->bits, line0, y0, fx, ux, iter->width);
+    }
+
+    if (line1->y != y1)
+    {
+	ssse3_fetch_horizontal (
+	    &iter->image->bits, line1, y1, fx, ux, iter->width);
+    }
+
+    dist_y = pixman_fixed_to_bilinear_weight (info->y);
+    dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);
+
+    vw = _mm_set_epi16 (
+	dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
+
+    for (i = 0; i + 3 < iter->width; i += 4)
+    {
+	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
+	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
+	__m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
+	__m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
+	__m128i r0, r1, tmp, p;
+
+	r0 = _mm_mulhi_epu16 (
+	    _mm_sub_epi16 (bot0, top0), vw);
+	tmp = _mm_cmplt_epi16 (bot0, top0);
+	tmp = _mm_and_si128 (tmp, vw);
+	r0 = _mm_sub_epi16 (r0, tmp);
+	r0 = _mm_add_epi16 (r0, top0);
+	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
+	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
+	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
+	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
+
+	r1 = _mm_mulhi_epu16 (
+	    _mm_sub_epi16 (bot1, top1), vw);
+	tmp = _mm_cmplt_epi16 (bot1, top1);
+	tmp = _mm_and_si128 (tmp, vw);
+	r1 = _mm_sub_epi16 (r1, tmp);
+	r1 = _mm_add_epi16 (r1, top1);
+	r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
+	r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
+	/* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
+
+	p = _mm_packus_epi16 (r0, r1);
+
+	_mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
+    }
+
+    while (i < iter->width)
+    {
+	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
+	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
+	__m128i r0, tmp, p;
+
+	r0 = _mm_mulhi_epu16 (
+	    _mm_sub_epi16 (bot0, top0), vw);
+	tmp = _mm_cmplt_epi16 (bot0, top0);
+	tmp = _mm_and_si128 (tmp, vw);
+	r0 = _mm_sub_epi16 (r0, tmp);
+	r0 = _mm_add_epi16 (r0, top0);
+	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
+	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
+	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
+	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
+
+	p = _mm_packus_epi16 (r0, r0);
+
+	if (iter->width - i == 1)
+	{
+	    *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p);
+	    i++;
+	}
+	else
+	{
+	    _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
+	    i += 2;
+	}
+    }
+    
+    info->y += iter->image->common.transform->matrix[1][1];
+
+    return iter->buffer;
+}
+
+static void
+ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter)
+{
+    free (iter->data);
+}
+
+static void
+ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info)
+{
+    int width = iter->width;
+    bilinear_info_t *info;
+    pixman_vector_t v;
+
+    /* Reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (iter->image->common.transform, &v))
+	goto fail;
+
+    info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64);
+    if (!info)
+	goto fail;
+
+    info->x = v.vector[0] - pixman_fixed_1 / 2;
+    info->y = v.vector[1] - pixman_fixed_1 / 2;
+
+#define ALIGN(addr)							\
+    ((void *)((((uintptr_t)(addr)) + 15) & (~15)))
+
+    /* It is safe to set the y coordinates to -1 initially
+     * because COVER_CLIP_BILINEAR ensures that we will only
+     * be asked to fetch lines in the [0, height) interval
+     */
+    info->lines[0].y = -1;
+    info->lines[0].buffer = ALIGN (&(info->data[0]));
+    info->lines[1].y = -1;
+    info->lines[1].buffer = ALIGN (info->lines[0].buffer + width);
+
+    iter->get_scanline = ssse3_fetch_bilinear_cover;
+    iter->fini = ssse3_bilinear_cover_iter_fini;
+
+    iter->data = info;
+    return;
+
+fail:
+    /* Something went wrong, either a bad matrix or OOM; in such cases,
+     * we don't guarantee any particular rendering.
+     */
+    _pixman_log_error (
+	FUNC, "Allocation failure or bad matrix, skipping rendering\n");
+    
+    iter->get_scanline = _pixman_iter_get_scanline_noop;
+    iter->fini = NULL;
+}
+
+static const pixman_iter_info_t ssse3_iters[] = 
+{
+    { PIXMAN_a8r8g8b8,
+      (FAST_PATH_STANDARD_FLAGS			|
+       FAST_PATH_SCALE_TRANSFORM		|
+       FAST_PATH_BILINEAR_FILTER		|
+       FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR),
+      ITER_NARROW | ITER_SRC,
+      ssse3_bilinear_cover_iter_init,
+      NULL, NULL
+    },
+
+    { PIXMAN_null },
+};
+
+static const pixman_fast_path_t ssse3_fast_paths[] =
+{
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp =
+	_pixman_implementation_create (fallback, ssse3_fast_paths);
+
+    imp->iter_info = ssse3_iters;
+
+    return imp;
+}
diff --git a/lib/pixman/pixman/pixman-utils.c b/lib/pixman/pixman/pixman-utils.c
index f31171f6d..4a3a835c4 100644
--- a/lib/pixman/pixman/pixman-utils.c
+++ b/lib/pixman/pixman/pixman-utils.c
@@ -49,6 +49,15 @@ _pixman_addition_overflows_int (unsigned int a, unsigned int b)
 }
 
 void *
+pixman_malloc_ab_plus_c (unsigned int a, unsigned int b, unsigned int c)
+{
+    if (!b || a >= INT32_MAX / b || (a * b) > INT32_MAX - c)
+	return NULL;
+
+    return malloc (a * b + c);
+}
+
+void *
 pixman_malloc_ab (unsigned int a,
                   unsigned int b)
 {
@@ -214,6 +223,17 @@ _pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask)
     return iter->buffer;
 }
 
+void
+_pixman_iter_init_bits_stride (pixman_iter_t *iter, const pixman_iter_info_t *info)
+{
+    pixman_image_t *image = iter->image;
+    uint8_t *b = (uint8_t *)image->bits.bits;
+    int s = image->bits.rowstride * 4;
+
+    iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (info->format) / 8;
+    iter->stride = s;
+}
+
 #define N_TMP_BOXES (16)
 
 pixman_bool_t
diff --git a/lib/pixman/pixman/pixman-vmx.c b/lib/pixman/pixman/pixman-vmx.c
index f629003ab..c33631c0e 100644
--- a/lib/pixman/pixman/pixman-vmx.c
+++ b/lib/pixman/pixman/pixman-vmx.c
@@ -134,15 +134,11 @@ over (vector unsigned int src,
     source ## _mask = vec_lvsl (0, source);
 
 #define COMPUTE_SHIFT_MASKS(dest, source)				\
-    dest ## _mask = vec_lvsl (0, dest);					\
-    source ## _mask = vec_lvsl (0, source);				\
-    store_mask = vec_lvsr (0, dest);
+    source ## _mask = vec_lvsl (0, source);
 
 #define COMPUTE_SHIFT_MASKC(dest, source, mask)				\
     mask ## _mask = vec_lvsl (0, mask);					\
-    dest ## _mask = vec_lvsl (0, dest);					\
-    source ## _mask = vec_lvsl (0, source);				\
-    store_mask = vec_lvsr (0, dest);
+    source ## _mask = vec_lvsl (0, source);
 
 /* notice you have to declare temp vars...
  * Note: tmp3 and tmp4 must remain untouched!
@@ -151,23 +147,17 @@ over (vector unsigned int src,
 #define LOAD_VECTORS(dest, source)			  \
     tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
     tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
-    tmp3 = (typeof(tmp3))vec_ld (0, dest);		  \
     v ## source = (typeof(v ## source))			  \
 	vec_perm (tmp1, tmp2, source ## _mask);		  \
-    tmp4 = (typeof(tmp4))vec_ld (15, dest);		  \
-    v ## dest = (typeof(v ## dest))			  \
-	vec_perm (tmp3, tmp4, dest ## _mask);
+    v ## dest = (typeof(v ## dest))vec_ld (0, dest);
 
 #define LOAD_VECTORSC(dest, source, mask)		  \
     tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
     tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
-    tmp3 = (typeof(tmp3))vec_ld (0, dest);		  \
     v ## source = (typeof(v ## source))			  \
 	vec_perm (tmp1, tmp2, source ## _mask);		  \
-    tmp4 = (typeof(tmp4))vec_ld (15, dest);		  \
     tmp1 = (typeof(tmp1))vec_ld (0, mask);		  \
-    v ## dest = (typeof(v ## dest))			  \
-	vec_perm (tmp3, tmp4, dest ## _mask);		  \
+    v ## dest = (typeof(v ## dest))vec_ld (0, dest);	  \
     tmp2 = (typeof(tmp2))vec_ld (15, mask);		  \
     v ## mask = (typeof(v ## mask))			  \
 	vec_perm (tmp1, tmp2, mask ## _mask);
@@ -178,11 +168,7 @@ over (vector unsigned int src,
                                 splat_alpha (v ## mask));
 
 #define STORE_VECTOR(dest)						\
-    edges = vec_perm (tmp4, tmp3, dest ## _mask);			\
-    tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \
-    tmp1 = vec_perm (edges, (vector unsigned char)v ## dest, store_mask); \
-    vec_st ((vector unsigned int) tmp3, 15, dest);			\
-    vec_st ((vector unsigned int) tmp1, 0, dest);
+    vec_st ((vector unsigned int) v ## dest, 0, dest);
 
 static void
 vmx_combine_over_u_no_mask (uint32_t *      dest,
@@ -191,8 +177,19 @@ vmx_combine_over_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+
+	*dest++ = d;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKS (dest, src);
 
@@ -230,8 +227,23 @@ vmx_combine_over_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia;
+
+	UN8x4_MUL_UN8 (s, m);
+
+	ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+	*dest++ = d;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -286,8 +298,18 @@ vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKS (dest, src);
 
@@ -324,8 +346,21 @@ vmx_combine_over_reverse_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -379,8 +414,17 @@ vmx_combine_in_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (*dest);
+
+	UN8x4_MUL_UN8 (s, a);
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKS (dest, src);
 
@@ -415,8 +459,20 @@ vmx_combine_in_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (*dest);
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_MUL_UN8 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -468,8 +524,18 @@ vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t d = *dest;
+	uint32_t a = ALPHA_8 (*src++);
+
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKS (dest, src);
 
@@ -505,8 +571,21 @@ vmx_combine_in_reverse_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t d = *dest;
+	uint32_t a = *src++;
+
+	UN8x4_MUL_UN8 (a, m);
+	a = ALPHA_8 (a);
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -559,8 +638,18 @@ vmx_combine_out_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (~(*dest));
+
+	UN8x4_MUL_UN8 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKS (dest, src);
 
@@ -596,8 +685,20 @@ vmx_combine_out_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (~(*dest));
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_MUL_UN8 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -649,8 +750,18 @@ vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t d = *dest;
+	uint32_t a = ALPHA_8 (~(*src++));
+
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKS (dest, src);
 
@@ -687,8 +798,21 @@ vmx_combine_out_reverse_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t d = *dest;
+	uint32_t a = *src++;
+
+	UN8x4_MUL_UN8 (a, m);
+	a = ALPHA_8 (~a);
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -741,8 +865,20 @@ vmx_combine_atop_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t dest_a = ALPHA_8 (d);
+	uint32_t src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKS (dest, src);
 
@@ -781,8 +917,25 @@ vmx_combine_atop_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t dest_a = ALPHA_8 (d);
+	uint32_t src_ia;
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -840,8 +993,20 @@ vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_a = ALPHA_8 (s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKS (dest, src);
 
@@ -880,8 +1045,25 @@ vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_a;
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_a = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -939,8 +1121,20 @@ vmx_combine_xor_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_ia = ALPHA_8 (~s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKS (dest, src);
 
@@ -979,8 +1173,25 @@ vmx_combine_xor_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_ia;
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -1038,8 +1249,18 @@ vmx_combine_add_u_no_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+
+	UN8x4_ADD_UN8x4 (d, s);
+
+	*dest++ = d;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKS (dest, src);
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
@@ -1074,8 +1295,20 @@ vmx_combine_add_u_mask (uint32_t *      dest,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, src_mask, mask_mask, store_mask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_ADD_UN8x4 (d, s);
+
+	*dest++ = d;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -1130,8 +1363,18 @@ vmx_combine_src_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+
+	UN8x4_MUL_UN8x4 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -1170,8 +1413,22 @@ vmx_combine_over_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
+
+	*dest++ = d;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -1214,8 +1471,21 @@ vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ida = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
+
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -1257,8 +1527,20 @@ vmx_combine_in_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t da = ALPHA_8 (*dest);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (s, da);
+
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -1299,8 +1581,20 @@ vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (*src++);
+
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -1342,8 +1636,21 @@ vmx_combine_out_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (s, da);
+
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -1386,8 +1693,21 @@ vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4 (d, ~a);
+
+	*dest++ = d;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -1430,8 +1750,23 @@ vmx_combine_atop_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask, vsrca;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+	*dest++ = d;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -1481,8 +1816,23 @@ vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
+
+	*dest++ = d;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -1529,8 +1879,23 @@ vmx_combine_xor_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+	*dest++ = d;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
@@ -1577,8 +1942,20 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
 {
     int i;
     vector unsigned int vdest, vsrc, vmask;
-    vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-	dest_mask, mask_mask, src_mask, store_mask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_ADD_UN8x4 (s, d);
+
+	*dest++ = s;
+	width--;
+    }
 
     COMPUTE_SHIFT_MASKC (dest, src, mask);
 
diff --git a/lib/pixman/pixman/pixman-x86.c b/lib/pixman/pixman/pixman-x86.c
index 57e4d1f35..05297c476 100644
--- a/lib/pixman/pixman/pixman-x86.c
+++ b/lib/pixman/pixman/pixman-x86.c
@@ -25,7 +25,7 @@
 
 #include "pixman-private.h"
 
-#if defined(USE_X86_MMX) || defined (USE_SSE2)
+#if defined(USE_X86_MMX) || defined (USE_SSE2) || defined (USE_SSSE3)
 
 /* The CPU detection code needs to be in a file not compiled with
  * "-mmmx -msse", as gcc would generate CMOV instructions otherwise
@@ -39,7 +39,8 @@ typedef enum
     X86_MMX_EXTENSIONS		= (1 << 1),
     X86_SSE			= (1 << 2) | X86_MMX_EXTENSIONS,
     X86_SSE2			= (1 << 3),
-    X86_CMOV			= (1 << 4)
+    X86_CMOV			= (1 << 4),
+    X86_SSSE3			= (1 << 5)
 } cpu_features_t;
 
 #ifdef HAVE_GETISAX
@@ -64,6 +65,8 @@ detect_cpu_features (void)
 	    features |= X86_SSE;
 	if (result & AV_386_SSE2)
 	    features |= X86_SSE2;
+	if (result & AV_386_SSSE3)
+	    features |= X86_SSSE3;
     }
 
     return features;
@@ -167,6 +170,8 @@ detect_cpu_features (void)
 	features |= X86_SSE;
     if (d & (1 << 26))
 	features |= X86_SSE2;
+    if (c & (1 << 9))
+	features |= X86_SSSE3;
 
     /* Check for AMD specific features */
     if ((features & X86_MMX) && !(features & X86_SSE))
@@ -222,6 +227,7 @@ _pixman_x86_get_implementations (pixman_implementation_t *imp)
 {
 #define MMX_BITS  (X86_MMX | X86_MMX_EXTENSIONS)
 #define SSE2_BITS (X86_MMX | X86_MMX_EXTENSIONS | X86_SSE | X86_SSE2)
+#define SSSE3_BITS (X86_SSE | X86_SSE2 | X86_SSSE3)
 
 #ifdef USE_X86_MMX
     if (!_pixman_disabled ("mmx") && have_feature (MMX_BITS))
@@ -233,5 +239,10 @@ _pixman_x86_get_implementations (pixman_implementation_t *imp)
 	imp = _pixman_implementation_create_sse2 (imp);
 #endif
 
+#ifdef USE_SSSE3
+    if (!_pixman_disabled ("ssse3") && have_feature (SSSE3_BITS))
+	imp = _pixman_implementation_create_ssse3 (imp);
+#endif
+
     return imp;
 }
diff --git a/lib/pixman/pixman/pixman.c b/lib/pixman/pixman/pixman.c
index 184f0c4e6..9555ceaaf 100644
--- a/lib/pixman/pixman/pixman.c
+++ b/lib/pixman/pixman/pixman.c
@@ -605,7 +605,7 @@ pixman_image_composite32 (pixman_op_t      op,
     else
     {
 	mask_format = PIXMAN_null;
-	info.mask_flags = FAST_PATH_IS_OPAQUE;
+	info.mask_flags = FAST_PATH_IS_OPAQUE | FAST_PATH_NO_ALPHA_MAP;
     }
 
     dest_format = dest->common.extended_format_code;
diff --git a/lib/pixman/pixman/pixman.h b/lib/pixman/pixman/pixman.h
index 7ff9fb52a..509ba5e53 100644
--- a/lib/pixman/pixman/pixman.h
+++ b/lib/pixman/pixman/pixman.h
@@ -1030,7 +1030,7 @@ struct pixman_triangle
 #define pixman_trapezoid_valid(t)				   \
     ((t)->left.p1.y != (t)->left.p2.y &&			   \
      (t)->right.p1.y != (t)->right.p2.y &&			   \
-     (int) ((t)->bottom - (t)->top) > 0)
+     ((t)->bottom > (t)->top))
 
 struct pixman_span_fix
 {
diff --git a/lib/pixman/test-driver b/lib/pixman/test-driver
new file mode 100644
index 000000000..32bf39e83
--- /dev/null
+++ b/lib/pixman/test-driver
@@ -0,0 +1,127 @@
+#! /bin/sh
+# test-driver - basic testsuite driver script.
+
+scriptversion=2012-06-27.10; # UTC
+
+# Copyright (C) 2011-2013 Free Software Foundation, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+
+# This file is maintained in Automake, please report
+# bugs to <bug-automake@gnu.org> or send patches to
+# <automake-patches@gnu.org>.
+
+# Make unconditional expansion of undefined variables an error.  This
+# helps a lot in preventing typo-related bugs.
+set -u
+
+usage_error ()
+{
+  echo "$0: $*" >&2
+  print_usage >&2
+  exit 2
+}
+
+print_usage ()
+{
+  cat <<END
+Usage:
+  test-driver --test-name=NAME --log-file=PATH --trs-file=PATH
+              [--expect-failure={yes|no}] [--color-tests={yes|no}]
+              [--enable-hard-errors={yes|no}] [--] TEST-SCRIPT
+The '--test-name', '--log-file' and '--trs-file' options are mandatory.
+END
+}
+
+# TODO: better error handling in option parsing (in particular, ensure
+# TODO: $log_file, $trs_file and $test_name are defined).
+test_name= # Used for reporting.
+log_file=  # Where to save the output of the test script.
+trs_file=  # Where to save the metadata of the test run.
+expect_failure=no
+color_tests=no
+enable_hard_errors=yes
+while test $# -gt 0; do
+  case $1 in
+  --help) print_usage; exit $?;;
+  --version) echo "test-driver $scriptversion"; exit $?;;
+  --test-name) test_name=$2; shift;;
+  --log-file) log_file=$2; shift;;
+  --trs-file) trs_file=$2; shift;;
+  --color-tests) color_tests=$2; shift;;
+  --expect-failure) expect_failure=$2; shift;;
+  --enable-hard-errors) enable_hard_errors=$2; shift;;
+  --) shift; break;;
+  -*) usage_error "invalid option: '$1'";;
+  esac
+  shift
+done
+
+if test $color_tests = yes; then
+  # Keep this in sync with 'lib/am/check.am:$(am__tty_colors)'.
+  red='[0;31m' # Red.
+  grn='[0;32m' # Green.
+  lgn='[1;32m' # Light green.
+  blu='[1;34m' # Blue.
+  mgn='[0;35m' # Magenta.
+  std='[m'     # No color.
+else
+  red= grn= lgn= blu= mgn= std=
+fi
+
+do_exit='rm -f $log_file $trs_file; (exit $st); exit $st'
+trap "st=129; $do_exit" 1
+trap "st=130; $do_exit" 2
+trap "st=141; $do_exit" 13
+trap "st=143; $do_exit" 15
+
+# Test script is run here.
+"$@" >$log_file 2>&1
+estatus=$?
+if test $enable_hard_errors = no && test $estatus -eq 99; then
+  estatus=1
+fi
+
+case $estatus:$expect_failure in
+  0:yes) col=$red res=XPASS recheck=yes gcopy=yes;;
+  0:*)   col=$grn res=PASS  recheck=no  gcopy=no;;
+  77:*)  col=$blu res=SKIP  recheck=no  gcopy=yes;;
+  99:*)  col=$mgn res=ERROR recheck=yes gcopy=yes;;
+  *:yes) col=$lgn res=XFAIL recheck=no  gcopy=yes;;
+  *:*)   col=$red res=FAIL  recheck=yes gcopy=yes;;
+esac
+
+# Report outcome to console.
+echo "${col}${res}${std}: $test_name"
+
+# Register the test result, and other relevant metadata.
+echo ":test-result: $res" > $trs_file
+echo ":global-test-result: $res" >> $trs_file
+echo ":recheck: $recheck" >> $trs_file
+echo ":copy-in-global-log: $gcopy" >> $trs_file
+
+# Local Variables:
+# mode: shell-script
+# sh-indentation: 2
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC"
+# time-stamp-end: "; # UTC"
+# End:
diff --git a/lib/pixman/test/Makefile.am b/lib/pixman/test/Makefile.am
index 5d901d572..88dc36d2a 100644
--- a/lib/pixman/test/Makefile.am
+++ b/lib/pixman/test/Makefile.am
@@ -1,8 +1,8 @@
 include $(top_srcdir)/test/Makefile.sources
 
-AM_CFLAGS = $(OPENMP_CFLAGS)
-AM_LDFLAGS = $(OPENMP_CFLAGS) $(TESTPROGS_EXTRA_LDFLAGS)
-LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la -lm  $(PNG_LIBS)
+AM_CFLAGS = $(OPENMP_CFLAGS) $(PTHREAD_CFLAGS)
+AM_LDFLAGS = $(OPENMP_CFLAGS) $(TESTPROGS_EXTRA_LDFLAGS) $(PTHREAD_LDFLAGS)
+LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la -lm  $(PNG_LIBS) $(PTHREAD_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(PNG_CFLAGS)
 
 libutils_la_SOURCES = $(libutils_sources) $(libutils_headers)
diff --git a/lib/pixman/test/Makefile.in b/lib/pixman/test/Makefile.in
index 48974546f..d7fa0d297 100644
--- a/lib/pixman/test/Makefile.in
+++ b/lib/pixman/test/Makefile.in
@@ -79,162 +79,203 @@ am__EXEEXT_1 = prng-test$(EXEEXT) a1-trap-test$(EXEEXT) \
 	region-translate-test$(EXEEXT) combiner-test$(EXEEXT) \
 	pixel-test$(EXEEXT) fetch-test$(EXEEXT) rotate-test$(EXEEXT) \
 	oob-test$(EXEEXT) infinite-loop$(EXEEXT) trap-crasher$(EXEEXT) \
-	alpha-loop$(EXEEXT) scaling-crash-test$(EXEEXT) \
-	scaling-helpers-test$(EXEEXT) gradient-crash-test$(EXEEXT) \
-	region-contains-test$(EXEEXT) alphamap$(EXEEXT) \
-	matrix-test$(EXEEXT) stress-test$(EXEEXT) \
+	alpha-loop$(EXEEXT) thread-test$(EXEEXT) \
+	scaling-crash-test$(EXEEXT) scaling-helpers-test$(EXEEXT) \
+	gradient-crash-test$(EXEEXT) region-contains-test$(EXEEXT) \
+	alphamap$(EXEEXT) matrix-test$(EXEEXT) stress-test$(EXEEXT) \
 	composite-traps-test$(EXEEXT) blitters-test$(EXEEXT) \
 	glyph-test$(EXEEXT) scaling-test$(EXEEXT) affine-test$(EXEEXT) \
 	composite$(EXEEXT)
 am__EXEEXT_2 = lowlevel-blt-bench$(EXEEXT) radial-perf-test$(EXEEXT) \
-	check-formats$(EXEEXT)
+	check-formats$(EXEEXT) scaling-bench$(EXEEXT)
 PROGRAMS = $(noinst_PROGRAMS)
 a1_trap_test_SOURCES = a1-trap-test.c
 a1_trap_test_OBJECTS = a1-trap-test.$(OBJEXT)
 a1_trap_test_LDADD = $(LDADD)
 am__DEPENDENCIES_1 =
 a1_trap_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 affine_test_SOURCES = affine-test.c
 affine_test_OBJECTS = affine-test.$(OBJEXT)
 affine_test_LDADD = $(LDADD)
 affine_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 alpha_loop_SOURCES = alpha-loop.c
 alpha_loop_OBJECTS = alpha-loop.$(OBJEXT)
 alpha_loop_LDADD = $(LDADD)
 alpha_loop_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 alphamap_SOURCES = alphamap.c
 alphamap_OBJECTS = alphamap.$(OBJEXT)
 alphamap_LDADD = $(LDADD)
 alphamap_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 blitters_test_SOURCES = blitters-test.c
 blitters_test_OBJECTS = blitters-test.$(OBJEXT)
 blitters_test_LDADD = $(LDADD)
 blitters_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 check_formats_SOURCES = check-formats.c
 check_formats_OBJECTS = check-formats.$(OBJEXT)
 check_formats_LDADD = $(LDADD)
 check_formats_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 combiner_test_SOURCES = combiner-test.c
 combiner_test_OBJECTS = combiner-test.$(OBJEXT)
 combiner_test_LDADD = $(LDADD)
 combiner_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 composite_SOURCES = composite.c
 composite_OBJECTS = composite.$(OBJEXT)
 composite_LDADD = $(LDADD)
 composite_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 composite_traps_test_SOURCES = composite-traps-test.c
 composite_traps_test_OBJECTS = composite-traps-test.$(OBJEXT)
 composite_traps_test_LDADD = $(LDADD)
 composite_traps_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 fetch_test_SOURCES = fetch-test.c
 fetch_test_OBJECTS = fetch-test.$(OBJEXT)
 fetch_test_LDADD = $(LDADD)
 fetch_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 glyph_test_SOURCES = glyph-test.c
 glyph_test_OBJECTS = glyph-test.$(OBJEXT)
 glyph_test_LDADD = $(LDADD)
 glyph_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 gradient_crash_test_SOURCES = gradient-crash-test.c
 gradient_crash_test_OBJECTS = gradient-crash-test.$(OBJEXT)
 gradient_crash_test_LDADD = $(LDADD)
 gradient_crash_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 infinite_loop_SOURCES = infinite-loop.c
 infinite_loop_OBJECTS = infinite-loop.$(OBJEXT)
 infinite_loop_LDADD = $(LDADD)
 infinite_loop_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 lowlevel_blt_bench_SOURCES = lowlevel-blt-bench.c
 lowlevel_blt_bench_OBJECTS = lowlevel-blt-bench.$(OBJEXT)
 lowlevel_blt_bench_LDADD = $(LDADD)
 lowlevel_blt_bench_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 matrix_test_SOURCES = matrix-test.c
 matrix_test_OBJECTS = matrix-test.$(OBJEXT)
 matrix_test_LDADD = $(LDADD)
 matrix_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 oob_test_SOURCES = oob-test.c
 oob_test_OBJECTS = oob-test.$(OBJEXT)
 oob_test_LDADD = $(LDADD)
 oob_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 pdf_op_test_SOURCES = pdf-op-test.c
 pdf_op_test_OBJECTS = pdf-op-test.$(OBJEXT)
 pdf_op_test_LDADD = $(LDADD)
 pdf_op_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 pixel_test_SOURCES = pixel-test.c
 pixel_test_OBJECTS = pixel-test.$(OBJEXT)
 pixel_test_LDADD = $(LDADD)
 pixel_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 prng_test_SOURCES = prng-test.c
 prng_test_OBJECTS = prng-test.$(OBJEXT)
 prng_test_LDADD = $(LDADD)
 prng_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 radial_perf_test_SOURCES = radial-perf-test.c
 radial_perf_test_OBJECTS = radial-perf-test.$(OBJEXT)
 radial_perf_test_LDADD = $(LDADD)
 radial_perf_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 region_contains_test_SOURCES = region-contains-test.c
 region_contains_test_OBJECTS = region-contains-test.$(OBJEXT)
 region_contains_test_LDADD = $(LDADD)
 region_contains_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 region_test_SOURCES = region-test.c
 region_test_OBJECTS = region-test.$(OBJEXT)
 region_test_LDADD = $(LDADD)
 region_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 region_translate_test_SOURCES = region-translate-test.c
 region_translate_test_OBJECTS = region-translate-test.$(OBJEXT)
 region_translate_test_LDADD = $(LDADD)
 region_translate_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 rotate_test_SOURCES = rotate-test.c
 rotate_test_OBJECTS = rotate-test.$(OBJEXT)
 rotate_test_LDADD = $(LDADD)
 rotate_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
+scaling_bench_SOURCES = scaling-bench.c
+scaling_bench_OBJECTS = scaling-bench.$(OBJEXT)
+scaling_bench_LDADD = $(LDADD)
+scaling_bench_DEPENDENCIES = libutils.la \
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 scaling_crash_test_SOURCES = scaling-crash-test.c
 scaling_crash_test_OBJECTS = scaling-crash-test.$(OBJEXT)
 scaling_crash_test_LDADD = $(LDADD)
 scaling_crash_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 scaling_helpers_test_SOURCES = scaling-helpers-test.c
 scaling_helpers_test_OBJECTS = scaling-helpers-test.$(OBJEXT)
 scaling_helpers_test_LDADD = $(LDADD)
 scaling_helpers_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 scaling_test_SOURCES = scaling-test.c
 scaling_test_OBJECTS = scaling-test.$(OBJEXT)
 scaling_test_LDADD = $(LDADD)
 scaling_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 stress_test_SOURCES = stress-test.c
 stress_test_OBJECTS = stress-test.$(OBJEXT)
 stress_test_LDADD = $(LDADD)
 stress_test_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
+thread_test_SOURCES = thread-test.c
+thread_test_OBJECTS = thread-test.$(OBJEXT)
+thread_test_LDADD = $(LDADD)
+thread_test_DEPENDENCIES = libutils.la \
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 trap_crasher_SOURCES = trap-crasher.c
 trap_crasher_OBJECTS = trap-crasher.$(OBJEXT)
 trap_crasher_LDADD = $(LDADD)
 trap_crasher_DEPENDENCIES = libutils.la \
-	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1)
+	$(top_builddir)/pixman/libpixman-1.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1)
 AM_V_P = $(am__v_P_@AM_V@)
 am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
 am__v_P_0 = false
@@ -276,8 +317,9 @@ SOURCES = $(libutils_la_SOURCES) a1-trap-test.c affine-test.c \
 	infinite-loop.c lowlevel-blt-bench.c matrix-test.c oob-test.c \
 	pdf-op-test.c pixel-test.c prng-test.c radial-perf-test.c \
 	region-contains-test.c region-test.c region-translate-test.c \
-	rotate-test.c scaling-crash-test.c scaling-helpers-test.c \
-	scaling-test.c stress-test.c trap-crasher.c
+	rotate-test.c scaling-bench.c scaling-crash-test.c \
+	scaling-helpers-test.c scaling-test.c stress-test.c \
+	thread-test.c trap-crasher.c
 DIST_SOURCES = $(libutils_la_SOURCES) a1-trap-test.c affine-test.c \
 	alpha-loop.c alphamap.c blitters-test.c check-formats.c \
 	combiner-test.c composite.c composite-traps-test.c \
@@ -285,8 +327,9 @@ DIST_SOURCES = $(libutils_la_SOURCES) a1-trap-test.c affine-test.c \
 	infinite-loop.c lowlevel-blt-bench.c matrix-test.c oob-test.c \
 	pdf-op-test.c pixel-test.c prng-test.c radial-perf-test.c \
 	region-contains-test.c region-test.c region-translate-test.c \
-	rotate-test.c scaling-crash-test.c scaling-helpers-test.c \
-	scaling-test.c stress-test.c trap-crasher.c
+	rotate-test.c scaling-bench.c scaling-crash-test.c \
+	scaling-helpers-test.c scaling-test.c stress-test.c \
+	thread-test.c trap-crasher.c
 am__can_run_installinfo = \
   case $$AM_UPDATE_INFO_DIR in \
     n|no|NO) false;; \
@@ -331,7 +374,7 @@ GREP = @GREP@
 GTK_CFLAGS = @GTK_CFLAGS@
 GTK_LIBS = @GTK_LIBS@
 HAVE_LIBPNG = @HAVE_LIBPNG@
-HAVE_PTHREAD_SETSPECIFIC = @HAVE_PTHREAD_SETSPECIFIC@
+HAVE_PTHREADS = @HAVE_PTHREADS@
 INSTALL = @INSTALL@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
@@ -377,6 +420,7 @@ PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
 PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
 PNG_CFLAGS = @PNG_CFLAGS@
 PNG_LIBS = @PNG_LIBS@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
 PTHREAD_LDFLAGS = @PTHREAD_LDFLAGS@
 PTHREAD_LIBS = @PTHREAD_LIBS@
 RANLIB = @RANLIB@
@@ -385,6 +429,7 @@ SET_MAKE = @SET_MAKE@
 SHELL = @SHELL@
 SSE2_CFLAGS = @SSE2_CFLAGS@
 SSE2_LDFLAGS = @SSE2_LDFLAGS@
+SSSE3_CFLAGS = @SSSE3_CFLAGS@
 STRIP = @STRIP@
 TESTPROGS_EXTRA_LDFLAGS = @TESTPROGS_EXTRA_LDFLAGS@
 TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR = @TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR@
@@ -459,6 +504,7 @@ TESTPROGRAMS = \
 	infinite-loop		\
 	trap-crasher		\
 	alpha-loop		\
+	thread-test		\
 	scaling-crash-test	\
 	scaling-helpers-test	\
 	gradient-crash-test	\
@@ -480,6 +526,7 @@ OTHERPROGRAMS = \
 	lowlevel-blt-bench	\
 	radial-perf-test	\
         check-formats           \
+	scaling-bench		\
 	$(NULL)
 
 
@@ -494,9 +541,9 @@ libutils_headers = \
 	utils-prng.h		\
 	$(NULL)
 
-AM_CFLAGS = $(OPENMP_CFLAGS)
-AM_LDFLAGS = $(OPENMP_CFLAGS) $(TESTPROGS_EXTRA_LDFLAGS)
-LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la -lm  $(PNG_LIBS)
+AM_CFLAGS = $(OPENMP_CFLAGS) $(PTHREAD_CFLAGS)
+AM_LDFLAGS = $(OPENMP_CFLAGS) $(TESTPROGS_EXTRA_LDFLAGS) $(PTHREAD_LDFLAGS)
+LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la -lm  $(PNG_LIBS) $(PTHREAD_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(PNG_CFLAGS)
 libutils_la_SOURCES = $(libutils_sources) $(libutils_headers)
 noinst_LTLIBRARIES = libutils.la
@@ -629,6 +676,9 @@ region-translate-test$(EXEEXT): $(region_translate_test_OBJECTS) $(region_transl
 rotate-test$(EXEEXT): $(rotate_test_OBJECTS) $(rotate_test_DEPENDENCIES) $(EXTRA_rotate_test_DEPENDENCIES) 
 	@rm -f rotate-test$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(rotate_test_OBJECTS) $(rotate_test_LDADD) $(LIBS)
+scaling-bench$(EXEEXT): $(scaling_bench_OBJECTS) $(scaling_bench_DEPENDENCIES) $(EXTRA_scaling_bench_DEPENDENCIES) 
+	@rm -f scaling-bench$(EXEEXT)
+	$(AM_V_CCLD)$(LINK) $(scaling_bench_OBJECTS) $(scaling_bench_LDADD) $(LIBS)
 scaling-crash-test$(EXEEXT): $(scaling_crash_test_OBJECTS) $(scaling_crash_test_DEPENDENCIES) $(EXTRA_scaling_crash_test_DEPENDENCIES) 
 	@rm -f scaling-crash-test$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(scaling_crash_test_OBJECTS) $(scaling_crash_test_LDADD) $(LIBS)
@@ -641,6 +691,9 @@ scaling-test$(EXEEXT): $(scaling_test_OBJECTS) $(scaling_test_DEPENDENCIES) $(EX
 stress-test$(EXEEXT): $(stress_test_OBJECTS) $(stress_test_DEPENDENCIES) $(EXTRA_stress_test_DEPENDENCIES) 
 	@rm -f stress-test$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(stress_test_OBJECTS) $(stress_test_LDADD) $(LIBS)
+thread-test$(EXEEXT): $(thread_test_OBJECTS) $(thread_test_DEPENDENCIES) $(EXTRA_thread_test_DEPENDENCIES) 
+	@rm -f thread-test$(EXEEXT)
+	$(AM_V_CCLD)$(LINK) $(thread_test_OBJECTS) $(thread_test_LDADD) $(LIBS)
 trap-crasher$(EXEEXT): $(trap_crasher_OBJECTS) $(trap_crasher_DEPENDENCIES) $(EXTRA_trap_crasher_DEPENDENCIES) 
 	@rm -f trap-crasher$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(trap_crasher_OBJECTS) $(trap_crasher_LDADD) $(LIBS)
@@ -675,10 +728,12 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/region-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/region-translate-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/rotate-test.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/scaling-bench.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/scaling-crash-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/scaling-helpers-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/scaling-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/stress-test.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/thread-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/trap-crasher.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utils-prng.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utils.Plo@am__quote@
diff --git a/lib/pixman/test/Makefile.sources b/lib/pixman/test/Makefile.sources
index b5fc740f3..2ae5d9f8d 100644
--- a/lib/pixman/test/Makefile.sources
+++ b/lib/pixman/test/Makefile.sources
@@ -13,6 +13,7 @@ TESTPROGRAMS =			\
 	infinite-loop		\
 	trap-crasher		\
 	alpha-loop		\
+	thread-test		\
 	scaling-crash-test	\
 	scaling-helpers-test	\
 	gradient-crash-test	\
@@ -33,6 +34,7 @@ OTHERPROGRAMS =                 \
 	lowlevel-blt-bench	\
 	radial-perf-test	\
         check-formats           \
+	scaling-bench		\
 	$(NULL)
 
 # Utility functions
diff --git a/lib/pixman/test/affine-test.c b/lib/pixman/test/affine-test.c
index 2506250db..8e19023a3 100644
--- a/lib/pixman/test/affine-test.c
+++ b/lib/pixman/test/affine-test.c
@@ -80,6 +80,18 @@ test_composite (int      testnum,
     prng_randmemset (srcbuf, src_stride * src_height, 0);
     prng_randmemset (dstbuf, dst_stride * dst_height, 0);
 
+    if (prng_rand_n (2) == 0)
+    {
+	srcbuf += (src_stride / 4) * (src_height - 1);
+	src_stride = - src_stride;
+    }
+
+    if (prng_rand_n (2) == 0)
+    {
+	dstbuf += (dst_stride / 4) * (dst_height - 1);
+	dst_stride = - dst_stride;
+    }
+    
     src_fmt = src_bpp == 4 ? (prng_rand_n (2) == 0 ?
                               PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
 
@@ -273,32 +285,20 @@ test_composite (int      testnum,
     pixman_image_composite (op, src_img, NULL, dst_img,
                             src_x, src_y, 0, 0, dst_x, dst_y, w, h);
 
-    if (dst_fmt == PIXMAN_x8r8g8b8)
-    {
-	/* ignore unused part */
-	for (i = 0; i < dst_stride * dst_height / 4; i++)
-	    dstbuf[i] &= 0xFFFFFF;
-    }
-
-    image_endian_swap (dst_img);
-
+    crc32 = compute_crc32_for_image (0, dst_img);
+    
     if (verbose)
-    {
-	int j;
-
-	for (i = 0; i < dst_height; i++)
-	{
-	    for (j = 0; j < dst_stride; j++)
-		printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
-
-	    printf ("\n");
-	}
-    }
+	print_image (dst_img);
 
     pixman_image_unref (src_img);
     pixman_image_unref (dst_img);
 
-    crc32 = compute_crc32 (0, dstbuf, dst_stride * dst_height);
+    if (src_stride < 0)
+	srcbuf += (src_stride / 4) * (src_height - 1);
+
+    if (dst_stride < 0)
+	dstbuf += (dst_stride / 4) * (dst_height - 1);
+    
     free (srcbuf);
     free (dstbuf);
 
@@ -306,12 +306,10 @@ test_composite (int      testnum,
     return crc32;
 }
 
-#if BILINEAR_INTERPOLATION_BITS == 8
-#define CHECKSUM 0x2CDF1F07
-#elif BILINEAR_INTERPOLATION_BITS == 7
-#define CHECKSUM 0xBC00B1DF
+#if BILINEAR_INTERPOLATION_BITS == 7
+#define CHECKSUM 0xBE724CFE
 #elif BILINEAR_INTERPOLATION_BITS == 4
-#define CHECKSUM 0xA227306B
+#define CHECKSUM 0x79BBE501
 #else
 #define CHECKSUM 0x00000000
 #endif
diff --git a/lib/pixman/test/blitters-test.c b/lib/pixman/test/blitters-test.c
index a2c6ff4d8..ea03f475d 100644
--- a/lib/pixman/test/blitters-test.c
+++ b/lib/pixman/test/blitters-test.c
@@ -57,6 +57,13 @@ create_random_image (pixman_format_code_t *allowed_formats,
 	prng_randmemset (buf, stride * height, RANDMEMSET_MORE_00_AND_FF);
     }
 
+    /* test negative stride */
+    if (prng_rand_n (4) == 0)
+    {
+	buf += (stride / 4) * (height - 1);
+	stride = - stride;
+    }
+    
     img = pixman_image_create_bits (fmt, width, height, buf, stride);
 
     if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_COLOR)
@@ -89,6 +96,9 @@ free_random_image (uint32_t initcrc,
     if (fmt != PIXMAN_null)
 	crc32 = compute_crc32_for_image (initcrc, img);
 
+    if (img->bits.rowstride < 0)
+	data += img->bits.rowstride * (img->bits.height - 1);
+
     pixman_image_unref (img);
     free (data);
 
@@ -222,7 +232,6 @@ static pixman_format_code_t mask_fmt_list[] = {
 uint32_t
 test_composite (int testnum, int verbose)
 {
-    int i;
     pixman_image_t *src_img = NULL;
     pixman_image_t *dst_img = NULL;
     pixman_image_t *mask_img = NULL;
@@ -235,7 +244,7 @@ test_composite (int testnum, int verbose)
     int w, h;
     pixman_op_t op;
     pixman_format_code_t src_fmt, dst_fmt, mask_fmt;
-    uint32_t *dstbuf, *srcbuf, *maskbuf;
+    uint32_t *srcbuf, *maskbuf;
     uint32_t crc32;
     int max_width, max_height, max_extra_stride;
     FLOAT_REGS_CORRUPTION_DETECTOR_START ();
@@ -282,7 +291,6 @@ test_composite (int testnum, int verbose)
     dst_height = pixman_image_get_height (dst_img);
     dst_stride = pixman_image_get_stride (dst_img);
 
-    dstbuf = pixman_image_get_data (dst_img);
     srcbuf = pixman_image_get_data (src_img);
 
     src_x = prng_rand_n (src_width);
@@ -355,23 +363,7 @@ test_composite (int testnum, int verbose)
 			    src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
 
     if (verbose)
-    {
-	int j;
-
-	printf ("---\n");
-	for (i = 0; i < dst_height; i++)
-	{
-	    for (j = 0; j < dst_stride; j++)
-	    {
-		if (j == (dst_width * PIXMAN_FORMAT_BPP (dst_fmt) + 7) / 8)
-		    printf ("| ");
-
-		printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
-	    }
-	    printf ("\n");
-	}
-	printf ("---\n");
-    }
+	print_image (dst_img);
 
     free_random_image (0, src_img, PIXMAN_null);
     crc32 = free_random_image (0, dst_img, dst_fmt);
@@ -402,6 +394,6 @@ main (int argc, const char *argv[])
     }
 
     return fuzzer_test_main("blitters", 2000000,
-			    0x0CF3283B,
+			    0xE0A07495,
 			    test_composite, argc, argv);
 }
diff --git a/lib/pixman/test/composite-traps-test.c b/lib/pixman/test/composite-traps-test.c
index 2983eae83..86a035564 100644
--- a/lib/pixman/test/composite-traps-test.c
+++ b/lib/pixman/test/composite-traps-test.c
@@ -97,19 +97,25 @@ test_composite (int      testnum,
 	int src_width = prng_rand_n (MAX_SRC_WIDTH) + 1;
 	int src_height = prng_rand_n (MAX_SRC_HEIGHT) + 1;
 	int src_stride = src_width * src_bpp + prng_rand_n (MAX_STRIDE) * src_bpp;
-	uint32_t *bits;
+	uint32_t *bits, *orig;
 
 	src_x = -(src_width / 4) + prng_rand_n (src_width * 3 / 2);
 	src_y = -(src_height / 4) + prng_rand_n (src_height * 3 / 2);
 
 	src_stride = (src_stride + 3) & ~3;
 	
-	bits = (uint32_t *)make_random_bytes (src_stride * src_height);
+	orig = bits = (uint32_t *)make_random_bytes (src_stride * src_height);
 
+	if (prng_rand_n (2) == 0)
+	{
+	    bits += (src_stride / 4) * (src_height - 1);
+	    src_stride = - src_stride;
+	}
+	
 	src_img = pixman_image_create_bits (
 	    src_format, src_width, src_height, bits, src_stride);
 
-	pixman_image_set_destroy_function (src_img, destroy_bits, bits);
+	pixman_image_set_destroy_function (src_img, destroy_bits, orig);
 
 	if (prng_rand_n (8) == 0)
 	{
@@ -153,6 +159,12 @@ test_composite (int      testnum,
 	
 	dst_bits = (uint32_t *)make_random_bytes (dst_stride * dst_height);
 
+	if (prng_rand_n (2) == 0)
+	{
+	    dst_bits += (dst_stride / 4) * (dst_height - 1);
+	    dst_stride = - dst_stride;
+	}
+	
 	dst_x = -(dst_width / 4) + prng_rand_n (dst_width * 3 / 2);
 	dst_y = -(dst_height / 4) + prng_rand_n (dst_height * 3 / 2);
 	
@@ -214,30 +226,14 @@ test_composite (int      testnum,
     pixman_composite_trapezoids (op, src_img, dst_img, mask_format,
 				 src_x, src_y, dst_x, dst_y, n_traps, traps);
 
-    if (dst_format == PIXMAN_x8r8g8b8)
-    {
-	/* ignore unused part */
-	for (i = 0; i < dst_stride * dst_height / 4; i++)
-	    dst_bits[i] &= 0xFFFFFF;
-    }
-
-    image_endian_swap (dst_img);
+    crc32 = compute_crc32_for_image (0, dst_img);
 
     if (verbose)
-    {
-	int j;
-	
-	for (i = 0; i < dst_height; i++)
-	{
-	    for (j = 0; j < dst_stride; j++)
-		printf ("%02X ", *((uint8_t *)dst_bits + i * dst_stride + j));
-
-	    printf ("\n");
-	}
-    }
-
-    crc32 = compute_crc32 (0, dst_bits, dst_stride * dst_height);
+	print_image (dst_img);
 
+    if (dst_stride < 0)
+	dst_bits += (dst_stride / 4) * (dst_height - 1);
+    
     fence_free (dst_bits);
     
     pixman_image_unref (src_img);
@@ -251,6 +247,6 @@ test_composite (int      testnum,
 int
 main (int argc, const char *argv[])
 {
-    return fuzzer_test_main("composite traps", 40000, 0x749BCC57,
+    return fuzzer_test_main("composite traps", 40000, 0xAF41D210,
 			    test_composite, argc, argv);
 }
diff --git a/lib/pixman/test/rotate-test.c b/lib/pixman/test/rotate-test.c
index 9d2a620cb..18ca60d9b 100644
--- a/lib/pixman/test/rotate-test.c
+++ b/lib/pixman/test/rotate-test.c
@@ -61,16 +61,25 @@ static pixman_image_t *
 make_image (void)
 {
     pixman_format_code_t format = RANDOM_FORMAT();
-    uint32_t *bytes = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *bytes, *orig;
     pixman_image_t *image;
+    int stride;
 
+    orig = bytes = malloc (WIDTH * HEIGHT * 4);
     prng_randmemset (bytes, WIDTH * HEIGHT * 4, 0);
 
+    stride = WIDTH * 4;
+    if (prng_rand_n (2) == 0)
+    {
+	bytes += (stride / 4) * (HEIGHT - 1);
+	stride = - stride;
+    }
+
     image = pixman_image_create_bits (
-	format, WIDTH, HEIGHT, bytes, WIDTH * 4);
+	format, WIDTH, HEIGHT, bytes, stride);
 
     pixman_image_set_transform (image, RANDOM_TRANSFORM());
-    pixman_image_set_destroy_function (image, on_destroy, bytes);
+    pixman_image_set_destroy_function (image, on_destroy, orig);
     pixman_image_set_repeat (image, PIXMAN_REPEAT_NORMAL);
 
     image_endian_swap (image);
@@ -106,6 +115,6 @@ int
 main (int argc, const char *argv[])
 {
     return fuzzer_test_main ("rotate", 15000,
-			     0xECF5E426,
+			     0x81E9EC2F,
 			     test_transform, argc, argv);
 }
diff --git a/lib/pixman/test/scaling-bench.c b/lib/pixman/test/scaling-bench.c
new file mode 100644
index 000000000..365e79850
--- /dev/null
+++ b/lib/pixman/test/scaling-bench.c
@@ -0,0 +1,80 @@
+#include <stdlib.h>
+#include "utils.h"
+
+#define SOURCE_WIDTH 320
+#define SOURCE_HEIGHT 240
+#define TEST_REPEATS 3
+
+static pixman_image_t *
+make_source (void)
+{
+    size_t n_bytes = (SOURCE_WIDTH + 2) * (SOURCE_HEIGHT + 2) * 4;
+    uint32_t *data = malloc (n_bytes);
+    pixman_image_t *source;
+
+    prng_randmemset (data, n_bytes, 0);
+    
+    source = pixman_image_create_bits (
+	PIXMAN_a8r8g8b8, SOURCE_WIDTH + 2, SOURCE_HEIGHT + 2,
+	data,
+	(SOURCE_WIDTH + 2) * 4);
+
+    pixman_image_set_filter (source, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    return source;
+}
+
+int
+main ()
+{
+    double scale;
+    pixman_image_t *src;
+
+    prng_srand (23874);
+    
+    src = make_source ();
+    printf ("# %-6s %-22s   %-14s %-12s\n",
+	    "ratio",
+	    "resolutions",
+	    "time / ms",
+	    "time per pixel / ns");
+    for (scale = 0.1; scale < 10.005; scale += 0.01)
+    {
+	int i;
+	int dest_width = SOURCE_WIDTH * scale + 0.5;
+	int dest_height = SOURCE_HEIGHT * scale + 0.5;
+	int dest_byte_stride = (dest_width * 4 + 15) & ~15;
+	pixman_fixed_t s = (1 / scale) * 65536.0 + 0.5;
+	pixman_transform_t transform;
+	pixman_image_t *dest;
+	double t1, t2, t = -1;
+	uint32_t *dest_buf = aligned_malloc (16, dest_byte_stride * dest_height);
+	memset (dest_buf, 0, dest_byte_stride * dest_height);
+
+	pixman_transform_init_scale (&transform, s, s);
+	pixman_image_set_transform (src, &transform);
+	
+	dest = pixman_image_create_bits (
+	    PIXMAN_a8r8g8b8, dest_width, dest_height, dest_buf, dest_byte_stride);
+
+	for (i = 0; i < TEST_REPEATS; i++)
+	{
+	    t1 = gettime();
+	    pixman_image_composite (
+		PIXMAN_OP_OVER, src, NULL, dest,
+		scale, scale, 0, 0, 0, 0, dest_width, dest_height);
+	    t2 = gettime();
+	    if (t < 0 || t2 - t1 < t)
+		t = t2 - t1;
+	}
+
+	printf ("%6.2f : %4dx%-4d => %4dx%-4d : %12.4f : %12.4f\n",
+		scale, SOURCE_WIDTH, SOURCE_HEIGHT, dest_width, dest_height,
+		t * 1000, (t / (dest_width * dest_height)) * 1000000000);
+
+	pixman_image_unref (dest);
+	free (dest_buf);
+    }
+
+    return 0;
+}
diff --git a/lib/pixman/test/scaling-test.c b/lib/pixman/test/scaling-test.c
index a8cb4c47b..e2f7fa9f4 100644
--- a/lib/pixman/test/scaling-test.c
+++ b/lib/pixman/test/scaling-test.c
@@ -147,6 +147,24 @@ test_composite (int      testnum,
     src_fmt = get_format (src_bpp);
     dst_fmt = get_format (dst_bpp);
 
+    if (prng_rand_n (2))
+    {
+	srcbuf += (src_stride / 4) * (src_height - 1);
+	src_stride = - src_stride;
+    }
+
+    if (prng_rand_n (2))
+    {
+	maskbuf += (mask_stride / 4) * (mask_height - 1);
+	mask_stride = - mask_stride;
+    }
+
+    if (prng_rand_n (2))
+    {
+	dstbuf += (dst_stride / 4) * (dst_height - 1);
+	dst_stride = - dst_stride;
+    }
+
     src_img = pixman_image_create_bits (
         src_fmt, src_width, src_height, srcbuf, src_stride);
 
@@ -340,33 +358,24 @@ test_composite (int      testnum,
 	pixman_image_composite (op, src_img, mask_img, dst_img,
                             src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
 
-    if (dst_fmt == PIXMAN_x8r8g8b8 || dst_fmt == PIXMAN_x8b8g8r8)
-    {
-	/* ignore unused part */
-	for (i = 0; i < dst_stride * dst_height / 4; i++)
-	    dstbuf[i] &= 0xFFFFFF;
-    }
-
-    image_endian_swap (dst_img);
-
+    crc32 = compute_crc32_for_image (0, dst_img);
+    
     if (verbose)
-    {
-	int j;
-	
-	for (i = 0; i < dst_height; i++)
-	{
-	    for (j = 0; j < dst_stride; j++)
-		printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
-
-	    printf ("\n");
-	}
-    }
+	print_image (dst_img);
 
     pixman_image_unref (src_img);
     pixman_image_unref (mask_img);
     pixman_image_unref (dst_img);
 
-    crc32 = compute_crc32 (0, dstbuf, dst_stride * dst_height);
+    if (src_stride < 0)
+	srcbuf += (src_stride / 4) * (src_height - 1);
+
+    if (mask_stride < 0)
+	maskbuf += (mask_stride / 4) * (mask_height - 1);
+
+    if (dst_stride < 0)
+	dstbuf += (dst_stride / 4) * (dst_height - 1);
+
     free (srcbuf);
     free (maskbuf);
     free (dstbuf);
@@ -375,12 +384,10 @@ test_composite (int      testnum,
     return crc32;
 }
 
-#if BILINEAR_INTERPOLATION_BITS == 8
-#define CHECKSUM 0x9096E6B6
-#elif BILINEAR_INTERPOLATION_BITS == 7
-#define CHECKSUM 0xCE8EC6BA
+#if BILINEAR_INTERPOLATION_BITS == 7
+#define CHECKSUM 0x92E0F068
 #elif BILINEAR_INTERPOLATION_BITS == 4
-#define CHECKSUM 0xAB1D39BE
+#define CHECKSUM 0x8EFFA1E5
 #else
 #define CHECKSUM 0x00000000
 #endif
diff --git a/lib/pixman/test/thread-test.c b/lib/pixman/test/thread-test.c
new file mode 100644
index 000000000..0b07b269d
--- /dev/null
+++ b/lib/pixman/test/thread-test.c
@@ -0,0 +1,199 @@
+#include "utils.h"
+
+#ifndef HAVE_PTHREADS
+
+int main ()
+{
+    printf ("Skipped thread-test - pthreads not supported\n");
+    return 0;
+}
+
+#else
+
+#include <stdlib.h>
+#include <pthread.h>
+
+typedef struct
+{
+    int       thread_no;
+    uint32_t *dst_buf;
+    prng_t    prng_state;
+} info_t;
+
+static const pixman_op_t operators[] = 
+{
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+};
+
+static const pixman_format_code_t formats[] =
+{
+    PIXMAN_a8r8g8b8,
+    PIXMAN_r5g6b5,
+    PIXMAN_a8,
+    PIXMAN_a4,
+    PIXMAN_a1,
+    PIXMAN_b5g6r5,
+    PIXMAN_r8g8b8a8,
+    PIXMAN_a4r4g4b4
+};
+
+#define N_ROUNDS 8192
+
+#define RAND_ELT(arr)							\
+    arr[prng_rand_r(&info->prng_state) % ARRAY_LENGTH (arr)]
+
+#define DEST_WIDTH (7)
+
+static void *
+thread (void *data)
+{
+    info_t *info = data;
+    uint32_t crc32 = 0x0;
+    uint32_t src_buf[64];
+    pixman_image_t *dst_img, *src_img;
+    int i;
+
+    prng_srand_r (&info->prng_state, info->thread_no);
+
+    for (i = 0; i < N_ROUNDS; ++i)
+    {
+	pixman_op_t op;
+	int rand1, rand2;
+
+	prng_randmemset_r (&info->prng_state, info->dst_buf,
+			   DEST_WIDTH * sizeof (uint32_t), 0);
+	prng_randmemset_r (&info->prng_state, src_buf,
+			   sizeof (src_buf), 0);
+
+	src_img = pixman_image_create_bits (
+	    RAND_ELT (formats), 4, 4, src_buf, 16);
+	dst_img = pixman_image_create_bits (
+	    RAND_ELT (formats), DEST_WIDTH, 1, info->dst_buf,
+	    DEST_WIDTH * sizeof (uint32_t));
+
+	image_endian_swap (src_img);
+	image_endian_swap (dst_img);
+	
+	rand2 = prng_rand_r (&info->prng_state) % 4;
+	rand1 = prng_rand_r (&info->prng_state) % 4;
+	op = RAND_ELT (operators);
+
+	pixman_image_composite32 (
+	    op,
+	    src_img, NULL, dst_img,
+	    rand1, rand2, 0, 0, 0, 0, DEST_WIDTH, 1);
+
+	crc32 = compute_crc32_for_image (crc32, dst_img);
+
+	pixman_image_unref (src_img);
+	pixman_image_unref (dst_img);
+    }
+
+    return (void *)(uintptr_t)crc32;
+}
+
+static inline uint32_t
+byteswap32 (uint32_t x)
+{
+    return ((x & ((uint32_t)0xFF << 24)) >> 24) |
+           ((x & ((uint32_t)0xFF << 16)) >>  8) |
+           ((x & ((uint32_t)0xFF <<  8)) <<  8) |
+           ((x & ((uint32_t)0xFF <<  0)) << 24);
+}
+
+int
+main (void)
+{
+    uint32_t dest[16 * DEST_WIDTH];
+    info_t info[16] = { { 0 } };
+    pthread_t threads[16];
+    void *retvals[16];
+    uint32_t crc32s[16], crc32;
+    int i;
+
+    for (i = 0; i < 16; ++i)
+    {
+	info[i].thread_no = i;
+	info[i].dst_buf = &dest[i * DEST_WIDTH];
+    }
+
+    for (i = 0; i < 16; ++i)
+	pthread_create (&threads[i], NULL, thread, &info[i]);
+
+    for (i = 0; i < 16; ++i)
+	pthread_join (threads[i], &retvals[i]);
+
+    for (i = 0; i < 16; ++i)
+    {
+	crc32s[i] = (uintptr_t)retvals[i];
+
+	if (is_little_endian())
+	    crc32s[i] = byteswap32 (crc32s[i]);
+    }
+
+    crc32 = compute_crc32 (0, crc32s, sizeof crc32s);
+
+#define EXPECTED 0xE299B18E
+
+    if (crc32 != EXPECTED)
+    {
+	printf ("thread-test failed. Got checksum 0x%08X, expected 0x%08X\n",
+		crc32, EXPECTED);
+	return 1;
+    }
+
+    return 0;
+}
+
+#endif
+
diff --git a/lib/pixman/test/trap-crasher.c b/lib/pixman/test/trap-crasher.c
index 4e4cac297..77be1c98b 100644
--- a/lib/pixman/test/trap-crasher.c
+++ b/lib/pixman/test/trap-crasher.c
@@ -5,7 +5,7 @@ int
 main()
 {
     pixman_image_t *dst;
-    pixman_trapezoid_t traps[1] = {
+    pixman_trapezoid_t traps[] = {
 	{
 	    2147483646,
 	    2147483647,
@@ -18,6 +18,18 @@ main()
 		{ 0, 2147483647 }
 	    }
 	},
+	{
+	    32768,
+	    - 2147483647,
+	    {
+		{ 0, 0 },
+		{ 0, 2147483647 }
+	    },
+	    {
+		{ 65536, 0 },
+		{ 0, 2147483647 }
+	    }
+	},
     };
 
     dst = pixman_image_create_bits (PIXMAN_a8, 1, 1, NULL, -1);
diff --git a/lib/pixman/test/utils.c b/lib/pixman/test/utils.c
index 3d1ba22ae..ebe0ccc09 100644
--- a/lib/pixman/test/utils.c
+++ b/lib/pixman/test/utils.c
@@ -150,6 +150,12 @@ compute_crc32_for_image_internal (uint32_t        crc32,
     uint32_t mask = 0xffffffff;
     int i;
 
+    if (stride < 0)
+    {
+	data += (stride / 4) * (height - 1);
+	stride = - stride;
+    }
+
     /* mask unused 'x' part */
     if (PIXMAN_FORMAT_BPP (fmt) - PIXMAN_FORMAT_DEPTH (fmt) &&
 	PIXMAN_FORMAT_DEPTH (fmt) != 0)
@@ -238,6 +244,38 @@ compute_crc32_for_image (uint32_t        crc32,
     return crc32;
 }
 
+void
+print_image (pixman_image_t *image)
+{
+    int i, j;
+    int width, height, stride;
+    pixman_format_code_t format;
+    uint8_t *buffer;
+    int s;
+
+    width = pixman_image_get_width (image);
+    height = pixman_image_get_height (image);
+    stride = pixman_image_get_stride (image);
+    format = pixman_image_get_format (image);
+    buffer = (uint8_t *)pixman_image_get_data (image);
+
+    s = (stride >= 0)? stride : - stride;
+    
+    printf ("---\n");
+    for (i = 0; i < height; i++)
+    {
+	for (j = 0; j < s; j++)
+	{
+	    if (j == (width * PIXMAN_FORMAT_BPP (format) + 7) / 8)
+		printf ("| ");
+
+	    printf ("%02X ", *((uint8_t *)buffer + i * stride + j));
+	}
+	printf ("\n");
+    }
+    printf ("---\n");
+}
+
 /* perform endian conversion of pixel data
  */
 void
@@ -259,11 +297,12 @@ image_endian_swap (pixman_image_t *img)
     for (i = 0; i < height; i++)
     {
 	uint8_t *line_data = (uint8_t *)data + stride * i;
-	
+	int s = (stride >= 0)? stride : - stride;
+    	
 	switch (bpp)
 	{
 	case 1:
-	    for (j = 0; j < stride; j++)
+	    for (j = 0; j < s; j++)
 	    {
 		line_data[j] =
 		    ((line_data[j] & 0x80) >> 7) |
@@ -277,13 +316,13 @@ image_endian_swap (pixman_image_t *img)
 	    }
 	    break;
 	case 4:
-	    for (j = 0; j < stride; j++)
+	    for (j = 0; j < s; j++)
 	    {
 		line_data[j] = (line_data[j] >> 4) | (line_data[j] << 4);
 	    }
 	    break;
 	case 16:
-	    for (j = 0; j + 2 <= stride; j += 2)
+	    for (j = 0; j + 2 <= s; j += 2)
 	    {
 		char t1 = line_data[j + 0];
 		char t2 = line_data[j + 1];
@@ -293,7 +332,7 @@ image_endian_swap (pixman_image_t *img)
 	    }
 	    break;
 	case 24:
-	    for (j = 0; j + 3 <= stride; j += 3)
+	    for (j = 0; j + 3 <= s; j += 3)
 	    {
 		char t1 = line_data[j + 0];
 		char t2 = line_data[j + 1];
@@ -305,7 +344,7 @@ image_endian_swap (pixman_image_t *img)
 	    }
 	    break;
 	case 32:
-	    for (j = 0; j + 4 <= stride; j += 4)
+	    for (j = 0; j + 4 <= s; j += 4)
 	    {
 		char t1 = line_data[j + 0];
 		char t2 = line_data[j + 1];
@@ -602,6 +641,32 @@ draw_checkerboard (pixman_image_t *image,
     }
 }
 
+static uint32_t
+call_test_function (uint32_t    (*test_function)(int testnum, int verbose),
+		    int		testnum,
+		    int		verbose)
+{
+    uint32_t retval;
+
+#if defined (__GNUC__) && defined (_WIN32) && (defined (__i386) || defined (__i386__))
+    __asm__ (
+	/* Deliberately avoid aligning the stack to 16 bytes */
+	"pushl	%1\n\t"
+	"pushl	%2\n\t"
+	"call	*%3\n\t"
+	"addl	$8, %%esp\n\t"
+	: "=a" (retval)
+	: "r" (verbose),
+	  "r" (testnum),
+	  "r" (test_function)
+	: "edx", "ecx"); /* caller save registers */
+#else
+    retval = test_function (testnum, verbose);
+#endif
+
+    return retval;
+}
+
 /*
  * A function, which can be used as a core part of the test programs,
  * intended to detect various problems with the help of fuzzing input
@@ -671,7 +736,9 @@ fuzzer_test_main (const char *test_name,
     else if (argc >= 2)
     {
 	n2 = atoi (argv[1]);
-	checksum = test_function (n2, 1);
+
+	checksum = call_test_function (test_function, n2, 1);
+
 	printf ("%d: checksum=%08X\n", n2, checksum);
 	return 0;
     }
@@ -687,7 +754,7 @@ fuzzer_test_main (const char *test_name,
 #endif
     for (i = n1; i <= n2; i++)
     {
-	uint32_t crc = test_function (i, 0);
+	uint32_t crc = call_test_function (test_function, i, 0);
 	if (verbose)
 	    printf ("%d: %08X\n", i, crc);
 	checksum += crc;
diff --git a/lib/pixman/test/utils.h b/lib/pixman/test/utils.h
index c2781516f..ebb14d9e4 100644
--- a/lib/pixman/test/utils.h
+++ b/lib/pixman/test/utils.h
@@ -6,6 +6,11 @@
 #include "pixman-private.h" /* For 'inline' definition */
 #include "utils-prng.h"
 
+#if defined(_MSC_VER)
+#define snprintf _snprintf
+#define strcasecmp _stricmp
+#endif
+
 #define ARRAY_LENGTH(A) ((int) (sizeof (A) / sizeof ((A) [0])))
 
 /* A primitive pseudorandom number generator,
@@ -63,6 +68,10 @@ uint32_t
 compute_crc32_for_image (uint32_t        in_crc32,
 			 pixman_image_t *image);
 
+/* Print the image in hexadecimal */
+void
+print_image (pixman_image_t *image);
+
 /* Returns TRUE if running on a little endian system
  */
 static force_inline pixman_bool_t
author	Matthieu Herrb <matthieu@cvs.openbsd.org>	2013-12-01 20:34:21 +0000
committer	Matthieu Herrb <matthieu@cvs.openbsd.org>	2013-12-01 20:34:21 +0000
commit	a5bbbb8d49b940f16b8fca367fa3e2cc5489f862 (patch)
tree	79a94b85f71cf67460335388866fdcdf78942987 /lib
parent	8a0bfc9a32ee1e84d7274040f2902b8e1b459d0f (diff)