Update to pixman 0.18.4.

Tweak build to use libpthread-stubs for TLS emulation instead of forcing every application using pixman to use -pthread. Tested by jasper@ and landry@ on a bulk ports build.
author: Matthieu Herrb <matthieu@cvs.openbsd.org> 2010-10-03 18:30:05 +0000
committer: Matthieu Herrb <matthieu@cvs.openbsd.org> 2010-10-03 18:30:05 +0000
commit: 519bd19882b18b3cfcccca5fe8e0e6ab6eb3b937 (patch)
tree: 1ed8f61276ba41eeaf1ffa509465cd2f767cc3aa /lib/pixman
parent: 9b631ded21a25e9a701bb5c1be5a29597ce2e3c9 (diff)
62 files changed, 10152 insertions, 5867 deletions
diff --git a/lib/pixman/COPYING b/lib/pixman/COPYING
index 286158f2e..b0571e6a6 100644
--- a/lib/pixman/COPYING
+++ b/lib/pixman/COPYING
@@ -18,6 +18,7 @@ Copyright 2008 Rodrigo Kumpera
 Copyright 2008 André Tupinambá
 Copyright 2008 Mozilla Corporation
 Copyright 2008 Frederic Plourde
+Copyright 2009 Sun Microsystems, Inc.
 
 Permission is hereby granted, free of charge, to any person obtaining a
 copy of this software and associated documentation files (the "Software"),
diff --git a/lib/pixman/Makefile.bsd-wrapper b/lib/pixman/Makefile.bsd-wrapper
index afa52b59d..3927b889a 100644
--- a/lib/pixman/Makefile.bsd-wrapper
+++ b/lib/pixman/Makefile.bsd-wrapper
@@ -1,8 +1,8 @@
-# $OpenBSD: Makefile.bsd-wrapper,v 1.11 2010/08/25 17:44:26 todd Exp $
+# $OpenBSD: Makefile.bsd-wrapper,v 1.12 2010/10/03 18:30:04 matthieu Exp $
 
 .include <bsd.own.mk>
 
-SHARED_LIBS=	pixman-1 16.6
+SHARED_LIBS=	pixman-1 18.4
 
 .if ${MACHINE} == amd64 && !${COMPILER_VERSION:L:Mgcc4*}
 CONFIGURE_ARGS += --disable-sse2
diff --git a/lib/pixman/Makefile.in b/lib/pixman/Makefile.in
index d51ee56ad..a49f36c9f 100644
--- a/lib/pixman/Makefile.in
+++ b/lib/pixman/Makefile.in
@@ -88,13 +88,13 @@ AMDEP_FALSE = @AMDEP_FALSE@
 AMDEP_TRUE = @AMDEP_TRUE@
 AMTAR = @AMTAR@
 AR = @AR@
-ARM_NEON_CFLAGS = @ARM_NEON_CFLAGS@
-ARM_SIMD_CFLAGS = @ARM_SIMD_CFLAGS@
 AUTOCONF = @AUTOCONF@
 AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 CC = @CC@
+CCAS = @CCAS@
+CCASFLAGS = @CCASFLAGS@
 CCDEPMODE = @CCDEPMODE@
 CFLAGS = @CFLAGS@
 CPP = @CPP@
@@ -122,6 +122,7 @@ GTK_CFLAGS = @GTK_CFLAGS@
 GTK_LIBS = @GTK_LIBS@
 HAVE_GTK_FALSE = @HAVE_GTK_FALSE@
 HAVE_GTK_TRUE = @HAVE_GTK_TRUE@
+HAVE_PTHREAD_SETSPECIFIC = @HAVE_PTHREAD_SETSPECIFIC@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
 INSTALL_SCRIPT = @INSTALL_SCRIPT@
@@ -151,6 +152,7 @@ PIXMAN_VERSION_MAJOR = @PIXMAN_VERSION_MAJOR@
 PIXMAN_VERSION_MICRO = @PIXMAN_VERSION_MICRO@
 PIXMAN_VERSION_MINOR = @PIXMAN_VERSION_MINOR@
 PKG_CONFIG = @PKG_CONFIG@
+PTHREAD_LDFLAGS = @PTHREAD_LDFLAGS@
 RANLIB = @RANLIB@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
@@ -158,6 +160,9 @@ SHELL = @SHELL@
 SSE2_CFLAGS = @SSE2_CFLAGS@
 SSE2_LDFLAGS = @SSE2_LDFLAGS@
 STRIP = @STRIP@
+STUBS_CFLAGS = @STUBS_CFLAGS@
+STUBS_LIBS = @STUBS_LIBS@
+TOOLCHAIN_SUPPORTS__THREAD = @TOOLCHAIN_SUPPORTS__THREAD@
 USE_ARM_NEON_FALSE = @USE_ARM_NEON_FALSE@
 USE_ARM_NEON_TRUE = @USE_ARM_NEON_TRUE@
 USE_ARM_SIMD_FALSE = @USE_ARM_SIMD_FALSE@
@@ -250,15 +255,15 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
-	      echo ' cd $(srcdir) && $(AUTOMAKE) --gnu '; \
-	      cd $(srcdir) && $(AUTOMAKE) --gnu  \
+	      echo ' cd $(srcdir) && $(AUTOMAKE) --foreign '; \
+	      cd $(srcdir) && $(AUTOMAKE) --foreign  \
 		&& exit 0; \
 	      exit 1;; \
 	  esac; \
 	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu  Makefile'; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign  Makefile'; \
 	cd $(top_srcdir) && \
-	  $(AUTOMAKE) --gnu  Makefile
+	  $(AUTOMAKE) --foreign  Makefile
 .PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
diff --git a/lib/pixman/README b/lib/pixman/README
index 843b06980..60dff4561 100644
--- a/lib/pixman/README
+++ b/lib/pixman/README
@@ -3,16 +3,12 @@ features such as image compositing and trapezoid rasterization.
 
 Please submit bugs & patches to the libpixman bugzilla:
 
-       https://bugs.freedesktop.org/enter_bug.cgi?product=pixman
+        https://bugs.freedesktop.org/enter_bug.cgi?product=pixman
 
-All questions regarding this software should be directed to either the 
-Xorg mailing list:
+All questions regarding this software should be directed to the pixman
+mailing list:
 
-       http://lists.freedesktop.org/mailman/listinfo/xorg
-
-or the cairo mailing list:
-
-       http://lists.freedesktop.org/mailman/listinfo/cairo
+        http://lists.freedesktop.org/mailman/listinfo/pixman
 
 The master development code repository can be found at:
 
diff --git a/lib/pixman/TODO b/lib/pixman/TODO
index 52d737706..4434ec7cb 100644
--- a/lib/pixman/TODO
+++ b/lib/pixman/TODO
@@ -1,3 +1,8 @@
+  - Testing
+    - Test implementations against each other
+    - Test both with and without the operator strength reduction.
+      They shold be identical.
+
   - SSE 2 issues:
 
       - Use MM_HINT_NTA instead of MM_HINT_T0
diff --git a/lib/pixman/aclocal.m4 b/lib/pixman/aclocal.m4
index b7278b304..272fe9080 100644
--- a/lib/pixman/aclocal.m4
+++ b/lib/pixman/aclocal.m4
@@ -6824,6 +6824,27 @@ AC_DEFUN([AM_AUTOMAKE_VERSION], [am__api_version="1.9"])
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
 	 [AM_AUTOMAKE_VERSION([1.9.6])])
 
+# Figure out how to run the assembler.                      -*- Autoconf -*-
+
+# Copyright (C) 2001, 2003, 2004, 2005  Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# serial 4
+
+# AM_PROG_AS
+# ----------
+AC_DEFUN([AM_PROG_AS],
+[# By default we simply use the C compiler to build assembly code.
+AC_REQUIRE([AC_PROG_CC])
+test "${CCAS+set}" = set || CCAS=$CC
+test "${CCASFLAGS+set}" = set || CCASFLAGS=$CFLAGS
+AC_ARG_VAR([CCAS],      [assembler compiler command (defaults to CC)])
+AC_ARG_VAR([CCASFLAGS], [assembler compiler flags (defaults to CFLAGS)])
+])
+
 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
 
 # Copyright (C) 2001, 2003, 2005  Free Software Foundation, Inc.
diff --git a/lib/pixman/config.h.in b/lib/pixman/config.h.in
index 283eb1a1b..6277b106c 100644
--- a/lib/pixman/config.h.in
+++ b/lib/pixman/config.h.in
@@ -18,6 +18,9 @@
 /* Whether we have posix_memalign() */
 #undef HAVE_POSIX_MEMALIGN
 
+/* Whether pthread_setspecific() is supported */
+#undef HAVE_PTHREAD_SETSPECIFIC
+
 /* Define to 1 if you have the <stdint.h> header file. */
 #undef HAVE_STDINT_H
 
@@ -60,13 +63,19 @@
 /* enable TIMER_BEGIN/TIMER_END macros */
 #undef PIXMAN_TIMERS
 
+/* The size of `long', as computed by sizeof. */
+#undef SIZEOF_LONG
+
 /* Define to 1 if you have the ANSI C header files. */
 #undef STDC_HEADERS
 
-/* use ARM NEON compiler intrinsics */
+/* Whether the tool chain supports __thread */
+#undef TOOLCHAIN_SUPPORTS__THREAD
+
+/* use ARM NEON assembly optimizations */
 #undef USE_ARM_NEON
 
-/* use ARM SIMD compiler intrinsics */
+/* use ARM SIMD assembly optimizations */
 #undef USE_ARM_SIMD
 
 /* use GNU-style inline assembler */
diff --git a/lib/pixman/configure b/lib/pixman/configure
index 262b1f57d..1f9d9d8f3 100644
--- a/lib/pixman/configure
+++ b/lib/pixman/configure
@@ -1,8 +1,8 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.62 for pixman 0.16.6.
+# Generated by GNU Autoconf 2.62 for pixman 0.18.4.
 #
-# Report bugs to <"sandmann@daimi.au.dk">.
+# Report bugs to <"pixman@lists.freedesktop.org">.
 #
 # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
 # 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
@@ -750,9 +750,9 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='pixman'
 PACKAGE_TARNAME='pixman'
-PACKAGE_VERSION='0.16.6'
-PACKAGE_STRING='pixman 0.16.6'
-PACKAGE_BUGREPORT='"sandmann@daimi.au.dk"'
+PACKAGE_VERSION='0.18.4'
+PACKAGE_STRING='pixman 0.18.4'
+PACKAGE_BUGREPORT='"pixman@lists.freedesktop.org"'
 
 # Factoring default headers for most tests.
 ac_includes_default="\
@@ -872,6 +872,8 @@ AMDEPBACKSLASH
 CCDEPMODE
 am__fastdepCC_TRUE
 am__fastdepCC_FALSE
+CCAS
+CCASFLAGS
 SED
 GREP
 EGREP
@@ -909,10 +911,8 @@ SSE2_LDFLAGS
 VMX_CFLAGS
 USE_VMX_TRUE
 USE_VMX_FALSE
-ARM_SIMD_CFLAGS
 USE_ARM_SIMD_TRUE
 USE_ARM_SIMD_FALSE
-ARM_NEON_CFLAGS
 USE_ARM_NEON_TRUE
 USE_ARM_NEON_FALSE
 USE_GCC_INLINE_ASM_TRUE
@@ -925,6 +925,11 @@ HAVE_GTK_TRUE
 HAVE_GTK_FALSE
 DEP_CFLAGS
 DEP_LIBS
+STUBS_CFLAGS
+STUBS_LIBS
+TOOLCHAIN_SUPPORTS__THREAD
+HAVE_PTHREAD_SETSPECIFIC
+PTHREAD_LDFLAGS
 LIBOBJS
 LTLIBOBJS'
 ac_subst_files=''
@@ -955,6 +960,8 @@ CFLAGS
 LDFLAGS
 LIBS
 CPPFLAGS
+CCAS
+CCASFLAGS
 CPP
 CXX
 CXXFLAGS
@@ -964,7 +971,9 @@ F77
 FFLAGS
 PKG_CONFIG
 GTK_CFLAGS
-GTK_LIBS'
+GTK_LIBS
+STUBS_CFLAGS
+STUBS_LIBS'
 
 
 # Initialize some variables set by options.
@@ -1517,7 +1526,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures pixman 0.16.6 to adapt to many kinds of systems.
+\`configure' configures pixman 0.18.4 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1587,7 +1596,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of pixman 0.16.6:";;
+     short | recursive ) echo "Configuration of pixman 0.18.4:";;
    esac
   cat <<\_ACEOF
 
@@ -1628,6 +1637,8 @@ Some influential environment variables:
   LIBS        libraries to pass to the linker, e.g. -l<library>
   CPPFLAGS    C/C++/Objective C preprocessor flags, e.g. -I<include dir> if
               you have headers in a nonstandard directory <include dir>
+  CCAS        assembler compiler command (defaults to CC)
+  CCASFLAGS   assembler compiler flags (defaults to CFLAGS)
   CPP         C preprocessor
   CXX         C++ compiler command
   CXXFLAGS    C++ compiler flags
@@ -1637,11 +1648,14 @@ Some influential environment variables:
   PKG_CONFIG  path to pkg-config utility
   GTK_CFLAGS  C compiler flags for GTK, overriding pkg-config
   GTK_LIBS    linker flags for GTK, overriding pkg-config
+  STUBS_CFLAGS
+              C compiler flags for STUBS, overriding pkg-config
+  STUBS_LIBS  linker flags for STUBS, overriding pkg-config
 
 Use these variables to override the choices made by `configure' or to help
 it to find libraries and programs with nonstandard names/locations.
 
-Report bugs to <"sandmann@daimi.au.dk">.
+Report bugs to <"pixman@lists.freedesktop.org">.
 _ACEOF
 ac_status=$?
 fi
@@ -1704,7 +1718,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-pixman configure 0.16.6
+pixman configure 0.18.4
 generated by GNU Autoconf 2.62
 
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1718,7 +1732,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by pixman $as_me 0.16.6, which was
+It was created by pixman $as_me 0.18.4, which was
 generated by GNU Autoconf 2.62.  Invocation command line was
 
   $ $0 $@
@@ -2367,7 +2381,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='pixman'
- VERSION='0.16.6'
+ VERSION='0.18.4'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -2514,6 +2528,9 @@ am__tar='${AMTAR} chof - "$$tardir"'; am__untar='${AMTAR} xf -'
 
 
 
+# Suppress verbose compile lines
+
+
 ac_config_headers="$ac_config_headers config.h"
 
 
@@ -3708,6 +3725,13 @@ else
 fi
 
 
+# By default we simply use the C compiler to build assembly code.
+
+test "${CCAS+set}" = set || CCAS=$CC
+test "${CCASFLAGS+set}" = set || CCASFLAGS=$CFLAGS
+
+
+
 # Check whether --enable-shared was given.
 if test "${enable_shared+set}" = set; then
   enableval=$enable_shared; p=${PACKAGE-default}
@@ -4405,7 +4429,7 @@ ia64-*-hpux*)
   ;;
 *-*-irix6*)
   # Find out which ABI we are using.
-  echo '#line 4408 "configure"' > conftest.$ac_ext
+  echo '#line 4432 "configure"' > conftest.$ac_ext
   if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
   (eval $ac_compile) 2>&5
   ac_status=$?
@@ -5204,9 +5228,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
     { $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
 $as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
     ( cat <<\_ASBOX
-## ------------------------------------- ##
-## Report this to "sandmann@daimi.au.dk" ##
-## ------------------------------------- ##
+## --------------------------------------------- ##
+## Report this to "pixman@lists.freedesktop.org" ##
+## --------------------------------------------- ##
 _ASBOX
      ) | sed "s/^/$as_me: WARNING:     /" >&2
     ;;
@@ -7513,11 +7537,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:7516: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:7540: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:7520: \$? = $ac_status" >&5
+   echo "$as_me:7544: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -7803,11 +7827,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:7806: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:7830: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:7810: \$? = $ac_status" >&5
+   echo "$as_me:7834: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -7907,11 +7931,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:7910: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:7934: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:7914: \$? = $ac_status" >&5
+   echo "$as_me:7938: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -10307,7 +10331,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10310 "configure"
+#line 10334 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -10407,7 +10431,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10410 "configure"
+#line 10434 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -12816,11 +12840,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:12819: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:12843: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:12823: \$? = $ac_status" >&5
+   echo "$as_me:12847: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -12920,11 +12944,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:12923: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:12947: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:12927: \$? = $ac_status" >&5
+   echo "$as_me:12951: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -14503,11 +14527,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:14506: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:14530: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:14510: \$? = $ac_status" >&5
+   echo "$as_me:14534: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -14607,11 +14631,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:14610: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:14634: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:14614: \$? = $ac_status" >&5
+   echo "$as_me:14638: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -16822,11 +16846,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:16825: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:16849: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:16829: \$? = $ac_status" >&5
+   echo "$as_me:16853: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -17112,11 +17136,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:17115: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:17139: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:17119: \$? = $ac_status" >&5
+   echo "$as_me:17143: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -17216,11 +17240,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:17219: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:17243: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:17223: \$? = $ac_status" >&5
+   echo "$as_me:17247: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -20478,6 +20502,362 @@ _ACEOF
 esac
 
 
+# The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:$LINENO: checking size of long" >&5
+$as_echo_n "checking size of long... " >&6; }
+if test "${ac_cv_sizeof_long+set}" = set; then
+  $as_echo_n "(cached) " >&6
+else
+  if test "$cross_compiling" = yes; then
+  # Depending upon the size, compute the lo and hi bounds.
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+int
+main ()
+{
+static int test_array [1 - 2 * !(((long int) (sizeof (long))) >= 0)];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_lo=0 ac_mid=0
+  while :; do
+    cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+int
+main ()
+{
+static int test_array [1 - 2 * !(((long int) (sizeof (long))) <= $ac_mid)];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_hi=$ac_mid; break
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_lo=`expr $ac_mid + 1`
+			if test $ac_lo -le $ac_mid; then
+			  ac_lo= ac_hi=
+			  break
+			fi
+			ac_mid=`expr 2 '*' $ac_mid + 1`
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  done
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+int
+main ()
+{
+static int test_array [1 - 2 * !(((long int) (sizeof (long))) < 0)];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_hi=-1 ac_mid=-1
+  while :; do
+    cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+int
+main ()
+{
+static int test_array [1 - 2 * !(((long int) (sizeof (long))) >= $ac_mid)];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_lo=$ac_mid; break
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_hi=`expr '(' $ac_mid ')' - 1`
+			if test $ac_mid -le $ac_hi; then
+			  ac_lo= ac_hi=
+			  break
+			fi
+			ac_mid=`expr 2 '*' $ac_mid`
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+  done
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_lo= ac_hi=
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+# Binary search between lo and hi bounds.
+while test "x$ac_lo" != "x$ac_hi"; do
+  ac_mid=`expr '(' $ac_hi - $ac_lo ')' / 2 + $ac_lo`
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+int
+main ()
+{
+static int test_array [1 - 2 * !(((long int) (sizeof (long))) <= $ac_mid)];
+test_array [0] = 0
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_hi=$ac_mid
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_lo=`expr '(' $ac_mid ')' + 1`
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+done
+case $ac_lo in
+?*) ac_cv_sizeof_long=$ac_lo;;
+'') if test "$ac_cv_type_long" = yes; then
+     { { $as_echo "$as_me:$LINENO: error: cannot compute sizeof (long)
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot compute sizeof (long)
+See \`config.log' for more details." >&2;}
+   { (exit 77); exit 77; }; }
+   else
+     ac_cv_sizeof_long=0
+   fi ;;
+esac
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+static long int longval () { return (long int) (sizeof (long)); }
+static unsigned long int ulongval () { return (long int) (sizeof (long)); }
+#include <stdio.h>
+#include <stdlib.h>
+int
+main ()
+{
+
+  FILE *f = fopen ("conftest.val", "w");
+  if (! f)
+    return 1;
+  if (((long int) (sizeof (long))) < 0)
+    {
+      long int i = longval ();
+      if (i != ((long int) (sizeof (long))))
+	return 1;
+      fprintf (f, "%ld", i);
+    }
+  else
+    {
+      unsigned long int i = ulongval ();
+      if (i != ((long int) (sizeof (long))))
+	return 1;
+      fprintf (f, "%lu", i);
+    }
+  /* Do not output a trailing newline, as this causes \r\n confusion
+     on some platforms.  */
+  return ferror (f) || fclose (f) != 0;
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && { ac_try='./conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  ac_cv_sizeof_long=`cat conftest.val`
+else
+  $as_echo "$as_me: program exited with status $ac_status" >&5
+$as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+( exit $ac_status )
+if test "$ac_cv_type_long" = yes; then
+     { { $as_echo "$as_me:$LINENO: error: cannot compute sizeof (long)
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot compute sizeof (long)
+See \`config.log' for more details." >&2;}
+   { (exit 77); exit 77; }; }
+   else
+     ac_cv_sizeof_long=0
+   fi
+fi
+rm -rf conftest.dSYM
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f conftest.val
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_sizeof_long" >&5
+$as_echo "$ac_cv_sizeof_long" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_LONG $ac_cv_sizeof_long
+_ACEOF
+
+
+
 # Checks for Sun Studio compilers
 { $as_echo "$as_me:$LINENO: checking whether __SUNPRO_C is declared" >&5
 $as_echo_n "checking whether __SUNPRO_C is declared... " >&6; }
@@ -20618,13 +20998,13 @@ fi
 
 
 
-LT_VERSION_INFO="16:6:16"
+LT_VERSION_INFO="18:4:18"
 
 PIXMAN_VERSION_MAJOR=0
 
-PIXMAN_VERSION_MINOR=16
+PIXMAN_VERSION_MINOR=18
 
-PIXMAN_VERSION_MICRO=6
+PIXMAN_VERSION_MICRO=4
 
 
 
@@ -20807,8 +21187,8 @@ xserver_save_CFLAGS=$CFLAGS
 CFLAGS="$MMX_CFLAGS $CFLAGS"
 cat >conftest.$ac_ext <<_ACEOF
 
-#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3))
-error "Need GCC >= 3.3 for MMX intrinsics"
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+error "Need GCC >= 3.4 for MMX intrinsics"
 #endif
 #include <mmintrin.h>
 int main () {
@@ -20915,7 +21295,7 @@ cat >conftest.$ac_ext <<_ACEOF
 #include <xmmintrin.h>
 #include <emmintrin.h>
 int main () {
-    __m128i a, b, c;
+    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
 	c = _mm_xor_si128 (a, b);
     return 0;
 }
@@ -21149,19 +21529,23 @@ else
 fi
 
 
-ARM_SIMD_CFLAGS="-mcpu=arm1136j-s"
-
 have_arm_simd=no
 { $as_echo "$as_me:$LINENO: checking whether to use ARM SIMD assembler" >&5
 $as_echo_n "checking whether to use ARM SIMD assembler... " >&6; }
 xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$ARM_SIMD_CFLAGS $CFLAGS"
+CFLAGS="-x assembler-with-cpp $CFLAGS"
 cat >conftest.$ac_ext <<_ACEOF
 
-int main () {
-    asm("uqadd8 r1, r1, r2");
-    return 0;
-}
+.text
+.arch armv6
+.object_arch armv4
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+uqadd8 r0, r0, r0
 _ACEOF
 rm -f conftest.$ac_objext
 if { (ac_try="$ac_compile"
@@ -21210,20 +21594,8 @@ cat >>confdefs.h <<\_ACEOF
 #define USE_ARM_SIMD 1
 _ACEOF
 
-else
-   ARM_SIMD_CFLAGS=
 fi
 
-{ $as_echo "$as_me:$LINENO: result: $have_arm_simd" >&5
-$as_echo "$have_arm_simd" >&6; }
-if test $enable_arm_simd = yes && test $have_arm_simd = no ; then
-   { { $as_echo "$as_me:$LINENO: error: ARM SIMD intrinsics not detected" >&5
-$as_echo "$as_me: error: ARM SIMD intrinsics not detected" >&2;}
-   { (exit 1); exit 1; }; }
-fi
-
-
-
 
 
 if test $have_arm_simd = yes; then
@@ -21235,20 +21607,33 @@ else
 fi
 
 
-ARM_NEON_CFLAGS="-mfpu=neon -mcpu=cortex-a8"
+{ $as_echo "$as_me:$LINENO: result: $have_arm_simd" >&5
+$as_echo "$have_arm_simd" >&6; }
+if test $enable_arm_simd = yes && test $have_arm_simd = no ; then
+   { { $as_echo "$as_me:$LINENO: error: ARM SIMD intrinsics not detected" >&5
+$as_echo "$as_me: error: ARM SIMD intrinsics not detected" >&2;}
+   { (exit 1); exit 1; }; }
+fi
 
 have_arm_neon=no
-{ $as_echo "$as_me:$LINENO: checking whether to use ARM NEON" >&5
-$as_echo_n "checking whether to use ARM NEON... " >&6; }
+{ $as_echo "$as_me:$LINENO: checking whether to use ARM NEON assembler" >&5
+$as_echo_n "checking whether to use ARM NEON assembler... " >&6; }
 xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$ARM_NEON_CFLAGS $CFLAGS"
+CFLAGS="-x assembler-with-cpp $CFLAGS"
 cat >conftest.$ac_ext <<_ACEOF
 
-#include <arm_neon.h>
-int main () {
-    uint8x8_t neon_test=vmov_n_u8(0);
-    return 0;
-}
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.eabi_attribute 10, 0
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+vmovn.u16 d0, q0
 _ACEOF
 rm -f conftest.$ac_objext
 if { (ac_try="$ac_compile"
@@ -21297,14 +21682,10 @@ cat >>confdefs.h <<\_ACEOF
 #define USE_ARM_NEON 1
 _ACEOF
 
-else
-   ARM_NEON_CFLAGS=
 fi
 
 
 
-
-
 if test $have_arm_neon = yes; then
   USE_ARM_NEON_TRUE=
   USE_ARM_NEON_FALSE='#'
@@ -21790,6 +22171,372 @@ _ACEOF
 
 fi
 
+
+support_for__thread=no
+
+{ $as_echo "$as_me:$LINENO: checking for __thread" >&5
+$as_echo_n "checking for __thread... " >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+
+#ifdef __MINGW32__
+#error MinGW has broken __thread support
+#endif
+__thread int x ;
+int main () { return 0; }
+
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  support_for__thread=yes
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+if test $support_for__thread = yes; then
+
+cat >>confdefs.h <<\_ACEOF
+#define TOOLCHAIN_SUPPORTS__THREAD /**/
+_ACEOF
+
+fi
+
+{ $as_echo "$as_me:$LINENO: result: $support_for__thread" >&5
+$as_echo "$support_for__thread" >&6; }
+
+
+if test $support_for__thread = no; then
+
+support_for_pthread_stubs_setspecific=no
+
+{ $as_echo "$as_me:$LINENO: checking for pthread_setspecific in libpthread_stubs" >&5
+$as_echo_n "checking for pthread_setspecific in libpthread_stubs... " >&6; }
+
+save_LDFLAGS=$LDFLAGS
+save_CFLAGS=$CFLAGS
+
+
+pkg_failed=no
+{ $as_echo "$as_me:$LINENO: checking for STUBS" >&5
+$as_echo_n "checking for STUBS... " >&6; }
+
+if test -n "$PKG_CONFIG"; then
+    if test -n "$STUBS_CFLAGS"; then
+        pkg_cv_STUBS_CFLAGS="$STUBS_CFLAGS"
+    else
+        if test -n "$PKG_CONFIG" && \
+    { ($as_echo "$as_me:$LINENO: \$PKG_CONFIG --exists --print-errors \"pthread-stubs\"") >&5
+  ($PKG_CONFIG --exists --print-errors "pthread-stubs") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; then
+  pkg_cv_STUBS_CFLAGS=`$PKG_CONFIG --cflags "pthread-stubs" 2>/dev/null`
+else
+  pkg_failed=yes
+fi
+    fi
+else
+	pkg_failed=untried
+fi
+if test -n "$PKG_CONFIG"; then
+    if test -n "$STUBS_LIBS"; then
+        pkg_cv_STUBS_LIBS="$STUBS_LIBS"
+    else
+        if test -n "$PKG_CONFIG" && \
+    { ($as_echo "$as_me:$LINENO: \$PKG_CONFIG --exists --print-errors \"pthread-stubs\"") >&5
+  ($PKG_CONFIG --exists --print-errors "pthread-stubs") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; then
+  pkg_cv_STUBS_LIBS=`$PKG_CONFIG --libs "pthread-stubs" 2>/dev/null`
+else
+  pkg_failed=yes
+fi
+    fi
+else
+	pkg_failed=untried
+fi
+
+
+
+if test $pkg_failed = yes; then
+
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+        _pkg_short_errors_supported=yes
+else
+        _pkg_short_errors_supported=no
+fi
+        if test $_pkg_short_errors_supported = yes; then
+	        STUBS_PKG_ERRORS=`$PKG_CONFIG --short-errors --errors-to-stdout --print-errors "pthread-stubs"`
+        else
+	        STUBS_PKG_ERRORS=`$PKG_CONFIG --errors-to-stdout --print-errors "pthread-stubs"`
+        fi
+	# Put the nasty error message in config.log where it belongs
+	echo "$STUBS_PKG_ERRORS" >&5
+
+	{ { $as_echo "$as_me:$LINENO: error: Package requirements (pthread-stubs) were not met:
+
+$STUBS_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+Alternatively, you may set the environment variables STUBS_CFLAGS
+and STUBS_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.
+" >&5
+$as_echo "$as_me: error: Package requirements (pthread-stubs) were not met:
+
+$STUBS_PKG_ERRORS
+
+Consider adjusting the PKG_CONFIG_PATH environment variable if you
+installed software in a non-standard prefix.
+
+Alternatively, you may set the environment variables STUBS_CFLAGS
+and STUBS_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.
+" >&2;}
+   { (exit 1); exit 1; }; }
+elif test $pkg_failed = untried; then
+	{ { $as_echo "$as_me:$LINENO: error: The pkg-config script could not be found or is too old.  Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+Alternatively, you may set the environment variables STUBS_CFLAGS
+and STUBS_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.
+
+To get pkg-config, see <http://www.freedesktop.org/software/pkgconfig>.
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: The pkg-config script could not be found or is too old.  Make sure it
+is in your PATH or set the PKG_CONFIG environment variable to the full
+path to pkg-config.
+
+Alternatively, you may set the environment variables STUBS_CFLAGS
+and STUBS_LIBS to avoid the need to call pkg-config.
+See the pkg-config man page for more details.
+
+To get pkg-config, see <http://www.freedesktop.org/software/pkgconfig>.
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }
+else
+	STUBS_CFLAGS=$pkg_cv_STUBS_CFLAGS
+	STUBS_LIBS=$pkg_cv_STUBS_LIBS
+        { $as_echo "$as_me:$LINENO: result: yes" >&5
+$as_echo "yes" >&6; }
+	:
+fi
+
+CFLAGS="${STUBS_CFLAGS}"
+LDFLAGS="${STUBS_LIBS}"
+
+cat >conftest.$ac_ext <<_ACEOF
+
+#include <pthread.h>
+
+#include <stdlib.h>
+#include <pthread.h>
+
+static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+static pthread_key_t key;
+
+static void
+make_key (void)
+{
+    pthread_key_create (&key, NULL);
+}
+
+int
+main ()
+{
+    void *value = NULL;
+
+    if (pthread_once (&once_control, make_key) != 0)
+    {
+	value = NULL;
+    }
+    else
+    {
+	value = pthread_getspecific (key);
+	if (!value)
+	{
+	    value = malloc (100);
+	    pthread_setspecific (key, value);
+	}
+    }
+}
+
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+	 test "$cross_compiling" = yes ||
+	 $as_test_x conftest$ac_exeext
+       }; then
+  support_for_pthread_stubs_setspecific=yes
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -rf conftest.dSYM
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+      conftest$ac_exeext conftest.$ac_ext;
+
+CFLAGS=$save_CFLAGS
+LDFLAGS=$save_LDFLAGS
+
+if test $support_for_pthread_stubs_setspecific = yes; then
+   PTHREAD_LDFLAGS="${STUBS_LIBS}"
+   PTHREAD_CFLAGS="${STUBS_CFLAGS}"
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE_PTHREAD_SETSPECIFIC /**/
+_ACEOF
+
+fi
+
+{ $as_echo "$as_me:$LINENO: result: $support_for_pthread_stubs_setspecific" >&5
+$as_echo "$support_for_pthread_stubs_setspecific" >&6; };
+
+fi
+
+if test $support_for_pthread_stubs_setspecific = no; then
+
+{ $as_echo "$as_me:$LINENO: checking for pthread_setspecific" >&5
+$as_echo_n "checking for pthread_setspecific... " >&6; }
+
+save_LDFLAGS=$LDFLAGS
+
+LDFLAGS="-pthread"
+
+cat >conftest.$ac_ext <<_ACEOF
+
+#include <pthread.h>
+
+#include <stdlib.h>
+#include <pthread.h>
+
+static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+static pthread_key_t key;
+
+static void
+make_key (void)
+{
+    pthread_key_create (&key, NULL);
+}
+
+int
+main ()
+{
+    void *value = NULL;
+
+    if (pthread_once (&once_control, make_key) != 0)
+    {
+	value = NULL;
+    }
+    else
+    {
+	value = pthread_getspecific (key);
+	if (!value)
+	{
+	    value = malloc (100);
+	    pthread_setspecific (key, value);
+	}
+    }
+}
+
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+	 test "$cross_compiling" = yes ||
+	 $as_test_x conftest$ac_exeext
+       }; then
+  support_for_pthread_setspecific=yes
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
+fi
+
+rm -rf conftest.dSYM
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+      conftest$ac_exeext conftest.$ac_ext;
+
+LDFLAGS=$save_LDFLAGS
+
+if test $support_for_pthread_setspecific = yes; then
+   PTHREAD_LDFLAGS="-pthread"
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE_PTHREAD_SETSPECIFIC /**/
+_ACEOF
+
+fi
+
+{ $as_echo "$as_me:$LINENO: result: $support_for_pthread_setspecific" >&5
+$as_echo "$support_for_pthread_setspecific" >&6; };
+
+fi
+
+
+
+
+
 ac_config_files="$ac_config_files pixman-1.pc pixman-1-uninstalled.pc Makefile pixman/Makefile pixman/pixman-version.h test/Makefile"
 
 cat >confcache <<\_ACEOF
@@ -22282,7 +23029,7 @@ exec 6>&1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by pixman $as_me 0.16.6, which was
+This file was extended by pixman $as_me 0.18.4, which was
 generated by GNU Autoconf 2.62.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -22335,7 +23082,7 @@ Report bugs to <bug-autoconf@gnu.org>."
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_version="\\
-pixman config.status 0.16.6
+pixman config.status 0.18.4
 configured by $0, generated by GNU Autoconf 2.62,
   with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
 
diff --git a/lib/pixman/configure.ac b/lib/pixman/configure.ac
index 8fa959ae4..8cfaca439 100644
--- a/lib/pixman/configure.ac
+++ b/lib/pixman/configure.ac
@@ -53,13 +53,16 @@ AC_PREREQ([2.57])
 #
 
 m4_define([pixman_major], 0)
-m4_define([pixman_minor], 16)
-m4_define([pixman_micro], 6)
+m4_define([pixman_minor], 18)
+m4_define([pixman_micro], 4)
 
 m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
 
-AC_INIT(pixman, pixman_version, "sandmann@daimi.au.dk", pixman)
-AM_INIT_AUTOMAKE([dist-bzip2])
+AC_INIT(pixman, pixman_version, "pixman@lists.freedesktop.org", pixman)
+AM_INIT_AUTOMAKE([foreign dist-bzip2])
+
+# Suppress verbose compile lines
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
 
 AM_CONFIG_HEADER(config.h)
 
@@ -68,11 +71,14 @@ AC_CANONICAL_HOST
 test_CFLAGS=${CFLAGS+set} # We may override autoconf default CFLAGS.
 
 AC_PROG_CC
+AM_PROG_AS
 AC_PROG_LIBTOOL
 AC_CHECK_FUNCS([getisax])
 AC_C_BIGENDIAN
 AC_C_INLINE
 
+AC_CHECK_SIZEOF(long)
+
 # Checks for Sun Studio compilers
 AC_CHECK_DECL([__SUNPRO_C], [SUNCC="yes"], [SUNCC="no"])
 AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
@@ -186,8 +192,8 @@ AC_MSG_CHECKING(whether to use MMX intrinsics)
 xserver_save_CFLAGS=$CFLAGS
 CFLAGS="$MMX_CFLAGS $CFLAGS"
 AC_COMPILE_IFELSE([
-#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 3))
-error "Need GCC >= 3.3 for MMX intrinsics"
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+error "Need GCC >= 3.4 for MMX intrinsics"
 #endif
 #include <mmintrin.h>
 int main () {
@@ -247,7 +253,7 @@ AC_COMPILE_IFELSE([
 #include <xmmintrin.h>
 #include <emmintrin.h>
 int main () {
-    __m128i a, b, c;
+    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
 	c = _mm_xor_si128 (a, b);
     return 0;
 }], have_sse2_intrinsics=yes)
@@ -355,19 +361,23 @@ AC_SUBST(VMX_CFLAGS)
 
 AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
 
-dnl ===========================================================================
-dnl Check for ARM SIMD instructions
-ARM_SIMD_CFLAGS="-mcpu=arm1136j-s"
-
+dnl ==========================================================================
+dnl Check if assembler is gas compatible and supports ARM SIMD instructions
 have_arm_simd=no
 AC_MSG_CHECKING(whether to use ARM SIMD assembler)
 xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$ARM_SIMD_CFLAGS $CFLAGS"
-AC_COMPILE_IFELSE([
-int main () {
-    asm("uqadd8 r1, r1, r2");
-    return 0;
-}], have_arm_simd=yes)
+CFLAGS="-x assembler-with-cpp $CFLAGS"
+AC_COMPILE_IFELSE([[
+.text
+.arch armv6
+.object_arch armv4
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+uqadd8 r0, r0, r0]], have_arm_simd=yes)
 CFLAGS=$xserver_save_CFLAGS
 
 AC_ARG_ENABLE(arm-simd,
@@ -380,34 +390,35 @@ if test $enable_arm_simd = no ; then
 fi
 
 if test $have_arm_simd = yes ; then
-   AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD compiler intrinsics])
-else
-   ARM_SIMD_CFLAGS=
+   AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD assembly optimizations])
 fi
 
+AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
+
 AC_MSG_RESULT($have_arm_simd)
 if test $enable_arm_simd = yes && test $have_arm_simd = no ; then
    AC_MSG_ERROR([ARM SIMD intrinsics not detected])
 fi
 
-AC_SUBST(ARM_SIMD_CFLAGS)
-
-AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
-
 dnl ==========================================================================
-dnl Check for ARM NEON instructions
-ARM_NEON_CFLAGS="-mfpu=neon -mcpu=cortex-a8"
-
+dnl Check if assembler is gas compatible and supports NEON instructions
 have_arm_neon=no
-AC_MSG_CHECKING(whether to use ARM NEON)
+AC_MSG_CHECKING(whether to use ARM NEON assembler)
 xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$ARM_NEON_CFLAGS $CFLAGS"
-AC_COMPILE_IFELSE([
-#include <arm_neon.h>
-int main () {
-    uint8x8_t neon_test=vmov_n_u8(0);
-    return 0;
-}], have_arm_neon=yes)
+CFLAGS="-x assembler-with-cpp $CFLAGS"
+AC_COMPILE_IFELSE([[
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.eabi_attribute 10, 0
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+vmovn.u16 d0, q0]], have_arm_neon=yes)
 CFLAGS=$xserver_save_CFLAGS
 
 AC_ARG_ENABLE(arm-neon,
@@ -420,13 +431,9 @@ if test $enable_arm_neon = no ; then
 fi
 
 if test $have_arm_neon = yes ; then
-   AC_DEFINE(USE_ARM_NEON, 1, [use ARM NEON compiler intrinsics])
-else
-   ARM_NEON_CFLAGS=
+   AC_DEFINE(USE_ARM_NEON, 1, [use ARM NEON assembly optimizations])
 fi
 
-AC_SUBST(ARM_NEON_CFLAGS)
-
 AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes)
 
 AC_MSG_RESULT($have_arm_neon)
@@ -510,6 +517,150 @@ if test x$have_posix_memalign = xyes; then
    AC_DEFINE(HAVE_POSIX_MEMALIGN, 1, [Whether we have posix_memalign()])
 fi
 
+dnl =====================================
+dnl Thread local storage
+
+support_for__thread=no
+
+AC_MSG_CHECKING(for __thread)
+AC_COMPILE_IFELSE([
+#ifdef __MINGW32__
+#error MinGW has broken __thread support
+#endif
+__thread int x ;
+int main () { return 0; }
+], support_for__thread=yes)
+
+if test $support_for__thread = yes; then 
+   AC_DEFINE([TOOLCHAIN_SUPPORTS__THREAD],[],[Whether the tool chain supports __thread])
+fi
+
+AC_MSG_RESULT($support_for__thread)
+
+dnl posix tls
+
+if test $support_for__thread = no; then
+
+support_for_pthread_stubs_setspecific=no
+   
+AC_MSG_CHECKING(for pthread_setspecific in libpthread_stubs)
+
+save_LDFLAGS=$LDFLAGS
+save_CFLAGS=$CFLAGS
+
+PKG_CHECK_MODULES(STUBS, pthread-stubs)
+
+CFLAGS="${STUBS_CFLAGS}"
+LDFLAGS="${STUBS_LIBS}"
+
+AC_LINK_IFELSE([
+#include <pthread.h>
+
+#include <stdlib.h>
+#include <pthread.h>
+
+static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+static pthread_key_t key;
+
+static void
+make_key (void)
+{
+    pthread_key_create (&key, NULL);
+}
+
+int
+main ()
+{
+    void *value = NULL;
+    
+    if (pthread_once (&once_control, make_key) != 0)
+    {
+	value = NULL;
+    }
+    else
+    {
+	value = pthread_getspecific (key);
+	if (!value)
+	{
+	    value = malloc (100);
+	    pthread_setspecific (key, value);
+	}
+    }
+}
+], support_for_pthread_stubs_setspecific=yes);
+
+CFLAGS=$save_CFLAGS
+LDFLAGS=$save_LDFLAGS
+
+if test $support_for_pthread_stubs_setspecific = yes; then
+   PTHREAD_LDFLAGS="${STUBS_LIBS}"
+   PTHREAD_CFLAGS="${STUBS_CFLAGS}"
+   AC_DEFINE([HAVE_PTHREAD_SETSPECIFIC], [], [Whether pthread_setspecific() is supported])
+fi
+
+AC_MSG_RESULT($support_for_pthread_stubs_setspecific);
+
+fi
+
+if test $support_for_pthread_stubs_setspecific = no; then
+
+AC_MSG_CHECKING(for pthread_setspecific)
+
+save_LDFLAGS=$LDFLAGS
+
+LDFLAGS="-pthread"
+
+AC_LINK_IFELSE([
+#include <pthread.h>
+
+#include <stdlib.h>
+#include <pthread.h>
+
+static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+static pthread_key_t key;
+
+static void
+make_key (void)
+{
+    pthread_key_create (&key, NULL);
+}
+
+int
+main ()
+{
+    void *value = NULL;
+    
+    if (pthread_once (&once_control, make_key) != 0)
+    {
+	value = NULL;
+    }
+    else
+    {
+	value = pthread_getspecific (key);
+	if (!value)
+	{
+	    value = malloc (100);
+	    pthread_setspecific (key, value);
+	}
+    }
+}
+], support_for_pthread_setspecific=yes);
+
+LDFLAGS=$save_LDFLAGS
+
+if test $support_for_pthread_setspecific = yes; then
+   PTHREAD_LDFLAGS="-pthread"
+   AC_DEFINE([HAVE_PTHREAD_SETSPECIFIC], [], [Whether pthread_setspecific() is supported])
+fi
+
+AC_MSG_RESULT($support_for_pthread_setspecific);
+
+fi
+
+AC_SUBST(TOOLCHAIN_SUPPORTS__THREAD)
+AC_SUBST(HAVE_PTHREAD_SETSPECIFIC)
+AC_SUBST(PTHREAD_LDFLAGS)
+
 AC_OUTPUT([pixman-1.pc
            pixman-1-uninstalled.pc
            Makefile
diff --git a/lib/pixman/pixman-1.pc.in b/lib/pixman/pixman-1.pc.in
index 936d95db0..14bfe1d38 100644
--- a/lib/pixman/pixman-1.pc.in
+++ b/lib/pixman/pixman-1.pc.in
@@ -6,6 +6,7 @@ includedir=@includedir@
 Name: Pixman
 Description: The pixman library (version 1)
 Version: @PACKAGE_VERSION@
+Requires: pthread-stubs >= 0.3
 Cflags: -I${includedir}/pixman-1 @DEP_CFLAGS@
 Libs: -L${libdir} -lpixman-1 @DEP_LIBS@
 
diff --git a/lib/pixman/pixman/Makefile.am b/lib/pixman/pixman/Makefile.am
index e19fa6e7f..66ad7f005 100644
--- a/lib/pixman/pixman/Makefile.am
+++ b/lib/pixman/pixman/Makefile.am
@@ -1,7 +1,6 @@
 lib_LTLIBRARIES = libpixman-1.la
-libpixman_1_la_LDFLAGS = -version-info $(LT_VERSION_INFO) -no-undefined
+libpixman_1_la_LDFLAGS = -version-info $(LT_VERSION_INFO) -no-undefined @PTHREAD_LDFLAGS@ 
 libpixman_1_la_LIBADD = @DEP_LIBS@ -lm
-libpixman_1_la_CFLAGS = -DPIXMAN_DISABLE_DEPRECATED
 libpixman_1_la_SOURCES =			\
 	pixman.h				\
 	pixman-accessor.h			\
@@ -64,6 +63,8 @@ libpixman_mmx_la_CFLAGS = $(DEP_CFLAGS) $(MMX_CFLAGS)
 libpixman_mmx_la_LIBADD = $(DEP_LIBS)
 libpixman_1_la_LDFLAGS += $(MMX_LDFLAGS)
 libpixman_1_la_LIBADD += libpixman-mmx.la
+
+ASM_CFLAGS_mmx=$(MMX_CFLAGS)
 endif
 
 # vmx code
@@ -75,6 +76,8 @@ libpixman_vmx_la_SOURCES = \
 libpixman_vmx_la_CFLAGS = $(DEP_CFLAGS) $(VMX_CFLAGS)
 libpixman_vmx_la_LIBADD = $(DEP_LIBS)
 libpixman_1_la_LIBADD += libpixman-vmx.la
+
+ASM_CFLAGS_vmx=$(VMX_CFLAGS)
 endif
 
 # sse2 code
@@ -86,26 +89,38 @@ libpixman_sse2_la_CFLAGS = $(DEP_CFLAGS) $(SSE2_CFLAGS)
 libpixman_sse2_la_LIBADD = $(DEP_LIBS)
 libpixman_1_la_LDFLAGS += $(SSE2_LDFLAGS)
 libpixman_1_la_LIBADD += libpixman-sse2.la
+
+ASM_CFLAGS_sse2=$(SSE2_CFLAGS)
 endif
 
 # arm simd code
 if USE_ARM_SIMD
 noinst_LTLIBRARIES += libpixman-arm-simd.la
 libpixman_arm_simd_la_SOURCES = \
-	pixman-arm-simd.c
-libpixman_arm_simd_la_CFLAGS = $(DEP_CFLAGS) $(ARM_SIMD_CFLAGS)
+	pixman-arm-simd.c	\
+	pixman-arm-common.h	\
+	pixman-arm-simd-asm.S
+libpixman_arm_simd_la_CFLAGS = $(DEP_CFLAGS)
 libpixman_arm_simd_la_LIBADD = $(DEP_LIBS)
 libpixman_1_la_LIBADD += libpixman-arm-simd.la
+
+ASM_CFLAGS_arm_simd=
 endif
 
 # arm neon code
 if USE_ARM_NEON
 noinst_LTLIBRARIES += libpixman-arm-neon.la
 libpixman_arm_neon_la_SOURCES = \
-        pixman-arm-neon.c
-libpixman_arm_neon_la_CFLAGS = $(DEP_CFLAGS) $(ARM_NEON_CFLAGS)
+        pixman-arm-neon.c	\
+        pixman-arm-common.h	\
+        pixman-arm-neon-asm.S	\
+        pixman-arm-neon-asm.h
+libpixman_arm_neon_la_CFLAGS = $(DEP_CFLAGS)
 libpixman_arm_neon_la_LIBADD = $(DEP_LIBS)
 libpixman_1_la_LIBADD += libpixman-arm-neon.la
-endif
 
+ASM_CFLAGS_arm_neon=
+endif
 
+.c.s : $(libpixmaninclude_HEADERS) $(BUILT_SOURCES)
+	$(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $<
diff --git a/lib/pixman/pixman/Makefile.in b/lib/pixman/pixman/Makefile.in
index 51c282071..dbd77f5f0 100644
--- a/lib/pixman/pixman/Makefile.in
+++ b/lib/pixman/pixman/Makefile.in
@@ -87,41 +87,35 @@ LTLIBRARIES = $(lib_LTLIBRARIES) $(noinst_LTLIBRARIES)
 libpixman_1_la_DEPENDENCIES = $(am__DEPENDENCIES_1) \
 	$(am__DEPENDENCIES_2) $(am__DEPENDENCIES_3) \
 	$(am__DEPENDENCIES_4) $(am__DEPENDENCIES_5)
-am_libpixman_1_la_OBJECTS = libpixman_1_la-pixman-access.lo \
-	libpixman_1_la-pixman-access-accessors.lo \
-	libpixman_1_la-pixman-cpu.lo \
-	libpixman_1_la-pixman-gradient-walker.lo \
-	libpixman_1_la-pixman-region16.lo \
-	libpixman_1_la-pixman-region32.lo \
-	libpixman_1_la-pixman-image.lo \
-	libpixman_1_la-pixman-implementation.lo \
-	libpixman_1_la-pixman-combine32.lo \
-	libpixman_1_la-pixman-combine64.lo \
-	libpixman_1_la-pixman-general.lo libpixman_1_la-pixman.lo \
-	libpixman_1_la-pixman-fast-path.lo \
-	libpixman_1_la-pixman-solid-fill.lo \
-	libpixman_1_la-pixman-conical-gradient.lo \
-	libpixman_1_la-pixman-linear-gradient.lo \
-	libpixman_1_la-pixman-radial-gradient.lo \
-	libpixman_1_la-pixman-bits-image.lo \
-	libpixman_1_la-pixman-utils.lo libpixman_1_la-pixman-edge.lo \
-	libpixman_1_la-pixman-edge-accessors.lo \
-	libpixman_1_la-pixman-trap.lo libpixman_1_la-pixman-timer.lo \
-	libpixman_1_la-pixman-matrix.lo
+am_libpixman_1_la_OBJECTS = pixman-access.lo \
+	pixman-access-accessors.lo pixman-cpu.lo \
+	pixman-gradient-walker.lo pixman-region16.lo \
+	pixman-region32.lo pixman-image.lo pixman-implementation.lo \
+	pixman-combine32.lo pixman-combine64.lo pixman-general.lo \
+	pixman.lo pixman-fast-path.lo pixman-solid-fill.lo \
+	pixman-conical-gradient.lo pixman-linear-gradient.lo \
+	pixman-radial-gradient.lo pixman-bits-image.lo pixman-utils.lo \
+	pixman-edge.lo pixman-edge-accessors.lo pixman-trap.lo \
+	pixman-timer.lo pixman-matrix.lo
 libpixman_1_la_OBJECTS = $(am_libpixman_1_la_OBJECTS)
 am__DEPENDENCIES_6 =
 @USE_ARM_NEON_TRUE@libpixman_arm_neon_la_DEPENDENCIES =  \
 @USE_ARM_NEON_TRUE@	$(am__DEPENDENCIES_6)
-am__libpixman_arm_neon_la_SOURCES_DIST = pixman-arm-neon.c
+am__libpixman_arm_neon_la_SOURCES_DIST = pixman-arm-neon.c \
+	pixman-arm-common.h pixman-arm-neon-asm.S \
+	pixman-arm-neon-asm.h
 @USE_ARM_NEON_TRUE@am_libpixman_arm_neon_la_OBJECTS =  \
-@USE_ARM_NEON_TRUE@	libpixman_arm_neon_la-pixman-arm-neon.lo
+@USE_ARM_NEON_TRUE@	libpixman_arm_neon_la-pixman-arm-neon.lo \
+@USE_ARM_NEON_TRUE@	pixman-arm-neon-asm.lo
 libpixman_arm_neon_la_OBJECTS = $(am_libpixman_arm_neon_la_OBJECTS)
 @USE_ARM_NEON_TRUE@am_libpixman_arm_neon_la_rpath =
 @USE_ARM_SIMD_TRUE@libpixman_arm_simd_la_DEPENDENCIES =  \
 @USE_ARM_SIMD_TRUE@	$(am__DEPENDENCIES_6)
-am__libpixman_arm_simd_la_SOURCES_DIST = pixman-arm-simd.c
+am__libpixman_arm_simd_la_SOURCES_DIST = pixman-arm-simd.c \
+	pixman-arm-common.h pixman-arm-simd-asm.S
 @USE_ARM_SIMD_TRUE@am_libpixman_arm_simd_la_OBJECTS =  \
-@USE_ARM_SIMD_TRUE@	libpixman_arm_simd_la-pixman-arm-simd.lo
+@USE_ARM_SIMD_TRUE@	libpixman_arm_simd_la-pixman-arm-simd.lo \
+@USE_ARM_SIMD_TRUE@	pixman-arm-simd-asm.lo
 libpixman_arm_simd_la_OBJECTS = $(am_libpixman_arm_simd_la_OBJECTS)
 @USE_ARM_SIMD_TRUE@am_libpixman_arm_simd_la_rpath =
 @USE_MMX_TRUE@libpixman_mmx_la_DEPENDENCIES = $(am__DEPENDENCIES_6)
@@ -145,6 +139,9 @@ libpixman_vmx_la_OBJECTS = $(am_libpixman_vmx_la_OBJECTS)
 DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir)
 depcomp = $(SHELL) $(top_srcdir)/depcomp
 am__depfiles_maybe = depfiles
+CCASCOMPILE = $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS)
+LTCCASCOMPILE = $(LIBTOOL) --mode=compile $(CCAS) $(AM_CCASFLAGS) \
+	$(CCASFLAGS)
 COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
 	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
 LTCOMPILE = $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) \
@@ -172,13 +169,13 @@ AMDEP_FALSE = @AMDEP_FALSE@
 AMDEP_TRUE = @AMDEP_TRUE@
 AMTAR = @AMTAR@
 AR = @AR@
-ARM_NEON_CFLAGS = @ARM_NEON_CFLAGS@
-ARM_SIMD_CFLAGS = @ARM_SIMD_CFLAGS@
 AUTOCONF = @AUTOCONF@
 AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 CC = @CC@
+CCAS = @CCAS@
+CCASFLAGS = @CCASFLAGS@
 CCDEPMODE = @CCDEPMODE@
 CFLAGS = @CFLAGS@
 CPP = @CPP@
@@ -206,6 +203,7 @@ GTK_CFLAGS = @GTK_CFLAGS@
 GTK_LIBS = @GTK_LIBS@
 HAVE_GTK_FALSE = @HAVE_GTK_FALSE@
 HAVE_GTK_TRUE = @HAVE_GTK_TRUE@
+HAVE_PTHREAD_SETSPECIFIC = @HAVE_PTHREAD_SETSPECIFIC@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
 INSTALL_SCRIPT = @INSTALL_SCRIPT@
@@ -235,6 +233,7 @@ PIXMAN_VERSION_MAJOR = @PIXMAN_VERSION_MAJOR@
 PIXMAN_VERSION_MICRO = @PIXMAN_VERSION_MICRO@
 PIXMAN_VERSION_MINOR = @PIXMAN_VERSION_MINOR@
 PKG_CONFIG = @PKG_CONFIG@
+PTHREAD_LDFLAGS = @PTHREAD_LDFLAGS@
 RANLIB = @RANLIB@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
@@ -242,6 +241,9 @@ SHELL = @SHELL@
 SSE2_CFLAGS = @SSE2_CFLAGS@
 SSE2_LDFLAGS = @SSE2_LDFLAGS@
 STRIP = @STRIP@
+STUBS_CFLAGS = @STUBS_CFLAGS@
+STUBS_LIBS = @STUBS_LIBS@
+TOOLCHAIN_SUPPORTS__THREAD = @TOOLCHAIN_SUPPORTS__THREAD@
 USE_ARM_NEON_FALSE = @USE_ARM_NEON_FALSE@
 USE_ARM_NEON_TRUE = @USE_ARM_NEON_TRUE@
 USE_ARM_SIMD_FALSE = @USE_ARM_SIMD_FALSE@
@@ -305,10 +307,10 @@ sysconfdir = @sysconfdir@
 target_alias = @target_alias@
 lib_LTLIBRARIES = libpixman-1.la
 libpixman_1_la_LDFLAGS = -version-info $(LT_VERSION_INFO) \
-	-no-undefined $(am__append_2) $(am__append_7)
+	-no-undefined @PTHREAD_LDFLAGS@ $(am__append_2) \
+	$(am__append_7)
 libpixman_1_la_LIBADD = @DEP_LIBS@ -lm $(am__append_3) $(am__append_5) \
 	$(am__append_8) $(am__append_10) $(am__append_12)
-libpixman_1_la_CFLAGS = -DPIXMAN_DISABLE_DEPRECATED
 libpixman_1_la_SOURCES = \
 	pixman.h				\
 	pixman-accessor.h			\
@@ -356,32 +358,42 @@ CLEANFILES = pixman-combine32.c pixman-combine64.c pixman-combine32.h pixman-com
 
 @USE_MMX_TRUE@libpixman_mmx_la_CFLAGS = $(DEP_CFLAGS) $(MMX_CFLAGS)
 @USE_MMX_TRUE@libpixman_mmx_la_LIBADD = $(DEP_LIBS)
+@USE_MMX_TRUE@ASM_CFLAGS_mmx = $(MMX_CFLAGS)
 @USE_VMX_TRUE@libpixman_vmx_la_SOURCES = \
 @USE_VMX_TRUE@	pixman-vmx.c \
 @USE_VMX_TRUE@	pixman-combine32.h
 
 @USE_VMX_TRUE@libpixman_vmx_la_CFLAGS = $(DEP_CFLAGS) $(VMX_CFLAGS)
 @USE_VMX_TRUE@libpixman_vmx_la_LIBADD = $(DEP_LIBS)
+@USE_VMX_TRUE@ASM_CFLAGS_vmx = $(VMX_CFLAGS)
 @USE_SSE2_TRUE@libpixman_sse2_la_SOURCES = \
 @USE_SSE2_TRUE@	pixman-sse2.c
 
 @USE_SSE2_TRUE@libpixman_sse2_la_CFLAGS = $(DEP_CFLAGS) $(SSE2_CFLAGS)
 @USE_SSE2_TRUE@libpixman_sse2_la_LIBADD = $(DEP_LIBS)
+@USE_SSE2_TRUE@ASM_CFLAGS_sse2 = $(SSE2_CFLAGS)
 @USE_ARM_SIMD_TRUE@libpixman_arm_simd_la_SOURCES = \
-@USE_ARM_SIMD_TRUE@	pixman-arm-simd.c
+@USE_ARM_SIMD_TRUE@	pixman-arm-simd.c	\
+@USE_ARM_SIMD_TRUE@	pixman-arm-common.h	\
+@USE_ARM_SIMD_TRUE@	pixman-arm-simd-asm.S
 
-@USE_ARM_SIMD_TRUE@libpixman_arm_simd_la_CFLAGS = $(DEP_CFLAGS) $(ARM_SIMD_CFLAGS)
+@USE_ARM_SIMD_TRUE@libpixman_arm_simd_la_CFLAGS = $(DEP_CFLAGS)
 @USE_ARM_SIMD_TRUE@libpixman_arm_simd_la_LIBADD = $(DEP_LIBS)
+@USE_ARM_SIMD_TRUE@ASM_CFLAGS_arm_simd = 
 @USE_ARM_NEON_TRUE@libpixman_arm_neon_la_SOURCES = \
-@USE_ARM_NEON_TRUE@        pixman-arm-neon.c
+@USE_ARM_NEON_TRUE@        pixman-arm-neon.c	\
+@USE_ARM_NEON_TRUE@        pixman-arm-common.h	\
+@USE_ARM_NEON_TRUE@        pixman-arm-neon-asm.S	\
+@USE_ARM_NEON_TRUE@        pixman-arm-neon-asm.h
 
-@USE_ARM_NEON_TRUE@libpixman_arm_neon_la_CFLAGS = $(DEP_CFLAGS) $(ARM_NEON_CFLAGS)
+@USE_ARM_NEON_TRUE@libpixman_arm_neon_la_CFLAGS = $(DEP_CFLAGS)
 @USE_ARM_NEON_TRUE@libpixman_arm_neon_la_LIBADD = $(DEP_LIBS)
+@USE_ARM_NEON_TRUE@ASM_CFLAGS_arm_neon = 
 all: $(BUILT_SOURCES)
 	$(MAKE) $(AM_MAKEFLAGS) all-am
 
 .SUFFIXES:
-.SUFFIXES: .c .lo .o .obj
+.SUFFIXES: .S .c .lo .o .obj .s
 $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
@@ -391,9 +403,9 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	      exit 1;; \
 	  esac; \
 	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu  pixman/Makefile'; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign  pixman/Makefile'; \
 	cd $(top_srcdir) && \
-	  $(AUTOMAKE) --gnu  pixman/Makefile
+	  $(AUTOMAKE) --foreign  pixman/Makefile
 .PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
@@ -468,35 +480,44 @@ mostlyclean-compile:
 distclean-compile:
 	-rm -f *.tab.c
 
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-access-accessors.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-access.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-bits-image.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-combine32.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-combine64.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-conical-gradient.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-cpu.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-edge-accessors.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-edge.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-fast-path.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-general.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-gradient-walker.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-image.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-implementation.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-linear-gradient.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-matrix.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-radial-gradient.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-region16.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-region32.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-solid-fill.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-timer.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-trap.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-utils.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_arm_neon_la-pixman-arm-neon.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_arm_simd_la-pixman-arm-simd.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_mmx_la-pixman-mmx.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_sse2_la-pixman-sse2.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_vmx_la-pixman-vmx.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-access-accessors.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-access.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-bits-image.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-combine32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-combine64.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-conical-gradient.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-cpu.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-edge-accessors.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-edge.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-fast-path.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-general.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-gradient-walker.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-image.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-implementation.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-linear-gradient.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-matrix.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-radial-gradient.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-region16.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-region32.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-solid-fill.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-timer.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-trap.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman-utils.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/pixman.Plo@am__quote@
+
+.S.o:
+	$(CCASCOMPILE) -c $<
+
+.S.obj:
+	$(CCASCOMPILE) -c `$(CYGPATH_W) '$<'`
+
+.S.lo:
+	$(LTCCASCOMPILE) -c -o $@ $<
 
 .c.o:
 @am__fastdepCC_TRUE@	if $(COMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \
@@ -519,174 +540,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
 
-libpixman_1_la-pixman-access.lo: pixman-access.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-access.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-access.Tpo" -c -o libpixman_1_la-pixman-access.lo `test -f 'pixman-access.c' || echo '$(srcdir)/'`pixman-access.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-access.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-access.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-access.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-access.c' object='libpixman_1_la-pixman-access.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-access.lo `test -f 'pixman-access.c' || echo '$(srcdir)/'`pixman-access.c
-
-libpixman_1_la-pixman-access-accessors.lo: pixman-access-accessors.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-access-accessors.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-access-accessors.Tpo" -c -o libpixman_1_la-pixman-access-accessors.lo `test -f 'pixman-access-accessors.c' || echo '$(srcdir)/'`pixman-access-accessors.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-access-accessors.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-access-accessors.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-access-accessors.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-access-accessors.c' object='libpixman_1_la-pixman-access-accessors.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-access-accessors.lo `test -f 'pixman-access-accessors.c' || echo '$(srcdir)/'`pixman-access-accessors.c
-
-libpixman_1_la-pixman-cpu.lo: pixman-cpu.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-cpu.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-cpu.Tpo" -c -o libpixman_1_la-pixman-cpu.lo `test -f 'pixman-cpu.c' || echo '$(srcdir)/'`pixman-cpu.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-cpu.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-cpu.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-cpu.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-cpu.c' object='libpixman_1_la-pixman-cpu.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-cpu.lo `test -f 'pixman-cpu.c' || echo '$(srcdir)/'`pixman-cpu.c
-
-libpixman_1_la-pixman-gradient-walker.lo: pixman-gradient-walker.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-gradient-walker.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-gradient-walker.Tpo" -c -o libpixman_1_la-pixman-gradient-walker.lo `test -f 'pixman-gradient-walker.c' || echo '$(srcdir)/'`pixman-gradient-walker.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-gradient-walker.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-gradient-walker.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-gradient-walker.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-gradient-walker.c' object='libpixman_1_la-pixman-gradient-walker.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-gradient-walker.lo `test -f 'pixman-gradient-walker.c' || echo '$(srcdir)/'`pixman-gradient-walker.c
-
-libpixman_1_la-pixman-region16.lo: pixman-region16.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-region16.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-region16.Tpo" -c -o libpixman_1_la-pixman-region16.lo `test -f 'pixman-region16.c' || echo '$(srcdir)/'`pixman-region16.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-region16.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-region16.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-region16.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-region16.c' object='libpixman_1_la-pixman-region16.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-region16.lo `test -f 'pixman-region16.c' || echo '$(srcdir)/'`pixman-region16.c
-
-libpixman_1_la-pixman-region32.lo: pixman-region32.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-region32.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-region32.Tpo" -c -o libpixman_1_la-pixman-region32.lo `test -f 'pixman-region32.c' || echo '$(srcdir)/'`pixman-region32.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-region32.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-region32.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-region32.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-region32.c' object='libpixman_1_la-pixman-region32.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-region32.lo `test -f 'pixman-region32.c' || echo '$(srcdir)/'`pixman-region32.c
-
-libpixman_1_la-pixman-image.lo: pixman-image.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-image.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-image.Tpo" -c -o libpixman_1_la-pixman-image.lo `test -f 'pixman-image.c' || echo '$(srcdir)/'`pixman-image.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-image.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-image.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-image.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-image.c' object='libpixman_1_la-pixman-image.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-image.lo `test -f 'pixman-image.c' || echo '$(srcdir)/'`pixman-image.c
-
-libpixman_1_la-pixman-implementation.lo: pixman-implementation.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-implementation.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-implementation.Tpo" -c -o libpixman_1_la-pixman-implementation.lo `test -f 'pixman-implementation.c' || echo '$(srcdir)/'`pixman-implementation.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-implementation.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-implementation.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-implementation.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-implementation.c' object='libpixman_1_la-pixman-implementation.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-implementation.lo `test -f 'pixman-implementation.c' || echo '$(srcdir)/'`pixman-implementation.c
-
-libpixman_1_la-pixman-combine32.lo: pixman-combine32.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-combine32.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-combine32.Tpo" -c -o libpixman_1_la-pixman-combine32.lo `test -f 'pixman-combine32.c' || echo '$(srcdir)/'`pixman-combine32.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-combine32.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-combine32.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-combine32.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-combine32.c' object='libpixman_1_la-pixman-combine32.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-combine32.lo `test -f 'pixman-combine32.c' || echo '$(srcdir)/'`pixman-combine32.c
-
-libpixman_1_la-pixman-combine64.lo: pixman-combine64.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-combine64.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-combine64.Tpo" -c -o libpixman_1_la-pixman-combine64.lo `test -f 'pixman-combine64.c' || echo '$(srcdir)/'`pixman-combine64.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-combine64.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-combine64.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-combine64.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-combine64.c' object='libpixman_1_la-pixman-combine64.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-combine64.lo `test -f 'pixman-combine64.c' || echo '$(srcdir)/'`pixman-combine64.c
-
-libpixman_1_la-pixman-general.lo: pixman-general.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-general.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-general.Tpo" -c -o libpixman_1_la-pixman-general.lo `test -f 'pixman-general.c' || echo '$(srcdir)/'`pixman-general.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-general.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-general.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-general.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-general.c' object='libpixman_1_la-pixman-general.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-general.lo `test -f 'pixman-general.c' || echo '$(srcdir)/'`pixman-general.c
-
-libpixman_1_la-pixman.lo: pixman.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman.Tpo" -c -o libpixman_1_la-pixman.lo `test -f 'pixman.c' || echo '$(srcdir)/'`pixman.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman.Tpo" "$(DEPDIR)/libpixman_1_la-pixman.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman.c' object='libpixman_1_la-pixman.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman.lo `test -f 'pixman.c' || echo '$(srcdir)/'`pixman.c
-
-libpixman_1_la-pixman-fast-path.lo: pixman-fast-path.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-fast-path.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-fast-path.Tpo" -c -o libpixman_1_la-pixman-fast-path.lo `test -f 'pixman-fast-path.c' || echo '$(srcdir)/'`pixman-fast-path.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-fast-path.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-fast-path.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-fast-path.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-fast-path.c' object='libpixman_1_la-pixman-fast-path.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-fast-path.lo `test -f 'pixman-fast-path.c' || echo '$(srcdir)/'`pixman-fast-path.c
-
-libpixman_1_la-pixman-solid-fill.lo: pixman-solid-fill.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-solid-fill.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-solid-fill.Tpo" -c -o libpixman_1_la-pixman-solid-fill.lo `test -f 'pixman-solid-fill.c' || echo '$(srcdir)/'`pixman-solid-fill.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-solid-fill.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-solid-fill.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-solid-fill.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-solid-fill.c' object='libpixman_1_la-pixman-solid-fill.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-solid-fill.lo `test -f 'pixman-solid-fill.c' || echo '$(srcdir)/'`pixman-solid-fill.c
-
-libpixman_1_la-pixman-conical-gradient.lo: pixman-conical-gradient.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-conical-gradient.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-conical-gradient.Tpo" -c -o libpixman_1_la-pixman-conical-gradient.lo `test -f 'pixman-conical-gradient.c' || echo '$(srcdir)/'`pixman-conical-gradient.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-conical-gradient.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-conical-gradient.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-conical-gradient.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-conical-gradient.c' object='libpixman_1_la-pixman-conical-gradient.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-conical-gradient.lo `test -f 'pixman-conical-gradient.c' || echo '$(srcdir)/'`pixman-conical-gradient.c
-
-libpixman_1_la-pixman-linear-gradient.lo: pixman-linear-gradient.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-linear-gradient.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-linear-gradient.Tpo" -c -o libpixman_1_la-pixman-linear-gradient.lo `test -f 'pixman-linear-gradient.c' || echo '$(srcdir)/'`pixman-linear-gradient.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-linear-gradient.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-linear-gradient.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-linear-gradient.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-linear-gradient.c' object='libpixman_1_la-pixman-linear-gradient.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-linear-gradient.lo `test -f 'pixman-linear-gradient.c' || echo '$(srcdir)/'`pixman-linear-gradient.c
-
-libpixman_1_la-pixman-radial-gradient.lo: pixman-radial-gradient.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-radial-gradient.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-radial-gradient.Tpo" -c -o libpixman_1_la-pixman-radial-gradient.lo `test -f 'pixman-radial-gradient.c' || echo '$(srcdir)/'`pixman-radial-gradient.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-radial-gradient.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-radial-gradient.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-radial-gradient.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-radial-gradient.c' object='libpixman_1_la-pixman-radial-gradient.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-radial-gradient.lo `test -f 'pixman-radial-gradient.c' || echo '$(srcdir)/'`pixman-radial-gradient.c
-
-libpixman_1_la-pixman-bits-image.lo: pixman-bits-image.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-bits-image.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-bits-image.Tpo" -c -o libpixman_1_la-pixman-bits-image.lo `test -f 'pixman-bits-image.c' || echo '$(srcdir)/'`pixman-bits-image.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-bits-image.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-bits-image.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-bits-image.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-bits-image.c' object='libpixman_1_la-pixman-bits-image.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-bits-image.lo `test -f 'pixman-bits-image.c' || echo '$(srcdir)/'`pixman-bits-image.c
-
-libpixman_1_la-pixman-utils.lo: pixman-utils.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-utils.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-utils.Tpo" -c -o libpixman_1_la-pixman-utils.lo `test -f 'pixman-utils.c' || echo '$(srcdir)/'`pixman-utils.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-utils.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-utils.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-utils.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-utils.c' object='libpixman_1_la-pixman-utils.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-utils.lo `test -f 'pixman-utils.c' || echo '$(srcdir)/'`pixman-utils.c
-
-libpixman_1_la-pixman-edge.lo: pixman-edge.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-edge.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-edge.Tpo" -c -o libpixman_1_la-pixman-edge.lo `test -f 'pixman-edge.c' || echo '$(srcdir)/'`pixman-edge.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-edge.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-edge.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-edge.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-edge.c' object='libpixman_1_la-pixman-edge.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-edge.lo `test -f 'pixman-edge.c' || echo '$(srcdir)/'`pixman-edge.c
-
-libpixman_1_la-pixman-edge-accessors.lo: pixman-edge-accessors.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-edge-accessors.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-edge-accessors.Tpo" -c -o libpixman_1_la-pixman-edge-accessors.lo `test -f 'pixman-edge-accessors.c' || echo '$(srcdir)/'`pixman-edge-accessors.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-edge-accessors.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-edge-accessors.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-edge-accessors.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-edge-accessors.c' object='libpixman_1_la-pixman-edge-accessors.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-edge-accessors.lo `test -f 'pixman-edge-accessors.c' || echo '$(srcdir)/'`pixman-edge-accessors.c
-
-libpixman_1_la-pixman-trap.lo: pixman-trap.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-trap.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-trap.Tpo" -c -o libpixman_1_la-pixman-trap.lo `test -f 'pixman-trap.c' || echo '$(srcdir)/'`pixman-trap.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-trap.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-trap.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-trap.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-trap.c' object='libpixman_1_la-pixman-trap.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-trap.lo `test -f 'pixman-trap.c' || echo '$(srcdir)/'`pixman-trap.c
-
-libpixman_1_la-pixman-timer.lo: pixman-timer.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-timer.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-timer.Tpo" -c -o libpixman_1_la-pixman-timer.lo `test -f 'pixman-timer.c' || echo '$(srcdir)/'`pixman-timer.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-timer.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-timer.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-timer.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-timer.c' object='libpixman_1_la-pixman-timer.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-timer.lo `test -f 'pixman-timer.c' || echo '$(srcdir)/'`pixman-timer.c
-
-libpixman_1_la-pixman-matrix.lo: pixman-matrix.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-matrix.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-matrix.Tpo" -c -o libpixman_1_la-pixman-matrix.lo `test -f 'pixman-matrix.c' || echo '$(srcdir)/'`pixman-matrix.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-matrix.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-matrix.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-matrix.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-matrix.c' object='libpixman_1_la-pixman-matrix.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-matrix.lo `test -f 'pixman-matrix.c' || echo '$(srcdir)/'`pixman-matrix.c
-
 libpixman_arm_neon_la-pixman-arm-neon.lo: pixman-arm-neon.c
 @am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_arm_neon_la_CFLAGS) $(CFLAGS) -MT libpixman_arm_neon_la-pixman-arm-neon.lo -MD -MP -MF "$(DEPDIR)/libpixman_arm_neon_la-pixman-arm-neon.Tpo" -c -o libpixman_arm_neon_la-pixman-arm-neon.lo `test -f 'pixman-arm-neon.c' || echo '$(srcdir)/'`pixman-arm-neon.c; \
 @am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_arm_neon_la-pixman-arm-neon.Tpo" "$(DEPDIR)/libpixman_arm_neon_la-pixman-arm-neon.Plo"; else rm -f "$(DEPDIR)/libpixman_arm_neon_la-pixman-arm-neon.Tpo"; exit 1; fi
@@ -935,6 +788,9 @@ pixman-combine64.c : pixman-combine.c.template pixman-combine64.h make-combine.p
 	$(PERL) $(srcdir)/make-combine.pl 16 < $(srcdir)/pixman-combine.c.template > $@ || ($(RM) $@; exit 1)
 pixman-combine64.h : pixman-combine.h.template make-combine.pl
 	$(PERL) $(srcdir)/make-combine.pl 16 < $(srcdir)/pixman-combine.h.template > $@ || ($(RM) $@; exit 1)
+
+.c.s : $(libpixmaninclude_HEADERS) $(BUILT_SOURCES)
+	$(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $<
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
 .NOEXPORT:
diff --git a/lib/pixman/pixman/pixman-access.c b/lib/pixman/pixman/pixman-access.c
index d9fd38c15..b65ef661d 100644
--- a/lib/pixman/pixman/pixman-access.c
+++ b/lib/pixman/pixman/pixman-access.c
@@ -180,11 +180,11 @@ fetch_scanline_b8g8r8a8 (pixman_image_t *image,
     const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint32_t *pixel = (uint32_t *)bits + x;
     const uint32_t *end = pixel + width;
-    
+
     while (pixel < end)
     {
 	uint32_t p = READ (image, pixel++);
-	
+
 	*buffer++ = (((p & 0xff000000) >> 24)	|
 	             ((p & 0x00ff0000) >> 8)	|
 	             ((p & 0x0000ff00) << 8)	|
@@ -731,23 +731,27 @@ fetch_scanline_b2g3r3 (pixman_image_t *image,
     const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint8_t *pixel = (const uint8_t *)bits + x;
     const uint8_t *end = pixel + width;
-    
+
     while (pixel < end)
     {
 	uint32_t p = READ (image, pixel++);
 	uint32_t r, g, b;
-	
-	b = (((p & 0xc0)     ) |
-	     ((p & 0xc0) >> 2) |
-	     ((p & 0xc0) >> 4) |
-	     ((p & 0xc0) >> 6));
-	
-	g = ((p & 0x38) | ((p & 0x38) >> 3) | ((p & 0x30) << 2)) << 8;
-	
-	r = (((p & 0x07)     ) |
-	     ((p & 0x07) << 3) |
-	     ((p & 0x06) << 6)) << 16;
-	
+
+	b  = p & 0xc0;
+	b |= b >> 2;
+	b |= b >> 4;
+	b &= 0xff;
+
+	g  = (p & 0x38) << 10;
+	g |= g >> 3;
+	g |= g >> 6;
+	g &= 0xff00;
+
+	r  = (p & 0x7) << 21;
+	r |= r >> 3;
+	r |= r >> 6;
+	r &= 0xff0000;
+
 	*buffer++ = 0xff000000 | r | g | b;
     }
 }
@@ -798,7 +802,7 @@ fetch_scanline_a2b2g2r2 (pixman_image_t *image,
 	uint32_t a, r, g, b;
 	
 	a = ((p & 0xc0) * 0x55) << 18;
-	b = ((p & 0x30) * 0x55) >> 6;
+	b = ((p & 0x30) * 0x55) >> 4;
 	g = ((p & 0x0c) * 0x55) << 6;
 	r = ((p & 0x03) * 0x55) << 16;
 	
@@ -840,20 +844,22 @@ fetch_scanline_x4a4 (pixman_image_t *image,
     const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint8_t *pixel = (const uint8_t *)bits + x;
     const uint8_t *end = pixel + width;
-    
+   
     while (pixel < end)
     {
 	uint8_t p = READ (image, pixel++) & 0xf;
-	
+
 	*buffer++ = (p | (p << 4)) << 24;
     }
 }
 
-#define FETCH_8(img,l,o)    (READ (img, (uint8_t *)(l) + ((o) >> 2)))
+#define FETCH_8(img,l,o)    (READ (img, (((uint8_t *)(l)) + ((o) >> 3))))
 #ifdef WORDS_BIGENDIAN
-#define FETCH_4(img,l,o)    ((o) & 2 ? FETCH_8 (img,l,o) & 0xf : FETCH_8 (img,l,o) >> 4)
+#define FETCH_4(img,l,o)						\
+    (((4 * (o)) & 4) ? (FETCH_8 (img,l, 4 * (o)) & 0xf) : (FETCH_8 (img,l,(4 * (o))) >> 4))
 #else
-#define FETCH_4(img,l,o)    ((o) & 2 ? FETCH_8 (img,l,o) >> 4 : FETCH_8 (img,l,o) & 0xf)
+#define FETCH_4(img,l,o)						\
+    (((4 * (o)) & 4) ? (FETCH_8 (img, l, 4 * (o)) >> 4) : (FETCH_8 (img, l, (4 * (o))) & 0xf))
 #endif
 
 static void
@@ -867,13 +873,13 @@ fetch_scanline_a4 (pixman_image_t *image,
 {
     const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     int i;
-    
+
     for (i = 0; i < width; ++i)
     {
 	uint32_t p = FETCH_4 (image, bits, i + x);
-	
+
 	p |= p << 4;
-	
+
 	*buffer++ = p << 24;
     }
 }
@@ -923,7 +929,7 @@ fetch_scanline_b1g2r1 (pixman_image_t *image,
 	b = ((p & 0x8) * 0xff) >> 3;
 	g = ((p & 0x6) * 0x55) << 7;
 	r = ((p & 0x1) * 0xff) << 16;
-	
+
 	*buffer++ = 0xff000000 | r | g | b;
     }
 }
@@ -940,16 +946,16 @@ fetch_scanline_a1r1g1b1 (pixman_image_t *image,
     uint32_t a, r, g, b;
     const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     int i;
-    
+
     for (i = 0; i < width; ++i)
     {
 	uint32_t p = FETCH_4 (image, bits, i + x);
-	
+
 	a = ((p & 0x8) * 0xff) << 21;
 	r = ((p & 0x4) * 0xff) << 14;
 	g = ((p & 0x2) * 0xff) << 7;
 	b = ((p & 0x1) * 0xff);
-	
+
 	*buffer++ = a | r | g | b;
     }
 }
@@ -965,17 +971,17 @@ fetch_scanline_a1b1g1r1 (pixman_image_t *image,
 {
     const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     int i;
-    
+
     for (i = 0; i < width; ++i)
     {
 	uint32_t p = FETCH_4 (image, bits, i + x);
 	uint32_t a, r, g, b;
-	
+
 	a = ((p & 0x8) * 0xff) << 21;
-	r = ((p & 0x4) * 0xff) >> 3;
+	b = ((p & 0x4) * 0xff) >> 2;
 	g = ((p & 0x2) * 0xff) << 7;
-	b = ((p & 0x1) * 0xff) << 16;
-	
+	r = ((p & 0x1) * 0xff) << 16;
+
 	*buffer++ = a | r | g | b;
     }
 }
@@ -1546,23 +1552,25 @@ fetch_pixel_b2g3r3 (bits_image_t *image,
 		    int           line)
 {
     uint32_t *bits = image->bits + line * image->rowstride;
-    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
+    uint32_t p = READ (image, (uint8_t *) bits + offset);
     uint32_t r, g, b;
-    
-    b = ((pixel & 0xc0)         |
-	 ((pixel & 0xc0) >> 2)  |
-	 ((pixel & 0xc0) >> 4)  |
-	 ((pixel & 0xc0) >> 6));
-    
-    g = ((pixel & 0x38)         |
-	 ((pixel & 0x38) >> 3)  |
-	 ((pixel & 0x30) << 2)) << 8;
-    
-    r = ((pixel & 0x07)         |
-	 ((pixel & 0x07) << 3)  |
-	 ((pixel & 0x06) << 6)) << 16;
-    
-    return (0xff000000 | r | g | b);
+
+    b  = p & 0xc0;
+    b |= b >> 2;
+    b |= b >> 4;
+    b &= 0xff;
+
+    g  = (p & 0x38) << 10;
+    g |= g >> 3;
+    g |= g >> 6;
+    g &= 0xff00;
+
+    r  = (p & 0x7) << 21;
+    r |= r >> 3;
+    r |= r >> 6;
+    r &= 0xff0000;
+
+    return 0xff000000 | r | g | b;
 }
 
 static uint32_t
@@ -1592,7 +1600,7 @@ fetch_pixel_a2b2g2r2 (bits_image_t *image,
     uint32_t a, r, g, b;
     
     a = ((pixel & 0xc0) * 0x55) << 18;
-    b = ((pixel & 0x30) * 0x55) >> 6;
+    b = ((pixel & 0x30) * 0x55) >> 4;
     g = ((pixel & 0x0c) * 0x55) << 6;
     r = ((pixel & 0x03) * 0x55) << 16;
     
@@ -1674,12 +1682,12 @@ fetch_pixel_a1r1g1b1 (bits_image_t *image,
     uint32_t *bits = image->bits + line * image->rowstride;
     uint32_t pixel = FETCH_4 (image, bits, offset);
     uint32_t a, r, g, b;
-    
+
     a = ((pixel & 0x8) * 0xff) << 21;
     r = ((pixel & 0x4) * 0xff) << 14;
     g = ((pixel & 0x2) * 0xff) << 7;
     b = ((pixel & 0x1) * 0xff);
-    
+
     return a | r | g | b;
 }
 
@@ -1691,12 +1699,12 @@ fetch_pixel_a1b1g1r1 (bits_image_t *image,
     uint32_t *bits = image->bits + line * image->rowstride;
     uint32_t pixel = FETCH_4 (image, bits, offset);
     uint32_t a, r, g, b;
-    
+
     a = ((pixel & 0x8) * 0xff) << 21;
-    r = ((pixel & 0x4) * 0xff) >> 3;
+    b = ((pixel & 0x4) * 0xff) >> 2;
     g = ((pixel & 0x2) * 0xff) << 7;
-    b = ((pixel & 0x1) * 0xff) << 16;
-    
+    r = ((pixel & 0x1) * 0xff) << 16;
+
     return a | r | g | b;
 }
 
@@ -1708,7 +1716,7 @@ fetch_pixel_c4 (bits_image_t *image,
     uint32_t *bits = image->bits + line * image->rowstride;
     uint32_t pixel = FETCH_4 (image, bits, offset);
     const pixman_indexed_t * indexed = image->indexed;
-    
+
     return indexed->rgba[pixel];
 }
 
@@ -2425,22 +2433,38 @@ store_scanline_x4a4 (bits_image_t *  image,
     uint32_t *bits = image->bits + image->rowstride * y;
     uint8_t   *pixel = ((uint8_t *) bits) + x;
     int i;
-    
+
     for (i = 0; i < width; ++i)
 	WRITE (image, pixel++, values[i] >> 28);
 }
 
 #define STORE_8(img,l,o,v)  (WRITE (img, (uint8_t *)(l) + ((o) >> 3), (v)))
 #ifdef WORDS_BIGENDIAN
-#define STORE_4(img,l,o,v)					    \
-    STORE_8 (img,l,o,((o) & 4 ?					    \
-                      (FETCH_8 (img,l,o) & 0xf0) | (v) :            \
-                      (FETCH_8 (img,l,o) & 0x0f) | ((v) << 4)))
+
+#define STORE_4(img,l,o,v)						\
+    do									\
+    {									\
+	int bo = 4 * (o);						\
+	int v4 = (v) & 0x0f;						\
+									\
+	STORE_8 (img, l, bo, (						\
+		     bo & 4 ?						\
+		     (FETCH_8 (img, l, bo) & 0xf0) | (v4) :		\
+		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4)));	\
+    } while (0)
 #else
-#define STORE_4(img,l,o,v)					\
-    STORE_8 (img,l,o,((o) & 4 ?					\
-                      (FETCH_8 (img,l,o) & 0x0f) | ((v) << 4) : \
-                      (FETCH_8 (img,l,o) & 0xf0) | (v)))
+
+#define STORE_4(img,l,o,v)						\
+    do									\
+    {									\
+	int bo = 4 * (o);						\
+	int v4 = (v) & 0x0f;						\
+									\
+	STORE_8 (img, l, bo, (						\
+		     bo & 4 ?						\
+		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4) :	\
+		     (FETCH_8 (img, l, bo) & 0xf0) | (v4)));		\
+    } while (0)
 #endif
 
 static void
@@ -2452,7 +2476,7 @@ store_scanline_a4 (bits_image_t *  image,
 {
     uint32_t *bits = image->bits + image->rowstride * y;
     int i;
-    
+
     for (i = 0; i < width; ++i)
 	STORE_4 (image, bits, i + x, values[i] >> 28);
 }
@@ -2466,11 +2490,11 @@ store_scanline_r1g2b1 (bits_image_t *  image,
 {
     uint32_t *bits = image->bits + image->rowstride * y;
     int i;
-    
+
     for (i = 0; i < width; ++i)
     {
 	uint32_t pixel;
-	
+
 	SPLIT (values[i]);
 	pixel = (((r >> 4) & 0x8) |
 	         ((g >> 5) & 0x6) |
@@ -2488,11 +2512,11 @@ store_scanline_b1g2r1 (bits_image_t *  image,
 {
     uint32_t *bits = image->bits + image->rowstride * y;
     int i;
-    
+
     for (i = 0; i < width; ++i)
     {
 	uint32_t pixel;
-	
+
 	SPLIT (values[i]);
 	pixel = (((b >> 4) & 0x8) |
 	         ((g >> 5) & 0x6) |
@@ -2510,16 +2534,17 @@ store_scanline_a1r1g1b1 (bits_image_t *  image,
 {
     uint32_t *bits = image->bits + image->rowstride * y;
     int i;
-    
+
     for (i = 0; i < width; ++i)
     {
 	uint32_t pixel;
-	
+
 	SPLIT_A (values[i]);
 	pixel = (((a >> 4) & 0x8) |
 	         ((r >> 5) & 0x4) |
 	         ((g >> 6) & 0x2) |
 	         ((b >> 7)      ));
+
 	STORE_4 (image, bits, i + x, pixel);
     }
 }
@@ -2533,16 +2558,17 @@ store_scanline_a1b1g1r1 (bits_image_t *  image,
 {
     uint32_t *bits = image->bits + image->rowstride * y;
     int i;
-    
+
     for (i = 0; i < width; ++i)
     {
 	uint32_t pixel;
-	
+
 	SPLIT_A (values[i]);
 	pixel = (((a >> 4) & 0x8) |
 	         ((b >> 5) & 0x4) |
 	         ((g >> 6) & 0x2) |
 	         ((r >> 7)      ));
+
 	STORE_4 (image, bits, i + x, pixel);
     }
 }
@@ -2614,7 +2640,7 @@ store_scanline_g1 (bits_image_t *  image,
 #else
 	mask = 1 << ((i + x) & 0x1f);
 #endif
-	v = RGB24_TO_ENTRY_Y (indexed, values[i]) ? mask : 0;
+	v = RGB24_TO_ENTRY_Y (indexed, values[i]) & 0x1 ? mask : 0;
 	
 	WRITE (image, pixel, (READ (image, pixel) & ~mask) | v);
     }
diff --git a/lib/pixman/pixman/pixman-arm-common.h b/lib/pixman/pixman/pixman-arm-common.h
new file mode 100644
index 000000000..58ee4e1c4
--- /dev/null
+++ b/lib/pixman/pixman/pixman-arm-common.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright � 2010 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+#ifndef PIXMAN_ARM_COMMON_H
+#define PIXMAN_ARM_COMMON_H
+
+/* Define some macros which can expand into proxy functions between
+ * ARM assembly optimized functions and the rest of pixman fast path API.
+ *
+ * All the low level ARM assembly functions have to use ARM EABI
+ * calling convention and take up to 8 arguments:
+ *    width, height, dst, dst_stride, src, src_stride, mask, mask_stride
+ *
+ * The arguments are ordered with the most important coming first (the
+ * first 4 arguments are passed to function in registers, the rest are
+ * on stack). The last arguments are optional, for example if the
+ * function is not using mask, then 'mask' and 'mask_stride' can be
+ * omitted when doing a function call.
+ *
+ * Arguments 'src' and 'mask' contain either a pointer to the top left
+ * pixel of the composited rectangle or a pixel color value depending
+ * on the function type. In the case of just a color value (solid source
+ * or mask), the corresponding stride argument is unused.
+ */
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(cputype, name,                \
+                                          src_type, src_cnt,            \
+                                          dst_type, dst_cnt)            \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t   w,                   \
+                                         int32_t   h,                   \
+                                         dst_type *dst,                 \
+                                         int32_t   dst_stride,          \
+                                         src_type *src,                 \
+                                         int32_t   src_stride);         \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_op_t              op,                \
+                            pixman_image_t *         src_image,         \
+                            pixman_image_t *         mask_image,        \
+                            pixman_image_t *         dst_image,         \
+                            int32_t                  src_x,             \
+                            int32_t                  src_y,             \
+                            int32_t                  mask_x,            \
+                            int32_t                  mask_y,            \
+                            int32_t                  dest_x,            \
+                            int32_t                  dest_y,            \
+                            int32_t                  width,             \
+                            int32_t                  height)            \
+{                                                                       \
+    dst_type *dst_line;                                                 \
+    src_type *src_line;                                                 \
+    int32_t dst_stride, src_stride;                                     \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
+                           src_stride, src_line, src_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type,         \
+                           dst_stride, dst_line, dst_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src_line, src_stride);     \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_N_DST(cputype, name,                  \
+                                        dst_type, dst_cnt)              \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         uint32_t   src);               \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_op_t              op,                \
+                            pixman_image_t *         src_image,         \
+                            pixman_image_t *         mask_image,        \
+                            pixman_image_t *         dst_image,         \
+                            int32_t                  src_x,             \
+                            int32_t                  src_y,             \
+                            int32_t                  mask_x,            \
+                            int32_t                  mask_y,            \
+                            int32_t                  dest_x,            \
+                            int32_t                  dest_y,            \
+                            int32_t                  width,             \
+                            int32_t                  height)            \
+{                                                                       \
+    dst_type  *dst_line;                                                \
+    int32_t    dst_stride;                                              \
+    uint32_t   src;                                                     \
+                                                                        \
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);  \
+                                                                        \
+    if (src == 0)                                                       \
+	return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type,         \
+                           dst_stride, dst_line, dst_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src);                      \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(cputype, name,             \
+                                             mask_type, mask_cnt,       \
+                                             dst_type, dst_cnt)         \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         uint32_t   src,                \
+                                         int32_t    unused,             \
+                                         mask_type *mask,               \
+                                         int32_t    mask_stride);       \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_op_t              op,                \
+                            pixman_image_t *         src_image,         \
+                            pixman_image_t *         mask_image,        \
+                            pixman_image_t *         dst_image,         \
+                            int32_t                  src_x,             \
+                            int32_t                  src_y,             \
+                            int32_t                  mask_x,            \
+                            int32_t                  mask_y,            \
+                            int32_t                  dest_x,            \
+                            int32_t                  dest_y,            \
+                            int32_t                  width,             \
+                            int32_t                  height)            \
+{                                                                       \
+    dst_type  *dst_line;                                                \
+    mask_type *mask_line;                                               \
+    int32_t    dst_stride, mask_stride;                                 \
+    uint32_t   src;                                                     \
+                                                                        \
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);  \
+                                                                        \
+    if (src == 0)                                                       \
+	return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type,         \
+                           dst_stride, dst_line, dst_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,       \
+                           mask_stride, mask_line, mask_cnt);           \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src, 0,                    \
+                                             mask_line, mask_stride);   \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST(cputype, name,              \
+                                            src_type, src_cnt,          \
+                                            dst_type, dst_cnt)          \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         src_type  *src,                \
+                                         int32_t    src_stride,         \
+                                         uint32_t   mask);              \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_op_t              op,                \
+                            pixman_image_t *         src_image,         \
+                            pixman_image_t *         mask_image,        \
+                            pixman_image_t *         dst_image,         \
+                            int32_t                  src_x,             \
+                            int32_t                  src_y,             \
+                            int32_t                  mask_x,            \
+                            int32_t                  mask_y,            \
+                            int32_t                  dest_x,            \
+                            int32_t                  dest_y,            \
+                            int32_t                  width,             \
+                            int32_t                  height)            \
+{                                                                       \
+    dst_type  *dst_line;                                                \
+    src_type  *src_line;                                                \
+    int32_t    dst_stride, src_stride;                                  \
+    uint32_t   mask;                                                    \
+                                                                        \
+    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);\
+                                                                        \
+    if (mask == 0)                                                      \
+	return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type,         \
+                           dst_stride, dst_line, dst_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
+                           src_stride, src_line, src_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src_line, src_stride,      \
+                                             mask);                     \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST(cputype, name,           \
+                                               src_type, src_cnt,       \
+                                               mask_type, mask_cnt,     \
+                                               dst_type, dst_cnt)       \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         src_type  *src,                \
+                                         int32_t    src_stride,         \
+                                         mask_type *mask,               \
+                                         int32_t    mask_stride);       \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_op_t              op,                \
+                            pixman_image_t *         src_image,         \
+                            pixman_image_t *         mask_image,        \
+                            pixman_image_t *         dst_image,         \
+                            int32_t                  src_x,             \
+                            int32_t                  src_y,             \
+                            int32_t                  mask_x,            \
+                            int32_t                  mask_y,            \
+                            int32_t                  dest_x,            \
+                            int32_t                  dest_y,            \
+                            int32_t                  width,             \
+                            int32_t                  height)            \
+{                                                                       \
+    dst_type  *dst_line;                                                \
+    src_type  *src_line;                                                \
+    mask_type *mask_line;                                               \
+    int32_t    dst_stride, src_stride, mask_stride;                     \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, dst_type,         \
+                           dst_stride, dst_line, dst_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
+                           src_stride, src_line, src_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,       \
+                           mask_stride, mask_line, mask_cnt);           \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src_line, src_stride,      \
+                                             mask_line, mask_stride);   \
+}
+
+#endif
diff --git a/lib/pixman/pixman/pixman-arm-neon-asm.S b/lib/pixman/pixman/pixman-arm-neon-asm.S
new file mode 100644
index 000000000..9ee3ab308
--- /dev/null
+++ b/lib/pixman/pixman/pixman-arm-neon-asm.S
@@ -0,0 +1,1713 @@
+/*
+ * Copyright � 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains implementations of NEON optimized pixel processing
+ * functions. There is no full and detailed tutorial, but some functions
+ * (those which are exposing some new or interesting features) are
+ * extensively commented and can be used as examples.
+ *
+ * You may want to have a look at the comments for following functions:
+ *  - pixman_composite_over_8888_0565_asm_neon
+ *  - pixman_composite_over_n_8_0565_asm_neon
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+    .text
+    .fpu neon
+    .arch armv7a
+    .object_arch armv4
+    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
+    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
+    .arm
+    .altmacro
+
+#include "pixman-arm-neon-asm.h"
+
+/* Global configuration options and preferences */
+
+/*
+ * The code can optionally make use of unaligned memory accesses to improve
+ * performance of handling leading/trailing pixels for each scanline.
+ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
+ * example in linux if unaligned memory accesses are not configured to
+ * generate.exceptions.
+ */
+.set RESPECT_STRICT_ALIGNMENT, 1
+
+/*
+ * Set default prefetch type. There is a choice between the following options:
+ *
+ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
+ * as NOP to workaround some HW bugs or for whatever other reason)
+ *
+ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
+ * advanced prefetch intruduces heavy overhead)
+ *
+ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
+ * which can run ARM and NEON instructions simultaneously so that extra ARM
+ * instructions do not add (many) extra cycles, but improve prefetch efficiency)
+ *
+ * Note: some types of function can't support advanced prefetch and fallback
+ *       to simple one (those which handle 24bpp pixels)
+ */
+.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
+
+/* Prefetch distance in pixels for simple prefetch */
+.set PREFETCH_DISTANCE_SIMPLE, 64
+
+/*
+ * Implementation of pixman_composite_over_8888_0565_asm_neon
+ *
+ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
+ * performs OVER compositing operation. Function fast_composite_over_8888_0565
+ * from pixman-fast-path.c does the same in C and can be used as a reference.
+ *
+ * First we need to have some NEON assembly code which can do the actual
+ * operation on the pixels and provide it to the template macro.
+ *
+ * Template macro quite conveniently takes care of emitting all the necessary
+ * code for memory reading and writing (including quite tricky cases of
+ * handling unaligned leading/trailing pixels), so we only need to deal with
+ * the data in NEON registers.
+ *
+ * NEON registers allocation in general is recommented to be the following:
+ * d0,  d1,  d2,  d3  - contain loaded source pixel data
+ * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
+ * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
+ * d28, d29, d30, d31 - place for storing the result (destination pixels)
+ *
+ * As can be seen above, four 64-bit NEON registers are used for keeping
+ * intermediate pixel data and up to 8 pixels can be processed in one step
+ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
+ *
+ * This particular function uses the following registers allocation:
+ * d0,  d1,  d2,  d3  - contain loaded source pixel data
+ * d4,  d5            - contain loaded destination pixels (they are needed)
+ * d28, d29           - place for storing the result (destination pixels)
+ */
+
+/*
+ * Step one. We need to have some code to do some arithmetics on pixel data.
+ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
+ * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
+ * perform all the needed calculations and write the result to {d28, d29}.
+ * The rationale for having two macros and not just one will be explained
+ * later. In practice, any single monolitic function which does the work can
+ * be split into two parts in any arbitrary way without affecting correctness.
+ *
+ * There is one special trick here too. Common template macro can optionally
+ * make our life a bit easier by doing R, G, B, A color components
+ * deinterleaving for 32bpp pixel formats (and this feature is used in
+ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
+ * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
+ * actually use d0 register for blue channel (a vector of eight 8-bit
+ * values), d1 register for green, d2 for red and d3 for alpha. This
+ * simple conversion can be also done with a few NEON instructions:
+ *
+ * Packed to planar conversion:
+ *  vuzp.8 d0, d1
+ *  vuzp.8 d2, d3
+ *  vuzp.8 d1, d3
+ *  vuzp.8 d0, d2
+ *
+ * Planar to packed conversion:
+ *  vzip.8 d0, d2
+ *  vzip.8 d1, d3
+ *  vzip.8 d2, d3
+ *  vzip.8 d0, d1
+ *
+ * But pixel can be loaded directly in planar format using VLD4.8 NEON
+ * instruction. It is 1 cycle slower than VLD1.32, so this is not always
+ * desirable, that's why deinterleaving is optional.
+ *
+ * But anyway, here is the code:
+ */
+.macro pixman_composite_over_8888_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vmvn.8      d3, d3      /* invert source alpha */
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/*
+ * OK, now we got almost everything that we need. Using the above two
+ * macros, the work can be done right. But now we want to optimize
+ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
+ * a lot from good code scheduling and software pipelining.
+ *
+ * Let's construct some code, which will run in the core main loop.
+ * Some pseudo-code of the main loop will look like this:
+ *   head
+ *   while (...) {
+ *     tail
+ *     head
+ *   }
+ *   tail
+ *
+ * It may look a bit weird, but this setup allows to hide instruction
+ * latencies better and also utilize dual-issue capability more
+ * efficiently (make pairs of load-store and ALU instructions).
+ *
+ * So what we need now is a '*_tail_head' macro, which will be used
+ * in the core main loop. A trivial straightforward implementation
+ * of this macro would look like this:
+ *
+ *   pixman_composite_over_8888_0565_process_pixblock_tail
+ *   vst1.16     {d28, d29}, [DST_W, :128]!
+ *   vld1.16     {d4, d5}, [DST_R, :128]!
+ *   vld4.32     {d0, d1, d2, d3}, [SRC]!
+ *   pixman_composite_over_8888_0565_process_pixblock_head
+ *   cache_preload 8, 8
+ *
+ * Now it also got some VLD/VST instructions. We simply can't move from
+ * processing one block of pixels to the other one with just arithmetics.
+ * The previously processed data needs to be written to memory and new
+ * data needs to be fetched. Fortunately, this main loop does not deal
+ * with partial leading/trailing pixels and can load/store a full block
+ * of pixels in a bulk. Additionally, destination buffer is already
+ * 16 bytes aligned here (which is good for performance).
+ *
+ * New things here are DST_R, DST_W, SRC and MASK identifiers. These
+ * are the aliases for ARM registers which are used as pointers for
+ * accessing data. We maintain separate pointers for reading and writing
+ * destination buffer (DST_R and DST_W).
+ *
+ * Another new thing is 'cache_preload' macro. It is used for prefetching
+ * data into CPU L2 cache and improve performance when dealing with large
+ * images which are far larger than cache size. It uses one argument
+ * (actually two, but they need to be the same here) - number of pixels
+ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
+ * details about this macro. Moreover, if good performance is needed
+ * the code from this macro needs to be copied into '*_tail_head' macro
+ * and mixed with the rest of code for optimal instructions scheduling.
+ * We are actually doing it below.
+ *
+ * Now after all the explanations, here is the optimized code.
+ * Different instruction streams (originaling from '*_head', '*_tail'
+ * and 'cache_preload' macro) use different indentation levels for
+ * better readability. Actually taking the code from one of these
+ * indentation levels and ignoring a few VLD/VST instructions would
+ * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
+ * macro!
+ */
+
+#if 1
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+        vqadd.u8    d16, d2, d20
+    vld1.16     {d4, d5}, [DST_R, :128]!
+        vqadd.u8    q9, q0, q11
+    vshrn.u16   d6, q2, #8
+    vld4.8      {d0, d1, d2, d3}, [SRC]!
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+        vshll.u8    q14, d16, #8
+                                    PF add PF_X, PF_X, #8
+        vshll.u8    q8, d19, #8
+                                    PF tst PF_CTL, #0xF
+    vsri.u8     d6, d6, #5
+                                    PF addne PF_X, PF_X, #8
+    vmvn.8      d3, d3
+                                    PF subne PF_CTL, PF_CTL, #1
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    vmull.u8    q10, d3, d6
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vsri.u16    q14, q8, #5
+                                    PF cmp PF_X, ORIG_W
+        vshll.u8    q9, d18, #8
+    vrshr.u16   q13, q10, #8
+                                    PF subge PF_X, PF_X, ORIG_W
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+                                    PF subges PF_CTL, PF_CTL, #0x10
+        vsri.u16    q14, q9, #11
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vraddhn.u16 d22, q12, q15
+        vst1.16     {d28, d29}, [DST_W, :128]!
+.endm
+
+#else
+
+/* If we did not care much about the performance, we would just use this... */
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+    pixman_composite_over_8888_0565_process_pixblock_tail
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vld4.32     {d0, d1, d2, d3}, [SRC]!
+    pixman_composite_over_8888_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+#endif
+
+/*
+ * And now the final part. We are using 'generate_composite_function' macro
+ * to put all the stuff together. We are specifying the name of the function
+ * which we want to get, number of bits per pixel for the source, mask and
+ * destination (0 if unused, like mask in this case). Next come some bit
+ * flags:
+ *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
+ *                             and written, for write-only buffer we would use
+ *                             FLAG_DST_WRITEONLY flag instead
+ *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
+ *                             and separate color channels for 32bpp format.
+ * The next things are:
+ *  - the number of pixels processed per iteration (8 in this case, because
+ *    that's the maximum what can fit into four 64-bit NEON registers).
+ *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
+ *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
+ *    prefetch distance can be selected by running some benchmarks.
+ *
+ * After that we specify some macros, these are 'default_init',
+ * 'default_cleanup' here which are empty (but it is possible to have custom
+ * init/cleanup macros to be able to save/restore some extra NEON registers
+ * like d8-d15 or do anything else) followed by
+ * 'pixman_composite_over_8888_0565_process_pixblock_head',
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
+ * which we got implemented above.
+ *
+ * The last part is the NEON registers allocation scheme.
+ */
+generate_composite_function \
+    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_n_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_0565_process_pixblock_tail_head
+    pixman_composite_over_n_0565_process_pixblock_tail
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    pixman_composite_over_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+    vmvn.8      d3, d3      /* invert source alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_0565_init, \
+    default_cleanup, \
+    pixman_composite_over_n_0565_process_pixblock_head, \
+    pixman_composite_over_n_0565_process_pixblock_tail, \
+    pixman_composite_over_n_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_0565_process_pixblock_head
+    vshll.u8    q8, d1, #8
+    vshll.u8    q14, d2, #8
+    vshll.u8    q9, d0, #8
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
+        vsri.u16    q14, q8, #5
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    vld4.8      {d0, d1, d2, d3}, [SRC]!
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vsri.u16    q14, q9, #11
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vshll.u8    q8, d1, #8
+        vst1.16     {d28, d29}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vshll.u8    q14, d2, #8
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vshll.u8    q9, d0, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_8888_process_pixblock_head
+    vshrn.u16   d30, q0, #8
+    vshrn.u16   d29, q0, #3
+    vsli.u16    q0, q0, #5
+    vmov.u8     d31, #255
+    vsri.u8     d30, d30, #5
+    vsri.u8     d29, d29, #6
+    vshrn.u16   d28, q0, #2
+.endm
+
+.macro pixman_composite_src_0565_8888_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
+    pixman_composite_src_0565_8888_process_pixblock_tail
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.16    {d0, d1}, [SRC]!
+    pixman_composite_src_0565_8888_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8000_8000_process_pixblock_head
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8000_8000_process_pixblock_tail
+.endm
+
+.macro pixman_composite_add_8000_8000_process_pixblock_tail_head
+    vld1.8      {d0, d1, d2, d3}, [SRC]!
+                                    PF add PF_X, PF_X, #32
+                                    PF tst PF_CTL, #0xF
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+                                    PF addne PF_X, PF_X, #32
+                                    PF subne PF_CTL, PF_CTL, #1
+        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vqadd.u8    q14, q0, q2
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vqadd.u8    q15, q1, q3
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8000_8000_asm_neon, 8, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8000_8000_process_pixblock_head, \
+    pixman_composite_add_8000_8000_process_pixblock_tail, \
+    pixman_composite_add_8000_8000_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
+    vld1.8      {d0, d1, d2, d3}, [SRC]!
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vqadd.u8    q14, q0, q2
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vqadd.u8    q15, q1, q3
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8000_8000_process_pixblock_head, \
+    pixman_composite_add_8000_8000_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8000_8000_process_pixblock_head, \
+    pixman_composite_add_8000_8000_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8888_process_pixblock_head
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    vld4.8      {d0, d1, d2, d3}, [SRC]!
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8888_process_pixblock_tail_head
+    pixman_composite_over_8888_8888_process_pixblock_tail
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    pixman_composite_over_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+.macro pixman_composite_over_reverse_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d7[0]}, [DUMMY]
+    vdup.8      d4, d7[0]
+    vdup.8      d5, d7[1]
+    vdup.8      d6, d7[2]
+    vdup.8      d7, d7[3]
+.endm
+
+generate_composite_function \
+    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_reverse_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0,  /* dst_r_basereg */ \
+    4,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_0565_process_pixblock_head
+    /* in */
+    vmull.u8    q0, d24, d8
+    vmull.u8    q1, d24, d9
+    vmull.u8    q6, d24, d10
+    vmull.u8    q7, d24, d11
+    vrshr.u16   q10, q0, #8
+    vrshr.u16   q11, q1, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q13, q7, #8
+    vraddhn.u16 d0, q0, q10
+    vraddhn.u16 d1, q1, q11
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d3, q7, q13
+
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vmvn.8      d3, d3
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_n_8_0565_process_pixblock_tail
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert to r5g6b5 */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_0565_process_pixblock_tail_head
+    pixman_composite_over_n_8_0565_process_pixblock_tail
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vld1.8      {d24}, [MASK]!
+    cache_preload 8, 8
+    pixman_composite_over_n_8_0565_process_pixblock_head
+.endm
+
+/*
+ * This function needs a special initialization of solid mask.
+ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
+ * offset, split into color components and replicated in d8-d11
+ * registers. Additionally, this function needs all the NEON registers,
+ * so it has to save d8-d15 registers which are callee saved according
+ * to ABI. These registers are restored from 'cleanup' macro. All the
+ * other NEON registers are caller saved, so can be clobbered freely
+ * without introducing any problems.
+ */
+.macro pixman_composite_over_n_8_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_0565_init, \
+    pixman_composite_over_n_8_0565_cleanup, \
+    pixman_composite_over_n_8_0565_process_pixblock_head, \
+    pixman_composite_over_n_8_0565_process_pixblock_tail, \
+    pixman_composite_over_n_8_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
+    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+    vld1.16 {d0, d1, d2, d3}, [SRC]!
+    cache_preload 16, 16
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_0565_process_pixblock_head, \
+    pixman_composite_src_0565_0565_process_pixblock_tail, \
+    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail_head
+    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #8
+    vsli.u64    d0, d0, #16
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_WRITEONLY, \
+    32, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_8_init, \
+    pixman_composite_src_n_8_cleanup, \
+    pixman_composite_src_n_8_process_pixblock_head, \
+    pixman_composite_src_n_8_process_pixblock_tail, \
+    pixman_composite_src_n_8_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail_head
+    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #16
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_0565_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_0565_init, \
+    pixman_composite_src_n_0565_cleanup, \
+    pixman_composite_src_n_0565_process_pixblock_head, \
+    pixman_composite_src_n_0565_process_pixblock_tail, \
+    pixman_composite_src_n_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    0, /* prefetch distance */ \
+    pixman_composite_src_n_8888_init, \
+    pixman_composite_src_n_8888_cleanup, \
+    pixman_composite_src_n_8888_process_pixblock_head, \
+    pixman_composite_src_n_8888_process_pixblock_tail, \
+    pixman_composite_src_n_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+    vld1.32 {d0, d1, d2, d3}, [SRC]!
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_8888_process_pixblock_head, \
+    pixman_composite_src_8888_8888_process_pixblock_tail, \
+    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_x888_8888_process_pixblock_head
+    vorr     q0, q0, q2
+    vorr     q1, q1, q2
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+    vld1.32 {d0, d1, d2, d3}, [SRC]!
+    vorr     q0, q0, q2
+    vorr     q1, q1, q2
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_x888_8888_init
+    vmov.u8  q2, #0xFF
+    vshl.u32 q2, q2, #24
+.endm
+
+generate_composite_function \
+    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_x888_8888_init, \
+    default_cleanup, \
+    pixman_composite_src_x888_8888_process_pixblock_head, \
+    pixman_composite_src_x888_8888_process_pixblock_tail, \
+    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_head
+    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
+    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+    /* and destination data in {d4, d5, d6, d7} */
+    /* mask is in d24 (d25, d26, d27 are unused) */
+
+    /* in */
+    vmull.u8    q0, d24, d8
+    vmull.u8    q1, d24, d9
+    vmull.u8    q6, d24, d10
+    vmull.u8    q7, d24, d11
+    vrshr.u16   q10, q0, #8
+    vrshr.u16   q11, q1, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q13, q7, #8
+    vraddhn.u16 d0, q0, q10
+    vraddhn.u16 d1, q1, q11
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d3, q7, q13
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
+    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
+    pixman_composite_over_n_8_8888_process_pixblock_tail
+    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vld1.8      {d24}, [MASK]!
+    cache_preload 8, 8
+    pixman_composite_over_n_8_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8888_init, \
+    pixman_composite_over_n_8_8888_cleanup, \
+    pixman_composite_over_n_8_8888_process_pixblock_head, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {d8,  d9,  d10, d11}
+     *         dest in          {d4,  d5,  d6,  d7 }
+     *         mask in          {d24, d25, d26, d27}
+     * output: updated src in   {d0,  d1,  d2,  d3 }
+     *         updated mask in  {d24, d25, d26, d3 }
+     */
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d9
+    vmull.u8    q6,  d26, d10
+    vmull.u8    q7,  d27, d11
+    vmull.u8    q9,  d11, d25
+    vmull.u8    q12, d11, d24
+    vmull.u8    q13, d11, d26
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q10, q1,  #8
+    vrshr.u16   q11, q6,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q10
+    vraddhn.u16 d2,  q6,  q11
+    vrshr.u16   q11, q12, #8
+    vrshr.u16   q8,  q9,  #8
+    vrshr.u16   q6,  q13, #8
+    vrshr.u16   q10, q7,  #8
+    vraddhn.u16 d24, q12, q11
+    vraddhn.u16 d25, q9,  q8
+    vraddhn.u16 d26, q13, q6
+    vraddhn.u16 d3,  q7,  q10
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in {d28, d29, d30, d31}
+     */
+    vmvn.8      d24, d24
+    vmvn.8      d25, d25
+    vmull.u8    q8,  d24, d4
+    vmull.u8    q9,  d25, d5
+    vmvn.8      d26, d26
+    vmvn.8      d27, d3
+    vmull.u8    q10, d26, d6
+    vmull.u8    q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q6,  q10, #8
+    vrshr.u16   q7,  q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q6,  q10
+    vraddhn.u16 d31, q7,  q11
+    vqadd.u8    q14, q0,  q14
+    vqadd.u8    q15, q1,  q15
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+        vrshr.u16   q15, q9, #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q6, q10, #8
+        vrshr.u16   q7, q11, #8
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+        vraddhn.u16 d30, q6, q10
+        vraddhn.u16 d31, q7, q11
+    vld4.8      {d24, d25, d26, d27}, [MASK]!
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    cache_preload 8, 8
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_8888_ca_init, \
+    pixman_composite_over_n_8888_8888_ca_cleanup, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_n_8_8_process_pixblock_head
+    /* expecting source data in {d8, d9, d10, d11} */
+    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+    /* and destination data in {d4, d5, d6, d7} */
+    /* mask is in d24, d25, d26, d27 */
+    vmull.u8    q0, d24, d11
+    vmull.u8    q1, d25, d11
+    vmull.u8    q6, d26, d11
+    vmull.u8    q7, d27, d11
+    vrshr.u16   q10, q0, #8
+    vrshr.u16   q11, q1, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q13, q7, #8
+    vraddhn.u16 d0, q0, q10
+    vraddhn.u16 d1, q1, q11
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d3, q7, q13
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_n_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
+    pixman_composite_add_n_8_8_process_pixblock_tail
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vld1.8      {d24, d25, d26, d27}, [MASK]!
+    cache_preload 32, 32
+    pixman_composite_add_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_add_n_8_8_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8_init, \
+    pixman_composite_add_n_8_8_cleanup, \
+    pixman_composite_add_n_8_8_process_pixblock_head, \
+    pixman_composite_add_n_8_8_process_pixblock_tail, \
+    pixman_composite_add_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_8_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* mask in {d24, d25, d26, d27} */
+    vmull.u8    q8, d24, d0
+    vmull.u8    q9, d25, d1
+    vmull.u8    q10, d26, d2
+    vmull.u8    q11, d27, d3
+    vrshr.u16   q0, q8, #8
+    vrshr.u16   q1, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d0, q0, q8
+    vraddhn.u16 d1, q1, q9
+    vraddhn.u16 d2, q12, q10
+    vraddhn.u16 d3, q13, q11
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
+    pixman_composite_add_8_8_8_process_pixblock_tail
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vld1.8      {d24, d25, d26, d27}, [MASK]!
+    vld1.8      {d0, d1, d2, d3}, [SRC]!
+    cache_preload 32, 32
+    pixman_composite_add_8_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_8_8_8_init
+.endm
+
+.macro pixman_composite_add_8_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8_8_8_init, \
+    pixman_composite_add_8_8_8_cleanup, \
+    pixman_composite_add_8_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* mask in {d24, d25, d26, d27} */
+    vmull.u8    q8, d27, d0
+    vmull.u8    q9, d27, d1
+    vmull.u8    q10, d27, d2
+    vmull.u8    q11, d27, d3
+    vrshr.u16   q0, q8, #8
+    vrshr.u16   q1, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d0, q0, q8
+    vraddhn.u16 d1, q1, q9
+    vraddhn.u16 d2, q12, q10
+    vraddhn.u16 d3, q13, q11
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail
+    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vld4.8      {d24, d25, d26, d27}, [MASK]!
+    vld4.8      {d0, d1, d2, d3}, [SRC]!
+    cache_preload 8, 8
+    pixman_composite_add_8888_8888_8888_process_pixblock_head
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* solid mask is in d15 */
+
+    /* 'in' */
+    vmull.u8    q8, d15, d3
+    vmull.u8    q6, d15, d2
+    vmull.u8    q5, d15, d1
+    vmull.u8    q4, d15, d0
+    vrshr.u16   q13, q8, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q11, q5, #8
+    vrshr.u16   q10, q4, #8
+    vraddhn.u16 d3, q8, q13
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d1, q5, q11
+    vraddhn.u16 d0, q4, q10
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    vld4.8     {d0, d1, d2, d3}, [SRC]!
+    cache_preload 8, 8
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_8888_n_8888_init
+    add         DUMMY, sp, #48
+    vpush       {d8-d15}
+    vld1.32     {d15[0]}, [DUMMY]
+    vdup.8      d15, d15[3]
+.endm
+
+.macro pixman_composite_over_8888_n_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_8888_init, \
+    pixman_composite_over_8888_n_8888_cleanup, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    vld4.8     {d0, d1, d2, d3}, [SRC]!
+    cache_preload 8, 8
+    vld4.8     {d12, d13, d14, d15}, [MASK]!
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_8888_8888_8888_init
+    vpush       {d8-d15}
+.endm
+
+.macro pixman_composite_over_8888_8888_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_8888_8888_init, \
+    pixman_composite_over_8888_8888_8888_cleanup, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    pixman_composite_over_8888_8888_8888_init, \
+    pixman_composite_over_8888_8888_8888_cleanup, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    vld4.8     {d0, d1, d2, d3}, [SRC]!
+    cache_preload 8, 8
+    vld1.8     {d15}, [MASK]!
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_8888_8_8888_init
+    vpush       {d8-d15}
+.endm
+
+.macro pixman_composite_over_8888_8_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_8_8888_init, \
+    pixman_composite_over_8888_8_8888_cleanup, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
+    vst3.8 {d0, d1, d2}, [DST_W]!
+    vld3.8 {d0, d1, d2}, [SRC]!
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0888_process_pixblock_head, \
+    pixman_composite_src_0888_0888_process_pixblock_tail, \
+    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
+    vswp   d0, d2
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
+    vst4.8 {d0, d1, d2, d3}, [DST_W]!
+    vld3.8 {d0, d1, d2}, [SRC]!
+    vswp   d0, d2
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_init
+    veor   d3, d3, d3
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_0888_8888_rev_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
+    vshll.u8    q8, d1, #8
+    vshll.u8    q9, d2, #8
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
+    vshll.u8    q14, d0, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
+        vshll.u8    q14, d0, #8
+    vld3.8 {d0, d1, d2}, [SRC]!
+        vsri.u16    q14, q8, #5
+        vsri.u16    q14, q9, #11
+    vshll.u8    q8, d1, #8
+        vst1.16 {d28, d29}, [DST_W, :128]!
+    vshll.u8    q9, d2, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
+    vrshr.u16   q11, q8, #8
+    vswp        d3, d31
+    vrshr.u16   q12, q9, #8
+    vrshr.u16   q13, q10, #8
+    vraddhn.u16 d30, q11, q8
+    vraddhn.u16 d29, q12, q9
+    vraddhn.u16 d28, q13, q10
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
+        vrshr.u16   q11, q8, #8
+        vswp        d3, d31
+        vrshr.u16   q12, q9, #8
+        vrshr.u16   q13, q10, #8
+    vld4.8 {d0, d1, d2, d3}, [SRC]!
+        vraddhn.u16 d30, q11, q8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d29, q12, q9
+        vraddhn.u16 d28, q13, q10
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
diff --git a/lib/pixman/pixman/pixman-arm-neon-asm.h b/lib/pixman/pixman/pixman-arm-neon-asm.h
new file mode 100644
index 000000000..583b96567
--- /dev/null
+++ b/lib/pixman/pixman/pixman-arm-neon-asm.h
@@ -0,0 +1,906 @@
+/*
+ * Copyright � 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains a macro ('generate_composite_function') which can
+ * construct 2D image processing functions, based on a common template.
+ * Any combinations of source, destination and mask images with 8bpp,
+ * 16bpp, 24bpp, 32bpp color formats are supported.
+ *
+ * This macro takes care of:
+ *  - handling of leading and trailing unaligned pixels
+ *  - doing most of the work related to L2 cache preload
+ *  - encourages the use of software pipelining for better instructions
+ *    scheduling
+ *
+ * The user of this macro has to provide some configuration parameters
+ * (bit depths for the images, prefetch distance, etc.) and a set of
+ * macros, which should implement basic code chunks responsible for
+ * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
+ * examples.
+ *
+ * TODO:
+ *  - try overlapped pixel method (from Ian Rickards) when processing
+ *    exactly two blocks of pixels
+ *  - maybe add an option to do reverse scanline processing
+ */
+
+/*
+ * Bit flags for 'generate_composite_function' macro which are used
+ * to tune generated functions behavior.
+ */
+.set FLAG_DST_WRITEONLY,       0
+.set FLAG_DST_READWRITE,       1
+.set FLAG_DEINTERLEAVE_32BPP,  2
+
+/*
+ * Offset in stack where mask and source pointer/stride can be accessed
+ * from 'init' macro. This is useful for doing special handling for solid mask.
+ */
+.set ARGS_STACK_OFFSET,        40
+
+/*
+ * Constants for selecting preferable prefetch type.
+ */
+.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
+.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
+.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
+
+/*
+ * Definitions of supplementary pixld/pixst macros (for partial load/store of
+ * pixel data).
+ */
+
+.macro pixldst1 op, elem_size, reg1, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
+    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
+.endm
+
+.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
+    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
+.endm
+
+.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
+    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
+.endm
+
+.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
+.if numbytes == 32
+    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
+                              %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif numbytes == 16
+    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
+.elseif numbytes == 8
+    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
+.elseif numbytes == 4
+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
+        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
+    .elseif elem_size == 16
+        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
+        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
+    .else
+        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
+    .endif
+.elseif numbytes == 2
+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
+        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
+    .else
+        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
+    .endif
+.elseif numbytes == 1
+    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
+.else
+    .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
+                      %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixst numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
+                      %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixld_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+    pixld numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+.macro pixst_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+    pixst numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+.macro vuzp8 reg1, reg2
+    vuzp.8 d&reg1, d&reg2
+.endm
+
+.macro vzip8 reg1, reg2
+    vzip.8 d&reg1, d&reg2
+.endm
+
+/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixdeinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vuzp8 %(basereg+0), %(basereg+1)
+    vuzp8 %(basereg+2), %(basereg+3)
+    vuzp8 %(basereg+1), %(basereg+3)
+    vuzp8 %(basereg+0), %(basereg+2)
+.endif
+.endm
+
+/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vzip8 %(basereg+0), %(basereg+2)
+    vzip8 %(basereg+1), %(basereg+3)
+    vzip8 %(basereg+2), %(basereg+3)
+    vzip8 %(basereg+0), %(basereg+1)
+.endif
+.endm
+
+/*
+ * This is a macro for implementing cache preload. The main idea is that
+ * cache preload logic is mostly independent from the rest of pixels
+ * processing code. It starts at the top left pixel and moves forward
+ * across pixels and can jump across scanlines. Prefetch distance is
+ * handled in an 'incremental' way: it starts from 0 and advances to the
+ * optimal distance over time. After reaching optimal prefetch distance,
+ * it is kept constant. There are some checks which prevent prefetching
+ * unneeded pixel lines below the image (but it still can prefetch a bit
+ * more data on the right side of the image - not a big issue and may
+ * be actually helpful when rendering text glyphs). Additional trick is
+ * the use of LDR instruction for prefetch instead of PLD when moving to
+ * the next line, the point is that we have a high chance of getting TLB
+ * miss in this case, and PLD would be useless.
+ *
+ * This sounds like it may introduce a noticeable overhead (when working with
+ * fully cached data). But in reality, due to having a separate pipeline and
+ * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
+ * execute simultaneously with NEON and be completely shadowed by it. Thus
+ * we get no performance overhead at all (*). This looks like a very nice
+ * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
+ * but still can implement some rather advanced prefetch logic in sofware
+ * for almost zero cost!
+ *
+ * (*) The overhead of the prefetcher is visible when running some trivial
+ * pixels processing like simple copy. Anyway, having prefetch is a must
+ * when working with the graphics data.
+ */
+.macro PF a, x:vararg
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
+    a x
+.endif
+.endm
+
+.macro cache_preload std_increment, boost_increment
+.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
+.if regs_shortage
+    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+.endif
+.if std_increment != 0
+    PF add PF_X, PF_X, #std_increment
+.endif
+    PF tst PF_CTL, #0xF
+    PF addne PF_X, PF_X, #boost_increment
+    PF subne PF_CTL, PF_CTL, #1
+    PF cmp PF_X, ORIG_W
+.if src_bpp_shift >= 0
+    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+.endif
+.if dst_r_bpp != 0
+    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+.endif
+.if mask_bpp_shift >= 0
+    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+.endif
+    PF subge PF_X, PF_X, ORIG_W
+    PF subges PF_CTL, PF_CTL, #0x10
+.if src_bpp_shift >= 0
+    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endif
+.if dst_r_bpp != 0
+    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+.endif
+.if mask_bpp_shift >= 0
+    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+.endif
+.endif
+.endm
+
+.macro cache_preload_simple
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
+.if src_bpp > 0
+    pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
+.endif
+.if dst_r_bpp > 0
+    pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
+.endif
+.if mask_bpp > 0
+    pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
+.endif
+.endif
+.endm
+
+/*
+ * Macro which is used to process leading pixels until destination
+ * pointer is properly aligned (at 16 bytes boundary). When destination
+ * buffer uses 16bpp format, this is unnecessary, or even pointless.
+ */
+.macro ensure_destination_ptr_alignment process_pixblock_head, \
+                                        process_pixblock_tail, \
+                                        process_pixblock_tail_head
+.if dst_w_bpp != 24
+    tst         DST_R, #0xF
+    beq         2f
+
+.irp lowbit, 1, 2, 4, 8, 16
+local skip1
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_R, #lowbit
+    beq         1f
+.endif
+    pixld       (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+.if dst_r_bpp > 0
+    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+.else
+    add         DST_R, DST_R, #lowbit
+.endif
+    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
+    sub         W, W, #(lowbit * 8 / dst_w_bpp)
+1:
+.endif
+.endr
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    process_pixblock_tail
+
+    pixinterleave dst_w_bpp, dst_w_basereg
+.irp lowbit, 1, 2, 4, 8, 16
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_W, #lowbit
+    beq         1f
+.endif
+    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+1:
+.endif
+.endr
+.endif
+2:
+.endm
+
+/*
+ * Special code for processing up to (pixblock_size - 1) remaining
+ * trailing pixels. As SIMD processing performs operation on
+ * pixblock_size pixels, anything smaller than this has to be loaded
+ * and stored in a special way. Loading and storing of pixel data is
+ * performed in such a way that we fill some 'slots' in the NEON
+ * registers (some slots naturally are unused), then perform compositing
+ * operation as usual. In the end, the data is taken from these 'slots'
+ * and saved to memory.
+ *
+ * cache_preload_flag - allows to suppress prefetch if
+ *                      set to 0
+ * dst_aligned_flag   - selects whether destination buffer
+ *                      is aligned
+ */
+.macro process_trailing_pixels cache_preload_flag, \
+                               dst_aligned_flag, \
+                               process_pixblock_head, \
+                               process_pixblock_tail, \
+                               process_pixblock_tail_head
+    tst         W, #(pixblock_size - 1)
+    beq         2f
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+    tst         W, #chunk_size
+    beq         1f
+    pixld       chunk_size, src_bpp, src_basereg, SRC
+    pixld       chunk_size, mask_bpp, mask_basereg, MASK
+.if dst_aligned_flag != 0
+    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.else
+    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.endif
+.if cache_preload_flag != 0
+    PF add      PF_X, PF_X, #chunk_size
+.endif
+1:
+.endif
+.endr
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    process_pixblock_head
+.if cache_preload_flag != 0
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+.endif
+    process_pixblock_tail
+    pixinterleave dst_w_bpp, dst_w_basereg
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+    tst         W, #chunk_size
+    beq         1f
+.if dst_aligned_flag != 0
+    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.else
+    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.endif
+1:
+.endif
+.endr
+2:
+.endm
+
+/*
+ * Macro, which performs all the needed operations to switch to the next
+ * scanline and start the next loop iteration unless all the scanlines
+ * are already processed.
+ */
+.macro advance_to_next_scanline start_of_loop_label
+.if regs_shortage
+    ldrd        W, [sp] /* load W and H (width and height) from stack */
+.else
+    mov         W, ORIG_W
+.endif
+    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
+.if src_bpp != 0
+    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
+.endif
+.if mask_bpp != 0
+    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
+.endif
+.if (dst_w_bpp != 24)
+    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
+.endif
+.if (src_bpp != 24) && (src_bpp != 0)
+    sub         SRC, SRC, W, lsl #src_bpp_shift
+.endif
+.if (mask_bpp != 24) && (mask_bpp != 0)
+    sub         MASK, MASK, W, lsl #mask_bpp_shift
+.endif
+    subs        H, H, #1
+    mov         DST_R, DST_W
+.if regs_shortage
+    str         H, [sp, #4] /* save updated height to stack */
+.endif
+    bge         start_of_loop_label
+.endm
+
+/*
+ * Registers are allocated in the following way by default:
+ * d0, d1, d2, d3     - reserved for loading source pixel data
+ * d4, d5, d6, d7     - reserved for loading destination pixel data
+ * d24, d25, d26, d27 - reserved for loading mask pixel data
+ * d28, d29, d30, d31 - final destination pixel data for writeback to memory
+ */
+.macro generate_composite_function fname, \
+                                   src_bpp_, \
+                                   mask_bpp_, \
+                                   dst_w_bpp_, \
+                                   flags, \
+                                   pixblock_size_, \
+                                   prefetch_distance, \
+                                   init, \
+                                   cleanup, \
+                                   process_pixblock_head, \
+                                   process_pixblock_tail, \
+                                   process_pixblock_tail_head, \
+                                   dst_w_basereg_ = 28, \
+                                   dst_r_basereg_ = 4, \
+                                   src_basereg_   = 0, \
+                                   mask_basereg_  = 24
+
+    .func fname
+    .global fname
+    /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+    push        {r4-r12, lr}        /* save all registers */
+
+/*
+ * Select prefetch type for this function. If prefetch distance is
+ * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
+ * has to be used instead of ADVANCED.
+ */
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
+.if prefetch_distance == 0
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
+        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
+.endif
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, src_bpp_
+    .set mask_bpp, mask_bpp_
+    .set dst_w_bpp, dst_w_bpp_
+    .set pixblock_size, pixblock_size_
+    .set dst_w_basereg, dst_w_basereg_
+    .set dst_r_basereg, dst_r_basereg_
+    .set src_basereg, src_basereg_
+    .set mask_basereg, mask_basereg_
+
+/*
+ * Assign symbolic names to registers
+ */
+    W           .req        r0      /* width (is updated during processing) */
+    H           .req        r1      /* height (is updated during processing) */
+    DST_W       .req        r2      /* destination buffer pointer for writes */
+    DST_STRIDE  .req        r3      /* destination image stride */
+    SRC         .req        r4      /* source buffer pointer */
+    SRC_STRIDE  .req        r5      /* source image stride */
+    DST_R       .req        r6      /* destination buffer pointer for reads */
+
+    MASK        .req        r7      /* mask pointer */
+    MASK_STRIDE .req        r8      /* mask stride */
+
+    PF_CTL      .req        r9      /* combined lines counter and prefetch */
+                                    /* distance increment counter */
+    PF_X        .req        r10     /* pixel index in a scanline for current */
+                                    /* pretetch position */
+    PF_SRC      .req        r11     /* pointer to source scanline start */
+                                    /* for prefetch purposes */
+    PF_DST      .req        r12     /* pointer to destination scanline start */
+                                    /* for prefetch purposes */
+    PF_MASK     .req        r14     /* pointer to mask scanline start */
+                                    /* for prefetch purposes */
+/*
+ * Check whether we have enough registers for all the local variables.
+ * If we don't have enough registers, original width and height are
+ * kept on top of stack (and 'regs_shortage' variable is set to indicate
+ * this for the rest of code). Even if there are enough registers, the
+ * allocation scheme may be a bit different depending on whether source
+ * or mask is not used.
+ */
+.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
+    ORIG_W      .req        r10     /* saved original width */
+    DUMMY       .req        r12     /* temporary register */
+    .set        regs_shortage, 0
+.elseif mask_bpp == 0
+    ORIG_W      .req        r7      /* saved original width */
+    DUMMY       .req        r8      /* temporary register */
+    .set        regs_shortage, 0
+.elseif src_bpp == 0
+    ORIG_W      .req        r4      /* saved original width */
+    DUMMY       .req        r5      /* temporary register */
+    .set        regs_shortage, 0
+.else
+    ORIG_W      .req        r1      /* saved original width */
+    DUMMY       .req        r1      /* temporary register */
+    .set        regs_shortage, 1
+.endif
+
+    .set mask_bpp_shift, -1
+.if src_bpp == 32
+    .set src_bpp_shift, 2
+.elseif src_bpp == 24
+    .set src_bpp_shift, 0
+.elseif src_bpp == 16
+    .set src_bpp_shift, 1
+.elseif src_bpp == 8
+    .set src_bpp_shift, 0
+.elseif src_bpp == 0
+    .set src_bpp_shift, -1
+.else
+    .error "requested src bpp (src_bpp) is not supported"
+.endif
+.if mask_bpp == 32
+    .set mask_bpp_shift, 2
+.elseif mask_bpp == 24
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 8
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 0
+    .set mask_bpp_shift, -1
+.else
+    .error "requested mask bpp (mask_bpp) is not supported"
+.endif
+.if dst_w_bpp == 32
+    .set dst_bpp_shift, 2
+.elseif dst_w_bpp == 24
+    .set dst_bpp_shift, 0
+.elseif dst_w_bpp == 16
+    .set dst_bpp_shift, 1
+.elseif dst_w_bpp == 8
+    .set dst_bpp_shift, 0
+.else
+    .error "requested dst bpp (dst_w_bpp) is not supported"
+.endif
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+.if prefetch_distance < 0 || prefetch_distance > 15
+    .error "invalid prefetch distance (prefetch_distance)"
+.endif
+
+.if src_bpp > 0
+    ldr         SRC, [sp, #40]
+.endif
+.if mask_bpp > 0
+    ldr         MASK, [sp, #48]
+.endif
+    PF mov      PF_X, #0
+.if src_bpp > 0
+    ldr         SRC_STRIDE, [sp, #44]
+.endif
+.if mask_bpp > 0
+    ldr         MASK_STRIDE, [sp, #52]
+.endif
+    mov         DST_R, DST_W
+
+.if src_bpp == 24
+    sub         SRC_STRIDE, SRC_STRIDE, W
+    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
+.endif
+.if mask_bpp == 24
+    sub         MASK_STRIDE, MASK_STRIDE, W
+    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
+.endif
+.if dst_w_bpp == 24
+    sub         DST_STRIDE, DST_STRIDE, W
+    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
+.endif
+
+/*
+ * Setup advanced prefetcher initial state
+ */
+    PF mov      PF_SRC, SRC
+    PF mov      PF_DST, DST_R
+    PF mov      PF_MASK, MASK
+    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
+    PF mov      PF_CTL, H, lsl #4
+    PF add      PF_CTL, #(prefetch_distance - 0x10)
+
+    init
+.if regs_shortage
+    push        {r0, r1}
+.endif
+    subs        H, H, #1
+.if regs_shortage
+    str         H, [sp, #4] /* save updated height to stack */
+.else
+    mov         ORIG_W, W
+.endif
+    blt         9f
+    cmp         W, #(pixblock_size * 2)
+    blt         8f
+/*
+ * This is the start of the pipelined loop, which if optimized for
+ * long scanlines
+ */
+0:
+    ensure_destination_ptr_alignment process_pixblock_head, \
+                                     process_pixblock_tail, \
+                                     process_pixblock_tail_head
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    pixld       pixblock_size, src_bpp, \
+                (src_basereg - pixblock_size * src_bpp / 64), SRC
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    PF add      PF_X, PF_X, #pixblock_size
+    process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    subs        W, W, #(pixblock_size * 2)
+    blt         2f
+1:
+    process_pixblock_tail_head
+    cache_preload_simple
+    subs        W, W, #pixblock_size
+    bge         1b
+2:
+    process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 1, 1, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+    advance_to_next_scanline 0b
+
+.if regs_shortage
+    pop         {r0, r1}
+.endif
+    cleanup
+    pop         {r4-r12, pc}  /* exit */
+/*
+ * This is the start of the loop, designed to process images with small width
+ * (less than pixblock_size * 2 pixels). In this case neither pipelining
+ * nor prefetch are used.
+ */
+8:
+    /* Process exactly pixblock_size pixels if needed */
+    tst         W, #pixblock_size
+    beq         1f
+    pixld       pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    pixld       pixblock_size, src_bpp, \
+                (src_basereg - pixblock_size * src_bpp / 64), SRC
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    process_pixblock_head
+    process_pixblock_tail
+    pixst       pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+1:
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 0, 0, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+    advance_to_next_scanline 8b
+9:
+.if regs_shortage
+    pop         {r0, r1}
+.endif
+    cleanup
+    pop         {r4-r12, pc}  /* exit */
+
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      ORIG_W
+    .unreq      W
+    .unreq      H
+    .unreq      SRC_STRIDE
+    .unreq      DST_STRIDE
+    .unreq      MASK_STRIDE
+    .unreq      PF_CTL
+    .unreq      PF_X
+    .unreq      PF_SRC
+    .unreq      PF_DST
+    .unreq      PF_MASK
+    .unreq      DUMMY
+    .endfunc
+.endm
+
+/*
+ * A simplified variant of function generation template for a single
+ * scanline processing (for implementing pixman combine functions)
+ */
+.macro generate_composite_function_single_scanline fname, \
+                                                   src_bpp_, \
+                                                   mask_bpp_, \
+                                                   dst_w_bpp_, \
+                                                   flags, \
+                                                   pixblock_size_, \
+                                                   init, \
+                                                   cleanup, \
+                                                   process_pixblock_head, \
+                                                   process_pixblock_tail, \
+                                                   process_pixblock_tail_head, \
+                                                   dst_w_basereg_ = 28, \
+                                                   dst_r_basereg_ = 4, \
+                                                   src_basereg_   = 0, \
+                                                   mask_basereg_  = 24
+
+    .func fname
+    .global fname
+    /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, src_bpp_
+    .set mask_bpp, mask_bpp_
+    .set dst_w_bpp, dst_w_bpp_
+    .set pixblock_size, pixblock_size_
+    .set dst_w_basereg, dst_w_basereg_
+    .set dst_r_basereg, dst_r_basereg_
+    .set src_basereg, src_basereg_
+    .set mask_basereg, mask_basereg_
+/*
+ * Assign symbolic names to registers
+ */
+    W           .req        r0      /* width (is updated during processing) */
+    DST_W       .req        r1      /* destination buffer pointer for writes */
+    SRC         .req        r2      /* source buffer pointer */
+    DST_R       .req        ip      /* destination buffer pointer for reads */
+    MASK        .req        r3      /* mask pointer */
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+    init
+    mov         DST_R, DST_W
+
+    cmp         W, #pixblock_size
+    blt         8f
+
+    ensure_destination_ptr_alignment process_pixblock_head, \
+                                     process_pixblock_tail, \
+                                     process_pixblock_tail_head
+
+    subs        W, W, #pixblock_size
+    blt         7f
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    pixld       pixblock_size, src_bpp, \
+                (src_basereg - pixblock_size * src_bpp / 64), SRC
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    process_pixblock_head
+    subs        W, W, #pixblock_size
+    blt         2f
+1:
+    process_pixblock_tail_head
+    subs        W, W, #pixblock_size
+    bge         1b
+2:
+    process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+7:
+    /* Process the remaining trailing pixels in the scanline (dst aligned) */
+    process_trailing_pixels 0, 1, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+
+    cleanup
+    bx         lr  /* exit */
+8:
+    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
+    process_trailing_pixels 0, 0, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+
+    cleanup
+    bx          lr  /* exit */
+
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      W
+    .endfunc
+.endm
+
+.macro default_init
+.endm
+
+.macro default_cleanup
+.endm
diff --git a/lib/pixman/pixman/pixman-arm-neon.c b/lib/pixman/pixman/pixman-arm-neon.c
index 8a2d72ea3..6808b3658 100644
--- a/lib/pixman/pixman/pixman-arm-neon.c
+++ b/lib/pixman/pixman/pixman-arm-neon.c
@@ -30,1670 +30,85 @@
 #include <config.h>
 #endif
 
-#include <arm_neon.h>
 #include <string.h>
 #include "pixman-private.h"
-
-/* Deal with an intrinsic that is defined differently in GCC */
-#if !defined(__ARMCC_VERSION) && !defined(__pld)
-#define __pld(_x) __builtin_prefetch (_x)
-#endif
-
-static force_inline uint8x8x4_t
-unpack0565 (uint16x8_t rgb)
-{
-    uint16x8_t gb, b;
-    uint8x8x4_t res;
-
-    res.val[3] = vdup_n_u8 (0);
-    gb = vshrq_n_u16 (rgb, 5);
-    b = vshrq_n_u16 (rgb, 5 + 6);
-
-    res.val[0] = vmovn_u16 (rgb);  /* get low 5 bits */
-    res.val[1] = vmovn_u16 (gb);   /* get mid 6 bits */
-    res.val[2] = vmovn_u16 (b);    /* get top 5 bits */
-
-    res.val[0] = vshl_n_u8 (res.val[0], 3); /* shift to top */
-    res.val[1] = vshl_n_u8 (res.val[1], 2); /* shift to top */
-    res.val[2] = vshl_n_u8 (res.val[2], 3); /* shift to top */
-
-    res.val[0] = vsri_n_u8 (res.val[0], res.val[0], 5);
-    res.val[1] = vsri_n_u8 (res.val[1], res.val[1], 6);
-    res.val[2] = vsri_n_u8 (res.val[2], res.val[2], 5);
-
-    return res;
-}
-
-#ifdef USE_GCC_INLINE_ASM
-/* Some versions of gcc have problems with vshll_n_u8 intrinsic (Bug 23576) */
-#define vshll_n_u8(a, n) ({ uint16x8_t r; \
-    asm ("vshll.u8 %q0, %P1, %2\n" : "=w" (r) : "w" (a), "i" (n)); r; })
-#endif
-
-static force_inline uint16x8_t
-pack0565 (uint8x8x4_t s)
-{
-    uint16x8_t rgb, val_g, val_r;
-
-    rgb = vshll_n_u8 (s.val[2], 8);
-    val_g = vshll_n_u8 (s.val[1], 8);
-    val_r = vshll_n_u8 (s.val[0], 8);
-    rgb = vsriq_n_u16 (rgb, val_g, 5);
-    rgb = vsriq_n_u16 (rgb, val_r, 5 + 6);
-
-    return rgb;
-}
-
-static force_inline uint8x8_t
-neon2mul (uint8x8_t x,
-          uint8x8_t alpha)
-{
-    uint16x8_t tmp, tmp2;
-    uint8x8_t res;
-
-    tmp = vmull_u8 (x, alpha);
-    tmp2 = vrshrq_n_u16 (tmp, 8);
-    res = vraddhn_u16 (tmp, tmp2);
-
-    return res;
-}
-
-static force_inline uint8x8x4_t
-neon8mul (uint8x8x4_t x,
-          uint8x8_t   alpha)
-{
-    uint16x8x4_t tmp;
-    uint8x8x4_t res;
-    uint16x8_t qtmp1, qtmp2;
-
-    tmp.val[0] = vmull_u8 (x.val[0], alpha);
-    tmp.val[1] = vmull_u8 (x.val[1], alpha);
-    tmp.val[2] = vmull_u8 (x.val[2], alpha);
-    tmp.val[3] = vmull_u8 (x.val[3], alpha);
-
-    qtmp1 = vrshrq_n_u16 (tmp.val[0], 8);
-    qtmp2 = vrshrq_n_u16 (tmp.val[1], 8);
-    res.val[0] = vraddhn_u16 (tmp.val[0], qtmp1);
-    qtmp1 = vrshrq_n_u16 (tmp.val[2], 8);
-    res.val[1] = vraddhn_u16 (tmp.val[1], qtmp2);
-    qtmp2 = vrshrq_n_u16 (tmp.val[3], 8);
-    res.val[2] = vraddhn_u16 (tmp.val[2], qtmp1);
-    res.val[3] = vraddhn_u16 (tmp.val[3], qtmp2);
-
-    return res;
-}
-
-static force_inline uint8x8x4_t
-neon8qadd (uint8x8x4_t x,
-           uint8x8x4_t y)
-{
-    uint8x8x4_t res;
-
-    res.val[0] = vqadd_u8 (x.val[0], y.val[0]);
-    res.val[1] = vqadd_u8 (x.val[1], y.val[1]);
-    res.val[2] = vqadd_u8 (x.val[2], y.val[2]);
-    res.val[3] = vqadd_u8 (x.val[3], y.val[3]);
-
-    return res;
-}
-
-static void
-neon_composite_add_8000_8000 (pixman_implementation_t * impl,
-                              pixman_op_t               op,
-                              pixman_image_t *          src_image,
-                              pixman_image_t *          mask_image,
-                              pixman_image_t *          dst_image,
-                              int32_t                   src_x,
-                              int32_t                   src_y,
-                              int32_t                   mask_x,
-                              int32_t                   mask_y,
-                              int32_t                   dest_x,
-                              int32_t                   dest_y,
-                              int32_t                   width,
-                              int32_t                   height)
-{
-    uint8_t     *dst_line, *dst;
-    uint8_t     *src_line, *src;
-    int dst_stride, src_stride;
-    uint16_t w;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
-    if (width >= 8)
-    {
-	/* Use overlapping 8-pixel method */
-	while (height--)
-	{
-	    uint8_t *keep_dst = 0;
-	    uint8x8_t sval, dval, temp;
-
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    src = src_line;
-	    src_line += src_stride;
-	    w = width;
-
-#ifndef USE_GCC_INLINE_ASM
-	    sval = vld1_u8 ((void *)src);
-	    dval = vld1_u8 ((void *)dst);
-	    keep_dst = dst;
-
-	    temp = vqadd_u8 (dval, sval);
-
-	    src += (w & 7);
-	    dst += (w & 7);
-	    w -= (w & 7);
-
-	    while (w)
-	    {
-		sval = vld1_u8 ((void *)src);
-		dval = vld1_u8 ((void *)dst);
-
-		vst1_u8 ((void *)keep_dst, temp);
-		keep_dst = dst;
-
-		temp = vqadd_u8 (dval, sval);
-
-		src += 8;
-		dst += 8;
-		w -= 8;
-	    }
-
-	    vst1_u8 ((void *)keep_dst, temp);
-#else
-	    asm volatile (
-/* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
-	        "vld1.8  {d0}, [%[src]]\n\t"
-	        "vld1.8  {d4}, [%[dst]]\n\t"
-	        "mov     %[keep_dst], %[dst]\n\t"
-
-	        "and ip, %[w], #7\n\t"
-	        "add %[src], %[src], ip\n\t"
-	        "add %[dst], %[dst], ip\n\t"
-	        "subs %[w], %[w], ip\n\t"
-	        "b 9f\n\t"
-/* LOOP */
-	        "2:\n\t"
-	        "vld1.8  {d0}, [%[src]]!\n\t"
-	        "vld1.8  {d4}, [%[dst]]!\n\t"
-	        "vst1.8  {d20}, [%[keep_dst]]\n\t"
-	        "sub     %[keep_dst], %[dst], #8\n\t"
-	        "subs %[w], %[w], #8\n\t"
-	        "9:\n\t"
-	        "vqadd.u8 d20, d0, d4\n\t"
-
-	        "bne 2b\n\t"
-
-	        "1:\n\t"
-	        "vst1.8  {d20}, [%[keep_dst]]\n\t"
-
-		: [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
-		:
-		: "ip", "cc", "memory", "d0", "d4",
-	        "d20"
-	        );
-#endif
-	}
-    }
-    else
-    {
-	const uint8_t nil = 0;
-	const uint8x8_t vnil = vld1_dup_u8 (&nil);
-
-	while (height--)
-	{
-	    uint8x8_t sval = vnil, dval = vnil;
-	    uint8_t *dst4 = 0, *dst2 = 0;
-
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    src = src_line;
-	    src_line += src_stride;
-	    w = width;
-
-	    if (w & 4)
-	    {
-		sval = vreinterpret_u8_u32 (
-		    vld1_lane_u32 ((void *)src, vreinterpret_u32_u8 (sval), 1));
-		dval = vreinterpret_u8_u32 (
-		    vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
-
-		dst4 = dst;
-		src += 4;
-		dst += 4;
-	    }
-
-	    if (w & 2)
-	    {
-		sval = vreinterpret_u8_u16 (
-		    vld1_lane_u16 ((void *)src, vreinterpret_u16_u8 (sval), 1));
-		dval = vreinterpret_u8_u16 (
-		    vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
-
-		dst2 = dst;
-		src += 2;
-		dst += 2;
-	    }
-
-	    if (w & 1)
-	    {
-		sval = vld1_lane_u8 (src, sval, 1);
-		dval = vld1_lane_u8 (dst, dval, 1);
-	    }
-
-	    dval = vqadd_u8 (dval, sval);
-
-	    if (w & 1)
-		vst1_lane_u8 (dst, dval, 1);
-
-	    if (w & 2)
-		vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (dval), 1);
-
-	    if (w & 4)
-		vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (dval), 1);
-	}
-    }
-}
-
-static void
-neon_composite_over_8888_8888 (pixman_implementation_t * impl,
-                               pixman_op_t               op,
-                               pixman_image_t *          src_image,
-                               pixman_image_t *          mask_image,
-                               pixman_image_t *          dst_image,
-                               int32_t                   src_x,
-                               int32_t                   src_y,
-                               int32_t                   mask_x,
-                               int32_t                   mask_y,
-                               int32_t                   dest_x,
-                               int32_t                   dest_y,
-                               int32_t                   width,
-                               int32_t                   height)
-{
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
-    int dst_stride, src_stride;
-    uint32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    if (width >= 8)
-    {
-	/* Use overlapping 8-pixel method */
-	while (height--)
-	{
-	    uint32_t *keep_dst = 0;
-	    uint8x8x4_t sval, dval, temp;
-
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    src = src_line;
-	    src_line += src_stride;
-	    w = width;
-
-#ifndef USE_GCC_INLINE_ASM
-	    sval = vld4_u8 ((void *)src);
-	    dval = vld4_u8 ((void *)dst);
-	    keep_dst = dst;
-
-	    temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
-	    temp = neon8qadd (sval, temp);
-
-	    src += (w & 7);
-	    dst += (w & 7);
-	    w -= (w & 7);
-
-	    while (w)
-	    {
-		sval = vld4_u8 ((void *)src);
-		dval = vld4_u8 ((void *)dst);
-
-		vst4_u8 ((void *)keep_dst, temp);
-		keep_dst = dst;
-
-		temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
-		temp = neon8qadd (sval, temp);
-
-		src += 8;
-		dst += 8;
-		w -= 8;
-	    }
-
-	    vst4_u8 ((void *)keep_dst, temp);
-#else
-	    asm volatile (
-/* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
-	        "vld4.8  {d0-d3}, [%[src]]\n\t"
-	        "vld4.8  {d4-d7}, [%[dst]]\n\t"
-	        "mov     %[keep_dst], %[dst]\n\t"
-
-	        "and ip, %[w], #7\n\t"
-	        "add %[src], %[src], ip, LSL#2\n\t"
-	        "add %[dst], %[dst], ip, LSL#2\n\t"
-	        "subs %[w], %[w], ip\n\t"
-	        "b 9f\n\t"
-/* LOOP */
-	        "2:\n\t"
-	        "vld4.8  {d0-d3}, [%[src]]!\n\t"
-	        "vld4.8  {d4-d7}, [%[dst]]!\n\t"
-	        "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
-	        "sub     %[keep_dst], %[dst], #8*4\n\t"
-	        "subs %[w], %[w], #8\n\t"
-	        "9:\n\t"
-	        "vmvn.8  d31, d3\n\t"
-	        "vmull.u8 q10, d31, d4\n\t"
-	        "vmull.u8 q11, d31, d5\n\t"
-	        "vmull.u8 q12, d31, d6\n\t"
-	        "vmull.u8 q13, d31, d7\n\t"
-	        "vrshr.u16 q8, q10, #8\n\t"
-	        "vrshr.u16 q9, q11, #8\n\t"
-	        "vraddhn.u16 d20, q10, q8\n\t"
-	        "vraddhn.u16 d21, q11, q9\n\t"
-	        "vrshr.u16 q8, q12, #8\n\t"
-	        "vrshr.u16 q9, q13, #8\n\t"
-	        "vraddhn.u16 d22, q12, q8\n\t"
-	        "vraddhn.u16 d23, q13, q9\n\t"
-/* result in d20-d23 */
-	        "vqadd.u8 d20, d0, d20\n\t"
-	        "vqadd.u8 d21, d1, d21\n\t"
-	        "vqadd.u8 d22, d2, d22\n\t"
-	        "vqadd.u8 d23, d3, d23\n\t"
-
-	        "bne 2b\n\t"
-
-	        "1:\n\t"
-	        "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
-
-		: [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
-		:
-		: "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
-	        "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23"
-	        );
-#endif
-	}
-    }
-    else
-    {
-	uint8x8_t alpha_selector = vreinterpret_u8_u64 (
-	    vcreate_u64 (0x0707070703030303ULL));
-
-	/* Handle width < 8 */
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    src = src_line;
-	    src_line += src_stride;
-	    w = width;
-
-	    while (w >= 2)
-	    {
-		uint8x8_t sval, dval;
-
-		/* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
-		sval = vreinterpret_u8_u32 (vld1_u32 ((void *)src));
-		dval = vreinterpret_u8_u32 (vld1_u32 ((void *)dst));
-		dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
-		vst1_u8 ((void *)dst, vqadd_u8 (sval, dval));
-
-		src += 2;
-		dst += 2;
-		w -= 2;
-	    }
-
-	    if (w)
-	    {
-		uint8x8_t sval, dval;
-
-		/* single 32-bit pixel in lane 0 */
-		sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)src));  /* only interested in lane 0 */
-		dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));  /* only interested in lane 0 */
-		dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
-		vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
-	    }
-	}
-    }
-}
-
-static void
-neon_composite_over_8888_n_8888 (pixman_implementation_t * impl,
-                                 pixman_op_t               op,
-                                 pixman_image_t *          src_image,
-                                 pixman_image_t *          mask_image,
-                                 pixman_image_t *          dst_image,
-                                 int32_t                   src_x,
-                                 int32_t                   src_y,
-                                 int32_t                   mask_x,
-                                 int32_t                   mask_y,
-                                 int32_t                   dest_x,
-                                 int32_t                   dest_y,
-                                 int32_t                   width,
-                                 int32_t                   height)
-{
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
-    uint32_t mask;
-    int dst_stride, src_stride;
-    uint32_t w;
-    uint8x8_t mask_alpha;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
-    mask_alpha = vdup_n_u8 ((mask) >> 24);
-
-    if (width >= 8)
-    {
-	/* Use overlapping 8-pixel method */
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    src = src_line;
-	    src_line += src_stride;
-	    w = width;
-
-	    uint32_t *keep_dst = 0;
-
-#ifndef USE_GCC_INLINE_ASM
-	    uint8x8x4_t sval, dval, temp;
-
-	    sval = vld4_u8 ((void *)src);
-	    dval = vld4_u8 ((void *)dst);
-	    keep_dst = dst;
-
-	    sval = neon8mul (sval, mask_alpha);
-	    temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
-	    temp = neon8qadd (sval, temp);
-
-	    src += (w & 7);
-	    dst += (w & 7);
-	    w -= (w & 7);
-
-	    while (w)
-	    {
-		sval = vld4_u8 ((void *)src);
-		dval = vld4_u8 ((void *)dst);
-
-		vst4_u8 ((void *)keep_dst, temp);
-		keep_dst = dst;
-
-		sval = neon8mul (sval, mask_alpha);
-		temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
-		temp = neon8qadd (sval, temp);
-
-		src += 8;
-		dst += 8;
-		w -= 8;
-	    }
-	    vst4_u8 ((void *)keep_dst, temp);
-#else
-	    asm volatile (
-/* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
-	        "vdup.32      d30, %[mask]\n\t"
-	        "vdup.8       d30, d30[3]\n\t"
-
-	        "vld4.8       {d0-d3}, [%[src]]\n\t"
-	        "vld4.8       {d4-d7}, [%[dst]]\n\t"
-	        "mov  %[keep_dst], %[dst]\n\t"
-
-	        "and  ip, %[w], #7\n\t"
-	        "add  %[src], %[src], ip, LSL#2\n\t"
-	        "add  %[dst], %[dst], ip, LSL#2\n\t"
-	        "subs  %[w], %[w], ip\n\t"
-	        "b 9f\n\t"
-/* LOOP */
-	        "2:\n\t"
-	        "vld4.8       {d0-d3}, [%[src]]!\n\t"
-	        "vld4.8       {d4-d7}, [%[dst]]!\n\t"
-	        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
-	        "sub  %[keep_dst], %[dst], #8*4\n\t"
-	        "subs  %[w], %[w], #8\n\t"
-
-	        "9:\n\t"
-	        "vmull.u8     q10, d30, d0\n\t"
-	        "vmull.u8     q11, d30, d1\n\t"
-	        "vmull.u8     q12, d30, d2\n\t"
-	        "vmull.u8     q13, d30, d3\n\t"
-	        "vrshr.u16    q8, q10, #8\n\t"
-	        "vrshr.u16    q9, q11, #8\n\t"
-	        "vraddhn.u16  d0, q10, q8\n\t"
-	        "vraddhn.u16  d1, q11, q9\n\t"
-	        "vrshr.u16    q9, q13, #8\n\t"
-	        "vrshr.u16    q8, q12, #8\n\t"
-	        "vraddhn.u16  d3, q13, q9\n\t"
-	        "vraddhn.u16  d2, q12, q8\n\t"
-
-	        "vmvn.8       d31, d3\n\t"
-	        "vmull.u8     q10, d31, d4\n\t"
-	        "vmull.u8     q11, d31, d5\n\t"
-	        "vmull.u8     q12, d31, d6\n\t"
-	        "vmull.u8     q13, d31, d7\n\t"
-	        "vrshr.u16    q8, q10, #8\n\t"
-	        "vrshr.u16    q9, q11, #8\n\t"
-	        "vraddhn.u16  d20, q10, q8\n\t"
-	        "vrshr.u16    q8, q12, #8\n\t"
-	        "vraddhn.u16  d21, q11, q9\n\t"
-	        "vrshr.u16    q9, q13, #8\n\t"
-	        "vraddhn.u16  d22, q12, q8\n\t"
-	        "vraddhn.u16  d23, q13, q9\n\t"
-
-/* result in d20-d23 */
-	        "vqadd.u8     d20, d0, d20\n\t"
-	        "vqadd.u8     d21, d1, d21\n\t"
-	        "vqadd.u8     d22, d2, d22\n\t"
-	        "vqadd.u8     d23, d3, d23\n\t"
-
-	        "bne  2b\n\t"
-
-	        "1:\n\t"
-	        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
-
-		: [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
-		: [mask] "r" (mask)
-		: "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
-	        "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
-	        "d30", "d31"
-	        );
-#endif
-	}
-    }
-    else
-    {
-	uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
-
-	/* Handle width < 8 */
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    src = src_line;
-	    src_line += src_stride;
-	    w = width;
-
-	    while (w >= 2)
-	    {
-		uint8x8_t sval, dval;
-
-		sval = vreinterpret_u8_u32 (vld1_u32 ((void *)src));
-		dval = vreinterpret_u8_u32 (vld1_u32 ((void *)dst));
-
-		/* sval * const alpha_mul */
-		sval = neon2mul (sval, mask_alpha);
-
-		/* dval * 255-(src alpha) */
-		dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
-
-		vst1_u8 ((void *)dst, vqadd_u8 (sval, dval));
-
-		src += 2;
-		dst += 2;
-		w -= 2;
-	    }
-
-	    if (w)
-	    {
-		uint8x8_t sval, dval;
-
-		sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)src));
-		dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));
-
-		/* sval * const alpha_mul */
-		sval = neon2mul (sval, mask_alpha);
-
-		/* dval * 255-(src alpha) */
-		dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
-
-		vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
-	    }
-	}
-    }
-}
-
-static void
-neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
-			      pixman_op_t               op,
-			      pixman_image_t *          src_image,
-			      pixman_image_t *          mask_image,
-			      pixman_image_t *          dst_image,
-			      int32_t                   src_x,
-			      int32_t                   src_y,
-			      int32_t                   mask_x,
-			      int32_t                   mask_y,
-			      int32_t                   dest_x,
-			      int32_t                   dest_y,
-			      int32_t                   width,
-			      int32_t                   height)
-{
-    uint32_t     src, srca;
-    uint16_t    *dst_line, *dst;
-    uint8_t     *mask_line, *mask;
-    int          dst_stride, mask_stride;
-    uint32_t     w;
-    uint8x8_t    sval2;
-    uint8x8x4_t  sval8;
-
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    sval2=vreinterpret_u8_u32 (vdup_n_u32 (src));
-    sval8.val[0]=vdup_lane_u8 (sval2,0);
-    sval8.val[1]=vdup_lane_u8 (sval2,1);
-    sval8.val[2]=vdup_lane_u8 (sval2,2);
-    sval8.val[3]=vdup_lane_u8 (sval2,3);
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    if (width>=8)
-    {
-	/* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
-	while (height--)
-	{
-	    uint16_t *keep_dst=0;
-
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-#ifndef USE_GCC_INLINE_ASM
-	    uint8x8_t alpha;
-	    uint16x8_t dval, temp;
-	    uint8x8x4_t sval8temp;
-
-	    alpha = vld1_u8 ((void *)mask);
-	    dval = vld1q_u16 ((void *)dst);
-	    keep_dst = dst;
-
-	    sval8temp = neon8mul (sval8, alpha);
-	    temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
-
-	    mask += (w & 7);
-	    dst += (w & 7);
-	    w -= (w & 7);
-
-	    while (w)
-	    {
-		dval = vld1q_u16 ((void *)dst);
-		alpha = vld1_u8 ((void *)mask);
-
-		vst1q_u16 ((void *)keep_dst, temp);
-		keep_dst = dst;
-
-		sval8temp = neon8mul (sval8, alpha);
-		temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
-
-		mask+=8;
-		dst+=8;
-		w-=8;
-	    }
-	    vst1q_u16 ((void *)keep_dst, temp);
-#else
-	    asm volatile (
-		"vdup.32      d0, %[src]\n\t"
-		"vdup.8       d1, d0[1]\n\t"
-		"vdup.8       d2, d0[2]\n\t"
-		"vdup.8       d3, d0[3]\n\t"
-		"vdup.8       d0, d0[0]\n\t"
-
-		"vld1.8       {q12}, [%[dst]]\n\t"
-		"vld1.8       {d31}, [%[mask]]\n\t"
-		"mov  %[keep_dst], %[dst]\n\t"
-
-		"and  ip, %[w], #7\n\t"
-		"add  %[mask], %[mask], ip\n\t"
-		"add  %[dst], %[dst], ip, LSL#1\n\t"
-		"subs  %[w], %[w], ip\n\t"
-		"b  9f\n\t"
-/* LOOP */
-		"2:\n\t"
-
-		"vld1.16      {q12}, [%[dst]]!\n\t"
-		"vld1.8       {d31}, [%[mask]]!\n\t"
-		"vst1.16      {q10}, [%[keep_dst]]\n\t"
-		"sub  %[keep_dst], %[dst], #8*2\n\t"
-		"subs  %[w], %[w], #8\n\t"
-		"9:\n\t"
-/* expand 0565 q12 to 8888 {d4-d7} */
-		"vmovn.u16    d4, q12\t\n"
-		"vshr.u16     q11, q12, #5\t\n"
-		"vshr.u16     q10, q12, #6+5\t\n"
-		"vmovn.u16    d5, q11\t\n"
-		"vmovn.u16    d6, q10\t\n"
-		"vshl.u8      d4, d4, #3\t\n"
-		"vshl.u8      d5, d5, #2\t\n"
-		"vshl.u8      d6, d6, #3\t\n"
-		"vsri.u8      d4, d4, #5\t\n"
-		"vsri.u8      d5, d5, #6\t\n"
-		"vsri.u8      d6, d6, #5\t\n"
-
-		"vmull.u8     q10, d31, d0\n\t"
-		"vmull.u8     q11, d31, d1\n\t"
-		"vmull.u8     q12, d31, d2\n\t"
-		"vmull.u8     q13, d31, d3\n\t"
-		"vrshr.u16    q8, q10, #8\n\t"
-		"vrshr.u16    q9, q11, #8\n\t"
-		"vraddhn.u16  d20, q10, q8\n\t"
-		"vraddhn.u16  d21, q11, q9\n\t"
-		"vrshr.u16    q9, q13, #8\n\t"
-		"vrshr.u16    q8, q12, #8\n\t"
-		"vraddhn.u16  d23, q13, q9\n\t"
-		"vraddhn.u16  d22, q12, q8\n\t"
-
-/* duplicate in 4/2/1 & 8pix vsns */
-		"vmvn.8       d30, d23\n\t"
-		"vmull.u8     q14, d30, d6\n\t"
-		"vmull.u8     q13, d30, d5\n\t"
-		"vmull.u8     q12, d30, d4\n\t"
-		"vrshr.u16    q8, q14, #8\n\t"
-		"vrshr.u16    q9, q13, #8\n\t"
-		"vraddhn.u16  d6, q14, q8\n\t"
-		"vrshr.u16    q8, q12, #8\n\t"
-		"vraddhn.u16  d5, q13, q9\n\t"
-		"vqadd.u8     d6, d6, d22\n\t"  /* moved up */
-		"vraddhn.u16  d4, q12, q8\n\t"
-/* intentionally don't calculate alpha */
-/* result in d4-d6 */
-
-/*              "vqadd.u8     d6, d6, d22\n\t"  ** moved up */
-		"vqadd.u8     d5, d5, d21\n\t"
-		"vqadd.u8     d4, d4, d20\n\t"
-
-/* pack 8888 {d20-d23} to 0565 q10 */
-		"vshll.u8     q10, d6, #8\n\t"
-		"vshll.u8     q3, d5, #8\n\t"
-		"vshll.u8     q2, d4, #8\n\t"
-		"vsri.u16     q10, q3, #5\t\n"
-		"vsri.u16     q10, q2, #11\t\n"
-
-		"bne 2b\n\t"
-
-		"1:\n\t"
-		"vst1.16      {q10}, [%[keep_dst]]\n\t"
-
-		: [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
-		: [src] "r" (src)
-		: "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
-		  "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
-		  "d30","d31"
-		);
-#endif
-	}
-    }
-    else
-    {
-	while (height--)
-	{
-	    void *dst4=0, *dst2=0;
-
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-
-#if 1 /* #ifndef USE_GCC_INLINE_ASM */
-	    uint8x8_t alpha;
-	    uint16x8_t dval, temp;
-	    uint8x8x4_t sval8temp;
-
-	    if (w&4)
-	    {
-		alpha = vreinterpret_u8_u32 (vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (alpha),1));
-		dval = vreinterpretq_u16_u64 (vld1q_lane_u64 ((void *)dst, vreinterpretq_u64_u16 (dval),1));
-		dst4=dst;
-		mask+=4;
-		dst+=4;
-	    }
-	    if (w&2)
-	    {
-		alpha = vreinterpret_u8_u16 (vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (alpha),1));
-		dval = vreinterpretq_u16_u32 (vld1q_lane_u32 ((void *)dst, vreinterpretq_u32_u16 (dval),1));
-		dst2=dst;
-		mask+=2;
-		dst+=2;
-	    }
-	    if (w&1)
-	    {
-		alpha = vld1_lane_u8 ((void *)mask, alpha,1);
-		dval = vld1q_lane_u16 ((void *)dst, dval,1);
-	    }
-
-	    sval8temp = neon8mul (sval8, alpha);
-	    temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
-
-	    if (w&1)
-		vst1q_lane_u16 ((void *)dst, temp,1);
-	    if (w&2)
-		vst1q_lane_u32 ((void *)dst2, vreinterpretq_u32_u16 (temp),1);
-	    if (w&4)
-		vst1q_lane_u64 ((void *)dst4, vreinterpretq_u64_u16 (temp),1);
-#else
-	    /* this code has some bug (does not pass blitters-test) */
-	    asm volatile (
-		"vdup.32      d0, %[src]\n\t"
-		"vdup.8       d1, d0[1]\n\t"
-		"vdup.8       d2, d0[2]\n\t"
-		"vdup.8       d3, d0[3]\n\t"
-		"vdup.8       d0, d0[0]\n\t"
-
-		"tst  %[w], #4\t\n"
-		"beq  skip_load4\t\n"
-
-		"vld1.64      {d25}, [%[dst]]\n\t"
-		"vld1.32      {d31[1]}, [%[mask]]\n\t"
-		"mov  %[dst4], %[dst]\t\n"
-		"add  %[mask], %[mask], #4\t\n"
-		"add  %[dst], %[dst], #4*2\t\n"
-
-		"skip_load4:\t\n"
-		"tst  %[w], #2\t\n"
-		"beq  skip_load2\t\n"
-		"vld1.32      {d24[1]}, [%[dst]]\n\t"
-		"vld1.16      {d31[1]}, [%[mask]]\n\t"
-		"mov  %[dst2], %[dst]\t\n"
-		"add  %[mask], %[mask], #2\t\n"
-		"add  %[dst], %[dst], #2*2\t\n"
-
-		"skip_load2:\t\n"
-		"tst  %[w], #1\t\n"
-		"beq  skip_load1\t\n"
-		"vld1.16      {d24[1]}, [%[dst]]\n\t"
-		"vld1.8       {d31[1]}, [%[mask]]\n\t"
-
-		"skip_load1:\t\n"
-/* expand 0565 q12 to 8888 {d4-d7} */
-		"vmovn.u16    d4, q12\t\n"
-		"vshr.u16     q11, q12, #5\t\n"
-		"vshr.u16     q10, q12, #6+5\t\n"
-		"vmovn.u16    d5, q11\t\n"
-		"vmovn.u16    d6, q10\t\n"
-		"vshl.u8      d4, d4, #3\t\n"
-		"vshl.u8      d5, d5, #2\t\n"
-		"vshl.u8      d6, d6, #3\t\n"
-		"vsri.u8      d4, d4, #5\t\n"
-		"vsri.u8      d5, d5, #6\t\n"
-		"vsri.u8      d6, d6, #5\t\n"
-
-		"vmull.u8     q10, d31, d0\n\t"
-		"vmull.u8     q11, d31, d1\n\t"
-		"vmull.u8     q12, d31, d2\n\t"
-		"vmull.u8     q13, d31, d3\n\t"
-		"vrshr.u16    q8, q10, #8\n\t"
-		"vrshr.u16    q9, q11, #8\n\t"
-		"vraddhn.u16  d20, q10, q8\n\t"
-		"vraddhn.u16  d21, q11, q9\n\t"
-		"vrshr.u16    q9, q13, #8\n\t"
-		"vrshr.u16    q8, q12, #8\n\t"
-		"vraddhn.u16  d23, q13, q9\n\t"
-		"vraddhn.u16  d22, q12, q8\n\t"
-
-/* duplicate in 4/2/1 & 8pix vsns */
-		"vmvn.8       d30, d23\n\t"
-		"vmull.u8     q14, d30, d6\n\t"
-		"vmull.u8     q13, d30, d5\n\t"
-		"vmull.u8     q12, d30, d4\n\t"
-		"vrshr.u16    q8, q14, #8\n\t"
-		"vrshr.u16    q9, q13, #8\n\t"
-		"vraddhn.u16  d6, q14, q8\n\t"
-		"vrshr.u16    q8, q12, #8\n\t"
-		"vraddhn.u16  d5, q13, q9\n\t"
-		"vqadd.u8     d6, d6, d22\n\t"  /* moved up */
-		"vraddhn.u16  d4, q12, q8\n\t"
-/* intentionally don't calculate alpha */
-/* result in d4-d6 */
-
-/*              "vqadd.u8     d6, d6, d22\n\t"  ** moved up */
-		"vqadd.u8     d5, d5, d21\n\t"
-		"vqadd.u8     d4, d4, d20\n\t"
-
-/* pack 8888 {d20-d23} to 0565 q10 */
-		"vshll.u8     q10, d6, #8\n\t"
-		"vshll.u8     q3, d5, #8\n\t"
-		"vshll.u8     q2, d4, #8\n\t"
-		"vsri.u16     q10, q3, #5\t\n"
-		"vsri.u16     q10, q2, #11\t\n"
-
-		"tst  %[w], #1\n\t"
-		"beq skip_store1\t\n"
-		"vst1.16      {d20[1]}, [%[dst]]\t\n"
-		"skip_store1:\t\n"
-		"tst  %[w], #2\n\t"
-		"beq  skip_store2\t\n"
-		"vst1.32      {d20[1]}, [%[dst2]]\t\n"
-		"skip_store2:\t\n"
-		"tst  %[w], #4\n\t"
-		"beq skip_store4\t\n"
-		"vst1.16      {d21}, [%[dst4]]\t\n"
-		"skip_store4:\t\n"
-
-		: [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2)
-		: [src] "r" (src)
-		: "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
-		  "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
-		  "d30","d31"
-		);
-#endif
-	}
-    }
-}
-
-static void
-neon_composite_over_n_8_8888 (pixman_implementation_t * impl,
-                              pixman_op_t               op,
-                              pixman_image_t *          src_image,
-                              pixman_image_t *          mask_image,
-                              pixman_image_t *          dst_image,
-                              int32_t                   src_x,
-                              int32_t                   src_y,
-                              int32_t                   mask_x,
-                              int32_t                   mask_y,
-                              int32_t                   dest_x,
-                              int32_t                   dest_y,
-                              int32_t                   width,
-                              int32_t                   height)
-{
-    uint32_t src, srca;
-    uint32_t    *dst_line, *dst;
-    uint8_t     *mask_line, *mask;
-    int dst_stride, mask_stride;
-    uint32_t w;
-    uint8x8_t sval2;
-    uint8x8x4_t sval8;
-    uint8x8_t mask_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0101010100000000ULL));
-    uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
-
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-    
-    /* bail out if fully transparent */
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    sval2 = vreinterpret_u8_u32 (vdup_n_u32 (src));
-    sval8.val[0] = vdup_lane_u8 (sval2, 0);
-    sval8.val[1] = vdup_lane_u8 (sval2, 1);
-    sval8.val[2] = vdup_lane_u8 (sval2, 2);
-    sval8.val[3] = vdup_lane_u8 (sval2, 3);
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    if (width >= 8)
-    {
-	/* Use overlapping 8-pixel method, modified to avoid
-	 * rewritten dest being reused
-	 */
-	while (height--)
-	{
-	    uint32_t *keep_dst = 0;
-
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-#ifndef USE_GCC_INLINE_ASM
-	    uint8x8_t alpha;
-	    uint8x8x4_t dval, temp;
-
-	    alpha = vld1_u8 ((void *)mask);
-	    dval = vld4_u8 ((void *)dst);
-	    keep_dst = dst;
-
-	    temp = neon8mul (sval8, alpha);
-	    dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
-	    temp = neon8qadd (temp, dval);
-
-	    mask += (w & 7);
-	    dst += (w & 7);
-	    w -= (w & 7);
-
-	    while (w)
-	    {
-		alpha = vld1_u8 ((void *)mask);
-		dval = vld4_u8 ((void *)dst);
-
-		vst4_u8 ((void *)keep_dst, temp);
-		keep_dst = dst;
-
-		temp = neon8mul (sval8, alpha);
-		dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
-		temp = neon8qadd (temp, dval);
-
-		mask += 8;
-		dst += 8;
-		w -= 8;
-	    }
-	    vst4_u8 ((void *)keep_dst, temp);
-#else
-	    asm volatile (
-	        "vdup.32      d0, %[src]\n\t"
-	        "vdup.8       d1, d0[1]\n\t"
-	        "vdup.8       d2, d0[2]\n\t"
-	        "vdup.8       d3, d0[3]\n\t"
-	        "vdup.8       d0, d0[0]\n\t"
-
-	        "vld4.8       {d4-d7}, [%[dst]]\n\t"
-	        "vld1.8       {d31}, [%[mask]]\n\t"
-	        "mov  %[keep_dst], %[dst]\n\t"
-
-	        "and  ip, %[w], #7\n\t"
-	        "add  %[mask], %[mask], ip\n\t"
-	        "add  %[dst], %[dst], ip, LSL#2\n\t"
-	        "subs  %[w], %[w], ip\n\t"
-	        "b 9f\n\t"
-/* LOOP */
-	        "2:\n\t"
-	        "vld4.8       {d4-d7}, [%[dst]]!\n\t"
-	        "vld1.8       {d31}, [%[mask]]!\n\t"
-	        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
-	        "sub  %[keep_dst], %[dst], #8*4\n\t"
-	        "subs  %[w], %[w], #8\n\t"
-	        "9:\n\t"
-
-	        "vmull.u8     q10, d31, d0\n\t"
-	        "vmull.u8     q11, d31, d1\n\t"
-	        "vmull.u8     q12, d31, d2\n\t"
-	        "vmull.u8     q13, d31, d3\n\t"
-	        "vrshr.u16    q8, q10, #8\n\t"
-	        "vrshr.u16    q9, q11, #8\n\t"
-	        "vraddhn.u16  d20, q10, q8\n\t"
-	        "vraddhn.u16  d21, q11, q9\n\t"
-	        "vrshr.u16    q9, q13, #8\n\t"
-	        "vrshr.u16    q8, q12, #8\n\t"
-	        "vraddhn.u16  d23, q13, q9\n\t"
-	        "vraddhn.u16  d22, q12, q8\n\t"
-
-	        "vmvn.8       d30, d23\n\t"
-	        "vmull.u8     q12, d30, d4\n\t"
-	        "vmull.u8     q13, d30, d5\n\t"
-	        "vmull.u8     q14, d30, d6\n\t"
-	        "vmull.u8     q15, d30, d7\n\t"
-
-	        "vrshr.u16    q8, q12, #8\n\t"
-	        "vrshr.u16    q9, q13, #8\n\t"
-	        "vraddhn.u16  d4, q12, q8\n\t"
-	        "vrshr.u16    q8, q14, #8\n\t"
-	        "vraddhn.u16  d5, q13, q9\n\t"
-	        "vrshr.u16    q9, q15, #8\n\t"
-	        "vraddhn.u16  d6, q14, q8\n\t"
-	        "vraddhn.u16  d7, q15, q9\n\t"
-/* result in d4-d7 */
-
-	        "vqadd.u8     d20, d4, d20\n\t"
-	        "vqadd.u8     d21, d5, d21\n\t"
-	        "vqadd.u8     d22, d6, d22\n\t"
-	        "vqadd.u8     d23, d7, d23\n\t"
-
-	        "bne 2b\n\t"
-
-	        "1:\n\t"
-	        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
-
-		: [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
-		: [src] "r" (src)
-		: "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
-	        "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
-	        "d30", "d31"
-	        );
-#endif
-	}
-    }
-    else
-    {
-	while (height--)
-	{
-	    uint8x8_t alpha;
-
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-	    while (w >= 2)
-	    {
-		uint8x8_t dval, temp, res;
-
-		alpha = vtbl1_u8 (
-		    vreinterpret_u8_u16 (vld1_dup_u16 ((void *)mask)), mask_selector);
-		dval = vld1_u8 ((void *)dst);
-
-		temp = neon2mul (sval2, alpha);
-		res = vqadd_u8 (
-		    temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
-
-		vst1_u8 ((void *)dst, res);
-
-		mask += 2;
-		dst += 2;
-		w -= 2;
-	    }
-
-	    if (w)
-	    {
-		uint8x8_t dval, temp, res;
-
-		alpha = vtbl1_u8 (vld1_dup_u8 ((void *)mask), mask_selector);
-		dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));
-
-		temp = neon2mul (sval2, alpha);
-		res = vqadd_u8 (
-		    temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
-
-		vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (res), 0);
-	    }
-	}
-    }
-}
-
-static void
-neon_composite_add_8888_8_8 (pixman_implementation_t * impl,
-                             pixman_op_t               op,
-                             pixman_image_t *          src_image,
-                             pixman_image_t *          mask_image,
-                             pixman_image_t *          dst_image,
-                             int32_t                   src_x,
-                             int32_t                   src_y,
-                             int32_t                   mask_x,
-                             int32_t                   mask_y,
-                             int32_t                   dest_x,
-                             int32_t                   dest_y,
-                             int32_t                   width,
-                             int32_t                   height)
-{
-    uint8_t     *dst_line, *dst;
-    uint8_t     *mask_line, *mask;
-    int dst_stride, mask_stride;
-    uint32_t w;
-    uint32_t src;
-    uint8x8_t sa;
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-    sa = vdup_n_u8 ((src) >> 24);
-
-    if (width >= 8)
-    {
-	/* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-	    uint8x8_t mval, dval, res;
-	    uint8_t     *keep_dst;
-
-	    mval = vld1_u8 ((void *)mask);
-	    dval = vld1_u8 ((void *)dst);
-	    keep_dst = dst;
-
-	    res = vqadd_u8 (neon2mul (mval, sa), dval);
-
-	    mask += (w & 7);
-	    dst += (w & 7);
-	    w -= w & 7;
-
-	    while (w)
-	    {
-		mval = vld1_u8 ((void *)mask);
-		dval = vld1_u8 ((void *)dst);
-		vst1_u8 ((void *)keep_dst, res);
-		keep_dst = dst;
-
-		res = vqadd_u8 (neon2mul (mval, sa), dval);
-
-		mask += 8;
-		dst += 8;
-		w -= 8;
-	    }
-	    vst1_u8 ((void *)keep_dst, res);
-	}
-    }
-    else
-    {
-	/* Use 4/2/1 load/store method to handle 1-7 pixels */
-	while (height--)
-	{
-	    dst = dst_line;
-	    dst_line += dst_stride;
-	    mask = mask_line;
-	    mask_line += mask_stride;
-	    w = width;
-
-	    uint8x8_t mval = sa, dval = sa, res;
-	    uint8_t *dst4 = 0, *dst2 = 0;
-
-	    if (w & 4)
-	    {
-		mval = vreinterpret_u8_u32 (
-		    vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (mval), 1));
-		dval = vreinterpret_u8_u32 (
-		    vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
-
-		dst4 = dst;
-		mask += 4;
-		dst += 4;
-	    }
-
-	    if (w & 2)
-	    {
-		mval = vreinterpret_u8_u16 (
-		    vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (mval), 1));
-		dval = vreinterpret_u8_u16 (
-		    vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
-		dst2 = dst;
-		mask += 2;
-		dst += 2;
-	    }
-
-	    if (w & 1)
-	    {
-		mval = vld1_lane_u8 (mask, mval, 1);
-		dval = vld1_lane_u8 (dst, dval, 1);
-	    }
-
-	    res = vqadd_u8 (neon2mul (mval, sa), dval);
-
-	    if (w & 1)
-		vst1_lane_u8 (dst, res, 1);
-	    if (w & 2)
-		vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (res), 1);
-	    if (w & 4)
-		vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (res), 1);
-	}
-    }
-}
-
-#ifdef USE_GCC_INLINE_ASM
-
-static void
-neon_composite_src_16_16 (pixman_implementation_t * impl,
-                          pixman_op_t               op,
-                          pixman_image_t *          src_image,
-                          pixman_image_t *          mask_image,
-                          pixman_image_t *          dst_image,
-                          int32_t                   src_x,
-                          int32_t                   src_y,
-                          int32_t                   mask_x,
-                          int32_t                   mask_y,
-                          int32_t                   dest_x,
-                          int32_t                   dest_y,
-                          int32_t                   width,
-                          int32_t                   height)
-{
-    uint16_t    *dst_line, *src_line;
-    uint32_t dst_stride, src_stride;
-
-    if (!height || !width)
-	return;
-
-    /* We simply copy 16-bit-aligned pixels from one place to another. */
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
-    /* Preload the first input scanline */
-    {
-	uint16_t *src_ptr = src_line;
-	uint32_t count = width;
-
-	asm volatile (
-	    "0: @ loop							\n"
-	    "	subs    %[count], %[count], #32				\n"
-	    "	pld     [%[src]]					\n"
-	    "	add     %[src], %[src], #64				\n"
-	    "	bgt 0b							\n"
-
-	    /* Clobbered input registers marked as input/outputs */
-	    : [src] "+r" (src_ptr), [count] "+r" (count)
-	    :     /* no unclobbered inputs */
-	    : "cc"
-	    );
-    }
-
-    while (height--)
-    {
-	uint16_t *dst_ptr = dst_line;
-	uint16_t *src_ptr = src_line;
-	uint32_t count = width;
-	uint32_t tmp = 0;
-
-	/* Uses multi-register access and preloading to maximise bandwidth.
-	 * Each pixel is one halfword, so a quadword contains 8px.
-	 * Preload frequency assumed a 64-byte cacheline.
-	 */
-	asm volatile (
-	    "	cmp       %[count], #64				\n"
-	    "	blt 1f    @ skip oversized fragments		\n"
-	    "0: @ start with eight quadwords at a time		\n"
-	    /* preload from next scanline */
-	    "	pld       [%[src], %[src_stride], LSL #1]	\n"
-	    "	sub       %[count], %[count], #64		\n"
-	    "	vld1.16   {d16, d17, d18, d19}, [%[src]]!		\n"
-	    "	vld1.16   {d20, d21, d22, d23}, [%[src]]!		\n"
-	    /* preload from next scanline */
-	    "	pld       [%[src], %[src_stride], LSL #1]	\n"
-	    "	vld1.16   {d24, d25, d26, d27}, [%[src]]!		\n"
-	    "	vld1.16   {d28, d29, d30, d31}, [%[src]]!		\n"
-	    "	cmp       %[count], #64				\n"
-	    "	vst1.16   {d16, d17, d18, d19}, [%[dst]]!		\n"
-	    "	vst1.16   {d20, d21, d22, d23}, [%[dst]]!		\n"
-	    "	vst1.16   {d24, d25, d26, d27}, [%[dst]]!		\n"
-	    "	vst1.16   {d28, d29, d30, d31}, [%[dst]]!		\n"
-	    "	bge 0b						\n"
-	    "	cmp       %[count], #0				\n"
-	    "	beq 7f    @ aligned fastpath			\n"
-	    "1: @ four quadwords				\n"
-	    "	tst       %[count], #32				\n"
-	    "	beq 2f    @ skip oversized fragment		\n"
-	    /* preload from next scanline */
-	    "	pld       [%[src], %[src_stride], LSL #1]	\n"
-	    "	vld1.16   {d16, d17, d18, d19}, [%[src]]!		\n"
-	    "	vld1.16   {d20, d21, d22, d23}, [%[src]]!		\n"
-	    "	vst1.16   {d16, d17, d18, d19}, [%[dst]]!		\n"
-	    "	vst1.16   {d20, d21, d22, d23}, [%[dst]]!		\n"
-	    "2: @ two quadwords					\n"
-	    "	tst       %[count], #16				\n"
-	    "	beq 3f    @ skip oversized fragment		\n"
-	    /* preload from next scanline */
-	    "	pld       [%[src], %[src_stride], LSL #1]	\n"
-	    "	vld1.16   {d16, d17, d18, d19}, [%[src]]!		\n"
-	    "	vst1.16   {d16, d17, d18, d19}, [%[dst]]!		\n"
-	    "3: @ one quadword					\n"
-	    "	tst       %[count], #8				\n"
-	    "	beq 4f    @ skip oversized fragment		\n"
-	    "	vld1.16   {d16, d17}, [%[src]]!			\n"
-	    "	vst1.16   {d16, d17}, [%[dst]]!			\n"
-	    "4: @ one doubleword				\n"
-	    "	tst       %[count], #4				\n"
-	    "	beq 5f    @ skip oversized fragment		\n"
-	    "	vld1.16   {d16}, [%[src]]!			\n"
-	    "	vst1.16   {d16}, [%[dst]]!			\n"
-	    "5: @ one word					\n"
-	    "	tst       %[count], #2				\n"
-	    "	beq 6f    @ skip oversized fragment		\n"
-	    "	ldr       %[tmp], [%[src]], #4			\n"
-	    "	str       %[tmp], [%[dst]], #4			\n"
-	    "6: @ one halfword					\n"
-	    "	tst       %[count], #1				\n"
-	    "	beq 7f    @ skip oversized fragment		\n"
-	    "	ldrh      %[tmp], [%[src]]			\n"
-	    "	strh      %[tmp], [%[dst]]			\n"
-	    "7: @ end						\n"
-
-	    /* Clobbered input registers marked as input/outputs */
-	    : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr),
-	      [count] "+r" (count), [tmp] "+r" (tmp)
-
-	      /* Unclobbered input */
-	    : [src_stride] "r" (src_stride)
-
-	      /* Clobbered vector registers */
-	    : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
-	      "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory"
-	    );
-
-	src_line += src_stride;
-	dst_line += dst_stride;
-    }
-}
-
-#endif /* USE_GCC_INLINE_ASM */
-
-static void
-neon_composite_src_24_16 (pixman_implementation_t * impl,
-                          pixman_op_t               op,
-                          pixman_image_t *          src_image,
-                          pixman_image_t *          mask_image,
-                          pixman_image_t *          dst_image,
-                          int32_t                   src_x,
-                          int32_t                   src_y,
-                          int32_t                   mask_x,
-                          int32_t                   mask_y,
-                          int32_t                   dest_x,
-                          int32_t                   dest_y,
-                          int32_t                   width,
-                          int32_t                   height)
-{
-    uint16_t    *dst_line;
-    uint32_t    *src_line;
-    uint32_t dst_stride, src_stride;
-
-    if (!width || !height)
-	return;
-
-    /* We simply copy pixels from one place to another,
-     * assuming that the source's alpha is opaque.
-     */
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
-    /* Preload the first input scanline */
-    {
-	uint8_t *src_ptr = (uint8_t*) src_line;
-	uint32_t count = (width + 15) / 16;
-
-#ifdef USE_GCC_INLINE_ASM
-	asm volatile (
-	    "0: @ loop						\n"
-	    "	subs    %[count], %[count], #1			\n"
-	    "	pld     [%[src]]				\n"
-	    "	add     %[src], %[src], #64			\n"
-	    "	bgt 0b						\n"
-
-	    /* Clobbered input registers marked as input/outputs */
-	    : [src] "+r" (src_ptr), [count] "+r" (count)
-	    :     /* no unclobbered inputs */
-	    : "cc"
-	    );
-#else
-	do
-	{
-	    __pld (src_ptr);
-	    src_ptr += 64;
-	}
-	while (--count);
-#endif
-    }
-
-    while (height--)
-    {
-	uint16_t *dst_ptr = dst_line;
-	uint32_t *src_ptr = src_line;
-	uint32_t count = width;
-	const uint32_t rb_mask = 0x1F;
-	const uint32_t g_mask = 0x3F;
-
-	/* If you're going to complain about a goto, take a long hard look
-	 * at the massive blocks of assembler this skips over.  ;-)
-	 */
-	if (count < 8)
-	    goto small_stuff;
-
-#ifdef USE_GCC_INLINE_ASM
-
-	/* This is not as aggressive as the RGB565-source case.
-	 * Generally the source is in cached RAM when the formats are
-	 * different, so we use preload.
-	 * 
-	 * We don't need to blend, so we are not reading from the
-	 * uncached framebuffer.
-	 */
-	asm volatile (
-	    "	cmp       %[count], #16				\n"
-	    "	blt 1f    @ skip oversized fragments		\n"
-	    "0: @ start with sixteen pixels at a time		\n"
-	    "	sub       %[count], %[count], #16		\n"
-	    "	pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline			\n"
-	    "	vld4.8    {d0, d1, d2, d3}, [%[src]]!		@ d3 is alpha and ignored, d2-0 are rgb.	\n"
-	    "	vld4.8    {d4, d5, d6, d7}, [%[src]]!		@ d7 is alpha and ignored, d6-4 are rgb.	\n"
-	    "	vshll.u8  q8, d2, #8				@ expand first red for repacking		\n"
-	    "	vshll.u8  q10, d1, #8				@ expand first green for repacking		\n"
-	    "	vshll.u8  q11, d0, #8				@ expand first blue for repacking		\n"
-	    "	vshll.u8  q9, d6, #8				@ expand second red for repacking		\n"
-	    "	vsri.u16  q8, q10, #5				@ insert first green after red			\n"
-	    "	vshll.u8  q10, d5, #8				@ expand second green for repacking		\n"
-	    "	vsri.u16  q8, q11, #11				@ insert first blue after green			\n"
-	    "	vshll.u8  q11, d4, #8				@ expand second blue for repacking		\n"
-	    "	vsri.u16  q9, q10, #5				@ insert second green after red			\n"
-	    "	vsri.u16  q9, q11, #11				@ insert second blue after green		\n"
-	    "	cmp       %[count], #16				\n"
-	    "	vst1.16   {d16, d17, d18, d19}, [%[dst]]!          @ store 16 pixels				\n"
-	    "	bge 0b						\n"
-	    "1: @ end of main loop				\n"
-	    "	cmp       %[count], #8				@ can we still do an 8-pixel block?		\n"
-	    "	blt 2f						\n"
-	    "	sub       %[count], %[count], #8		\n"
-	    "	pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline			\n"
-	    "	vld4.8    {d0, d1, d2, d3}, [%[src]]!		@ d3 is alpha and ignored, d2-0 are rgb.	\n"
-	    "	vshll.u8  q8, d2, #8				@ expand first red for repacking		\n"
-	    "	vshll.u8  q10, d1, #8				@ expand first green for repacking		\n"
-	    "	vshll.u8  q11, d0, #8				@ expand first blue for repacking		\n"
-	    "	vsri.u16  q8, q10, #5				@ insert first green after red			\n"
-	    "	vsri.u16  q8, q11, #11				@ insert first blue after green			\n"
-	    "	vst1.16   {d16, d17}, [%[dst]]!          @ store 8 pixels				\n"
-	    "2: @ end						\n"
-
-	    /* Clobbered input and working registers marked as input/outputs */
-	    : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count)
-
-	      /* Unclobbered input */
-	    : [src_stride] "r" (src_stride)
-
-	      /* Clobbered vector registers */
-
-	      /* NB: these are the quad aliases of the
-	       * double registers used in the asm
-	       */
-	    : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17",
-	      "d18", "d19", "d20", "d21", "d22", "d23", "cc", "memory"
-	    );
-#else
-	/* A copy of the above code, in intrinsics-form. */
-	while (count >= 16)
-	{
-	    uint8x8x4_t pixel_set_a, pixel_set_b;
-	    uint16x8_t red_a, green_a, blue_a;
-	    uint16x8_t red_b, green_b, blue_b;
-	    uint16x8_t dest_pixels_a, dest_pixels_b;
-
-	    count -= 16;
-	    __pld (src_ptr + src_stride);
-	    pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
-	    pixel_set_b = vld4_u8 ((uint8_t*)(src_ptr + 8));
-	    src_ptr += 16;
-
-	    red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
-	    green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
-	    blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
-	    
-	    red_b   = vshll_n_u8 (pixel_set_b.val[2], 8);
-	    green_b = vshll_n_u8 (pixel_set_b.val[1], 8);
-	    blue_b  = vshll_n_u8 (pixel_set_b.val[0], 8);
-	    
-	    dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
-	    dest_pixels_b = vsriq_n_u16 (red_b, green_b, 5);
-	    
-	    dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
-	    dest_pixels_b = vsriq_n_u16 (dest_pixels_b, blue_b, 11);
-
-	    /* There doesn't seem to be an intrinsic for the
-	     * double-quadword variant
-	     */
-	    vst1q_u16 (dst_ptr, dest_pixels_a);
-	    vst1q_u16 (dst_ptr + 8, dest_pixels_b);
-	    dst_ptr += 16;
-	}
-
-	/* 8-pixel loop */
-	if (count >= 8)
-	{
-	    uint8x8x4_t pixel_set_a;
-	    uint16x8_t red_a, green_a, blue_a;
-	    uint16x8_t dest_pixels_a;
-
-	    __pld (src_ptr + src_stride);
-	    count -= 8;
-	    pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
-	    src_ptr += 8;
-
-	    red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
-	    green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
-	    blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
-
-	    dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
-	    dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
-
-	    vst1q_u16 (dst_ptr, dest_pixels_a);
-	    dst_ptr += 8;
-	}
-
-#endif  /* USE_GCC_INLINE_ASM */
-
-    small_stuff:
-	if (count)
-	    __pld (src_ptr + src_stride);
-
-	while (count >= 2)
-	{
-	    uint32_t src_pixel_a = *src_ptr++;
-	    uint32_t src_pixel_b = *src_ptr++;
-
-	    /* ARM is really good at shift-then-ALU ops. */
-	    /* This should be a total of six shift-ANDs and five shift-ORs. */
-	    uint32_t dst_pixels_a;
-	    uint32_t dst_pixels_b;
-
-	    dst_pixels_a  = ((src_pixel_a >>  3) & rb_mask);
-	    dst_pixels_a |= ((src_pixel_a >> 10) &  g_mask) << 5;
-	    dst_pixels_a |= ((src_pixel_a >> 19) & rb_mask) << 11;
-
-	    dst_pixels_b  = ((src_pixel_b >>  3) & rb_mask);
-	    dst_pixels_b |= ((src_pixel_b >> 10) &  g_mask) << 5;
-	    dst_pixels_b |= ((src_pixel_b >> 19) & rb_mask) << 11;
-
-	    /* little-endian mode only */
-	    *((uint32_t*) dst_ptr) = dst_pixels_a | (dst_pixels_b << 16);
-	    dst_ptr += 2;
-	    count -= 2;
-	}
-
-	if (count)
-	{
-	    uint32_t src_pixel = *src_ptr++;
-
-	    /* ARM is really good at shift-then-ALU ops.
-	     * This block should end up as three shift-ANDs
-	     * and two shift-ORs.
-	     */
-	    uint32_t tmp_blue  = (src_pixel >>  3) & rb_mask;
-	    uint32_t tmp_green = (src_pixel >> 10) & g_mask;
-	    uint32_t tmp_red   = (src_pixel >> 19) & rb_mask;
-	    uint16_t dst_pixel = (tmp_red << 11) | (tmp_green << 5) | tmp_blue;
-
-	    *dst_ptr++ = dst_pixel;
-	    count--;
-	}
-
-	src_line += src_stride;
-	dst_line += dst_stride;
-    }
-}
+#include "pixman-arm-common.h"
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_x888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_0565,
+                                   uint16_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0888,
+                                   uint8_t, 3, uint8_t, 3)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_0565,
+                                   uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_8888,
+                                   uint16_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_8888_rev,
+                                   uint8_t, 3, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0565_rev,
+                                   uint8_t, 3, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_pixbuf_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8000_8000,
+                                   uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_0565,
+                                   uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (neon, over_n_0565,
+                                 uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (neon, over_n_8888,
+                                 uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (neon, over_reverse_n_8888,
+                                 uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8_0565,
+                                      uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, over_n_8888_8888_ca,
+                                      uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (neon, add_n_8_8,
+                                      uint8_t, 1, uint8_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (neon, over_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8,
+                                        uint8_t, 1, uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8888_8888,
+                                        uint32_t, 1, uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_8888,
+                                        uint32_t, 1, uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8888_8888,
+                                        uint32_t, 1, uint32_t, 1, uint32_t, 1)
+
+void
+pixman_composite_src_n_8_asm_neon (int32_t   w,
+                                   int32_t   h,
+                                   uint8_t  *dst,
+                                   int32_t   dst_stride,
+                                   uint8_t   src);
+
+void
+pixman_composite_src_n_0565_asm_neon (int32_t   w,
+                                      int32_t   h,
+                                      uint16_t *dst,
+                                      int32_t   dst_stride,
+                                      uint16_t  src);
+
+void
+pixman_composite_src_n_8888_asm_neon (int32_t   w,
+                                      int32_t   h,
+                                      uint32_t *dst,
+                                      int32_t   dst_stride,
+                                      uint32_t  src);
 
 static pixman_bool_t
 pixman_fill_neon (uint32_t *bits,
@@ -1705,1019 +120,144 @@ pixman_fill_neon (uint32_t *bits,
                   int       height,
                   uint32_t  _xor)
 {
-    uint32_t byte_stride, color;
-    char *dst;
-
     /* stride is always multiple of 32bit units in pixman */
-    byte_stride = stride * sizeof(uint32_t);
+    uint32_t byte_stride = stride * sizeof(uint32_t);
 
     switch (bpp)
     {
     case 8:
-	dst = ((char *) bits) + y * byte_stride + x;
-	_xor &= 0xff;
-	color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
-	break;
-
+	pixman_composite_src_n_8_asm_neon (
+		width,
+		height,
+		(uint8_t *)(((char *) bits) + y * byte_stride + x),
+		byte_stride,
+		_xor & 0xff);
+	return TRUE;
     case 16:
-	dst = ((char *) bits) + y * byte_stride + x * 2;
-	_xor &= 0xffff;
-	color = _xor << 16 | _xor;
-	width *= 2;         /* width to bytes */
-	break;
-
+	pixman_composite_src_n_0565_asm_neon (
+		width,
+		height,
+		(uint16_t *)(((char *) bits) + y * byte_stride + x * 2),
+		byte_stride / 2,
+		_xor & 0xffff);
+	return TRUE;
     case 32:
-	dst = ((char *) bits) + y * byte_stride + x * 4;
-	color = _xor;
-	width *= 4;         /* width to bytes */
-	break;
-
+	pixman_composite_src_n_8888_asm_neon (
+		width,
+		height,
+		(uint32_t *)(((char *) bits) + y * byte_stride + x * 4),
+		byte_stride / 4,
+		_xor);
+	return TRUE;
     default:
 	return FALSE;
     }
-
-#ifdef USE_GCC_INLINE_ASM
-    if (width < 16)
-    {
-	/* We have a special case for such small widths that don't allow
-	 * us to use wide 128-bit stores anyway. We don't waste time
-	 * trying to align writes, since there are only very few of them anyway
-	 */
-	asm volatile (
-	    "cmp		%[height], #0\n"/* Check if empty fill */
-	    "beq		3f\n"
-	    "vdup.32	d0, %[color]\n"/* Fill the color to neon req */
-
-	    /* Check if we have a such width that can easily be handled by single
-	     * operation for each scanline. This significantly reduces the number
-	     * of test/branch instructions for each scanline
-	     */
-	    "cmp		%[width], #8\n"
-	    "beq		4f\n"
-	    "cmp		%[width], #4\n"
-	    "beq		5f\n"
-	    "cmp		%[width], #2\n"
-	    "beq		6f\n"
-
-	    /* Loop starts here for each scanline */
-	    "1:\n"
-	    "mov		r4, %[dst]\n" /* Starting address of the current line */
-	    "tst		%[width], #8\n"
-	    "beq		2f\n"
-	    "vst1.8		{d0}, [r4]!\n"
-	    "2:\n"
-	    "tst		%[width], #4\n"
-	    "beq		2f\n"
-	    "str		%[color], [r4], #4\n"
-	    "2:\n"
-	    "tst		%[width], #2\n"
-	    "beq		2f\n"
-	    "strh		%[color], [r4], #2\n"
-	    "2:\n"
-	    "tst		%[width], #1\n"
-	    "beq		2f\n"
-	    "strb		%[color], [r4], #1\n"
-	    "2:\n"
-
-	    "subs		%[height], %[height], #1\n"
-	    "add		%[dst], %[dst], %[byte_stride]\n"
-	    "bne		1b\n"
-	    "b		3f\n"
-
-	    /* Special fillers for those widths that we can do with single operation */
-	    "4:\n"
-	    "subs		%[height], %[height], #1\n"
-	    "vst1.8		{d0}, [%[dst]]\n"
-	    "add		%[dst], %[dst], %[byte_stride]\n"
-	    "bne		4b\n"
-	    "b		3f\n"
-
-	    "5:\n"
-	    "subs		%[height], %[height], #1\n"
-	    "str		%[color], [%[dst]]\n"
-	    "add		%[dst], %[dst], %[byte_stride]\n"
-	    "bne		5b\n"
-	    "b		3f\n"
-
-	    "6:\n"
-	    "subs		%[height], %[height], #1\n"
-	    "strh		%[color], [%[dst]]\n"
-	    "add		%[dst], %[dst], %[byte_stride]\n"
-	    "bne		6b\n"
-
-	    "3:\n"
-	    : [height] "+r" (height), [dst] "+r" (dst)
-	    : [color] "r" (color), [width] "r" (width),
-	      [byte_stride] "r" (byte_stride)
-	    : "memory", "cc", "d0", "r4");
-    }
-    else
-    {
-	asm volatile (
-	    "cmp		%[height], #0\n"/* Check if empty fill */
-	    "beq		5f\n"
-	    "vdup.32	q0, %[color]\n"/* Fill the color to neon req */
-
-	    /* Loop starts here for each scanline */
-	    "1:\n"
-	    "mov		r4, %[dst]\n"/* Starting address of the current line */
-	    "mov		r5, %[width]\n"/* We're going to write this many bytes */
-	    "ands		r6, r4, #15\n"/* Are we at the 128-bit aligned address? */
-	    "beq		2f\n"/* Jump to the best case */
-
-	    /* We're not 128-bit aligned: However, we know that we can get to the
-	       next aligned location, since the fill is at least 16 bytes wide */
-	    "rsb                r6, r6, #16\n" /* We would need to go forward this much */
-	    "sub		r5, r5, r6\n"/* Update bytes left */
-	    "tst		r6, #1\n"
-	    "beq		6f\n"
-	    "vst1.8		{d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
-	    "6:\n"
-	    "tst		r6, #2\n"
-	    "beq		6f\n"
-	    "vst1.16	{d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
-	    "6:\n"
-	    "tst		r6, #4\n"
-	    "beq		6f\n"
-	    "vst1.32	{d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
-	    "6:\n"
-	    "tst		r6, #8\n"
-	    "beq		2f\n"
-	    "vst1.64	{d0}, [r4, :64]!\n"/* Store qword now we're 64-bit aligned */
-
-	    /* The good case: We're 128-bit aligned for this scanline */
-	    "2:\n"
-	    "and		r6, r5, #15\n"/* Number of tailing bytes */
-	    "cmp		r5, r6\n"/* Do we have at least one qword to write? */
-	    "beq		6f\n"/* No, we just write the tail */
-	    "lsr		r5, r5, #4\n"/* This many full qwords to write */
-
-	    /* The main block: Do 128-bit aligned writes */
-	    "3:\n"
-	    "subs		r5, r5, #1\n"
-	    "vst1.64	{d0, d1}, [r4, :128]!\n"
-	    "bne		3b\n"
-
-	    /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
-	       We know that we're currently at 128-bit aligned address, so we can just
-	       pick the biggest operations that the remaining write width allows */
-	    "6:\n"
-	    "cmp		r6, #0\n"
-	    "beq		4f\n"
-	    "tst		r6, #8\n"
-	    "beq		6f\n"
-	    "vst1.64	{d0}, [r4, :64]!\n"
-	    "6:\n"
-	    "tst		r6, #4\n"
-	    "beq		6f\n"
-	    "vst1.32	{d0[0]}, [r4, :32]!\n"
-	    "6:\n"
-	    "tst		r6, #2\n"
-	    "beq		6f\n"
-	    "vst1.16	{d0[0]}, [r4, :16]!\n"
-	    "6:\n"
-	    "tst		r6, #1\n"
-	    "beq		4f\n"
-	    "vst1.8		{d0[0]}, [r4]!\n"
-	    "4:\n"
-
-	    /* Handle the next scanline */
-	    "subs		%[height], %[height], #1\n"
-	    "add		%[dst], %[dst], %[byte_stride]\n"
-	    "bne		1b\n"
-	    "5:\n"
-	    : [height] "+r" (height), [dst] "+r" (dst)
-	    : [color] "r" (color), [width] "r" (width),
-	      [byte_stride] "r" (byte_stride)
-	    : "memory", "cc", "d0", "d1", "r4", "r5", "r6");
-    }
-    return TRUE;
-
-#else
-
-    /* TODO: intrinsic version for armcc */
-    return FALSE;
-
-#endif
-}
-
-/* TODO: is there a more generic way of doing this being introduced? */
-#define NEON_SCANLINE_BUFFER_PIXELS (1024)
-
-static inline void
-neon_quadword_copy (void *   dst,
-		    void *   src,
-		    uint32_t count,         /* of quadwords */
-		    uint32_t trailer_count  /* of bytes */)
-{
-    uint8_t *t_dst = dst, *t_src = src;
-
-    /* Uses aligned multi-register loads to maximise read bandwidth
-     * on uncached memory such as framebuffers
-     * The accesses do not have the aligned qualifiers, so that the copy
-     * may convert between aligned-uncached and unaligned-cached memory.
-     * It is assumed that the CPU can infer alignedness from the address.
-     */
-
-#ifdef USE_GCC_INLINE_ASM
-
-    asm volatile (
-        "	cmp       %[count], #8				\n"
-        "	blt 1f    @ skip oversized fragments		\n"
-        "0: @ start with eight quadwords at a time		\n"
-        "	sub       %[count], %[count], #8		\n"
-        "	vld1.8    {d16, d17, d18, d19}, [%[src]]!		\n"
-        "	vld1.8    {d20, d21, d22, d23}, [%[src]]!		\n"
-        "	vld1.8    {d24, d25, d26, d27}, [%[src]]!		\n"
-        "	vld1.8    {d28, d29, d30, d31}, [%[src]]!		\n"
-        "	cmp       %[count], #8				\n"
-        "	vst1.8    {d16, d17, d18, d19}, [%[dst]]!		\n"
-        "	vst1.8    {d20, d21, d22, d23}, [%[dst]]!		\n"
-        "	vst1.8    {d24, d25, d26, d27}, [%[dst]]!		\n"
-        "	vst1.8    {d28, d29, d30, d31}, [%[dst]]!		\n"
-        "	bge 0b						\n"
-        "1: @ four quadwords					\n"
-        "	tst       %[count], #4				\n"
-        "	beq 2f    @ skip oversized fragment		\n"
-        "	vld1.8    {d16, d17, d18, d19}, [%[src]]!		\n"
-        "	vld1.8    {d20, d21, d22, d23}, [%[src]]!		\n"
-        "	vst1.8    {d16, d17, d18, d19}, [%[dst]]!		\n"
-        "	vst1.8    {d20, d21, d22, d23}, [%[dst]]!		\n"
-        "2: @ two quadwords					\n"
-        "	tst       %[count], #2				\n"
-        "	beq 3f    @ skip oversized fragment		\n"
-        "	vld1.8    {d16, d17, d18, d19}, [%[src]]!		\n"
-        "	vst1.8    {d16, d17, d18, d19}, [%[dst]]!		\n"
-        "3: @ one quadword					\n"
-        "	tst       %[count], #1				\n"
-        "	beq 4f    @ skip oversized fragment		\n"
-        "	vld1.8    {d16, d17}, [%[src]]!			\n"
-        "	vst1.8    {d16, d17}, [%[dst]]!			\n"
-        "4: @ end						\n"
-
-        /* Clobbered input registers marked as input/outputs */
-	: [dst] "+r" (t_dst), [src] "+r" (t_src), [count] "+r" (count)
-
-	  /* No unclobbered inputs */
-	:
-
-        /* Clobbered vector registers */
-	: "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
-	  "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory");
-
-#else
-
-    while (count >= 8)
-    {
-	uint8x16x4_t t1 = vld4q_u8 (t_src);
-	uint8x16x4_t t2 = vld4q_u8 (t_src + sizeof(uint8x16x4_t));
-	
-	t_src += sizeof(uint8x16x4_t) * 2;
-	vst4q_u8 (t_dst, t1);
-	vst4q_u8 (t_dst + sizeof(uint8x16x4_t), t2);
-	t_dst += sizeof(uint8x16x4_t) * 2;
-	count -= 8;
-    }
-
-    if (count & 4)
-    {
-	uint8x16x4_t t1 = vld4q_u8 (t_src);
-	
-	t_src += sizeof(uint8x16x4_t);
-	vst4q_u8 (t_dst, t1);
-	t_dst += sizeof(uint8x16x4_t);
-    }
-
-    if (count & 2)
-    {
-	uint8x8x4_t t1 = vld4_u8 (t_src);
-	
-	t_src += sizeof(uint8x8x4_t);
-	vst4_u8 (t_dst, t1);
-	t_dst += sizeof(uint8x8x4_t);
-    }
-
-    if (count & 1)
-    {
-	uint8x16_t t1 = vld1q_u8 (t_src);
-	
-	t_src += sizeof(uint8x16_t);
-	vst1q_u8 (t_dst, t1);
-	t_dst += sizeof(uint8x16_t);
-    }
-
-#endif  /* !USE_GCC_INLINE_ASM */
-
-    if (trailer_count)
-    {
-	if (trailer_count & 8)
-	{
-	    uint8x8_t t1 = vld1_u8 (t_src);
-	    
-	    t_src += sizeof(uint8x8_t);
-	    vst1_u8 (t_dst, t1);
-	    t_dst += sizeof(uint8x8_t);
-	}
-
-	if (trailer_count & 4)
-	{
-	    *((uint32_t*) t_dst) = *((uint32_t*) t_src);
-	    
-	    t_dst += 4;
-	    t_src += 4;
-	}
-
-	if (trailer_count & 2)
-	{
-	    *((uint16_t*) t_dst) = *((uint16_t*) t_src);
-	    
-	    t_dst += 2;
-	    t_src += 2;
-	}
-
-	if (trailer_count & 1)
-	{
-	    *t_dst++ = *t_src++;
-	}
-    }
-}
-
-static inline void
-solid_over_565_8_pix_neon (uint32_t  glyph_colour,
-                           uint16_t *dest,
-                           uint8_t * in_mask,
-                           uint32_t  dest_stride,    /* bytes, not elements */
-                           uint32_t  mask_stride,
-                           uint32_t  count           /* 8-pixel groups */)
-{
-    /* Inner loop of glyph blitter (solid colour, alpha mask) */
-
-#ifdef USE_GCC_INLINE_ASM
-
-    asm volatile (
-        "	vld4.8 {d20[], d21[], d22[], d23[]}, [%[glyph_colour]]  @ splat solid colour components	\n"
-        "0:	@ loop																				\n"
-        "	vld1.16   {d0, d1}, [%[dest]]         @ load first pixels from framebuffer			\n"
-        "	vld1.8    {d17}, [%[in_mask]]         @ load alpha mask of glyph						\n"
-        "	vmull.u8  q9, d17, d23               @ apply glyph colour alpha to mask				\n"
-        "	vshrn.u16 d17, q9, #8                @ reformat it to match original mask			\n"
-        "	vmvn      d18, d17                   @ we need the inverse mask for the background	\n"
-        "	vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits				\n"
-        "	vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels			\n"
-        "	vshrn.u16 d4, q0, #3                 @ unpack green									\n"
-        "	vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)			\n"
-        "	vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)		\n"
-        "	vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)			\n"
-        "	vmull.u8  q1, d2, d18                @ apply inverse mask to background red...		\n"
-        "	vmull.u8  q2, d4, d18                @ ...green...									\n"
-        "	vmull.u8  q3, d6, d18                @ ...blue										\n"
-        "	subs      %[count], %[count], #1     @ decrement/test loop counter					\n"
-        "	vmlal.u8  q1, d17, d22               @ add masked foreground red...					\n"
-        "	vmlal.u8  q2, d17, d21               @ ...green...									\n"
-        "	vmlal.u8  q3, d17, d20               @ ...blue										\n"
-        "	add %[in_mask], %[in_mask], %[mask_stride] @ advance mask pointer, while we wait		\n"
-        "	vsri.16   q1, q2, #5                 @ pack green behind red						\n"
-        "	vsri.16   q1, q3, #11                @ pack blue into pixels						\n"
-        "	vst1.16   {d2, d3}, [%[dest]]         @ store composited pixels						\n"
-        "	add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer					\n"
-        "	bne 0b                               @ next please									\n"
-
-	/* Clobbered registers marked as input/outputs */
-	: [dest] "+r" (dest), [in_mask] "+r" (in_mask), [count] "+r" (count)
-	  
-	  /* Inputs */
-	: [dest_stride] "r" (dest_stride), [mask_stride] "r" (mask_stride), [glyph_colour] "r" (&glyph_colour)
-
-	  /* Clobbers, including the inputs we modify, and potentially lots of memory */
-	: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d19",
-	  "d20", "d21", "d22", "d23", "d24", "d25", "cc", "memory"
-        );
-
-#else
-
-    uint8x8x4_t solid_colour = vld4_dup_u8 ((uint8_t*) &glyph_colour);
-
-    while (count--)
-    {
-	uint16x8_t pixels = vld1q_u16 (dest);
-	uint8x8_t mask = vshrn_n_u16 (vmull_u8 (solid_colour.val[3], vld1_u8 (in_mask)), 8);
-	uint8x8_t mask_image = vmvn_u8 (mask);
-
-	uint8x8_t t_red   = vshrn_n_u16 (pixels, 8);
-	uint8x8_t t_green = vshrn_n_u16 (pixels, 3);
-	uint8x8_t t_blue  = vshrn_n_u16 (vsli_n_u8 (pixels, pixels, 5), 2);
-
-	uint16x8_t s_red   = vmull_u8 (vsri_n_u8 (t_red, t_red, 5), mask_image);
-	uint16x8_t s_green = vmull_u8 (vsri_n_u8 (t_green, t_green, 6), mask_image);
-	uint16x8_t s_blue  = vmull_u8 (t_blue, mask_image);
-
-	s_red   = vmlal (s_red, mask, solid_colour.val[2]);
-	s_green = vmlal (s_green, mask, solid_colour.val[1]);
-	s_blue  = vmlal (s_blue, mask, solid_colour.val[0]);
-
-	pixels = vsri_n_u16 (s_red, s_green, 5);
-	pixels = vsri_n_u16 (pixels, s_blue, 11);
-	vst1q_u16 (dest, pixels);
-
-	dest += dest_stride;
-	mask += mask_stride;
-    }
-
-#endif
 }
 
-#if 0 /* this is broken currently */
-static void
-neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
-                              pixman_op_t               op,
-                              pixman_image_t *          src_image,
-                              pixman_image_t *          mask_image,
-                              pixman_image_t *          dst_image,
-                              int32_t                   src_x,
-                              int32_t                   src_y,
-                              int32_t                   mask_x,
-                              int32_t                   mask_y,
-                              int32_t                   dest_x,
-                              int32_t                   dest_y,
-                              int32_t                   width,
-                              int32_t                   height)
-{
-    uint32_t  src, srca;
-    uint16_t *dst_line, *aligned_line;
-    uint8_t  *mask_line;
-    uint32_t  dst_stride, mask_stride;
-    uint32_t  kernel_count, copy_count, copy_tail;
-    uint8_t   kernel_offset, copy_offset;
-
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
-    /* bail out if fully transparent or degenerate */
-    srca = src >> 24;
-    if (src == 0)
-	return;
-
-    if (width == 0 || height == 0)
-	return;
-
-    if (width > NEON_SCANLINE_BUFFER_PIXELS)
-    {
-	/* split the blit, so we can use a fixed-size scanline buffer
-	 * TODO: there must be a more elegant way of doing this.
-	 */
-	int x;
-	for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
-	{
-	    neon_composite_over_n_8_0565 (
-		impl, op,
-		src_image, mask_image, dst_image,
-		src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
-		(x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
-	}
-
-	return;
-    }
-    
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
-    /* keep within minimum number of aligned quadwords on width
-     * while also keeping the minimum number of columns to process
-     */
-    {
-	unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
-	unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
-	unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
-
-	/* the fast copy should be quadword aligned */
-	copy_offset = dst_line - ((uint16_t*) aligned_left);
-	aligned_line = dst_line - copy_offset;
-	copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
-	copy_tail = 0;
-
-	if (aligned_right - aligned_left > ceiling_length)
-	{
-	    /* unaligned routine is tightest */
-	    kernel_count = (uint32_t) (ceiling_length >> 4);
-	    kernel_offset = copy_offset;
-	}
-	else
-	{
-	    /* aligned routine is equally tight, so it is safer to align */
-	    kernel_count = copy_count;
-	    kernel_offset = 0;
-	}
-
-	/* We should avoid reading beyond scanline ends for safety */
-	if (aligned_line < (dst_line - dest_x) ||
-	    (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
-	{
-	    /* switch to precise read */
-	    copy_offset = kernel_offset = 0;
-	    aligned_line = dst_line;
-	    kernel_count = (uint32_t) (ceiling_length >> 4);
-	    copy_count = (width * sizeof(*dst_line)) >> 4;
-	    copy_tail = (width * sizeof(*dst_line)) & 0xF;
-	}
-    }
-
-    {
-	uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8];         /* deliberately not initialised */
-	uint8_t glyph_line[NEON_SCANLINE_BUFFER_PIXELS + 8];
-	int y = height;
-
-	/* row-major order */
-	/* left edge, middle block, right edge */
-	for ( ; y--; mask_line += mask_stride, aligned_line += dst_stride, dst_line += dst_stride)
-	{
-	    /* We don't want to overrun the edges of the glyph,
-	     * so realign the edge data into known buffers
-	     */
-	    neon_quadword_copy (glyph_line + copy_offset, mask_line, width >> 4, width & 0xF);
-
-	    /* Uncached framebuffer access is really, really slow
-	     * if we do it piecemeal. It should be much faster if we
-	     * grab it all at once. One scanline should easily fit in
-	     * L1 cache, so this should not waste RAM bandwidth.
-	     */
-	    neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
-
-	    /* Apply the actual filter */
-	    solid_over_565_8_pix_neon (
-		src, scan_line + kernel_offset,
-		glyph_line + kernel_offset, 8 * sizeof(*dst_line),
-		8, kernel_count);
-
-	    /* Copy the modified scanline back */
-	    neon_quadword_copy (dst_line, scan_line + copy_offset,
-				width >> 3, (width & 7) * 2);
-	}
-    }
-}
-#endif
-
-#ifdef USE_GCC_INLINE_ASM
-
-static inline void
-plain_over_565_8_pix_neon (uint32_t  colour,
-			   uint16_t *dest,
-			   uint32_t  dest_stride,     /* bytes, not elements */
-			   uint32_t  count            /* 8-pixel groups */)
-{
-    /* Inner loop for plain translucent rects
-     * (solid colour without alpha mask)
-     */
-    asm volatile (
-        "	vld4.8   {d20[], d21[], d22[], d23[]}, [%[colour]]  @ solid colour load/splat \n"
-        "	vmull.u8  q12, d23, d22              @ premultiply alpha red   \n"
-        "	vmull.u8  q13, d23, d21              @ premultiply alpha green \n"
-        "	vmull.u8  q14, d23, d20              @ premultiply alpha blue  \n"
-        "	vmvn      d18, d23                   @ inverse alpha for background \n"
-        "0:	@ loop\n"
-        "	vld1.16   {d0, d1}, [%[dest]]         @ load first pixels from framebuffer	\n"
-        "	vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels	\n"
-        "	vshrn.u16 d4, q0, #3                 @ unpack green				\n"
-        "	vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits		\n"
-        "	vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)	\n"
-        "	vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)	\n"
-        "	vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)	\n"
-        "	vmov      q0, q12                    @ retrieve foreground red   \n"
-        "	vmlal.u8  q0, d2, d18                @ blend red - my kingdom for a four-operand MLA \n"
-        "	vmov      q1, q13                    @ retrieve foreground green \n"
-        "	vmlal.u8  q1, d4, d18                @ blend green               \n"
-        "	vmov      q2, q14                    @ retrieve foreground blue  \n"
-        "	vmlal.u8  q2, d6, d18                @ blend blue                \n"
-        "	subs      %[count], %[count], #1     @ decrement/test loop counter		\n"
-        "	vsri.16   q0, q1, #5                 @ pack green behind red			\n"
-        "	vsri.16   q0, q2, #11                @ pack blue into pixels			\n"
-        "	vst1.16   {d0, d1}, [%[dest]]         @ store composited pixels			\n"
-        "	add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer		\n"
-        "	bne 0b                               @ next please				\n"
-
-        /* Clobbered registers marked as input/outputs */
-	: [dest] "+r" (dest), [count] "+r" (count)
-
-	  /* Inputs */
-	: [dest_stride] "r" (dest_stride), [colour] "r" (&colour)
-
-	  /* Clobbers, including the inputs we modify, and
-	   * potentially lots of memory
-	   */
-	: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d18", "d19",
-	  "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
-	  "cc", "memory"
-        );
-}
-
-static void
-neon_composite_over_n_0565 (pixman_implementation_t * impl,
-                            pixman_op_t               op,
-                            pixman_image_t *          src_image,
-                            pixman_image_t *          mask_image,
-                            pixman_image_t *          dst_image,
-                            int32_t                   src_x,
-                            int32_t                   src_y,
-                            int32_t                   mask_x,
-                            int32_t                   mask_y,
-                            int32_t                   dest_x,
-                            int32_t                   dest_y,
-                            int32_t                   width,
-                            int32_t                   height)
+static pixman_bool_t
+pixman_blt_neon (uint32_t *src_bits,
+                 uint32_t *dst_bits,
+                 int       src_stride,
+                 int       dst_stride,
+                 int       src_bpp,
+                 int       dst_bpp,
+                 int       src_x,
+                 int       src_y,
+                 int       dst_x,
+                 int       dst_y,
+                 int       width,
+                 int       height)
 {
-    uint32_t src, srca;
-    uint16_t    *dst_line, *aligned_line;
-    uint32_t dst_stride;
-    uint32_t kernel_count, copy_count, copy_tail;
-    uint8_t kernel_offset, copy_offset;
-
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
-
-    /* bail out if fully transparent */
-    srca = src >> 24;
-    if (src == 0)
-	return;
-    
-    if (width == 0 || height == 0)
-	return;
-
-    if (width > NEON_SCANLINE_BUFFER_PIXELS)
-    {
-	/* split the blit, so we can use a fixed-size scanline buffer *
-	 * TODO: there must be a more elegant way of doing this.
-	 */
-	int x;
-	
-	for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
-	{
-	    neon_composite_over_n_0565 (
-		impl, op,
-		src_image, mask_image, dst_image,
-		src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
-		(x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
-	}
-	return;
-    }
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
-    /* keep within minimum number of aligned quadwords on width
-     * while also keeping the minimum number of columns to process
-     */
-    {
-	unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
-	unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
-	unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
-
-	/* the fast copy should be quadword aligned */
-	copy_offset = dst_line - ((uint16_t*) aligned_left);
-	aligned_line = dst_line - copy_offset;
-	copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
-	copy_tail = 0;
-
-	if (aligned_right - aligned_left > ceiling_length)
-	{
-	    /* unaligned routine is tightest */
-	    kernel_count = (uint32_t) (ceiling_length >> 4);
-	    kernel_offset = copy_offset;
-	}
-	else
-	{
-	    /* aligned routine is equally tight, so it is safer to align */
-	    kernel_count = copy_count;
-	    kernel_offset = 0;
-	}
-
-	/* We should avoid reading beyond scanline ends for safety */
-	if (aligned_line < (dst_line - dest_x) ||
-	    (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
-	{
-	    /* switch to precise read */
-	    copy_offset = kernel_offset = 0;
-	    aligned_line = dst_line;
-	    kernel_count = (uint32_t) (ceiling_length >> 4);
-	    copy_count = (width * sizeof(*dst_line)) >> 4;
-	    copy_tail = (width * sizeof(*dst_line)) & 0xF;
-	}
-    }
+    if (src_bpp != dst_bpp)
+	return FALSE;
 
+    switch (src_bpp)
     {
-	uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8];  /* deliberately not initialised */
-
-	/* row-major order */
-	/* left edge, middle block, right edge */
-	for ( ; height--; aligned_line += dst_stride, dst_line += dst_stride)
-	{
-	    /* Uncached framebuffer access is really, really slow if we do it piecemeal.
-	     * It should be much faster if we grab it all at once.
-	     * One scanline should easily fit in L1 cache, so this should
-	     * not waste RAM bandwidth.
-	     */
-	    neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
-
-	    /* Apply the actual filter */
-	    plain_over_565_8_pix_neon (
-		src, scan_line + kernel_offset, 8 * sizeof(*dst_line), kernel_count);
-
-	    /* Copy the modified scanline back */
-	    neon_quadword_copy (
-		dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
-	}
+    case 16:
+	pixman_composite_src_0565_0565_asm_neon (
+		width, height,
+		(uint16_t *)(((char *) dst_bits) +
+		dst_y * dst_stride * 4 + dst_x * 2), dst_stride * 2,
+		(uint16_t *)(((char *) src_bits) +
+		src_y * src_stride * 4 + src_x * 2), src_stride * 2);
+	return TRUE;
+    case 32:
+	pixman_composite_src_8888_8888_asm_neon (
+		width, height,
+		(uint32_t *)(((char *) dst_bits) +
+		dst_y * dst_stride * 4 + dst_x * 4), dst_stride,
+		(uint32_t *)(((char *) src_bits) +
+		src_y * src_stride * 4 + src_x * 4), src_stride);
+	return TRUE;
+    default:
+	return FALSE;
     }
 }
 
-static inline void
-ARGB8_over_565_8_pix_neon (uint32_t *src,
-                           uint16_t *dest,
-                           uint32_t  src_stride,     /* bytes, not elements */
-                           uint32_t  count           /* 8-pixel groups */)
-{
-    asm volatile (
-        "0:	@ loop\n"
-        "	pld   [%[src], %[src_stride]]         @ preload from next scanline	\n"
-        "	vld1.16   {d0, d1}, [%[dest]]         @ load pixels from framebuffer	\n"
-        "	vld4.8   {d20, d21, d22, d23},[%[src]]! @ load source image pixels		\n"
-        "	vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits		\n"
-        "	vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels	\n"
-        "	vshrn.u16 d4, q0, #3                 @ unpack green				\n"
-        "	vmvn      d18, d23                   @ we need the inverse alpha for the background	\n"
-        "	vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)	\n"
-        "	vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)	\n"
-        "	vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)	\n"
-        "	vmull.u8  q1, d2, d18                @ apply inverse alpha to background red...	\n"
-        "	vmull.u8  q2, d4, d18                @ ...green...				\n"
-        "	vmull.u8  q3, d6, d18                @ ...blue					\n"
-        "	subs      %[count], %[count], #1     @ decrement/test loop counter		\n"
-        "	vmlal.u8  q1, d23, d22               @ add blended foreground red...		\n"
-        "	vmlal.u8  q2, d23, d21               @ ...green...				\n"
-        "	vmlal.u8  q3, d23, d20               @ ...blue					\n"
-        "	vsri.16   q1, q2, #5                 @ pack green behind red			\n"
-        "	vsri.16   q1, q3, #11                @ pack blue into pixels			\n"
-        "	vst1.16   {d2, d3}, [%[dest]]!        @ store composited pixels			\n"
-        "	bne 0b                               @ next please				\n"
-
-        /* Clobbered registers marked as input/outputs */
-	: [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count)
-
-	  /* Inputs */
-	: [src_stride] "r" (src_stride)
-
-	  /* Clobbers, including the inputs we modify, and potentially lots of memory */
-	: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d20",
-	  "d21", "d22", "d23", "cc", "memory"
-        );
-}
-
-static void
-neon_composite_over_8888_0565 (pixman_implementation_t * impl,
-                               pixman_op_t               op,
-                               pixman_image_t *          src_image,
-                               pixman_image_t *          mask_image,
-                               pixman_image_t *          dst_image,
-                               int32_t                   src_x,
-                               int32_t                   src_y,
-                               int32_t                   mask_x,
-                               int32_t                   mask_y,
-                               int32_t                   dest_x,
-                               int32_t                   dest_y,
-                               int32_t                   width,
-                               int32_t                   height)
+static const pixman_fast_path_t arm_neon_fast_paths[] =
 {
-    uint32_t    *src_line;
-    uint16_t    *dst_line, *aligned_line;
-    uint32_t dst_stride, src_stride;
-    uint32_t kernel_count, copy_count, copy_tail;
-    uint8_t kernel_offset, copy_offset;
-
-    /* we assume mask is opaque 
-     * so the only alpha to deal with is embedded in src
-     */
-    if (width > NEON_SCANLINE_BUFFER_PIXELS)
-    {
-	/* split the blit, so we can use a fixed-size scanline buffer */
-	int x;
-	for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
-	{
-	    neon_composite_over_8888_0565 (
-		impl, op,
-		src_image, mask_image, dst_image,
-		src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
-		(x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
-	}
-	return;
-    }
-
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    /* keep within minimum number of aligned quadwords on width
-     * while also keeping the minimum number of columns to process
-     */
-    {
-	unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
-	unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
-	unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
-
-	/* the fast copy should be quadword aligned */
-	copy_offset = dst_line - ((uint16_t*) aligned_left);
-	aligned_line = dst_line - copy_offset;
-	copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
-	copy_tail = 0;
-
-	if (aligned_right - aligned_left > ceiling_length)
-	{
-	    /* unaligned routine is tightest */
-	    kernel_count = (uint32_t) (ceiling_length >> 4);
-	    kernel_offset = copy_offset;
-	}
-	else
-	{
-	    /* aligned routine is equally tight, so it is safer to align */
-	    kernel_count = copy_count;
-	    kernel_offset = 0;
-	}
-
-	/* We should avoid reading beyond scanline ends for safety */
-	if (aligned_line < (dst_line - dest_x) ||
-	    (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
-	{
-	    /* switch to precise read */
-	    copy_offset = kernel_offset = 0;
-	    aligned_line = dst_line;
-	    kernel_count = (uint32_t) (ceiling_length >> 4);
-	    copy_count = (width * sizeof(*dst_line)) >> 4;
-	    copy_tail = (width * sizeof(*dst_line)) & 0xF;
-	}
-    }
-
-    /* Preload the first input scanline */
-    {
-	uint8_t *src_ptr = (uint8_t*) src_line;
-	uint32_t count = (width + 15) / 16;
-
-#ifdef USE_GCC_INLINE_ASM
-	asm volatile (
-	    "0: @ loop						\n"
-	    "	subs    %[count], %[count], #1			\n"
-	    "	pld     [%[src]]				\n"
-	    "	add     %[src], %[src], #64			\n"
-	    "	bgt 0b						\n"
-
-	    /* Clobbered input registers marked as input/outputs */
-	    : [src] "+r" (src_ptr), [count] "+r" (count)
-	    :     /* no unclobbered inputs */
-	    : "cc"
-	    );
-#else
-	do
-	{
-	    __pld (src_ptr);
-	    src_ptr += 64;
-	}
-	while (--count);
-#endif
-    }
-
-    {
-	uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
-
-	/* row-major order */
-	/* left edge, middle block, right edge */
-	for ( ; height--; src_line += src_stride, aligned_line += dst_stride)
-	{
-	    /* Uncached framebuffer access is really, really slow if we do
-	     * it piecemeal. It should be much faster if we grab it all at
-	     * once. One scanline should easily fit in L1 cache, so this
-	     * should not waste RAM bandwidth.
-	     */
-	    neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
-
-	    /* Apply the actual filter */
-	    ARGB8_over_565_8_pix_neon (
-		src_line, scan_line + kernel_offset,
-		src_stride * sizeof(*src_line), kernel_count);
-
-	    /* Copy the modified scanline back */
-	    neon_quadword_copy (dst_line,
-				scan_line + copy_offset,
-				width >> 3, (width & 7) * 2);
-	}
-    }
-}
+    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     r5g6b5,   neon_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     b5g6r5,   neon_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     a8r8g8b8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     x8r8g8b8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     a8b8g8r8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     x8b8g8r8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     x8r8g8b8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     x8r8g8b8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     x8b8g8r8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     x8b8g8r8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     a8r8g8b8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     a8b8g8r8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  r8g8b8,   null,     r8g8b8,   neon_composite_src_0888_0888),
+    PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     x8r8g8b8, neon_composite_src_0888_8888_rev),
+    PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     r5g6b5,   neon_composite_src_0888_0565_rev),
+    PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8r8g8b8, neon_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       r5g6b5,   neon_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       b5g6r5,   neon_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8r8g8b8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8r8g8b8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8b8g8r8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8b8g8r8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     r5g6b5,   neon_composite_over_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     a8r8g8b8, neon_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     x8r8g8b8, neon_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    a8r8g8b8, neon_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    x8r8g8b8, neon_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       a8r8g8b8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       x8r8g8b8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       a8b8g8r8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       x8b8g8r8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_over_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     r5g6b5,   neon_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     b5g6r5,   neon_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     a8r8g8b8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     x8r8g8b8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     a8b8g8r8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     x8b8g8r8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       neon_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       neon_composite_add_8_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       neon_composite_add_8000_8000),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, neon_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, neon_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888),
 
-#endif  /* USE_GCC_INLINE_ASM */
-
-static const pixman_fast_path_t arm_neon_fast_path_array[] =
-{
-    { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       neon_composite_add_8888_8_8,     0 },
-    { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       neon_composite_add_8000_8000,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   neon_composite_over_n_8_0565,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   neon_composite_over_n_8_0565,    0 },
-    { PIXMAN_OP_SRC,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
-    { PIXMAN_OP_SRC,  PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
-    { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
-    { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
-#ifdef USE_GCC_INLINE_ASM
-    { PIXMAN_OP_SRC,  PIXMAN_r5g6b5,   PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_16_16,        0 },
-    { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_16_16,        0 },
-#if 0 /* this code has some bugs */
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_n_0565,      0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_n_0565,      0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_8888_0565,   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_8888_0565,   0 },
-#endif
-#endif
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, neon_composite_over_8888_8888,   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, neon_composite_over_8888_8888,   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, neon_composite_over_8888_8888,   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, neon_composite_over_8888_8888,   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888,    0 },
     { PIXMAN_OP_NONE },
 };
 
-const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
-
-static void
-arm_neon_composite (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    pixman_image_t *         src,
-                    pixman_image_t *         mask,
-                    pixman_image_t *         dest,
-                    int32_t                  src_x,
-                    int32_t                  src_y,
-                    int32_t                  mask_x,
-                    int32_t                  mask_y,
-                    int32_t                  dest_x,
-                    int32_t                  dest_y,
-                    int32_t                  width,
-                    int32_t                  height)
-{
-    if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
-                               op, src, mask, dest,
-                               src_x, src_y,
-                               mask_x, mask_y,
-                               dest_x, dest_y,
-                               width, height))
-    {
-	return;
-    }
-
-    _pixman_implementation_composite (imp->delegate, op,
-                                      src, mask, dest,
-                                      src_x, src_y,
-                                      mask_x, mask_y,
-                                      dest_x, dest_y,
-                                      width, height);
-}
-
-static pixman_bool_t
-pixman_blt_neon (void *src_bits,
-                 void *dst_bits,
-                 int   src_stride,
-                 int   dst_stride,
-                 int   src_bpp,
-                 int   dst_bpp,
-                 int   src_x,
-                 int   src_y,
-                 int   dst_x,
-                 int   dst_y,
-                 int   width,
-                 int   height)
-{
-    if (!width || !height)
-	return TRUE;
-
-    /* accelerate only straight copies involving complete bytes */
-    if (src_bpp != dst_bpp || (src_bpp & 7))
-	return FALSE;
-
-    {
-	uint32_t bytes_per_pixel = src_bpp >> 3;
-	uint32_t byte_width = width * bytes_per_pixel;
-	/* parameter is in words for some reason */
-	int32_t src_stride_bytes = src_stride * 4;
-	int32_t dst_stride_bytes = dst_stride * 4;
-	uint8_t *src_bytes = ((uint8_t*) src_bits) +
-	    src_y * src_stride_bytes + src_x * bytes_per_pixel;
-	uint8_t *dst_bytes = ((uint8_t*) dst_bits) +
-	    dst_y * dst_stride_bytes + dst_x * bytes_per_pixel;
-	uint32_t quadword_count = byte_width / 16;
-	uint32_t offset         = byte_width % 16;
-
-	while (height--)
-	{
-	    neon_quadword_copy (dst_bytes, src_bytes, quadword_count, offset);
-	    src_bytes += src_stride_bytes;
-	    dst_bytes += dst_stride_bytes;
-	}
-    }
-
-    return TRUE;
-}
-
 static pixman_bool_t
 arm_neon_blt (pixman_implementation_t *imp,
               uint32_t *               src_bits,
@@ -2733,17 +273,18 @@ arm_neon_blt (pixman_implementation_t *imp,
               int                      width,
               int                      height)
 {
-    if (pixman_blt_neon (
+    if (!pixman_blt_neon (
             src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
             src_x, src_y, dst_x, dst_y, width, height))
+
     {
-	return TRUE;
+	return _pixman_implementation_blt (
+	    imp->delegate,
+	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+	    src_x, src_y, dst_x, dst_y, width, height);
     }
 
-    return _pixman_implementation_blt (
-               imp->delegate,
-               src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-               src_x, src_y, dst_x, dst_y, width, height);
+    return TRUE;
 }
 
 static pixman_bool_t
@@ -2764,18 +305,48 @@ arm_neon_fill (pixman_implementation_t *imp,
 	imp->delegate, bits, stride, bpp, x, y, width, height, xor);
 }
 
+#define BIND_COMBINE_U(name)                                             \
+void                                                                     \
+pixman_composite_scanline_##name##_mask_asm_neon (int32_t         w,     \
+                                                  const uint32_t *dst,   \
+                                                  const uint32_t *src,   \
+                                                  const uint32_t *mask); \
+                                                                         \
+void                                                                     \
+pixman_composite_scanline_##name##_asm_neon (int32_t         w,          \
+                                             const uint32_t *dst,        \
+                                             const uint32_t *src);       \
+                                                                         \
+static void                                                              \
+neon_combine_##name##_u (pixman_implementation_t *imp,                   \
+                         pixman_op_t              op,                    \
+                         uint32_t *               dest,                  \
+                         const uint32_t *         src,                   \
+                         const uint32_t *         mask,                  \
+                         int                      width)                 \
+{                                                                        \
+    if (mask)                                                            \
+	pixman_composite_scanline_##name##_mask_asm_neon (width, dest,   \
+	                                                  src, mask);    \
+    else                                                                 \
+	pixman_composite_scanline_##name##_asm_neon (width, dest, src);  \
+}
+
+BIND_COMBINE_U (over)
+BIND_COMBINE_U (add)
+
 pixman_implementation_t *
 _pixman_implementation_create_arm_neon (void)
 {
     pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
-    pixman_implementation_t *imp = _pixman_implementation_create (general);
+    pixman_implementation_t *imp =
+	_pixman_implementation_create (general, arm_neon_fast_paths);
+
+    imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u;
+    imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u;
 
-    imp->composite = arm_neon_composite;
-#if 0 /* this code has some bugs */
     imp->blt = arm_neon_blt;
-#endif
     imp->fill = arm_neon_fill;
 
     return imp;
 }
-
diff --git a/lib/pixman/pixman/pixman-arm-simd-asm.S b/lib/pixman/pixman/pixman-arm-simd-asm.S
new file mode 100644
index 000000000..a82e05de2
--- /dev/null
+++ b/lib/pixman/pixman/pixman-arm-simd-asm.S
@@ -0,0 +1,330 @@
+/*
+ * Copyright � 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+	.text
+	.arch armv6
+	.object_arch armv4
+	.arm
+	.altmacro
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+	.func fname
+	.global fname
+#ifdef __ELF__
+	.hidden fname
+	.type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * The code below was generated by gcc 4.3.4 from the commented out
+ * functions in 'pixman-arm-simd.c' file with the following optimization
+ * options: "-O3 -mcpu=arm1136jf-s -fomit-frame-pointer"
+ *
+ * TODO: replace gcc generated code with hand tuned versions because
+ * the code quality is not very good, introduce symbolic register
+ * aliases for better readability and maintainability.
+ */
+
+pixman_asm_function pixman_composite_add_8000_8000_asm_armv6
+	push	{r4, r5, r6, r7, r8, r9, r10, r11}
+	mov	r10, r1
+	sub	sp, sp, #4
+	subs	r10, r10, #1
+	mov	r11, r0
+	mov	r8, r2
+	str	r3, [sp]
+	ldr	r7, [sp, #36]
+	bcc	0f
+6:	cmp	r11, #0
+	beq	1f
+	orr	r3, r8, r7
+	tst	r3, #3
+	beq	2f
+	mov	r1, r8
+	mov	r0, r7
+	mov	r12, r11
+	b	3f
+5:	tst	r3, #3
+	beq	4f
+3:	ldrb	r2, [r0], #1
+	subs	r12, r12, #1
+	ldrb	r3, [r1]
+	uqadd8	r3, r2, r3
+	strb	r3, [r1], #1
+	orr	r3, r1, r0
+	bne	5b
+1:	ldr	r3, [sp]
+	add	r8, r8, r3
+	ldr	r3, [sp, #40]
+	add	r7, r7, r3
+10:	subs	r10, r10, #1
+	bcs	6b
+0:	add	sp, sp, #4
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
+	bx	lr
+2:	mov	r12, r11
+	mov	r1, r8
+	mov	r0, r7
+4:	cmp	r12, #3
+	subgt	r6, r12, #4
+	movgt	r9, r12
+	lsrgt	r5, r6, #2
+	addgt	r3, r5, #1
+	movgt	r12, #0
+	lslgt	r4, r3, #2
+	ble	7f
+8:	ldr	r3, [r0, r12]
+	ldr	r2, [r1, r12]
+	uqadd8	r3, r3, r2
+	str	r3, [r1, r12]
+	add	r12, r12, #4
+	cmp	r12, r4
+	bne	8b
+	sub	r3, r9, #4
+	bic	r3, r3, #3
+	add	r3, r3, #4
+	subs	r12, r6, r5, lsl #2
+	add	r1, r1, r3
+	add	r0, r0, r3
+	beq	1b
+7:	mov	r4, #0
+9:	ldrb	r3, [r1, r4]
+	ldrb	r2, [r0, r4]
+	uqadd8	r3, r2, r3
+	strb	r3, [r1, r4]
+	add	r4, r4, #1
+	cmp	r4, r12
+	bne	9b
+	ldr	r3, [sp]
+	add	r8, r8, r3
+	ldr	r3, [sp, #40]
+	add	r7, r7, r3
+	b	10b
+.endfunc
+
+pixman_asm_function pixman_composite_over_8888_8888_asm_armv6
+	push	{r4, r5, r6, r7, r8, r9, r10, r11}
+	sub	sp, sp, #20
+	cmp	r1, #0
+	mov	r12, r2
+	str	r1, [sp, #12]
+	str	r0, [sp, #16]
+	ldr	r2, [sp, #52]
+	beq	0f
+	lsl	r3, r3, #2
+	str	r3, [sp]
+	ldr	r3, [sp, #56]
+	mov	r10, #0
+	lsl	r3, r3, #2
+	str	r3, [sp, #8]
+	mov	r11, r3
+	b	1f
+6:	ldr	r11, [sp, #8]
+1:	ldr	r9, [sp]
+	mov	r0, r12
+	add	r12, r12, r9
+	mov	r1, r2
+	str	r12, [sp, #4]
+	add	r2, r2, r11
+	ldr	r12, [sp, #16]
+	ldr	r3, =0x00800080
+	ldr	r9, =0xff00ff00
+	mov	r11, #255
+	cmp	r12, #0
+	beq	4f
+5:	ldr	r5, [r1], #4
+	ldr	r4, [r0]
+	sub	r8, r11, r5, lsr #24
+	uxtb16	r6, r4
+	uxtb16	r7, r4, ror #8
+	mla	r6, r6, r8, r3
+	mla	r7, r7, r8, r3
+	uxtab16	r6, r6, r6, ror #8
+	uxtab16	r7, r7, r7, ror #8
+	and	r7, r7, r9
+	uxtab16	r6, r7, r6, ror #8
+	uqadd8	r5, r6, r5
+	str	r5, [r0], #4
+	subs	r12, r12, #1
+	bne	5b
+4:	ldr	r3, [sp, #12]
+	add	r10, r10, #1
+	cmp	r10, r3
+	ldr	r12, [sp, #4]
+	bne	6b
+0:	add	sp, sp, #20
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
+	bx	lr
+.endfunc
+
+pixman_asm_function pixman_composite_over_8888_n_8888_asm_armv6
+	push	{r4, r5, r6, r7, r8, r9, r10, r11}
+	sub	sp, sp, #28
+	cmp	r1, #0
+	str	r1, [sp, #12]
+	ldrb	r1, [sp, #71]
+	mov	r12, r2
+	str	r0, [sp, #16]
+	ldr	r2, [sp, #60]
+	str	r1, [sp, #24]
+	beq	0f
+	lsl	r3, r3, #2
+	str	r3, [sp, #20]
+	ldr	r3, [sp, #64]
+	mov	r10, #0
+	lsl	r3, r3, #2
+	str	r3, [sp, #8]
+	mov	r11, r3
+	b	1f
+5:	ldr	r11, [sp, #8]
+1:	ldr	r4, [sp, #20]
+	mov	r0, r12
+	mov	r1, r2
+	add	r12, r12, r4
+	add	r2, r2, r11
+	str	r12, [sp]
+	str	r2, [sp, #4]
+	ldr	r12, [sp, #16]
+	ldr	r2, =0x00800080
+	ldr	r3, [sp, #24]
+	mov	r11, #255
+	cmp	r12, #0
+	beq	3f
+4:	ldr	r5, [r1], #4
+	ldr	r4, [r0]
+	uxtb16	r6, r5
+	uxtb16	r7, r5, ror #8
+	mla	r6, r6, r3, r2
+	mla	r7, r7, r3, r2
+	uxtab16	r6, r6, r6, ror #8
+	uxtab16	r7, r7, r7, ror #8
+	uxtb16	r6, r6, ror #8
+	uxtb16	r7, r7, ror #8
+	orr	r5, r6, r7, lsl #8
+	uxtb16	r6, r4
+	uxtb16	r7, r4, ror #8
+	sub	r8, r11, r5, lsr #24
+	mla	r6, r6, r8, r2
+	mla	r7, r7, r8, r2
+	uxtab16	r6, r6, r6, ror #8
+	uxtab16	r7, r7, r7, ror #8
+	uxtb16	r6, r6, ror #8
+	uxtb16	r7, r7, ror #8
+	orr	r6, r6, r7, lsl #8
+	uqadd8	r5, r6, r5
+	str	r5, [r0], #4
+	subs	r12, r12, #1
+	bne	4b
+3:	ldr	r1, [sp, #12]
+	add	r10, r10, #1
+	cmp	r10, r1
+	ldr	r12, [sp]
+	ldr	r2, [sp, #4]
+	bne	5b
+0:	add	sp, sp, #28
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
+	bx	lr
+.endfunc
+
+pixman_asm_function pixman_composite_over_n_8_8888_asm_armv6
+	push	{r4, r5, r6, r7, r8, r9, r10, r11}
+	sub	sp, sp, #28
+	cmp	r1, #0
+	ldr	r9, [sp, #60]
+	str	r1, [sp, #12]
+	bic	r1, r9, #-16777216
+	str	r1, [sp, #20]
+	mov	r12, r2
+	lsr	r1, r9, #8
+	ldr	r2, [sp, #20]
+	bic	r1, r1, #-16777216
+	bic	r2, r2, #65280
+	bic	r1, r1, #65280
+	str	r2, [sp, #20]
+	str	r0, [sp, #16]
+	str	r1, [sp, #4]
+	ldr	r2, [sp, #68]
+	beq	0f
+	lsl	r3, r3, #2
+	str	r3, [sp, #24]
+	mov	r0, #0
+	b	1f
+5:	ldr	r3, [sp, #24]
+1:	ldr	r4, [sp, #72]
+	mov	r10, r12
+	mov	r1, r2
+	add	r12, r12, r3
+	add	r2, r2, r4
+	str	r12, [sp, #8]
+	str	r2, [sp]
+	ldr	r12, [sp, #16]
+	ldr	r11, =0x00800080
+	ldr	r2, [sp, #4]
+	ldr	r3, [sp, #20]
+	cmp	r12, #0
+	beq	3f
+4:	ldrb	r5, [r1], #1
+	ldr	r4, [r10]
+	mla	r6, r3, r5, r11
+	mla	r7, r2, r5, r11
+	uxtab16	r6, r6, r6, ror #8
+	uxtab16	r7, r7, r7, ror #8
+	uxtb16	r6, r6, ror #8
+	uxtb16	r7, r7, ror #8
+	orr	r5, r6, r7, lsl #8
+	uxtb16	r6, r4
+	uxtb16	r7, r4, ror #8
+	mvn	r8, r5
+	lsr	r8, r8, #24
+	mla	r6, r6, r8, r11
+	mla	r7, r7, r8, r11
+	uxtab16	r6, r6, r6, ror #8
+	uxtab16	r7, r7, r7, ror #8
+	uxtb16	r6, r6, ror #8
+	uxtb16	r7, r7, ror #8
+	orr	r6, r6, r7, lsl #8
+	uqadd8	r5, r6, r5
+	str	r5, [r10], #4
+	subs	r12, r12, #1
+	bne	4b
+3:	ldr	r4, [sp, #12]
+	add	r0, r0, #1
+	cmp	r0, r4
+	ldr	r12, [sp, #8]
+	ldr	r2, [sp]
+	bne	5b
+0:	add	sp, sp, #28
+	pop	{r4, r5, r6, r7, r8, r9, r10, r11}
+	bx	lr
+.endfunc
diff --git a/lib/pixman/pixman/pixman-arm-simd.c b/lib/pixman/pixman/pixman-arm-simd.c
index fb7bf3da8..389c9e01a 100644
--- a/lib/pixman/pixman/pixman-arm-simd.c
+++ b/lib/pixman/pixman/pixman-arm-simd.c
@@ -28,31 +28,22 @@
 #endif
 
 #include "pixman-private.h"
+#include "pixman-arm-common.h"
 
-static void
-arm_composite_add_8000_8000 (pixman_implementation_t * impl,
-    pixman_op_t               op,
-    pixman_image_t *          src_image,
-    pixman_image_t *          mask_image,
-    pixman_image_t *          dst_image,
-    int32_t                   src_x,
-    int32_t                   src_y,
-    int32_t                   mask_x,
-    int32_t                   mask_y,
-    int32_t                   dest_x,
-    int32_t                   dest_y,
-    int32_t                   width,
-    int32_t                   height)
+#if 0 /* This code was moved to 'pixman-arm-simd-asm.S' */
+
+void
+pixman_composite_add_8000_8000_asm_armv6 (int32_t  width,
+                                          int32_t  height,
+                                          uint8_t *dst_line,
+                                          int32_t  dst_stride,
+                                          uint8_t *src_line,
+                                          int32_t  src_stride)
 {
-    uint8_t     *dst_line, *dst;
-    uint8_t     *src_line, *src;
-    int dst_stride, src_stride;
-    uint16_t w;
+    uint8_t *dst, *src;
+    int32_t w;
     uint8_t s, d;
 
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
-
     while (height--)
     {
 	dst = dst_line;
@@ -101,32 +92,21 @@ arm_composite_add_8000_8000 (pixman_implementation_t * impl,
 
 }
 
-static void
-arm_composite_over_8888_8888 (pixman_implementation_t * impl,
-    pixman_op_t               op,
-    pixman_image_t *          src_image,
-    pixman_image_t *          mask_image,
-    pixman_image_t *          dst_image,
-    int32_t                   src_x,
-    int32_t                   src_y,
-    int32_t                   mask_x,
-    int32_t                   mask_y,
-    int32_t                   dest_x,
-    int32_t                   dest_y,
-    int32_t                   width,
-    int32_t                   height)
+void
+pixman_composite_over_8888_8888_asm_armv6 (int32_t   width,
+                                           int32_t   height,
+                                           uint32_t *dst_line,
+                                           int32_t   dst_stride,
+                                           uint32_t *src_line,
+                                           int32_t   src_stride)
 {
-    uint32_t    *dst_line, *dst;
-    uint32_t    *src_line, *src;
-    int dst_stride, src_stride;
-    uint16_t w;
+    uint32_t    *dst;
+    uint32_t    *src;
+    int32_t w;
     uint32_t component_half = 0x800080;
     uint32_t upper_component_mask = 0xff00ff00;
     uint32_t alpha_mask = 0xff;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
     while (height--)
     {
 	dst = dst_line;
@@ -188,40 +168,27 @@ arm_composite_over_8888_8888 (pixman_implementation_t * impl,
 	    "2:\n\t"
 	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
 	    : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
-	    [alpha_mask] "r" (alpha_mask)
+	      [alpha_mask] "r" (alpha_mask)
 	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
 	    );
     }
 }
 
-static void
-arm_composite_over_8888_n_8888 (
-    pixman_implementation_t * impl,
-    pixman_op_t               op,
-    pixman_image_t *          src_image,
-    pixman_image_t *          mask_image,
-    pixman_image_t *          dst_image,
-    int32_t                   src_x,
-    int32_t                   src_y,
-    int32_t                   mask_x,
-    int32_t                   mask_y,
-    int32_t                   dest_x,
-    int32_t                   dest_y,
-    int32_t                   width,
-    int32_t                   height)
+void
+pixman_composite_over_8888_n_8888_asm_armv6 (int32_t   width,
+                                             int32_t   height,
+                                             uint32_t *dst_line,
+                                             int32_t   dst_stride,
+                                             uint32_t *src_line,
+                                             int32_t   src_stride,
+                                             uint32_t  mask)
 {
-    uint32_t *dst_line, *dst;
-    uint32_t *src_line, *src;
-    uint32_t mask;
-    int dst_stride, src_stride;
-    uint16_t w;
+    uint32_t *dst;
+    uint32_t *src;
+    int32_t w;
     uint32_t component_half = 0x800080;
     uint32_t alpha_mask = 0xff;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-
-    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
     mask = (mask) >> 24;
 
     while (height--)
@@ -298,39 +265,28 @@ arm_composite_over_8888_n_8888 (
 	    "2:\n\t"
 	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
 	    : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
-	    [alpha_mask] "r" (alpha_mask)
+	      [alpha_mask] "r" (alpha_mask)
 	    : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
 	    );
     }
 }
 
-static void
-arm_composite_over_n_8_8888 (pixman_implementation_t * impl,
-			     pixman_op_t               op,
-			     pixman_image_t *          src_image,
-			     pixman_image_t *          mask_image,
-			     pixman_image_t *          dst_image,
-			     int32_t                   src_x,
-			     int32_t                   src_y,
-			     int32_t                   mask_x,
-			     int32_t                   mask_y,
-			     int32_t                   dest_x,
-			     int32_t                   dest_y,
-			     int32_t                   width,
-			     int32_t                   height)
+void
+pixman_composite_over_n_8_8888_asm_armv6 (int32_t   width,
+                                          int32_t   height,
+                                          uint32_t *dst_line,
+                                          int32_t   dst_stride,
+                                          uint32_t  src,
+                                          int32_t   unused,
+                                          uint8_t  *mask_line,
+                                          int32_t   mask_stride)
 {
-    uint32_t src, srca;
-    uint32_t *dst_line, *dst;
-    uint8_t  *mask_line, *mask;
-    int dst_stride, mask_stride;
-    uint16_t w;
-
-    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    uint32_t  srca;
+    uint32_t *dst;
+    uint8_t  *mask;
+    int32_t w;
 
-    /* bail out if fully transparent */
     srca = src >> 24;
-    if (src == 0)
-	return;
 
     uint32_t component_mask = 0xff00ff;
     uint32_t component_half = 0x800080;
@@ -338,9 +294,6 @@ arm_composite_over_n_8_8888 (pixman_implementation_t * impl,
     uint32_t src_hi = (src >> 8) & component_mask;
     uint32_t src_lo = src & component_mask;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
-    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
-
     while (height--)
     {
 	dst = dst_line;
@@ -384,7 +337,8 @@ arm_composite_over_n_8_8888 (pixman_implementation_t * impl,
 	    "uxtb16 r7, r4, ror #8\n\t"
 
 	    /* we could simplify this to use 'sub' if we were
-	    * willing to give up a register for alpha_mask */
+	     * willing to give up a register for alpha_mask
+	     */
 	    "mvn r8, r5\n\t"
 	    "mov r8, r8, lsr #24\n\t"
 
@@ -419,68 +373,45 @@ arm_composite_over_n_8_8888 (pixman_implementation_t * impl,
     }
 }
 
-static const pixman_fast_path_t arm_simd_fast_path_array[] =
-{
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, arm_composite_over_8888_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, arm_composite_over_8888_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, arm_composite_over_8888_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, arm_composite_over_8888_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, arm_composite_over_8888_n_8888,  NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, arm_composite_over_8888_n_8888,  NEED_SOLID_MASK },
+#endif
 
-    { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       arm_composite_add_8000_8000,     0 },
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8000_8000,
+                                   uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
 
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, arm_composite_over_n_8_8888,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, arm_composite_over_n_8_8888,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, arm_composite_over_n_8_8888,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, arm_composite_over_n_8_8888,     0 },
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (armv6, over_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
 
-    { PIXMAN_OP_NONE },
-};
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (armv6, over_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
 
-const pixman_fast_path_t *const arm_simd_fast_paths = arm_simd_fast_path_array;
-
-static void
-arm_simd_composite (pixman_implementation_t *imp,
-                    pixman_op_t              op,
-                    pixman_image_t *         src,
-                    pixman_image_t *         mask,
-                    pixman_image_t *         dest,
-                    int32_t                  src_x,
-                    int32_t                  src_y,
-                    int32_t                  mask_x,
-                    int32_t                  mask_y,
-                    int32_t                  dest_x,
-                    int32_t                  dest_y,
-                    int32_t                  width,
-                    int32_t                  height)
+static const pixman_fast_path_t arm_simd_fast_paths[] =
 {
-    if (_pixman_run_fast_path (arm_simd_fast_paths, imp,
-                               op, src, mask, dest,
-                               src_x, src_y,
-                               mask_x, mask_y,
-                               dest_x, dest_y,
-                               width, height))
-    {
-	return;
-    }
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
+
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8000_8000),
+
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
 
-    _pixman_implementation_composite (imp->delegate, op,
-                                      src, mask, dest,
-                                      src_x, src_y,
-                                      mask_x, mask_y,
-                                      dest_x, dest_y,
-                                      width, height);
-}
+    { PIXMAN_OP_NONE },
+};
 
 pixman_implementation_t *
 _pixman_implementation_create_arm_simd (void)
 {
     pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
-    pixman_implementation_t *imp = _pixman_implementation_create (general);
-
-    imp->composite = arm_simd_composite;
+    pixman_implementation_t *imp = _pixman_implementation_create (general, arm_simd_fast_paths);
 
     return imp;
 }
-
diff --git a/lib/pixman/pixman/pixman-bits-image.c b/lib/pixman/pixman/pixman-bits-image.c
index 7a1910935..0225ae5aa 100644
--- a/lib/pixman/pixman/pixman-bits-image.c
+++ b/lib/pixman/pixman/pixman-bits-image.c
@@ -4,6 +4,7 @@
  *             2008 Aaron Plattner, NVIDIA Corporation
  * Copyright © 2000 SuSE, Inc.
  * Copyright © 2007, 2009 Red Hat, Inc.
+ * Copyright © 2008 André Tupinambá <andrelrt@gmail.com>
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
@@ -28,6 +29,7 @@
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "pixman-private.h"
@@ -158,6 +160,9 @@ repeat (pixman_repeat_t repeat, int size, int *coord)
 
     case PIXMAN_REPEAT_NONE:
 	break;
+
+    default:
+        break;
     }
 }
 
@@ -182,6 +187,97 @@ bits_image_fetch_pixel_nearest (bits_image_t   *image,
     }
 }
 
+#if SIZEOF_LONG > 4
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    uint64_t distxy, distxiy, distixy, distixiy;
+    uint64_t tl64, tr64, bl64, br64;
+    uint64_t f, r;
+
+    distxy = distx * disty;
+    distxiy = distx * (256 - disty);
+    distixy = (256 - distx) * disty;
+    distixiy = (256 - distx) * (256 - disty);
+
+    /* Alpha and Blue */
+    tl64 = tl & 0xff0000ff;
+    tr64 = tr & 0xff0000ff;
+    bl64 = bl & 0xff0000ff;
+    br64 = br & 0xff0000ff;
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r = f & 0x0000ff0000ff0000ull;
+
+    /* Red and Green */
+    tl64 = tl;
+    tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
+
+    tr64 = tr;
+    tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
+
+    bl64 = bl;
+    bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
+
+    br64 = br;
+    br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
+
+    return (uint32_t)(r >> 16);
+}
+
+#else
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    int distxy, distxiy, distixy, distixiy;
+    uint32_t f, r;
+
+    distxy = distx * disty;
+    distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
+    distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
+    distixiy =
+	256 * 256 - (disty << 8) -
+	(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
+
+    /* Blue */
+    r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+
+    /* Green */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    tl >>= 16;
+    tr >>= 16;
+    bl >>= 16;
+    br >>= 16;
+    r >>= 16;
+
+    /* Red */
+    f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+    r |= f & 0x00ff0000;
+
+    /* Alpha */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    return r;
+}
+
+#endif
+
 static force_inline uint32_t
 bits_image_fetch_pixel_bilinear (bits_image_t   *image,
 				 pixman_fixed_t  x,
@@ -191,9 +287,8 @@ bits_image_fetch_pixel_bilinear (bits_image_t   *image,
     int width = image->width;
     int height = image->height;
     int x1, y1, x2, y2;
-    uint32_t tl, tr, bl, br, r;
-    int32_t distx, disty, idistx, idisty;
-    uint32_t ft, fb;
+    uint32_t tl, tr, bl, br;
+    int32_t distx, disty;
 
     x1 = x - pixman_fixed_1 / 2;
     y1 = y - pixman_fixed_1 / 2;
@@ -212,7 +307,7 @@ bits_image_fetch_pixel_bilinear (bits_image_t   *image,
 	repeat (repeat_mode, height, &y1);
 	repeat (repeat_mode, width, &x2);
 	repeat (repeat_mode, height, &y2);
-	
+
 	tl = get_pixel (image, x1, y1, FALSE);
 	bl = get_pixel (image, x1, y2, FALSE);
 	tr = get_pixel (image, x2, y1, FALSE);
@@ -226,24 +321,218 @@ bits_image_fetch_pixel_bilinear (bits_image_t   *image,
 	br = get_pixel (image, x2, y2, TRUE);
     }
 
-    idistx = 256 - distx;
-    idisty = 256 - disty;
+    return bilinear_interpolation (tl, tr, bl, br, distx, disty);
+}
 
-#define GET8(v, i)   ((uint16_t) (uint8_t) ((v) >> i))
-    ft = GET8 (tl, 0) * idistx + GET8 (tr, 0) * distx;
-    fb = GET8 (bl, 0) * idistx + GET8 (br, 0) * distx;
-    r = (((ft * idisty + fb * disty) >> 16) & 0xff);
-    ft = GET8 (tl, 8) * idistx + GET8 (tr, 8) * distx;
-    fb = GET8 (bl, 8) * idistx + GET8 (br, 8) * distx;
-    r |= (((ft * idisty + fb * disty) >> 8) & 0xff00);
-    ft = GET8 (tl, 16) * idistx + GET8 (tr, 16) * distx;
-    fb = GET8 (bl, 16) * idistx + GET8 (br, 16) * distx;
-    r |= (((ft * idisty + fb * disty)) & 0xff0000);
-    ft = GET8 (tl, 24) * idistx + GET8 (tr, 24) * distx;
-    fb = GET8 (bl, 24) * idistx + GET8 (br, 24) * distx;
-    r |= (((ft * idisty + fb * disty) << 8) & 0xff000000);
+static void
+bits_image_fetch_bilinear_no_repeat_8888 (pixman_image_t * ima,
+					  int              offset,
+					  int              line,
+					  int              width,
+					  uint32_t *       buffer,
+					  const uint32_t * mask,
+					  uint32_t         mask_bits)
+{
+    bits_image_t *bits = &ima->bits;
+    pixman_fixed_t x_top, x_bottom, x;
+    pixman_fixed_t ux_top, ux_bottom, ux;
+    pixman_vector_t v;
+    uint32_t top_mask, bottom_mask;
+    uint32_t *top_row;
+    uint32_t *bottom_row;
+    uint32_t *end;
+    uint32_t zero[2] = { 0, 0 };
+    int y, y1, y2;
+    int disty;
+    int mask_inc;
+    int w;
 
-    return r;
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (bits->common.transform, &v))
+	return;
+
+    ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
+    x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
+
+    y = v.vector[1] - pixman_fixed_1/2;
+    disty = (y >> 8) & 0xff;
+
+    /* Load the pointers to the first and second lines from the source
+     * image that bilinear code must read.
+     *
+     * The main trick in this code is about the check if any line are
+     * outside of the image;
+     *
+     * When I realize that a line (any one) is outside, I change
+     * the pointer to a dummy area with zeros. Once I change this, I
+     * must be sure the pointer will not change, so I set the
+     * variables to each pointer increments inside the loop.
+     */
+    y1 = pixman_fixed_to_int (y);
+    y2 = y1 + 1;
+
+    if (y1 < 0 || y1 >= bits->height)
+    {
+	top_row = zero;
+	x_top = 0;
+	ux_top = 0;
+    }
+    else
+    {
+	top_row = bits->bits + y1 * bits->rowstride;
+	x_top = x;
+	ux_top = ux;
+    }
+
+    if (y2 < 0 || y2 >= bits->height)
+    {
+	bottom_row = zero;
+	x_bottom = 0;
+	ux_bottom = 0;
+    }
+    else
+    {
+	bottom_row = bits->bits + y2 * bits->rowstride;
+	x_bottom = x;
+	ux_bottom = ux;
+    }
+
+    /* Instead of checking whether the operation uses the mast in
+     * each loop iteration, verify this only once and prepare the
+     * variables to make the code smaller inside the loop.
+     */
+    if (!mask)
+    {
+        mask_inc = 0;
+        mask_bits = 1;
+        mask = &mask_bits;
+    }
+    else
+    {
+        /* If have a mask, prepare the variables to check it */
+        mask_inc = 1;
+    }
+
+    /* If both are zero, then the whole thing is zero */
+    if (top_row == zero && bottom_row == zero)
+    {
+	memset (buffer, 0, width * sizeof (uint32_t));
+	return;
+    }
+    else if (bits->format == PIXMAN_x8r8g8b8)
+    {
+	if (top_row == zero)
+	{
+	    top_mask = 0;
+	    bottom_mask = 0xff000000;
+	}
+	else if (bottom_row == zero)
+	{
+	    top_mask = 0xff000000;
+	    bottom_mask = 0;
+	}
+	else
+	{
+	    top_mask = 0xff000000;
+	    bottom_mask = 0xff000000;
+	}
+    }
+    else
+    {
+	top_mask = 0;
+	bottom_mask = 0;
+    }
+
+    end = buffer + width;
+
+    /* Zero fill to the left of the image */
+    while (buffer < end && x < pixman_fixed_minus_1)
+    {
+	*buffer++ = 0;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Left edge
+     */
+    while (buffer < end && x < 0)
+    {
+	uint32_t tr, br;
+	int32_t distx;
+
+	tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
+	br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+	distx = (x >> 8) & 0xff;
+
+	*buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
+
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Main part */
+    w = pixman_int_to_fixed (bits->width - 1);
+
+    while (buffer < end  &&  x < w)
+    {
+	if (*mask)
+	{
+	    uint32_t tl, tr, bl, br;
+	    int32_t distx;
+
+	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+	    tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
+	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+	    br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+	    distx = (x >> 8) & 0xff;
+
+	    *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
+	}
+
+	buffer++;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Right Edge */
+    w = pixman_int_to_fixed (bits->width);
+    while (buffer < end  &&  x < w)
+    {
+	if (*mask)
+	{
+	    uint32_t tl, bl;
+	    int32_t distx;
+
+	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+
+	    distx = (x >> 8) & 0xff;
+
+	    *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
+	}
+
+	buffer++;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Zero fill to the left of the image */
+    while (buffer < end)
+	*buffer++ = 0;
 }
 
 static force_inline uint32_t
@@ -340,6 +629,9 @@ bits_image_fetch_pixel_filtered (bits_image_t *image,
     case PIXMAN_FILTER_CONVOLUTION:
 	return bits_image_fetch_pixel_convolution (image, x, y);
 	break;
+
+    default:
+        break;
     }
 
     return 0;
@@ -583,55 +875,6 @@ bits_image_fetch_untransformed_64 (pixman_image_t * image,
     }
 }
 
-static pixman_bool_t out_of_bounds_workaround = TRUE;
-
-/* Old X servers rely on out-of-bounds accesses when they are asked
- * to composite with a window as the source. They create a pixman image
- * pointing to some bogus position in memory, but then they set a clip
- * region to the position where the actual bits are.
- *
- * Due to a bug in old versions of pixman, where it would not clip
- * against the image bounds when a clip region was set, this would
- * actually work. So by default we allow certain out-of-bound access
- * to happen unless explicitly disabled.
- *
- * Fixed X servers should call this function to disable the workaround.
- */
-PIXMAN_EXPORT void
-pixman_disable_out_of_bounds_workaround (void)
-{
-    out_of_bounds_workaround = FALSE;
-}
-
-static pixman_bool_t
-source_image_needs_out_of_bounds_workaround (bits_image_t *image)
-{
-    if (image->common.clip_sources                      &&
-        image->common.repeat == PIXMAN_REPEAT_NONE      &&
-	image->common.have_clip_region			&&
-        out_of_bounds_workaround)
-    {
-	if (!image->common.client_clip)
-	{
-	    /* There is no client clip, so if the clip region extends beyond the
-	     * drawable geometry, it must be because the X server generated the
-	     * bogus clip region.
-	     */
-	    const pixman_box32_t *extents = pixman_region32_extents (&image->common.clip_region);
-
-	    if (extents->x1 >= 0 && extents->x2 <= image->width &&
-		extents->y1 >= 0 && extents->y2 <= image->height)
-	    {
-		return FALSE;
-	    }
-	}
-
-	return TRUE;
-    }
-
-    return FALSE;
-}
-
 static void
 bits_image_property_changed (pixman_image_t *image)
 {
@@ -665,6 +908,25 @@ bits_image_property_changed (pixman_image_t *image)
 	image->common.get_scanline_64 = bits_image_fetch_untransformed_64;
 	image->common.get_scanline_32 = bits_image_fetch_untransformed_32;
     }
+    else if (bits->common.transform					&&
+	     bits->common.transform->matrix[2][0] == 0			&&
+	     bits->common.transform->matrix[2][1] == 0			&&
+	     bits->common.transform->matrix[2][2] == pixman_fixed_1	&&
+	     bits->common.transform->matrix[0][0] > 0			&&
+	     bits->common.transform->matrix[1][0] == 0			&&
+	     !bits->read_func						&&
+	     (bits->common.filter == PIXMAN_FILTER_BILINEAR ||
+	      bits->common.filter == PIXMAN_FILTER_GOOD	    ||
+	      bits->common.filter == PIXMAN_FILTER_BEST)		&&
+	     bits->common.repeat == PIXMAN_REPEAT_NONE			&&
+	     (bits->format == PIXMAN_a8r8g8b8	||
+	      bits->format == PIXMAN_x8r8g8b8))
+    {
+	image->common.get_scanline_64 =
+	    _pixman_image_get_scanline_generic_64;
+	image->common.get_scanline_32 =
+	    bits_image_fetch_bilinear_no_repeat_8888;
+    }
     else
     {
 	image->common.get_scanline_64 =
@@ -675,9 +937,6 @@ bits_image_property_changed (pixman_image_t *image)
 
     bits->store_scanline_64 = bits_image_store_scanline_64;
     bits->store_scanline_32 = bits_image_store_scanline_32;
-
-    bits->common.need_workaround =
-        source_image_needs_out_of_bounds_workaround (bits);
 }
 
 static uint32_t *
@@ -731,8 +990,10 @@ pixman_image_create_bits (pixman_format_code_t format,
 
     /* must be a whole number of uint32_t's
      */
-    return_val_if_fail (bits == NULL ||
-                        (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
+    return_val_if_fail (
+	bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
+
+    return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL);
 
     if (!bits && width && height)
     {
diff --git a/lib/pixman/pixman/pixman-compiler.h b/lib/pixman/pixman/pixman-compiler.h
index 9647dbb48..26f7071c9 100644
--- a/lib/pixman/pixman/pixman-compiler.h
+++ b/lib/pixman/pixman/pixman-compiler.h
@@ -69,3 +69,135 @@
 #   define PIXMAN_EXPORT
 #endif
 
+/* TLS */
+#if defined(TOOLCHAIN_SUPPORTS__THREAD)
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static __thread type name
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    (&name)
+
+#elif defined(__MINGW32__) && !defined(__WIN64)
+
+/* We can't include <windows.h> as it causes carious clashes with
+ * identifiers in pixman, sigh. So just declare the functions we need
+ * here.
+ */
+extern __stdcall long InterlockedCompareExchange(long volatile *, long, long);
+#define InterlockedCompareExchangePointer(d,e,c)			\
+    (void *)InterlockedCompareExchange((long volatile *)(d),(long)(e),(long)(c))
+extern __stdcall int TlsAlloc (void);
+extern __stdcall void *TlsGetValue (unsigned);
+extern __stdcall int TlsSetValue (unsigned, void *);
+extern __stdcall void *CreateMutexA(void *, int, char *);
+extern __stdcall int CloseHandle(void *);
+extern __stdcall unsigned WaitForSingleObject (void *, unsigned);
+extern __stdcall int ReleaseMutex (void *);
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static volatile int tls_ ## name ## _initialized = 0;		\
+    static void *tls_ ## name ## _mutex = NULL;				\
+    static unsigned tls_ ## name ## _index;				\
+									\
+    static type *							\
+    tls_ ## name ## _alloc (void)					\
+    {									\
+        type *value = calloc (1, sizeof (type));			\
+        if (value)							\
+            TlsSetValue (tls_ ## name ## _index, value);		\
+        return value;							\
+    }									\
+									\
+    static force_inline type *						\
+    tls_ ## name ## _get (void)						\
+    {									\
+	type *value;							\
+	if (!tls_ ## name ## _initialized)				\
+	{								\
+	    if (!tls_ ## name ## _mutex)				\
+	    {								\
+		void *mutex = CreateMutexA (NULL, 0, NULL);		\
+		if (InterlockedCompareExchangePointer (			\
+			&tls_ ## name ## _mutex, mutex, NULL) != NULL)	\
+		{							\
+		    CloseHandle (mutex);				\
+		}							\
+	    }								\
+	    WaitForSingleObject (tls_ ## name ## _mutex, 0xFFFFFFFF);	\
+	    if (!tls_ ## name ## _initialized)				\
+	    {								\
+		tls_ ## name ## _index = TlsAlloc ();			\
+		tls_ ## name ## _initialized = 1;			\
+	    }								\
+	    ReleaseMutex (tls_ ## name ## _mutex);			\
+	}								\
+	if (tls_ ## name ## _index == 0xFFFFFFFF)			\
+	    return NULL;						\
+	value = TlsGetValue (tls_ ## name ## _index);			\
+	if (!value)							\
+	    value = tls_ ## name ## _alloc ();				\
+	return value;							\
+    }
+
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    tls_ ## name ## _get ()
+
+#elif defined(_MSC_VER)
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static __declspec(thread) type name
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    (&name)
+
+#elif defined(HAVE_PTHREAD_SETSPECIFIC)
+
+#include <pthread.h>
+
+#  define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static pthread_once_t tls_ ## name ## _once_control = PTHREAD_ONCE_INIT; \
+    static pthread_key_t tls_ ## name ## _key;				\
+									\
+    static void								\
+    tls_ ## name ## _destroy_value (void *value)			\
+    {									\
+	free (value);							\
+    }									\
+									\
+    static void								\
+    tls_ ## name ## _make_key (void)					\
+    {									\
+	pthread_key_create (&tls_ ## name ## _key,			\
+			    tls_ ## name ## _destroy_value);		\
+    }									\
+									\
+    static type *							\
+    tls_ ## name ## _alloc (void)					\
+    {									\
+	type *value = calloc (1, sizeof (type));			\
+	if (value)							\
+	    pthread_setspecific (tls_ ## name ## _key, value);		\
+	return value;							\
+    }									\
+									\
+    static force_inline type *						\
+    tls_ ## name ## _get (void)						\
+    {									\
+	type *value = NULL;						\
+	if (pthread_once (&tls_ ## name ## _once_control,		\
+			  tls_ ## name ## _make_key) == 0)		\
+	{								\
+	    value = pthread_getspecific (tls_ ## name ## _key);		\
+	    if (!value)							\
+		value = tls_ ## name ## _alloc ();			\
+	}								\
+	return value;							\
+    }
+
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    tls_ ## name ## _get ()
+
+#else
+
+#    error "Unknown thread local support for this system"
+
+#endif
diff --git a/lib/pixman/pixman/pixman-conical-gradient.c b/lib/pixman/pixman/pixman-conical-gradient.c
index d720db3d4..0341a8ebf 100644
--- a/lib/pixman/pixman/pixman-conical-gradient.c
+++ b/lib/pixman/pixman/pixman-conical-gradient.c
@@ -23,7 +23,11 @@
  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  * SOFTWARE.
  */
+
+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif
+
 #include <stdlib.h>
 #include <math.h>
 #include "pixman-private.h"
@@ -49,7 +53,7 @@ conical_gradient_get_scanline_32 (pixman_image_t *image,
     double rx = x + 0.5;
     double ry = y + 0.5;
     double rz = 1.;
-    double a = conical->angle / (180. * 65536);
+    double a = (conical->angle * M_PI) / (180. * 65536);
 
     _pixman_gradient_walker_init (&walker, gradient, source->common.repeat);
 
diff --git a/lib/pixman/pixman/pixman-cpu.c b/lib/pixman/pixman/pixman-cpu.c
index 5d5469bb8..e96b140bd 100644
--- a/lib/pixman/pixman/pixman-cpu.c
+++ b/lib/pixman/pixman/pixman-cpu.c
@@ -253,8 +253,6 @@ pixman_arm_read_auxv ()
 	    if (aux.a_type == AT_HWCAP)
 	    {
 		uint32_t hwcap = aux.a_un.a_val;
-		if (getenv ("ARM_FORCE_HWCAP"))
-		    hwcap = strtoul (getenv ("ARM_FORCE_HWCAP"), NULL, 0);
 		/* hardcode these values to avoid depending on specific
 		 * versions of the hwcap header, e.g. HWCAP_NEON
 		 */
@@ -266,8 +264,6 @@ pixman_arm_read_auxv ()
 	    else if (aux.a_type == AT_PLATFORM)
 	    {
 		const char *plat = (const char*) aux.a_un.a_val;
-		if (getenv ("ARM_FORCE_PLATFORM"))
-		    plat = getenv ("ARM_FORCE_PLATFORM");
 		if (strncmp (plat, "v7l", 3) == 0)
 		{
 		    arm_has_v7 = TRUE;
@@ -280,12 +276,6 @@ pixman_arm_read_auxv ()
 	    }
 	}
 	close (fd);
-
-	/* if we don't have 2.6.29, we have to do this hack; set
-	 * the env var to trust HWCAP.
-	 */
-	if (!getenv ("ARM_TRUST_HWCAP") && arm_has_v7)
-	    arm_has_neon = TRUE;
     }
 
     arm_tests_initialized = TRUE;
@@ -319,7 +309,7 @@ pixman_have_arm_neon (void)
 
 #endif /* USE_ARM_SIMD || USE_ARM_NEON */
 
-#ifdef USE_MMX
+#if defined(USE_MMX) || defined(USE_SSE2)
 /* The CPU detection code needs to be in a file not compiled with
  * "-mmmx -msse", as gcc would generate CMOV instructions otherwise
  * that would lead to SIGILL instructions on old CPUs that don't have
diff --git a/lib/pixman/pixman/pixman-edge-imp.h b/lib/pixman/pixman/pixman-edge-imp.h
index a30f82108..a4698eddb 100644
--- a/lib/pixman/pixman/pixman-edge-imp.h
+++ b/lib/pixman/pixman/pixman-edge-imp.h
@@ -49,10 +49,14 @@ RASTERIZE_EDGES (pixman_image_t  *image,
 	rx = r->x;
 #if N_BITS == 1
 	/* For the non-antialiased case, round the coordinates up, in effect
-	 * sampling the center of the pixel. (The AA case does a similar 
-	 * adjustment in RENDER_SAMPLES_X) */
-	lx += X_FRAC_FIRST(1);
-	rx += X_FRAC_FIRST(1);
+	 * sampling just slightly to the left of the pixel. This is so that
+	 * when the sample point lies exactly on the line, we round towards
+	 * north-west.
+	 *
+	 * (The AA case does a similar  adjustment in RENDER_SAMPLES_X)
+	 */
+	lx += X_FRAC_FIRST(1) - pixman_fixed_e;
+	rx += X_FRAC_FIRST(1) - pixman_fixed_e;
 #endif
 	/* clip X */
 	if (lx < 0)
@@ -79,14 +83,6 @@ RASTERIZE_EDGES (pixman_image_t  *image,
 #if N_BITS == 1
 	    {
 
-#ifdef WORDS_BIGENDIAN
-#   define SCREEN_SHIFT_LEFT(x,n)	((x) << (n))
-#   define SCREEN_SHIFT_RIGHT(x,n)	((x) >> (n))
-#else
-#   define SCREEN_SHIFT_LEFT(x,n)	((x) >> (n))
-#   define SCREEN_SHIFT_RIGHT(x,n)	((x) << (n))
-#endif
-
 #define LEFT_MASK(x)							\
 		(((x) & 0x1f) ?						\
 		 SCREEN_SHIFT_RIGHT (0xffffffff, (x) & 0x1f) : 0)
diff --git a/lib/pixman/pixman/pixman-edge.c b/lib/pixman/pixman/pixman-edge.c
index 81a2e960a..8d498ab44 100644
--- a/lib/pixman/pixman/pixman-edge.c
+++ b/lib/pixman/pixman/pixman-edge.c
@@ -70,7 +70,7 @@
 #define N_BITS  4
 #define RASTERIZE_EDGES rasterize_edges_4
 
-#ifndef WORDS_BIG_ENDIAN
+#ifndef WORDS_BIGENDIAN
 #define SHIFT_4(o)      ((o) << 2)
 #else
 #define SHIFT_4(o)      ((1 - (o)) << 2)
@@ -358,6 +358,9 @@ PIXMAN_RASTERIZE_EDGES (pixman_image_t *image,
     case 8:
 	rasterize_edges_8 (image, l, r, t, b);
 	break;
+
+    default:
+        break;
     }
 }
 
diff --git a/lib/pixman/pixman/pixman-fast-path.c b/lib/pixman/pixman/pixman-fast-path.c
index 5ab8d8c99..bf5b298c8 100644
--- a/lib/pixman/pixman/pixman-fast-path.c
+++ b/lib/pixman/pixman/pixman-fast-path.c
@@ -27,6 +27,7 @@
 #include <config.h>
 #endif
 #include <string.h>
+#include <stdlib.h>
 #include "pixman-private.h"
 #include "pixman-combine32.h"
 
@@ -125,7 +126,7 @@ fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
     int src_stride, mask_stride, dst_stride;
     uint8_t m;
     uint32_t s, d;
-    uint16_t w;
+    int32_t w;
 
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
@@ -183,7 +184,7 @@ fast_composite_in_n_8_8 (pixman_implementation_t *imp,
     uint8_t     *dst_line, *dst;
     uint8_t     *mask_line, *mask, m;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
     uint16_t t;
 
     src = _pixman_image_get_solid (src_image, dest_image->bits.format);
@@ -260,7 +261,7 @@ fast_composite_in_8_8 (pixman_implementation_t *imp,
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
     uint8_t s;
     uint16_t t;
 
@@ -308,7 +309,7 @@ fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
     uint32_t    *dst_line, *dst, d;
     uint8_t     *mask_line, *mask, m;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
 
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
@@ -366,7 +367,7 @@ fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
     uint32_t    *dst_line, *dst, d;
     uint32_t    *mask_line, *mask, ma;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
 
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
@@ -423,7 +424,7 @@ fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
     uint32_t    *dst_line, *dst, d;
     uint32_t    *mask_line, *mask, ma;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
 
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
@@ -490,7 +491,7 @@ fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
     uint32_t d;
     uint8_t     *mask_line, *mask, m;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
 
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
@@ -555,7 +556,7 @@ fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
     uint32_t d;
     uint8_t     *mask_line, *mask, m;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
 
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
@@ -622,7 +623,7 @@ fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
     uint32_t  d;
     uint32_t *mask_line, *mask, ma;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
 
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
@@ -697,7 +698,7 @@ fast_composite_over_8888_8888 (pixman_implementation_t *imp,
     uint32_t    *src_line, *src, s;
     int dst_stride, src_stride;
     uint8_t a;
-    uint16_t w;
+    int32_t w;
 
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
@@ -723,6 +724,7 @@ fast_composite_over_8888_8888 (pixman_implementation_t *imp,
     }
 }
 
+#if 0
 static void
 fast_composite_over_8888_0888 (pixman_implementation_t *imp,
 			       pixman_op_t              op,
@@ -743,7 +745,7 @@ fast_composite_over_8888_0888 (pixman_implementation_t *imp,
     uint32_t    *src_line, *src, s;
     uint8_t a;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
 
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
@@ -773,6 +775,7 @@ fast_composite_over_8888_0888 (pixman_implementation_t *imp,
 	}
     }
 }
+#endif
 
 static void
 fast_composite_over_8888_0565 (pixman_implementation_t *imp,
@@ -794,7 +797,7 @@ fast_composite_over_8888_0565 (pixman_implementation_t *imp,
     uint32_t    *src_line, *src, s;
     uint8_t a;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
 
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
@@ -847,7 +850,7 @@ fast_composite_src_x888_0565 (pixman_implementation_t *imp,
     uint16_t    *dst_line, *dst;
     uint32_t    *src_line, *src, s;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
 
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
@@ -887,7 +890,7 @@ fast_composite_add_8000_8000 (pixman_implementation_t *imp,
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
     uint8_t s, d;
     uint16_t t;
 
@@ -938,7 +941,7 @@ fast_composite_add_8888_8888 (pixman_implementation_t *imp,
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t s, d;
 
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
@@ -971,24 +974,24 @@ fast_composite_add_8888_8888 (pixman_implementation_t *imp,
 }
 
 static void
-fast_composite_add_8888_8_8 (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             pixman_image_t *         src_image,
-                             pixman_image_t *         mask_image,
-                             pixman_image_t *         dst_image,
-                             int32_t                  src_x,
-                             int32_t                  src_y,
-                             int32_t                  mask_x,
-                             int32_t                  mask_y,
-                             int32_t                  dest_x,
-                             int32_t                  dest_y,
-                             int32_t                  width,
-                             int32_t                  height)
+fast_composite_add_n_8_8 (pixman_implementation_t *imp,
+			  pixman_op_t              op,
+			  pixman_image_t *         src_image,
+			  pixman_image_t *         mask_image,
+			  pixman_image_t *         dst_image,
+			  int32_t                  src_x,
+			  int32_t                  src_y,
+			  int32_t                  mask_x,
+			  int32_t                  mask_y,
+			  int32_t                  dest_x,
+			  int32_t                  dest_y,
+			  int32_t                  width,
+			  int32_t                  height)
 {
     uint8_t     *dst_line, *dst;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t src;
     uint8_t sa;
 
@@ -1023,6 +1026,254 @@ fast_composite_add_8888_8_8 (pixman_implementation_t *imp,
     }
 }
 
+#ifdef WORDS_BIGENDIAN
+#define CREATE_BITMASK(n) (0x80000000 >> (n))
+#define UPDATE_BITMASK(n) ((n) >> 1)
+#else
+#define CREATE_BITMASK(n) (1 << (n))
+#define UPDATE_BITMASK(n) ((n) << 1)
+#endif
+
+#define TEST_BIT(p, n)					\
+    (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
+#define SET_BIT(p, n)							\
+    do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
+
+static void
+fast_composite_add_1000_1000 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t     *dst_line, *dst;
+    uint32_t     *src_line, *src;
+    int           dst_stride, src_stride;
+    int32_t       w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
+                           src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, 0, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    /*
+	     * TODO: improve performance by processing uint32_t data instead
+	     *       of individual bits
+	     */
+	    if (TEST_BIT (src, src_x + w))
+		SET_BIT (dst, dest_x + w);
+	}
+    }
+}
+
+static void
+fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t     src, srca;
+    uint32_t    *dst, *dst_line;
+    uint32_t    *mask, *mask_line;
+    int          mask_stride, dst_stride;
+    uint32_t     bitcache, bitmask;
+    int32_t      w;
+
+    if (width <= 0)
+	return;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+                           mask_stride, mask_line, 1);
+    mask_line += mask_x >> 5;
+
+    if (srca == 0xff)
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		    *dst = src;
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		    *dst = over (src, *dst);
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+}
+
+static void
+fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t     src, srca;
+    uint16_t    *dst, *dst_line;
+    uint32_t    *mask, *mask_line;
+    int          mask_stride, dst_stride;
+    uint32_t     bitcache, bitmask;
+    int32_t      w;
+    uint32_t     d;
+    uint16_t     src565;
+
+    if (width <= 0)
+	return;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t,
+                           dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+                           mask_stride, mask_line, 1);
+    mask_line += mask_x >> 5;
+
+    if (srca == 0xff)
+    {
+	src565 = CONVERT_8888_TO_0565 (src);
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		    *dst = src565;
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		{
+		    d = over (src, CONVERT_0565_TO_0888 (*dst));
+		    *dst = CONVERT_8888_TO_0565 (d);
+		}
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+}
+
 /*
  * Simple bitblt
  */
@@ -1095,81 +1346,316 @@ fast_composite_src_8888_x888 (pixman_implementation_t *imp,
     }
 }
 
-static const pixman_fast_path_t c_fast_paths[] =
+static force_inline pixman_bool_t
+repeat (pixman_repeat_t repeat, int *c, int size)
 {
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   fast_composite_over_n_8_0565, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   fast_composite_over_n_8_0565, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r8g8b8,   fast_composite_over_n_8_0888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b8g8r8,   fast_composite_over_n_8_0888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fast_composite_over_n_8_8888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fast_composite_over_n_8_8888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fast_composite_over_n_8_8888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fast_composite_over_n_8_8888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fast_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, fast_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   fast_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, fast_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, fast_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   fast_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fast_composite_over_x888_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fast_composite_over_x888_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, fast_composite_over_x888_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, fast_composite_over_x888_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fast_composite_over_8888_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fast_composite_over_8888_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fast_composite_over_8888_0565,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, fast_composite_over_8888_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, fast_composite_over_8888_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fast_composite_over_8888_0565,    0 },
-    { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, fast_composite_add_8888_8888,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, fast_composite_add_8888_8888,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fast_composite_add_8000_8000,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_solid,     PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fast_composite_add_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_ADD, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       fast_composite_add_8888_8_8,    0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_a8r8g8b8, fast_composite_solid_fill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_x8r8g8b8, fast_composite_solid_fill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_a8b8g8r8, fast_composite_solid_fill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_x8b8g8r8, fast_composite_solid_fill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_a8,       fast_composite_solid_fill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_r5g6b5,   fast_composite_solid_fill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, fast_composite_src_8888_x888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, fast_composite_src_8888_x888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, fast_composite_src_8888_x888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, fast_composite_src_8888_x888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_r5g6b5,   fast_composite_src_x888_0565, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_r5g6b5,   fast_composite_src_x888_0565, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_b5g6r5,   fast_composite_src_x888_0565, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_b5g6r5,   fast_composite_src_x888_0565, 0 },
-    { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fast_composite_in_8_8,   0 },
-    { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       fast_composite_in_n_8_8, 0 },
-    { PIXMAN_OP_NONE },
-};
+    if (repeat == PIXMAN_REPEAT_NONE)
+    {
+	if (*c < 0 || *c >= size)
+	    return FALSE;
+    }
+    else if (repeat == PIXMAN_REPEAT_NORMAL)
+    {
+	while (*c >= size)
+	    *c -= size;
+	while (*c < 0)
+	    *c += size;
+    }
+    else if (repeat == PIXMAN_REPEAT_PAD)
+    {
+	*c = CLIP (*c, 0, size - 1);
+    }
+    else /* REFLECT */
+    {
+	*c = MOD (*c, size * 2);
+	if (*c >= size)
+	    *c = size * 2 - *c - 1;
+    }
+    return TRUE;
+}
+
+/* A macroified version of specialized nearest scalers for some
+ * common 8888 and 565 formats. It supports SRC and OVER ops.
+ *
+ * There are two repeat versions, one that handles repeat normal,
+ * and one without repeat handling that only works if the src region
+ * used is completely covered by the pre-repeated source samples.
+ *
+ * The loops are unrolled to process two pixels per iteration for better
+ * performance on most CPU architectures (superscalar processors
+ * can issue several operations simultaneously, other processors can hide
+ * instructions latencies by pipelining operations). Unrolling more
+ * does not make much sense because the compiler will start running out
+ * of spare registers soon.
+ */
+
+#define GET_8888_ALPHA(s) ((s) >> 24)
+ /* This is not actually used since we don't have an OVER with
+    565 source, but it is needed to build. */
+#define GET_0565_ALPHA(s) 0xff
+
+#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,					\
+		     src_type_t, dst_type_t, OP, do_repeat)					\
+static void											\
+fast_composite_scaled_nearest_ ## scale_func_name ## _ ## OP (pixman_implementation_t *imp,	\
+							      pixman_op_t              op,      \
+							      pixman_image_t *         src_image, \
+							      pixman_image_t *         mask_image, \
+							      pixman_image_t *         dst_image, \
+							      int32_t                  src_x,   \
+							      int32_t                  src_y,   \
+							      int32_t                  mask_x,  \
+							      int32_t                  mask_y,  \
+							      int32_t                  dst_x,   \
+							      int32_t                  dst_y,   \
+							      int32_t                  width,   \
+							      int32_t                  height)  \
+{												\
+    dst_type_t *dst_line;									\
+    src_type_t *src_first_line;									\
+    uint32_t   d;										\
+    src_type_t s1, s2;										\
+    uint8_t   a1, a2;										\
+    int       w;										\
+    int       x1, x2, y;									\
+    pixman_fixed_t orig_vx;									\
+    pixman_fixed_t max_vx, max_vy;								\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+												\
+    src_type_t *src;										\
+    dst_type_t *dst;										\
+    int       src_stride, dst_stride;								\
+												\
+    if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
+	abort();										\
+												\
+    PIXMAN_IMAGE_GET_LINE (dst_image, dst_x, dst_y, dst_type_t, dst_stride, dst_line, 1);	\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
+    v.vector[0] -= pixman_fixed_e;								\
+    v.vector[1] -= pixman_fixed_e;								\
+												\
+    vx = v.vector[0];										\
+    vy = v.vector[1];										\
+												\
+    if (do_repeat)										\
+    {												\
+	/* Clamp repeating positions inside the actual samples */				\
+	max_vx = src_image->bits.width << 16;							\
+	max_vy = src_image->bits.height << 16;							\
+												\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, max_vx);						\
+	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+    }												\
+												\
+    orig_vx = vx;										\
+												\
+    while (--height >= 0)									\
+    {												\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+												\
+	y = vy >> 16;										\
+	vy += unit_y;										\
+	if (do_repeat)										\
+	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+												\
+	src = src_first_line + src_stride * y;							\
+												\
+	w = width;										\
+	vx = orig_vx;										\
+	while ((w -= 2) >= 0)									\
+	{											\
+	    x1 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (do_repeat)									\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
+	    s1 = src[x1];									\
+												\
+	    x2 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (do_repeat)									\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
+	    s2 = src[x2];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+												\
+		if (a2 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+		}										\
+		else if (s2)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    a2 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		    *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		    *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s2);			\
+	    }											\
+	}											\
+												\
+	if (w & 1)										\
+	{											\
+	    x1 = vx >> 16;									\
+	    vx += unit_x;									\
+	    if (do_repeat)									\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= max_vx)								\
+		    vx -= max_vx;								\
+	    }											\
+	    s1 = src[x1];									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = CONVERT_## DST_FORMAT ## _TO_8888 (*dst);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = CONVERT_8888_TO_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		    *dst++ = CONVERT_ ## SRC_FORMAT ## _TO_ ## DST_FORMAT (s1);			\
+	    }											\
+	}											\
+    }												\
+}
+
+FAST_NEAREST(x888_x888_none, 8888, 8888, uint32_t, uint32_t, SRC, /*repeat: */ 0);
+FAST_NEAREST(x888_x888_normal, 8888, 8888, uint32_t, uint32_t, SRC, /*repeat: */ 1);
+FAST_NEAREST(x888_x888_none, 8888, 8888, uint32_t, uint32_t, OVER, /*repeat: */ 0);
+FAST_NEAREST(x888_x888_normal, 8888, 8888, uint32_t, uint32_t, OVER, /*repeat: */ 1);
+FAST_NEAREST(x888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, /*repeat: */ 0);
+FAST_NEAREST(x888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, /*repeat: */ 1);
+FAST_NEAREST(565_565_none, 0565, 0565, uint16_t, uint16_t, SRC, /*repeat: */ 0);
+FAST_NEAREST(565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, /*repeat: */ 1);
+FAST_NEAREST(8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, /*repeat: */ 0);
+FAST_NEAREST(8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, /*repeat: */ 1);
+
+static force_inline uint32_t
+fetch_nearest (pixman_repeat_t src_repeat,
+	       pixman_format_code_t format,
+	       uint32_t *src, int x, int src_width)
+{
+    if (repeat (src_repeat, &x, src_width))
+    {
+	if (format == PIXMAN_x8r8g8b8)
+	    return *(src + x) | 0xff000000;
+	else
+	    return *(src + x);
+    }
+    else
+    {
+	return 0;
+    }
+}
+
+static force_inline void
+combine_over (uint32_t s, uint32_t *dst)
+{
+    if (s)
+    {
+	uint8_t ia = 0xff - (s >> 24);
+
+	if (ia)
+	    UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
+	else
+	    *dst = s;
+    }
+}
+
+static force_inline void
+combine_src (uint32_t s, uint32_t *dst)
+{
+    *dst = s;
+}
 
 static void
-fast_composite_src_scale_nearest (pixman_implementation_t *imp,
-                                  pixman_op_t              op,
-                                  pixman_image_t *         src_image,
-                                  pixman_image_t *         mask_image,
-                                  pixman_image_t *         dst_image,
-                                  int32_t                  src_x,
-                                  int32_t                  src_y,
-                                  int32_t                  mask_x,
-                                  int32_t                  mask_y,
-                                  int32_t                  dest_x,
-                                  int32_t                  dest_y,
-                                  int32_t                  width,
-                                  int32_t                  height)
+fast_composite_scaled_nearest (pixman_implementation_t *imp,
+			       pixman_op_t              op,
+			       pixman_image_t *         src_image,
+			       pixman_image_t *         mask_image,
+			       pixman_image_t *         dst_image,
+			       int32_t                  src_x,
+			       int32_t                  src_y,
+			       int32_t                  mask_x,
+			       int32_t                  mask_y,
+			       int32_t                  dest_x,
+			       int32_t                  dest_y,
+			       int32_t                  width,
+			       int32_t                  height)
 {
-    uint32_t       *dst;
-    uint32_t       *src;
-    int dst_stride, src_stride;
-    int i, j;
+    uint32_t       *dst_line;
+    uint32_t       *src_line;
+    int             dst_stride, src_stride;
+    int		    src_width, src_height;
+    pixman_repeat_t src_repeat;
+    pixman_fixed_t unit_x, unit_y;
+    pixman_format_code_t src_format;
     pixman_vector_t v;
+    pixman_fixed_t vy;
 
-    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
-     * transformed from destination space to source space */
-    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src, 1);
+     * transformed from destination space to source space
+     */
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
 
     /* reference point is the center of the pixel */
     v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
@@ -1179,145 +1665,221 @@ fast_composite_src_scale_nearest (pixman_implementation_t *imp,
     if (!pixman_transform_point_3d (src_image->common.transform, &v))
 	return;
 
+    unit_x = src_image->common.transform->matrix[0][0];
+    unit_y = src_image->common.transform->matrix[1][1];
+
     /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
     v.vector[0] -= pixman_fixed_e;
     v.vector[1] -= pixman_fixed_e;
 
-    for (j = 0; j < height; j++)
+    src_height = src_image->bits.height;
+    src_width = src_image->bits.width;
+    src_repeat = src_image->common.repeat;
+    src_format = src_image->bits.format;
+
+    vy = v.vector[1];
+    while (height--)
     {
-	pixman_fixed_t vx = v.vector[0];
-	pixman_fixed_t vy = v.vector[1];
+        pixman_fixed_t vx = v.vector[0];
+	int y = pixman_fixed_to_int (vy);
+	uint32_t *dst = dst_line;
 
-	for (i = 0; i < width; ++i)
+	dst_line += dst_stride;
+
+        /* adjust the y location by a unit vector in the y direction
+         * this is equivalent to transforming y+1 of the destination point to source space */
+        vy += unit_y;
+
+	if (!repeat (src_repeat, &y, src_height))
 	{
-	    pixman_bool_t inside_bounds;
-	    uint32_t result;
-	    int x, y;
-	    x = vx >> 16;
-	    y = vy >> 16;
-
-	    /* apply the repeat function */
-	    switch (src_image->common.repeat)
-	    {
-	    case PIXMAN_REPEAT_NORMAL:
-		x = MOD (x, src_image->bits.width);
-		y = MOD (y, src_image->bits.height);
-		inside_bounds = TRUE;
-		break;
-
-	    case PIXMAN_REPEAT_PAD:
-		x = CLIP (x, 0, src_image->bits.width - 1);
-		y = CLIP (y, 0, src_image->bits.height - 1);
-		inside_bounds = TRUE;
-		break;
-
-	    case PIXMAN_REPEAT_REFLECT:
-		x = MOD (x, src_image->bits.width * 2);
-		if (x >= src_image->bits.width)
-		    x = src_image->bits.width * 2 - x - 1;
-		y = MOD (y, src_image->bits.height * 2);
-		if (y >= src_image->bits.height)
-		    y = src_image->bits.height * 2 - y - 1;
-		inside_bounds = TRUE;
-		break;
-
-	    case PIXMAN_REPEAT_NONE:
-	    default:
-		inside_bounds =
-		    (x >= 0				&&
-		     x < src_image->bits.width		&&
-		     y >= 0				&&
-		     y < src_image->bits.height);
-		break;
-	    }
+	    if (op == PIXMAN_OP_SRC)
+		memset (dst, 0, sizeof (*dst) * width);
+	}
+	else
+	{
+	    int w = width;
+
+	    uint32_t *src = src_line + y * src_stride;
 
-	    if (inside_bounds)
+	    while (w >= 2)
 	    {
-		/* XXX: we should move this multiplication out of the loop */
-		result = *(src + y * src_stride + x);
+		uint32_t s1, s2;
+		int x1, x2;
+
+		x1 = pixman_fixed_to_int (vx);
+		vx += unit_x;
+
+		x2 = pixman_fixed_to_int (vx);
+		vx += unit_x;
+
+		w -= 2;
+
+		s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
+		s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
+
+		if (op == PIXMAN_OP_OVER)
+		{
+		    combine_over (s1, dst++);
+		    combine_over (s2, dst++);
+		}
+		else
+		{
+		    combine_src (s1, dst++);
+		    combine_src (s2, dst++);
+		}
 	    }
-	    else
+
+	    while (w--)
 	    {
-		result = 0;
-	    }
-	    *(dst + i) = result;
+		uint32_t s;
+		int x;
 
-	    /* adjust the x location by a unit vector in the x direction:
-	     * this is equivalent to transforming x+1 of the destination
-	     * point to source space
-	     */
-	    vx += src_image->common.transform->matrix[0][0];
+		x = pixman_fixed_to_int (vx);
+		vx += unit_x;
+
+		s = fetch_nearest (src_repeat, src_format, src, x, src_width);
+
+		if (op == PIXMAN_OP_OVER)
+		    combine_over (s, dst++);
+		else
+		    combine_src (s, dst++);
+	    }
 	}
-	/* adjust the y location by a unit vector in the y direction
-	 * this is equivalent to transforming y+1 of the destination point
-	 * to source space
-	 */
-	v.vector[1] += src_image->common.transform->matrix[1][1];
-	dst += dst_stride;
     }
 }
 
-static void
-fast_path_composite (pixman_implementation_t *imp,
-                     pixman_op_t              op,
-                     pixman_image_t *         src,
-                     pixman_image_t *         mask,
-                     pixman_image_t *         dest,
-                     int32_t                  src_x,
-                     int32_t                  src_y,
-                     int32_t                  mask_x,
-                     int32_t                  mask_y,
-                     int32_t                  dest_x,
-                     int32_t                  dest_y,
-                     int32_t                  width,
-                     int32_t                  height)
+static const pixman_fast_path_t c_fast_paths[] =
 {
-    if (src->type == BITS
-        && src->common.transform
-        && !mask
-        && op == PIXMAN_OP_SRC
-        && !src->common.alpha_map && !dest->common.alpha_map
-        && (src->common.filter == PIXMAN_FILTER_NEAREST)
-        && PIXMAN_FORMAT_BPP (dest->bits.format) == 32
-        && src->bits.format == dest->bits.format
-        && !src->bits.read_func && !src->bits.write_func
-        && !dest->bits.read_func && !dest->bits.write_func)
-    {
-	/* ensure that the transform matrix only has a scale */
-	if (src->common.transform->matrix[0][1] == 0 &&
-	    src->common.transform->matrix[1][0] == 0 &&
-	    src->common.transform->matrix[2][0] == 0 &&
-	    src->common.transform->matrix[2][1] == 0 &&
-	    src->common.transform->matrix[2][2] == pixman_fixed_1)
-	{
-	    _pixman_walk_composite_region (imp, op,
-	                                   src, mask, dest,
-	                                   src_x, src_y,
-	                                   mask_x, mask_y,
-	                                   dest_x, dest_y,
-	                                   width, height,
-	                                   fast_composite_src_scale_nearest);
-	    return;
-	}
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   fast_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   fast_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8000_8000),
+    PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1000_1000),
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_8888_x888),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_8888_x888),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_8888_x888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_8888_x888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
+
+#define SCALED_NEAREST_FLAGS						\
+    (FAST_PATH_SCALE_TRANSFORM	|					\
+     FAST_PATH_NO_ALPHA_MAP	|					\
+     FAST_PATH_NEAREST_FILTER	|					\
+     FAST_PATH_NO_ACCESSORS	|					\
+     FAST_PATH_NO_WIDE_FORMAT)
+
+#define HAS_NORMAL_REPEAT_FLAGS						\
+    (FAST_PATH_NO_REFLECT_REPEAT |					\
+     FAST_PATH_NO_PAD_REPEAT     |					\
+     FAST_PATH_NO_NONE_REPEAT)
+
+#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | HAS_NORMAL_REPEAT_FLAGS | FAST_PATH_16BIT_SAFE | FAST_PATH_X_UNIT_POSITIVE, \
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    },									\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP,		\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
     }
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, x888_x888),
 
-    if (_pixman_run_fast_path (c_fast_paths, imp,
-                               op, src, mask, dest,
-                               src_x, src_y,
-                               mask_x, mask_y,
-                               dest_x, dest_y,
-                               width, height))
-    {
-	return;
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, x888_x888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, x888_565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, x888_565),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, x888_x888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, x888_x888),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
+
+#define NEAREST_FAST_PATH(op,s,d)		\
+    {   PIXMAN_OP_ ## op,			\
+	PIXMAN_ ## s, SCALED_NEAREST_FLAGS,	\
+	PIXMAN_null, 0,				\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,	\
+	fast_composite_scaled_nearest,		\
     }
 
-    _pixman_implementation_composite (imp->delegate, op,
-                                      src, mask, dest,
-                                      src_x, src_y,
-                                      mask_x, mask_y,
-                                      dest_x, dest_y,
-                                      width, height);
-}
+    NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
+    NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
+
+    NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
+    NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
+
+    NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
+    NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
+
+    NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
+    NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
+
+    {   PIXMAN_OP_NONE	},
+};
 
 static void
 pixman_fill8 (uint32_t *bits,
@@ -1430,11 +1992,9 @@ pixman_implementation_t *
 _pixman_implementation_create_fast_path (void)
 {
     pixman_implementation_t *general = _pixman_implementation_create_general ();
-    pixman_implementation_t *imp = _pixman_implementation_create (general);
+    pixman_implementation_t *imp = _pixman_implementation_create (general, c_fast_paths);
 
-    imp->composite = fast_path_composite;
     imp->fill = fast_path_fill;
 
     return imp;
 }
-
diff --git a/lib/pixman/pixman/pixman-general.c b/lib/pixman/pixman/pixman-general.c
index 3ead3dac7..bddf79aae 100644
--- a/lib/pixman/pixman/pixman-general.c
+++ b/lib/pixman/pixman/pixman-general.c
@@ -133,15 +133,27 @@ general_composite_rect  (pixman_implementation_t *imp,
     /* Skip the store step and composite directly into the
      * destination if the output format of the compose func matches
      * the destination format.
+     *
+     * If the destination format is a8r8g8b8 then we can always do
+     * this. If it is x8r8g8b8, then we can only do it if the
+     * operator doesn't make use of destination alpha.
      */
-    if (!wide &&
-        !dest->common.alpha_map &&
-        !dest->bits.write_func &&
-        (op == PIXMAN_OP_ADD || op == PIXMAN_OP_OVER) &&
-        (dest->bits.format == PIXMAN_a8r8g8b8 ||
-         dest->bits.format == PIXMAN_x8r8g8b8))
+    if ((dest->bits.format == PIXMAN_a8r8g8b8)	||
+	(dest->bits.format == PIXMAN_x8r8g8b8	&&
+	 (op == PIXMAN_OP_OVER		||
+	  op == PIXMAN_OP_ADD		||
+	  op == PIXMAN_OP_SRC		||
+	  op == PIXMAN_OP_CLEAR		||
+	  op == PIXMAN_OP_IN_REVERSE	||
+	  op == PIXMAN_OP_OUT_REVERSE	||
+	  op == PIXMAN_OP_DST)))
     {
-	store = NULL;
+	if (!wide &&
+	    !dest->common.alpha_map &&
+	    !dest->bits.write_func)
+	{
+	    store = NULL;
+	}
     }
 
     if (!store)
@@ -252,26 +264,11 @@ general_composite_rect  (pixman_implementation_t *imp,
 	free (scanline_buffer);
 }
 
-static void
-general_composite (pixman_implementation_t * imp,
-                   pixman_op_t               op,
-                   pixman_image_t *          src,
-                   pixman_image_t *          mask,
-                   pixman_image_t *          dest,
-                   int32_t                   src_x,
-                   int32_t                   src_y,
-                   int32_t                   mask_x,
-                   int32_t                   mask_y,
-                   int32_t                   dest_x,
-                   int32_t                   dest_y,
-                   int32_t                   width,
-                   int32_t                   height)
+static const pixman_fast_path_t general_fast_path[] =
 {
-    _pixman_walk_composite_region (imp, op, src, mask, dest, src_x, src_y,
-                                   mask_x, mask_y, dest_x, dest_y,
-				   width, height,
-                                   general_composite_rect);
-}
+    { PIXMAN_OP_any, PIXMAN_any, 0, PIXMAN_any,	0, PIXMAN_any, 0, general_composite_rect },
+    { PIXMAN_OP_NONE }
+};
 
 static pixman_bool_t
 general_blt (pixman_implementation_t *imp,
@@ -310,12 +307,11 @@ general_fill (pixman_implementation_t *imp,
 pixman_implementation_t *
 _pixman_implementation_create_general (void)
 {
-    pixman_implementation_t *imp = _pixman_implementation_create (NULL);
+    pixman_implementation_t *imp = _pixman_implementation_create (NULL, general_fast_path);
 
     _pixman_setup_combiner_functions_32 (imp);
     _pixman_setup_combiner_functions_64 (imp);
 
-    imp->composite = general_composite;
     imp->blt = general_blt;
     imp->fill = general_fill;
 
diff --git a/lib/pixman/pixman/pixman-image.c b/lib/pixman/pixman/pixman-image.c
index fff0497f1..03a39db87 100644
--- a/lib/pixman/pixman/pixman-image.c
+++ b/lib/pixman/pixman/pixman-image.c
@@ -48,8 +48,6 @@ _pixman_init_gradient (gradient_t *                  gradient,
     gradient->n_stops = n_stops;
 
     gradient->stop_range = 0xffff;
-    gradient->color_table = NULL;
-    gradient->color_table_size = 0;
     gradient->common.class = SOURCE_IMAGE_CLASS_UNKNOWN;
 
     return TRUE;
@@ -119,7 +117,6 @@ _pixman_image_allocate (void)
 	common->client_clip = FALSE;
 	common->destroy_func = NULL;
 	common->destroy_data = NULL;
-	common->need_workaround = FALSE;
 	common->dirty = TRUE;
     }
 
@@ -233,23 +230,249 @@ pixman_image_set_destroy_function (pixman_image_t *            image,
     image->common.destroy_data = data;
 }
 
+PIXMAN_EXPORT void *
+pixman_image_get_destroy_data (pixman_image_t *image)
+{
+  return image->common.destroy_data;
+}
+
 void
 _pixman_image_reset_clip_region (pixman_image_t *image)
 {
     image->common.have_clip_region = FALSE;
 }
 
+static pixman_bool_t out_of_bounds_workaround = TRUE;
+
+/* Old X servers rely on out-of-bounds accesses when they are asked
+ * to composite with a window as the source. They create a pixman image
+ * pointing to some bogus position in memory, but then they set a clip
+ * region to the position where the actual bits are.
+ *
+ * Due to a bug in old versions of pixman, where it would not clip
+ * against the image bounds when a clip region was set, this would
+ * actually work. So by default we allow certain out-of-bound access
+ * to happen unless explicitly disabled.
+ *
+ * Fixed X servers should call this function to disable the workaround.
+ */
+PIXMAN_EXPORT void
+pixman_disable_out_of_bounds_workaround (void)
+{
+    out_of_bounds_workaround = FALSE;
+}
+
+static pixman_bool_t
+source_image_needs_out_of_bounds_workaround (bits_image_t *image)
+{
+    if (image->common.clip_sources                      &&
+        image->common.repeat == PIXMAN_REPEAT_NONE      &&
+	image->common.have_clip_region			&&
+        out_of_bounds_workaround)
+    {
+	if (!image->common.client_clip)
+	{
+	    /* There is no client clip, so if the clip region extends beyond the
+	     * drawable geometry, it must be because the X server generated the
+	     * bogus clip region.
+	     */
+	    const pixman_box32_t *extents =
+		pixman_region32_extents (&image->common.clip_region);
+
+	    if (extents->x1 >= 0 && extents->x2 <= image->width &&
+		extents->y1 >= 0 && extents->y2 <= image->height)
+	    {
+		return FALSE;
+	    }
+	}
+
+	return TRUE;
+    }
+
+    return FALSE;
+}
+
+static void
+compute_image_info (pixman_image_t *image)
+{
+    pixman_format_code_t code;
+    uint32_t flags = 0;
+
+    /* Transform */
+    if (!image->common.transform)
+    {
+	flags |= (FAST_PATH_ID_TRANSFORM | FAST_PATH_X_UNIT_POSITIVE);
+    }
+    else
+    {
+	if (image->common.transform->matrix[0][1] == 0 &&
+	    image->common.transform->matrix[1][0] == 0 &&
+	    image->common.transform->matrix[2][0] == 0 &&
+	    image->common.transform->matrix[2][1] == 0 &&
+	    image->common.transform->matrix[2][2] == pixman_fixed_1)
+	{
+	    flags |= FAST_PATH_SCALE_TRANSFORM;
+	}
+
+	if (image->common.transform->matrix[0][0] > 0)
+	    flags |= FAST_PATH_X_UNIT_POSITIVE;
+    }
+
+    /* Alpha map */
+    if (!image->common.alpha_map)
+	flags |= FAST_PATH_NO_ALPHA_MAP;
+
+    /* Filter */
+    switch (image->common.filter)
+    {
+    case PIXMAN_FILTER_NEAREST:
+    case PIXMAN_FILTER_FAST:
+	flags |= (FAST_PATH_NEAREST_FILTER | FAST_PATH_NO_CONVOLUTION_FILTER);
+	break;
+
+    case PIXMAN_FILTER_CONVOLUTION:
+	break;
+
+    default:
+	flags |= FAST_PATH_NO_CONVOLUTION_FILTER;
+	break;
+    }
+
+    /* Repeat mode */
+    switch (image->common.repeat)
+    {
+    case PIXMAN_REPEAT_NONE:
+	flags |= FAST_PATH_NO_REFLECT_REPEAT | FAST_PATH_NO_PAD_REPEAT;
+	break;
+
+    case PIXMAN_REPEAT_REFLECT:
+	flags |= FAST_PATH_NO_PAD_REPEAT | FAST_PATH_NO_NONE_REPEAT;
+	break;
+
+    case PIXMAN_REPEAT_PAD:
+	flags |= FAST_PATH_NO_REFLECT_REPEAT | FAST_PATH_NO_NONE_REPEAT;
+	break;
+
+    default:
+	flags |= FAST_PATH_NO_REFLECT_REPEAT | FAST_PATH_NO_PAD_REPEAT | FAST_PATH_NO_NONE_REPEAT;
+	break;
+    }
+
+    /* Component alpha */
+    if (image->common.component_alpha)
+	flags |= FAST_PATH_COMPONENT_ALPHA;
+    else
+	flags |= FAST_PATH_UNIFIED_ALPHA;
+
+    flags |= (FAST_PATH_NO_ACCESSORS | FAST_PATH_NO_WIDE_FORMAT);
+
+    /* Type specific checks */
+    switch (image->type)
+    {
+    case SOLID:
+	code = PIXMAN_solid;
+
+	if (image->solid.color.alpha == 0xffff)
+	    flags |= FAST_PATH_IS_OPAQUE;
+	break;
+
+    case BITS:
+	if (image->bits.width == 1	&&
+	    image->bits.height == 1	&&
+	    image->common.repeat != PIXMAN_REPEAT_NONE)
+	{
+	    code = PIXMAN_solid;
+	}
+	else
+	{
+	    code = image->bits.format;
+
+	    if (!image->common.transform &&
+		image->common.repeat == PIXMAN_REPEAT_NORMAL)
+	    {
+		flags |= FAST_PATH_SIMPLE_REPEAT;
+	    }
+	}
+
+	if (image->common.repeat != PIXMAN_REPEAT_NONE				&&
+	    !PIXMAN_FORMAT_A (image->bits.format)				&&
+	    PIXMAN_FORMAT_TYPE (image->bits.format) != PIXMAN_TYPE_GRAY		&&
+	    PIXMAN_FORMAT_TYPE (image->bits.format) != PIXMAN_TYPE_COLOR)
+	{
+	    flags |= FAST_PATH_IS_OPAQUE;
+	}
+
+	if (source_image_needs_out_of_bounds_workaround (&image->bits))
+	    flags |= FAST_PATH_NEEDS_WORKAROUND;
+
+	if (image->bits.read_func || image->bits.write_func)
+	    flags &= ~FAST_PATH_NO_ACCESSORS;
+
+	if (PIXMAN_FORMAT_IS_WIDE (image->bits.format))
+	    flags &= ~FAST_PATH_NO_WIDE_FORMAT;
+	break;
+
+    case LINEAR:
+    case RADIAL:
+	code = PIXMAN_unknown;
+
+	if (image->common.repeat != PIXMAN_REPEAT_NONE)
+	{
+	    int i;
+
+	    flags |= FAST_PATH_IS_OPAQUE;
+	    for (i = 0; i < image->gradient.n_stops; ++i)
+	    {
+		if (image->gradient.stops[i].color.alpha != 0xffff)
+		{
+		    flags &= ~FAST_PATH_IS_OPAQUE;
+		    break;
+		}
+	    }
+	}
+	break;
+
+    default:
+	code = PIXMAN_unknown;
+	break;
+    }
+
+    /* Both alpha maps and convolution filters can introduce
+     * non-opaqueness in otherwise opaque images. Also
+     * an image with component alpha turned on is only opaque
+     * if all channels are opaque, so we simply turn it off
+     * unconditionally for those images.
+     */
+    if (image->common.alpha_map					||
+	image->common.filter == PIXMAN_FILTER_CONVOLUTION	||
+	image->common.component_alpha)
+    {
+	flags &= ~FAST_PATH_IS_OPAQUE;
+    }
+
+    image->common.flags = flags;
+    image->common.extended_format_code = code;
+}
+
 void
 _pixman_image_validate (pixman_image_t *image)
 {
     if (image->common.dirty)
     {
+	compute_image_info (image);
+
+	/* It is important that property_changed is
+	 * called *after* compute_image_info() because
+	 * property_changed() can make use of the flags
+	 * to set up accessors etc.
+	 */
 	image->common.property_changed (image);
+
 	image->common.dirty = FALSE;
     }
 
     if (image->common.alpha_map)
-	_pixman_image_validate (image->common.alpha_map);
+	_pixman_image_validate ((pixman_image_t *)image->common.alpha_map);
 }
 
 PIXMAN_EXPORT pixman_bool_t
@@ -518,25 +741,6 @@ pixman_image_get_depth (pixman_image_t *image)
     return 0;
 }
 
-pixman_bool_t
-_pixman_image_is_solid (pixman_image_t *image)
-{
-    if (image->type == SOLID)
-	return TRUE;
-
-    if (image->type != BITS     ||
-        image->bits.width != 1  ||
-        image->bits.height != 1)
-    {
-	return FALSE;
-    }
-
-    if (image->common.repeat == PIXMAN_REPEAT_NONE)
-	return FALSE;
-
-    return TRUE;
-}
-
 uint32_t
 _pixman_image_get_solid (pixman_image_t *     image,
                          pixman_format_code_t format)
@@ -556,54 +760,3 @@ _pixman_image_get_solid (pixman_image_t *     image,
 
     return result;
 }
-
-pixman_bool_t
-_pixman_image_is_opaque (pixman_image_t *image)
-{
-    int i;
-
-    if (image->common.alpha_map)
-	return FALSE;
-
-    switch (image->type)
-    {
-    case BITS:
-	if (image->common.repeat == PIXMAN_REPEAT_NONE)
-	    return FALSE;
-
-	if (PIXMAN_FORMAT_A (image->bits.format))
-	    return FALSE;
-	break;
-
-    case LINEAR:
-    case RADIAL:
-	if (image->common.repeat == PIXMAN_REPEAT_NONE)
-	    return FALSE;
-
-	for (i = 0; i < image->gradient.n_stops; ++i)
-	{
-	    if (image->gradient.stops[i].color.alpha != 0xffff)
-		return FALSE;
-	}
-	break;
-
-    case CONICAL:
-	/* Conical gradients always have a transparent border */
-	return FALSE;
-	break;
-
-    case SOLID:
-	if (ALPHA_8 (image->solid.color) != 0xff)
-	    return FALSE;
-	break;
-    }
-
-    /* Convolution filters can introduce translucency if the sum of the
-     * weights is lower than 1.
-     */
-    if (image->common.filter == PIXMAN_FILTER_CONVOLUTION)
-	return FALSE;
-
-    return TRUE;
-}
-
diff --git a/lib/pixman/pixman/pixman-implementation.c b/lib/pixman/pixman/pixman-implementation.c
index bcda9fe85..bc3749ef5 100644
--- a/lib/pixman/pixman/pixman-implementation.c
+++ b/lib/pixman/pixman/pixman-implementation.c
@@ -28,30 +28,6 @@
 #include "pixman-private.h"
 
 static void
-delegate_composite (pixman_implementation_t * imp,
-                    pixman_op_t               op,
-                    pixman_image_t *          src,
-                    pixman_image_t *          mask,
-                    pixman_image_t *          dest,
-                    int32_t                   src_x,
-                    int32_t                   src_y,
-                    int32_t                   mask_x,
-                    int32_t                   mask_y,
-                    int32_t                   dest_x,
-                    int32_t                   dest_y,
-                    int32_t                   width,
-                    int32_t                   height)
-{
-    _pixman_implementation_composite (imp->delegate,
-                                      op,
-                                      src, mask, dest,
-                                      src_x, src_y,
-                                      mask_x, mask_y,
-                                      dest_x, dest_y,
-                                      width, height);
-}
-
-static void
 delegate_combine_32 (pixman_implementation_t * imp,
                      pixman_op_t               op,
                      uint32_t *                dest,
@@ -136,7 +112,8 @@ delegate_fill (pixman_implementation_t *imp,
 }
 
 pixman_implementation_t *
-_pixman_implementation_create (pixman_implementation_t *delegate)
+_pixman_implementation_create (pixman_implementation_t *delegate,
+			       const pixman_fast_path_t *fast_paths)
 {
     pixman_implementation_t *imp = malloc (sizeof (pixman_implementation_t));
     pixman_implementation_t *d;
@@ -145,6 +122,8 @@ _pixman_implementation_create (pixman_implementation_t *delegate)
     if (!imp)
 	return NULL;
 
+    assert (fast_paths);
+
     /* Make sure the whole delegate chain has the right toplevel */
     imp->delegate = delegate;
     for (d = imp; d != NULL; d = d->delegate)
@@ -152,11 +131,10 @@ _pixman_implementation_create (pixman_implementation_t *delegate)
 
     /* Fill out function pointers with ones that just delegate
      */
-    imp->composite = delegate_composite;
     imp->blt = delegate_blt;
     imp->fill = delegate_fill;
 
-    for (i = 0; i < PIXMAN_OP_LAST; ++i)
+    for (i = 0; i < PIXMAN_N_OPERATORS; ++i)
     {
 	imp->combine_32[i] = delegate_combine_32;
 	imp->combine_64[i] = delegate_combine_64;
@@ -164,6 +142,8 @@ _pixman_implementation_create (pixman_implementation_t *delegate)
 	imp->combine_64_ca[i] = delegate_combine_64_ca;
     }
 
+    imp->fast_paths = fast_paths;
+    
     return imp;
 }
 
@@ -211,27 +191,6 @@ _pixman_implementation_combine_64_ca (pixman_implementation_t * imp,
     (*imp->combine_64_ca[op]) (imp, op, dest, src, mask, width);
 }
 
-void
-_pixman_implementation_composite (pixman_implementation_t * imp,
-                                  pixman_op_t               op,
-                                  pixman_image_t *          src,
-                                  pixman_image_t *          mask,
-                                  pixman_image_t *          dest,
-                                  int32_t                   src_x,
-                                  int32_t                   src_y,
-                                  int32_t                   mask_x,
-                                  int32_t                   mask_y,
-                                  int32_t                   dest_x,
-                                  int32_t                   dest_y,
-                                  int32_t                   width,
-                                  int32_t                   height)
-{
-    (*imp->composite) (imp, op,
-		       src, mask, dest,
-		       src_x, src_y, mask_x, mask_y, dest_x, dest_y,
-		       width, height);
-}
-
 pixman_bool_t
 _pixman_implementation_blt (pixman_implementation_t * imp,
                             uint32_t *                src_bits,
diff --git a/lib/pixman/pixman/pixman-mmx.c b/lib/pixman/pixman/pixman-mmx.c
index 7dcc1dc96..d51b40cc1 100644
--- a/lib/pixman/pixman/pixman-mmx.c
+++ b/lib/pixman/pixman/pixman-mmx.c
@@ -485,7 +485,7 @@ mmx_combine_over_reverse_u (pixman_implementation_t *imp,
     {
 	__m64 d, da;
 	uint32_t s = combine (src, mask);
-	
+
 	d = load8888 (*dest);
 	da = expand_alpha (d);
 	*dest = store8888 (over (d, da, load8888 (s)));
@@ -511,12 +511,12 @@ mmx_combine_in_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 x, a;
-	
+
 	x = load8888 (combine (src, mask));
 	a = load8888 (*dest);
 	a = expand_alpha (a);
 	x = pix_multiply (x, a);
-	
+
 	*dest = store8888 (x);
 
 	++dest;
@@ -540,7 +540,7 @@ mmx_combine_in_reverse_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 x, a;
-	
+
 	x = load8888 (*dest);
 	a = load8888 (combine (src, mask));
 	a = expand_alpha (a);
@@ -568,7 +568,7 @@ mmx_combine_out_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 x, a;
-	
+
 	x = load8888 (combine (src, mask));
 	a = load8888 (*dest);
 	a = expand_alpha (a);
@@ -597,7 +597,7 @@ mmx_combine_out_reverse_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 x, a;
-	
+
 	x = load8888 (*dest);
 	a = load8888 (combine (src, mask));
 	a = expand_alpha (a);
@@ -627,7 +627,7 @@ mmx_combine_atop_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 s, da, d, sia;
-	
+
 	s = load8888 (combine (src, mask));
 	d = load8888 (*dest);
 	sia = expand_alpha (s);
@@ -659,7 +659,7 @@ mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 s, dia, d, sa;
-	
+
 	s = load8888 (combine (src, mask));
 	d = load8888 (*dest);
 	sa = expand_alpha (s);
@@ -689,7 +689,7 @@ mmx_combine_xor_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 s, dia, d, sia;
-	
+
 	s = load8888 (combine (src, mask));
 	d = load8888 (*dest);
 	sia = expand_alpha (s);
@@ -720,7 +720,7 @@ mmx_combine_add_u (pixman_implementation_t *imp,
     while (dest < end)
     {
 	__m64 s, d;
-	
+
 	s = load8888 (combine (src, mask));
 	d = load8888 (*dest);
 	s = pix_add (s, d);
@@ -785,7 +785,7 @@ mmx_combine_src_ca (pixman_implementation_t *imp,
     {
 	__m64 a = load8888 (*mask);
 	__m64 s = load8888 (*src);
-	
+
 	s = pix_multiply (s, a);
 	*dest = store8888 (s);
 
@@ -864,7 +864,7 @@ mmx_combine_in_ca (pixman_implementation_t *imp,
 	__m64 s = load8888 (*src);
 	__m64 d = load8888 (*dest);
 	__m64 da = expand_alpha (d);
-	
+
 	s = pix_multiply (s, a);
 	s = pix_multiply (s, da);
 	*dest = store8888 (s);
@@ -892,7 +892,7 @@ mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
 	__m64 s = load8888 (*src);
 	__m64 d = load8888 (*dest);
 	__m64 sa = expand_alpha (s);
-	
+
 	a = pix_multiply (a, sa);
 	d = pix_multiply (d, a);
 	*dest = store8888 (d);
@@ -920,7 +920,7 @@ mmx_combine_out_ca (pixman_implementation_t *imp,
 	__m64 s = load8888 (*src);
 	__m64 d = load8888 (*dest);
 	__m64 da = expand_alpha (d);
-	
+
 	da = negate (da);
 	s = pix_multiply (s, a);
 	s = pix_multiply (s, da);
@@ -1102,7 +1102,7 @@ mmx_composite_over_n_8888 (pixman_implementation_t *imp,
 {
     uint32_t src;
     uint32_t    *dst_line, *dst;
-    uint16_t w;
+    int32_t w;
     int dst_stride;
     __m64 vsrc, vsrca;
 
@@ -1181,7 +1181,7 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp,
 {
     uint32_t src;
     uint16_t    *dst_line, *dst;
-    uint16_t w;
+    int32_t w;
     int dst_stride;
     __m64 vsrc, vsrca;
 
@@ -1209,7 +1209,7 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp,
 	{
 	    uint64_t d = *dst;
 	    __m64 vdest = expand565 (M64 (d), 0);
-	    
+
 	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
 	    *dst = UINT64 (vdest);
 
@@ -1240,7 +1240,7 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp,
 	{
 	    uint64_t d = *dst;
 	    __m64 vdest = expand565 (M64 (d), 0);
-	    
+
 	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
 	    *dst = UINT64 (vdest);
 
@@ -1376,7 +1376,7 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
     uint32_t mask;
     __m64 vmask;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
     __m64 srca;
 
     CHECKPOINT ();
@@ -1385,6 +1385,7 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
+    mask &= 0xff000000;
     mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
     vmask = load8888 (mask);
     srca = MC (4x00ff);
@@ -1461,7 +1462,7 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
     uint32_t mask;
     __m64 vmask;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
     __m64 srca;
 
     CHECKPOINT ();
@@ -1470,6 +1471,7 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
     mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
 
+    mask &= 0xff000000;
     mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
     vmask = load8888 (mask);
     srca = MC (4x00ff);
@@ -1596,7 +1598,7 @@ mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
     uint32_t s;
     int dst_stride, src_stride;
     uint8_t a;
-    uint16_t w;
+    int32_t w;
 
     CHECKPOINT ();
 
@@ -1615,7 +1617,7 @@ mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
 	{
 	    s = *src++;
 	    a = s >> 24;
-	    
+
 	    if (a == 0xff)
 	    {
 		*dst = s;
@@ -1627,7 +1629,7 @@ mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
 		sa = expand_alpha (ms);
 		*dst = store8888 (over (ms, sa, load8888 (*dst)));
 	    }
-	    
+
 	    dst++;
 	}
     }
@@ -1652,7 +1654,7 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
     uint16_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
 
     CHECKPOINT ();
 
@@ -1756,7 +1758,7 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
     uint32_t *dst_line, *dst;
     uint8_t *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
     __m64 vsrc, vsrca;
     uint64_t srcsrc;
 
@@ -1795,7 +1797,7 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
 		__m64 vdest = in_over (vsrc, vsrca,
 				       expand_alpha_rev (M64 (m)),
 				       load8888 (*dst));
-		
+
 		*dst = store8888 (vdest);
 	    }
 
@@ -1809,7 +1811,7 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
 	while (w >= 2)
 	{
 	    uint64_t m0, m1;
-	    
+
 	    m0 = *mask;
 	    m1 = *(mask + 1);
 
@@ -1883,22 +1885,13 @@ pixman_fill_mmx (uint32_t *bits,
     if (bpp != 16 && bpp != 32 && bpp != 8)
 	return FALSE;
 
-    if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
-	return FALSE;
-
-    if (bpp == 8 &&
-        ((xor >> 16 != (xor & 0xffff)) ||
-         (xor >> 24 != (xor & 0x00ff) >> 16)))
-    {
-	return FALSE;
-    }
-
     if (bpp == 8)
     {
 	stride = stride * (int) sizeof (uint32_t) / 1;
 	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
 	byte_width = width;
 	stride *= 1;
+        xor = (xor & 0xff) * 0x01010101;
     }
     else if (bpp == 16)
     {
@@ -1906,6 +1899,7 @@ pixman_fill_mmx (uint32_t *bits,
 	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
 	byte_width = 2 * width;
 	stride *= 2;
+        xor = (xor & 0xffff) * 0x00010001;
     }
     else
     {
@@ -1928,7 +1922,7 @@ pixman_fill_mmx (uint32_t *bits,
         "movq		%7,	%5\n"
         "movq		%7,	%6\n"
 	: "=y" (v1), "=y" (v2), "=y" (v3),
-        "=y" (v4), "=y" (v5), "=y" (v6), "=y" (v7)
+	  "=y" (v4), "=y" (v5), "=y" (v6), "=y" (v7)
 	: "y" (vfill));
 #endif
 
@@ -1936,7 +1930,7 @@ pixman_fill_mmx (uint32_t *bits,
     {
 	int w;
 	uint8_t *d = byte_line;
-	
+
 	byte_line += stride;
 	w = byte_width;
 
@@ -1976,8 +1970,8 @@ pixman_fill_mmx (uint32_t *bits,
 	        "movq	%8,	56(%0)\n"
 		:
 		: "r" (d),
-	        "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
-	        "y" (v4), "y" (v5), "y" (v6), "y" (v7)
+		  "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
+		  "y" (v4), "y" (v5), "y" (v6), "y" (v7)
 		: "memory");
 #else
 	    *(__m64*) (d +  0) = vfill;
@@ -2038,7 +2032,7 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
     uint32_t    *dst_line, *dst;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
     __m64 vsrc, vsrca;
     uint64_t srcsrc;
 
@@ -2080,7 +2074,7 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		__m64 vdest = in (vsrc, expand_alpha_rev (M64 (m)));
-		
+
 		*dst = store8888 (vdest);
 	    }
 	    else
@@ -2136,7 +2130,7 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		__m64 vdest = load8888 (*dst);
-		
+
 		vdest = in (vsrc, expand_alpha_rev (M64 (m)));
 		*dst = store8888 (vdest);
 	    }
@@ -2173,7 +2167,7 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
     uint16_t *dst_line, *dst;
     uint8_t *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
     __m64 vsrc, vsrca, tmp;
     uint64_t srcsrcsrcsrc, src16;
 
@@ -2218,7 +2212,7 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
 		__m64 vd = M64 (d);
 		__m64 vdest = in_over (
 		    vsrc, vsrca, expand_alpha_rev (M64 (m)), expand565 (vd, 0));
-		
+
 		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
 		*dst = UINT64 (vd);
 	    }
@@ -2313,7 +2307,7 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
     uint16_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
 
     CHECKPOINT ();
 
@@ -2433,7 +2427,7 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
 
     CHECKPOINT ();
 
@@ -2641,7 +2635,7 @@ mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
     uint8_t *dst_line, *dst;
     uint8_t *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t src;
     uint8_t sa;
     __m64 vsrc, vsrca;
@@ -2723,7 +2717,7 @@ mmx_composite_in_8_8 (pixman_implementation_t *imp,
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
     int src_stride, dst_stride;
-    uint16_t w;
+    int32_t w;
 
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
@@ -2788,7 +2782,7 @@ mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
     uint8_t     *dst_line, *dst;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t src;
     uint8_t sa;
     __m64 vsrc, vsrca;
@@ -2868,7 +2862,7 @@ mmx_composite_add_8000_8000 (pixman_implementation_t *imp,
     uint8_t *dst_line, *dst;
     uint8_t *src_line, *src;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
     uint8_t s, d;
     uint16_t t;
 
@@ -2942,7 +2936,7 @@ mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
 
     CHECKPOINT ();
 
@@ -3082,8 +3076,8 @@ pixman_blt_mmx (uint32_t *src_bits,
 		:
 		: "r" (d), "r" (s)
 		: "memory",
-	        "%mm0", "%mm1", "%mm2", "%mm3",
-	        "%mm4", "%mm5", "%mm6", "%mm7");
+		  "%mm0", "%mm1", "%mm2", "%mm3",
+		  "%mm4", "%mm5", "%mm6", "%mm7");
 #else
 	    __m64 v0 = *(__m64 *)(s + 0);
 	    __m64 v1 = *(__m64 *)(s + 8);
@@ -3153,6 +3147,7 @@ mmx_composite_copy_area (pixman_implementation_t *imp,
                     src_x, src_y, dest_x, dest_y, width, height);
 }
 
+#if 0
 static void
 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
                                 pixman_op_t              op,
@@ -3168,11 +3163,11 @@ mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
                                 int32_t                  width,
                                 int32_t                  height)
 {
-    uint32_t    *src, *src_line;
-    uint32_t    *dst, *dst_line;
-    uint8_t     *mask, *mask_line;
+    uint32_t  *src, *src_line;
+    uint32_t  *dst, *dst_line;
+    uint8_t  *mask, *mask_line;
     int src_stride, mask_stride, dst_stride;
-    uint16_t w;
+    int32_t w;
 
     PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
     PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
@@ -3219,116 +3214,82 @@ mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 
     _mm_empty ();
 }
+#endif
 
 static const pixman_fast_path_t mmx_fast_paths[] =
 {
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   mmx_composite_over_n_8_0565,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   mmx_composite_over_n_8_0565,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, mmx_composite_over_n_8_8888,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, mmx_composite_over_n_8_8888,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, mmx_composite_over_n_8_8888,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, mmx_composite_over_n_8_8888,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, mmx_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, mmx_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   mmx_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, mmx_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, mmx_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   mmx_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   mmx_composite_over_pixbuf_0565, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5,   mmx_composite_over_pixbuf_0565, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5,   mmx_composite_over_pixbuf_0565, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   mmx_composite_over_pixbuf_0565, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, mmx_composite_over_x888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, mmx_composite_over_x888_n_8888,           NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, mmx_composite_over_x888_n_8888,           NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, mmx_composite_over_x888_n_8888,           NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, mmx_composite_over_8888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, mmx_composite_over_8888_n_8888,           NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, mmx_composite_over_8888_n_8888,           NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, mmx_composite_over_8888_n_8888,           NEED_SOLID_MASK },
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
 #if 0
-    /* FIXME: This code is commented out since it's apparently not actually faster than the generic code. */
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, mmx_composite_over_x888_8_8888,   0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, mmx_composite_over_x888_8_8888,   0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8r8g8, PIXMAN_a8,       PIXMAN_x8b8g8r8, mmx_composite_over_x888_8_8888,   0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8r8g8, PIXMAN_a8,       PIXMAN_a8r8g8b8, mmx_composite_over_x888_8_8888,   0 },
+    /* FIXME: This code is commented out since it's apparently
+     * not actually faster than the generic code.
+     */
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8r8g8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8r8g8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
 #endif
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_a8r8g8b8, mmx_composite_over_n_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_x8r8g8b8, mmx_composite_over_n_8888,        0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   mmx_composite_over_n_0565,        0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, mmx_composite_copy_area,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, mmx_composite_copy_area,          0 },
-
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, mmx_composite_over_8888_8888,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, mmx_composite_over_8888_8888,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   mmx_composite_over_8888_0565,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, mmx_composite_over_8888_8888,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, mmx_composite_over_8888_8888,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   mmx_composite_over_8888_0565,     0 },
-
-    { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, mmx_composite_add_8888_8888,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, mmx_composite_add_8888_8888,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       mmx_composite_add_8000_8000,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       mmx_composite_add_n_8_8,    0 },
-
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8r8g8b8, mmx_composite_src_n_8_8888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8r8g8b8, mmx_composite_src_n_8_8888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8b8g8r8, mmx_composite_src_n_8_8888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8b8g8r8, mmx_composite_src_n_8_8888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, mmx_composite_copy_area, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, mmx_composite_copy_area, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, mmx_composite_copy_area, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, mmx_composite_copy_area, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, mmx_composite_copy_area, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, mmx_composite_copy_area, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_r5g6b5,    PIXMAN_null,     PIXMAN_r5g6b5,   mmx_composite_copy_area, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_b5g6r5,    PIXMAN_null,     PIXMAN_b5g6r5,   mmx_composite_copy_area, 0 },
-
-    { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       mmx_composite_in_8_8,   0 },
-    { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       mmx_composite_in_n_8_8, 0 },
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
+
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
+
+    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8000_8000       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
+
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
+
+    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
+    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
 
     { PIXMAN_OP_NONE },
 };
 
-static void
-mmx_composite (pixman_implementation_t *imp,
-               pixman_op_t              op,
-               pixman_image_t *         src,
-               pixman_image_t *         mask,
-               pixman_image_t *         dest,
-               int32_t                  src_x,
-               int32_t                  src_y,
-               int32_t                  mask_x,
-               int32_t                  mask_y,
-               int32_t                  dest_x,
-               int32_t                  dest_y,
-               int32_t                  width,
-               int32_t                  height)
-{
-    if (_pixman_run_fast_path (mmx_fast_paths, imp,
-                               op, src, mask, dest,
-                               src_x, src_y,
-                               mask_x, mask_y,
-                               dest_x, dest_y,
-                               width, height))
-    {
-	return;
-    }
-
-    _pixman_implementation_composite (imp->delegate,
-                                      op, src, mask, dest, src_x, src_y,
-                                      mask_x, mask_y, dest_x, dest_y,
-                                      width, height);
-}
-
 static pixman_bool_t
 mmx_blt (pixman_implementation_t *imp,
          uint32_t *               src_bits,
@@ -3350,9 +3311,9 @@ mmx_blt (pixman_implementation_t *imp,
 
     {
 	return _pixman_implementation_blt (
-	           imp->delegate,
-	           src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-	           src_x, src_y, dst_x, dst_y, width, height);
+	    imp->delegate,
+	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+	    src_x, src_y, dst_x, dst_y, width, height);
     }
 
     return TRUE;
@@ -3372,7 +3333,7 @@ mmx_fill (pixman_implementation_t *imp,
     if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
     {
 	return _pixman_implementation_fill (
-	           imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
     }
 
     return TRUE;
@@ -3382,7 +3343,7 @@ pixman_implementation_t *
 _pixman_implementation_create_mmx (void)
 {
     pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
-    pixman_implementation_t *imp = _pixman_implementation_create (general);
+    pixman_implementation_t *imp = _pixman_implementation_create (general, mmx_fast_paths);
 
     imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
@@ -3408,7 +3369,6 @@ _pixman_implementation_create_mmx (void)
     imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
     imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
 
-    imp->composite = mmx_composite;
     imp->blt = mmx_blt;
     imp->fill = mmx_fill;
 
diff --git a/lib/pixman/pixman/pixman-private.h b/lib/pixman/pixman/pixman-private.h
index ff7a65f88..d5767af48 100644
--- a/lib/pixman/pixman/pixman-private.h
+++ b/lib/pixman/pixman/pixman-private.h
@@ -5,9 +5,14 @@
 #ifndef PIXMAN_PRIVATE_H
 #define PIXMAN_PRIVATE_H
 
+#define PIXMAN_DISABLE_DEPRECATED
+#define PIXMAN_USE_INTERNAL_API
+
 #include "pixman.h"
 #include <time.h>
 #include <assert.h>
+#include <stdio.h>
+#include <string.h>
 
 #include "pixman-compiler.h"
 
@@ -83,7 +88,6 @@ struct image_common
 						     * the image is used as a source
 						     */
     pixman_bool_t		dirty;
-    pixman_bool_t               need_workaround;
     pixman_transform_t *        transform;
     pixman_repeat_t             repeat;
     pixman_filter_t             filter;
@@ -100,6 +104,9 @@ struct image_common
 
     pixman_image_destroy_func_t destroy_func;
     void *                      destroy_data;
+
+    uint32_t			flags;
+    pixman_format_code_t	extended_format_code;
 };
 
 struct source_image
@@ -111,7 +118,10 @@ struct source_image
 struct solid_fill
 {
     source_image_t common;
-    uint32_t       color;    /* FIXME: shouldn't this be a pixman_color_t? */
+    pixman_color_t color;
+    
+    uint32_t	   color_32;
+    uint64_t	   color_64;
 };
 
 struct gradient
@@ -120,8 +130,6 @@ struct gradient
     int                     n_stops;
     pixman_gradient_stop_t *stops;
     int                     stop_range;
-    uint32_t *              color_table;
-    int                     color_table_size;
 };
 
 struct linear_gradient
@@ -253,10 +261,6 @@ _pixman_image_store_scanline_32 (bits_image_t *  image,
                                  int             y,
                                  int             width,
                                  const uint32_t *buffer);
-void
-_pixman_image_fetch_pixels (bits_image_t *image,
-                            uint32_t *    buffer,
-                            int           n_pixels);
 
 /* Even though the type of buffer is uint32_t *, the function
  * actually expects a uint64_t *buffer.
@@ -281,12 +285,6 @@ _pixman_image_reset_clip_region (pixman_image_t *image);
 void
 _pixman_image_validate (pixman_image_t *image);
 
-pixman_bool_t
-_pixman_image_is_opaque (pixman_image_t *image);
-
-pixman_bool_t
-_pixman_image_is_solid (pixman_image_t *image);
-
 uint32_t
 _pixman_image_get_solid (pixman_image_t *     image,
                          pixman_format_code_t format);
@@ -349,13 +347,13 @@ _pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
 #define STEP_Y_SMALL(n) (pixman_fixed_1 / N_Y_FRAC (n))
 #define STEP_Y_BIG(n)   (pixman_fixed_1 - (N_Y_FRAC (n) - 1) * STEP_Y_SMALL (n))
 
-#define Y_FRAC_FIRST(n) (STEP_Y_SMALL (n) / 2)
+#define Y_FRAC_FIRST(n) (STEP_Y_BIG (n) / 2)
 #define Y_FRAC_LAST(n)  (Y_FRAC_FIRST (n) + (N_Y_FRAC (n) - 1) * STEP_Y_SMALL (n))
 
 #define STEP_X_SMALL(n) (pixman_fixed_1 / N_X_FRAC (n))
 #define STEP_X_BIG(n)   (pixman_fixed_1 - (N_X_FRAC (n) - 1) * STEP_X_SMALL (n))
 
-#define X_FRAC_FIRST(n) (STEP_X_SMALL (n) / 2)
+#define X_FRAC_FIRST(n) (STEP_X_BIG (n) / 2)
 #define X_FRAC_LAST(n)  (X_FRAC_FIRST (n) + (N_X_FRAC (n) - 1) * STEP_X_SMALL (n))
 
 #define RENDER_SAMPLES_X(x, n)						\
@@ -372,7 +370,6 @@ pixman_rasterize_edges_accessors (pixman_image_t *image,
 /*
  * Implementations
  */
-
 typedef struct pixman_implementation_t pixman_implementation_t;
 
 typedef void (*pixman_combine_32_func_t) (pixman_implementation_t *imp,
@@ -428,23 +425,36 @@ typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp,
 void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp);
 void _pixman_setup_combiner_functions_64 (pixman_implementation_t *imp);
 
-struct pixman_implementation_t
+typedef struct
 {
-    pixman_implementation_t *toplevel;
-    pixman_implementation_t *delegate;
-
-    pixman_composite_func_t  composite;
-    pixman_blt_func_t        blt;
-    pixman_fill_func_t       fill;
+    pixman_op_t             op;
+    pixman_format_code_t    src_format;
+    uint32_t		    src_flags;
+    pixman_format_code_t    mask_format;
+    uint32_t		    mask_flags;
+    pixman_format_code_t    dest_format;
+    uint32_t		    dest_flags;
+    pixman_composite_func_t func;
+} pixman_fast_path_t;
 
-    pixman_combine_32_func_t combine_32[PIXMAN_OP_LAST];
-    pixman_combine_32_func_t combine_32_ca[PIXMAN_OP_LAST];
-    pixman_combine_64_func_t combine_64[PIXMAN_OP_LAST];
-    pixman_combine_64_func_t combine_64_ca[PIXMAN_OP_LAST];
+struct pixman_implementation_t
+{
+    pixman_implementation_t *	toplevel;
+    pixman_implementation_t *	delegate;
+    const pixman_fast_path_t *	fast_paths;
+    
+    pixman_blt_func_t		blt;
+    pixman_fill_func_t		fill;
+
+    pixman_combine_32_func_t	combine_32[PIXMAN_N_OPERATORS];
+    pixman_combine_32_func_t	combine_32_ca[PIXMAN_N_OPERATORS];
+    pixman_combine_64_func_t	combine_64[PIXMAN_N_OPERATORS];
+    pixman_combine_64_func_t	combine_64_ca[PIXMAN_N_OPERATORS];
 };
 
 pixman_implementation_t *
-_pixman_implementation_create (pixman_implementation_t *delegate);
+_pixman_implementation_create (pixman_implementation_t *delegate,
+			       const pixman_fast_path_t *fast_paths);
 
 void
 _pixman_implementation_combine_32 (pixman_implementation_t *imp,
@@ -474,20 +484,6 @@ _pixman_implementation_combine_64_ca (pixman_implementation_t *imp,
                                       const uint64_t *         src,
                                       const uint64_t *         mask,
                                       int                      width);
-void
-_pixman_implementation_composite (pixman_implementation_t *imp,
-                                  pixman_op_t              op,
-                                  pixman_image_t *         src,
-                                  pixman_image_t *         mask,
-                                  pixman_image_t *         dest,
-                                  int32_t                  src_x,
-                                  int32_t                  src_y,
-                                  int32_t                  mask_x,
-                                  int32_t                  mask_y,
-                                  int32_t                  dest_x,
-                                  int32_t                  dest_y,
-                                  int32_t                  width,
-                                  int32_t                  height);
 
 pixman_bool_t
 _pixman_implementation_blt (pixman_implementation_t *imp,
@@ -556,25 +552,85 @@ _pixman_choose_implementation (void);
  * Utilities
  */
 
-/* These "formats" both have depth 0, so they
+/* These "formats" all have depth 0, so they
  * will never clash with any real ones
  */
 #define PIXMAN_null             PIXMAN_FORMAT (0, 0, 0, 0, 0, 0)
 #define PIXMAN_solid            PIXMAN_FORMAT (0, 1, 0, 0, 0, 0)
-
-#define NEED_COMPONENT_ALPHA            (1 << 0)
-#define NEED_PIXBUF                     (1 << 1)
-#define NEED_SOLID_MASK                 (1 << 2)
-
-typedef struct
-{
-    pixman_op_t             op;
-    pixman_format_code_t    src_format;
-    pixman_format_code_t    mask_format;
-    pixman_format_code_t    dest_format;
-    pixman_composite_func_t func;
-    uint32_t                flags;
-} pixman_fast_path_t;
+#define PIXMAN_pixbuf		PIXMAN_FORMAT (0, 2, 0, 0, 0, 0)
+#define PIXMAN_rpixbuf		PIXMAN_FORMAT (0, 3, 0, 0, 0, 0)
+#define PIXMAN_unknown		PIXMAN_FORMAT (0, 4, 0, 0, 0, 0)
+#define PIXMAN_any		PIXMAN_FORMAT (0, 5, 0, 0, 0, 0)
+
+#define PIXMAN_OP_any		(PIXMAN_N_OPERATORS + 1)
+
+#define FAST_PATH_ID_TRANSFORM			(1 <<  0)
+#define FAST_PATH_NO_ALPHA_MAP			(1 <<  1)
+#define FAST_PATH_NO_CONVOLUTION_FILTER		(1 <<  2)
+#define FAST_PATH_NO_PAD_REPEAT			(1 <<  3)
+#define FAST_PATH_NO_REFLECT_REPEAT		(1 <<  4)
+#define FAST_PATH_NO_ACCESSORS			(1 <<  5)
+#define FAST_PATH_NO_WIDE_FORMAT		(1 <<  6)
+#define FAST_PATH_COVERS_CLIP			(1 <<  7)
+#define FAST_PATH_COMPONENT_ALPHA		(1 <<  8)
+#define FAST_PATH_UNIFIED_ALPHA			(1 <<  9)
+#define FAST_PATH_SCALE_TRANSFORM		(1 << 10)
+#define FAST_PATH_NEAREST_FILTER		(1 << 11)
+#define FAST_PATH_SIMPLE_REPEAT			(1 << 12)
+#define FAST_PATH_IS_OPAQUE			(1 << 13)
+#define FAST_PATH_NEEDS_WORKAROUND		(1 << 14)
+#define FAST_PATH_NO_NONE_REPEAT		(1 << 15)
+#define FAST_PATH_SAMPLES_COVER_CLIP		(1 << 16)
+#define FAST_PATH_16BIT_SAFE			(1 << 17)
+#define FAST_PATH_X_UNIT_POSITIVE		(1 << 18)
+
+#define _FAST_PATH_STANDARD_FLAGS					\
+    (FAST_PATH_ID_TRANSFORM		|				\
+     FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_CONVOLUTION_FILTER	|				\
+     FAST_PATH_NO_PAD_REPEAT		|				\
+     FAST_PATH_NO_REFLECT_REPEAT	|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_NO_WIDE_FORMAT		|				\
+     FAST_PATH_COVERS_CLIP)
+
+#define FAST_PATH_STD_SRC_FLAGS						\
+    _FAST_PATH_STANDARD_FLAGS
+#define FAST_PATH_STD_MASK_U_FLAGS					\
+    (_FAST_PATH_STANDARD_FLAGS		|				\
+     FAST_PATH_UNIFIED_ALPHA)
+#define FAST_PATH_STD_MASK_CA_FLAGS					\
+    (_FAST_PATH_STANDARD_FLAGS		|				\
+     FAST_PATH_COMPONENT_ALPHA)
+#define FAST_PATH_STD_DEST_FLAGS					\
+    (FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_NO_WIDE_FORMAT)
+
+#define FAST_PATH(op, src, src_flags, mask, mask_flags, dest, dest_flags, func) \
+    PIXMAN_OP_ ## op,							\
+    PIXMAN_ ## src,							\
+    src_flags,							        \
+    PIXMAN_ ## mask,						        \
+    mask_flags,							        \
+    PIXMAN_ ## dest,	                                                \
+    dest_flags,							        \
+    func
+
+#define PIXMAN_STD_FAST_PATH(op, src, mask, dest, func)			\
+    { FAST_PATH (							\
+	  op,								\
+	  src, FAST_PATH_STD_SRC_FLAGS,					\
+	  mask, (PIXMAN_ ## mask) ? FAST_PATH_STD_MASK_U_FLAGS : 0,	\
+	  dest, FAST_PATH_STD_DEST_FLAGS,				\
+	  func) }
+
+#define PIXMAN_STD_FAST_PATH_CA(op, src, mask, dest, func)		\
+    { FAST_PATH (							\
+	  op,								\
+	  src, FAST_PATH_STD_SRC_FLAGS,					\
+	  mask, FAST_PATH_STD_MASK_CA_FLAGS,				\
+	  dest, FAST_PATH_STD_DEST_FLAGS,				\
+	  func) }
 
 /* Memory allocation helpers */
 void *
@@ -590,38 +646,6 @@ pixman_bool_t
 pixman_addition_overflows_int (unsigned int a, unsigned int b);
 
 /* Compositing utilities */
-pixman_bool_t
-_pixman_run_fast_path (const pixman_fast_path_t *paths,
-                       pixman_implementation_t * imp,
-                       pixman_op_t               op,
-                       pixman_image_t *          src,
-                       pixman_image_t *          mask,
-                       pixman_image_t *          dest,
-                       int32_t                   src_x,
-                       int32_t                   src_y,
-                       int32_t                   mask_x,
-                       int32_t                   mask_y,
-                       int32_t                   dest_x,
-                       int32_t                   dest_y,
-                       int32_t                   width,
-                       int32_t                   height);
-
-void
-_pixman_walk_composite_region (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               pixman_image_t *         src_image,
-                               pixman_image_t *         mask_image,
-                               pixman_image_t *         dst_image,
-                               int16_t                  src_x,
-                               int16_t                  src_y,
-                               int16_t                  mask_x,
-                               int16_t                  mask_y,
-                               int16_t                  dest_x,
-                               int16_t                  dest_y,
-                               uint16_t                 width,
-                               uint16_t                 height,
-                               pixman_composite_func_t  composite_rect);
-
 void
 pixman_expand (uint64_t *           dst,
                const uint32_t *     src,
@@ -684,29 +708,62 @@ pixman_region16_copy_from_region32 (pixman_region16_t *dst,
      ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) |			\
      ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)))
 
+#define CONVERT_0565_TO_8888(s) (CONVERT_0565_TO_0888(s) | 0xff000000)
+
+/* Trivial versions that are useful in macros */
+#define CONVERT_8888_TO_8888(s) (s)
+#define CONVERT_0565_TO_0565(s) (s)
+
 #define PIXMAN_FORMAT_IS_WIDE(f)					\
     (PIXMAN_FORMAT_A (f) > 8 ||						\
      PIXMAN_FORMAT_R (f) > 8 ||						\
      PIXMAN_FORMAT_G (f) > 8 ||						\
      PIXMAN_FORMAT_B (f) > 8)
 
+#ifdef WORDS_BIGENDIAN
+#   define SCREEN_SHIFT_LEFT(x,n)	((x) << (n))
+#   define SCREEN_SHIFT_RIGHT(x,n)	((x) >> (n))
+#else
+#   define SCREEN_SHIFT_LEFT(x,n)	((x) >> (n))
+#   define SCREEN_SHIFT_RIGHT(x,n)	((x) << (n))
+#endif
+
 /*
  * Various debugging code
  */
 
 #undef DEBUG
-#define DEBUG 0
 
-#if DEBUG
+/* Turn on debugging depending on what type of release this is
+ */
+#if (((PIXMAN_VERSION_MICRO % 2) == 0) && ((PIXMAN_VERSION_MINOR % 2) == 1))
+
+/* Debugging gets turned on for development releases because these
+ * are the things that end up in bleeding edge distributions such
+ * as Rawhide etc.
+ *
+ * For performance reasons we don't turn it on for stable releases or
+ * random git checkouts. (Random git checkouts are often used for
+ * performance work).
+ */
+
+#    define DEBUG
+
+#endif
+
+#ifdef DEBUG
+
+void
+_pixman_log_error (const char *function, const char *message);
 
 #define return_if_fail(expr)                                            \
     do                                                                  \
     {                                                                   \
-	if (!(expr))                                                    \
-	{                                                               \
-	    fprintf (stderr, "In %s: %s failed\n", FUNC, # expr);	\
-	    return;                                                     \
-	}                                                               \
+	if (!(expr))							\
+	{								\
+	    _pixman_log_error (FUNC, "The expression " # expr " was false"); \
+	    return;							\
+	}								\
     }                                                                   \
     while (0)
 
@@ -714,16 +771,27 @@ pixman_region16_copy_from_region32 (pixman_region16_t *dst,
     do                                                                  \
     {                                                                   \
 	if (!(expr))                                                    \
-	{                                                               \
-	    fprintf (stderr, "In %s: %s failed\n", FUNC, # expr);	\
-	    return (retval);                                            \
-	}                                                               \
+	{								\
+	    _pixman_log_error (FUNC, "The expression " # expr " was false"); \
+	    return (retval);						\
+	}								\
     }                                                                   \
     while (0)
 
+#define critical_if_fail(expr)						\
+    do									\
+    {									\
+	if (!(expr))							\
+	    _pixman_log_error (FUNC, "The expression " # expr " was false"); \
+    }									\
+    while (0)
+
+
 #else
 
-#define return_if_fail(expr)                                            \
+#define _pixman_log_error(f,m) do { } while (0)				\
+
+#define return_if_fail(expr)						\
     do                                                                  \
     {                                                                   \
 	if (!(expr))							\
@@ -739,6 +807,11 @@ pixman_region16_copy_from_region32 (pixman_region16_t *dst,
     }                                                                   \
     while (0)
 
+#define critical_if_fail(expr)						\
+    do									\
+    {									\
+    }									\
+    while (0)
 #endif
 
 /*
diff --git a/lib/pixman/pixman/pixman-region.c b/lib/pixman/pixman/pixman-region.c
index 8ce5deb77..a9b835488 100644
--- a/lib/pixman/pixman/pixman-region.c
+++ b/lib/pixman/pixman/pixman-region.c
@@ -42,6 +42,25 @@
  * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  * SOFTWARE.
  *
+ * Copyright � 1998 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
  */
 
 #include <stdlib.h>
@@ -66,74 +85,17 @@
 #define GOOD_RECT(rect) ((rect)->x1 < (rect)->x2 && (rect)->y1 < (rect)->y2)
 #define BAD_RECT(rect) ((rect)->x1 > (rect)->x2 || (rect)->y1 > (rect)->y2)
 
-/* Turn on debugging depending on what type of release this is
- */
-
-#if ((PIXMAN_VERSION_MICRO % 2) == 1)
-/* Random git checkout.
- * 
- * Those are often used for performance work, so we don't turn on the
- * full self-checking, but we do turn on the asserts.
- */
-#    define   FATAL_BUGS
-#    define noSELF_CHECKS
-#elif ((PIXMAN_VERSION_MINOR % 2) == 0)
-/* Stable release.
- *
- * We don't want assertions because the X server should stay alive
- * if possible. We also don't want self-checks for performance-reasons.
- */
-#    define noFATAL_BUGS
-#    define noSELF_CHECKS
-#else
-/* Development snapshot.
- *
- * These are the things that get shipped in development distributions
- * such as Rawhide. We want both self-checking and fatal assertions
- * to catch as many bugs as possible.
- */
-#    define FATAL_BUGS
-#    define SELF_CHECKS
-#endif
-
-#ifndef FATAL_BUGS
-#    undef assert
-#    undef abort
-#    define assert(expr)
-#    define abort()
-#endif
-
-#ifdef SELF_CHECKS
-
-static void
-log_region_error (const char *function, const char *message)
-{
-    static int n_messages = 0;
-
-    if (n_messages < 50)
-    {
-	fprintf (stderr,
-		 "*** BUG ***\n"
-		 "%s: %s\n"
-		 "Set a breakpoint on 'log_region_error' to debug\n\n",
-                 function, message);
-
-        abort (); /* This is #defined away unless FATAL_BUGS is defined */
-
-	n_messages++;
-    }
-}
+#ifdef DEBUG
 
 #define GOOD(reg)							\
     do									\
     {									\
 	if (!PREFIX (_selfcheck (reg)))					\
-	    log_region_error (FUNC, "Malformed region " # reg);         \
+	    _pixman_log_error (FUNC, "Malformed region " # reg);	\
     } while (0)
 
 #else
 
-#define log_region_error(function, name)
 #define GOOD(reg)
 
 #endif
@@ -295,7 +257,7 @@ alloc_data (size_t n)
 	}								\
 	ADDRECT (next_rect, nx1, ny1, nx2, ny2);			\
 	region->data->numRects++;					\
-	assert (region->data->numRects <= region->data->size);		\
+	critical_if_fail (region->data->numRects <= region->data->size);		\
     } while (0)
 
 #define DOWNSIZE(reg, numRects)						\
@@ -420,7 +382,7 @@ PREFIX (_init_rect) (region_type_t *	region,
     if (!GOOD_RECT (&region->extents))
     {
         if (BAD_RECT (&region->extents))
-            log_region_error (FUNC, "Invalid rectangle passed");
+            _pixman_log_error (FUNC, "Invalid rectangle passed");
         PREFIX (_init) (region);
         return;
     }
@@ -434,7 +396,7 @@ PREFIX (_init_with_extents) (region_type_t *region, box_type_t *extents)
     if (!GOOD_RECT (extents))
     {
         if (BAD_RECT (extents))
-            log_region_error (FUNC, "Invalid rectangle passed");
+            _pixman_log_error (FUNC, "Invalid rectangle passed");
         PREFIX (_init) (region);
         return;
     }
@@ -612,7 +574,7 @@ pixman_coalesce (region_type_t * region,      /* Region to coalesce		 */
      * Figure out how many rectangles are in the band.
      */
     numRects = cur_start - prev_start;
-    assert (numRects == region->data->numRects - cur_start);
+    critical_if_fail (numRects == region->data->numRects - cur_start);
 
     if (!numRects) return cur_start;
 
@@ -700,8 +662,8 @@ pixman_region_append_non_o (region_type_t * region,
 
     new_rects = r_end - r;
 
-    assert (y1 < y2);
-    assert (new_rects != 0);
+    critical_if_fail (y1 < y2);
+    critical_if_fail (new_rects != 0);
 
     /* Make sure we have enough space for all rectangles to be added */
     RECTALLOC (region, new_rects);
@@ -710,7 +672,7 @@ pixman_region_append_non_o (region_type_t * region,
 
     do
     {
-	assert (r->x1 < r->x2);
+	critical_if_fail (r->x1 < r->x2);
 	ADDRECT (next_rect, r->x1, y1, r->x2, y2);
 	r++;
     }
@@ -835,8 +797,8 @@ pixman_op (region_type_t *  new_reg,               /* Place to store result
     r2 = PIXREGION_RECTS (reg2);
     r2_end = r2 + numRects;
     
-    assert (r1 != r1_end);
-    assert (r2 != r2_end);
+    critical_if_fail (r1 != r1_end);
+    critical_if_fail (r2 != r2_end);
 
     old_data = (region_data_type_t *)NULL;
 
@@ -904,8 +866,8 @@ pixman_op (region_type_t *  new_reg,               /* Place to store result
 	 * rectangle after the last one in the current band for their
 	 * respective regions.
 	 */
-        assert (r1 != r1_end);
-        assert (r2 != r2_end);
+        critical_if_fail (r1 != r1_end);
+        critical_if_fail (r2 != r2_end);
 
         FIND_BAND (r1, r1_band_end, r1_end, r1y1);
         FIND_BAND (r2, r2_band_end, r2_end, r2y1);
@@ -1112,7 +1074,7 @@ pixman_set_extents (region_type_t *region)
     region->extents.x2 = box_end->x2;
     region->extents.y2 = box_end->y2;
 
-    assert (region->extents.y1 < region->extents.y2);
+    critical_if_fail (region->extents.y1 < region->extents.y2);
 
     while (box <= box_end)
     {
@@ -1123,7 +1085,7 @@ pixman_set_extents (region_type_t *region)
         box++;
     }
 
-    assert (region->extents.x1 < region->extents.x2);
+    critical_if_fail (region->extents.x1 < region->extents.x2);
 }
 
 /*======================================================================
@@ -1159,8 +1121,8 @@ pixman_region_intersect_o (region_type_t *region,
 
     next_rect = PIXREGION_TOP (region);
 
-    assert (y1 < y2);
-    assert (r1 != r1_end && r2 != r2_end);
+    critical_if_fail (y1 < y2);
+    critical_if_fail (r1 != r1_end && r2 != r2_end);
 
     do
     {
@@ -1317,8 +1279,8 @@ pixman_region_union_o (region_type_t *region,
     int x1;            /* left and right side of current union */
     int x2;
 
-    assert (y1 < y2);
-    assert (r1 != r1_end && r2 != r2_end);
+    critical_if_fail (y1 < y2);
+    critical_if_fail (r1 != r1_end && r2 != r2_end);
 
     next_rect = PIXREGION_TOP (region);
 
@@ -1388,10 +1350,10 @@ PREFIX (_union_rect) (region_type_t *dest,
     if (!GOOD_RECT (&region.extents))
     {
         if (BAD_RECT (&region.extents))
-            log_region_error (FUNC, "Invalid rectangle passed");
+            _pixman_log_error (FUNC, "Invalid rectangle passed");
 	return PREFIX (_copy) (dest, source);
     }
-    
+
     region.data = NULL;
 
     return PREFIX (_union) (dest, source, &region);
@@ -1881,8 +1843,8 @@ pixman_region_subtract_o (region_type_t * region,
 
     x1 = r1->x1;
 
-    assert (y1 < y2);
-    assert (r1 != r1_end && r2 != r2_end);
+    critical_if_fail (y1 < y2);
+    critical_if_fail (r1 != r1_end && r2 != r2_end);
 
     next_rect = PIXREGION_TOP (region);
 
@@ -1926,7 +1888,7 @@ pixman_region_subtract_o (region_type_t * region,
 	     * Left part of subtrahend covers part of minuend: add uncovered
 	     * part of minuend to region and skip to next subtrahend.
 	     */
-            assert (x1 < r2->x1);
+            critical_if_fail (x1 < r2->x1);
             NEWRECT (region, next_rect, x1, y1, r2->x1, y2);
 
             x1 = r2->x2;
@@ -1968,7 +1930,7 @@ pixman_region_subtract_o (region_type_t * region,
      */
     while (r1 != r1_end)
     {
-        assert (x1 < r1->x2);
+        critical_if_fail (x1 < r1->x2);
 
         NEWRECT (region, next_rect, x1, y1, r1->x2, y2);
 
@@ -2330,7 +2292,7 @@ PREFIX (_reset) (region_type_t *region, box_type_t *box)
 {
     GOOD (region);
 
-    assert (GOOD_RECT (box));
+    critical_if_fail (GOOD_RECT (box));
 
     region->extents = *box;
 
@@ -2470,7 +2432,7 @@ PREFIX (_selfcheck) (region_type_t *reg)
 
 PIXMAN_EXPORT pixman_bool_t
 PREFIX (_init_rects) (region_type_t *region,
-                      box_type_t *boxes, int count)
+                      const box_type_t *boxes, int count)
 {
     box_type_t *rects;
     int displacement;
@@ -2550,3 +2512,240 @@ PREFIX (_init_rects) (region_type_t *region,
 
     return validate (region, &i);
 }
+
+#define READ(_ptr) (*(_ptr))
+
+static inline box_type_t *
+bitmap_addrect (region_type_t *reg,
+                box_type_t *r,
+                box_type_t **first_rect,
+                int rx1, int ry1,
+                int rx2, int ry2)
+{
+    if ((rx1 < rx2) && (ry1 < ry2) &&
+	(!(reg->data->numRects &&
+	   ((r-1)->y1 == ry1) && ((r-1)->y2 == ry2) &&
+	   ((r-1)->x1 <= rx1) && ((r-1)->x2 >= rx2))))
+    {
+	if (!reg->data ||
+	    reg->data->numRects == reg->data->size)
+	{
+	    if (!pixman_rect_alloc (reg, 1))
+		return NULL;
+	    *first_rect = PIXREGION_BOXPTR(reg);
+	    r = *first_rect + reg->data->numRects;
+	}
+	r->x1 = rx1;
+	r->y1 = ry1;
+	r->x2 = rx2;
+	r->y2 = ry2;
+	reg->data->numRects++;
+	if (r->x1 < reg->extents.x1)
+	    reg->extents.x1 = r->x1;
+	if (r->x2 > reg->extents.x2)
+	    reg->extents.x2 = r->x2;
+	r++;
+    }
+    return r;
+}
+
+/* Convert bitmap clip mask into clipping region.
+ * First, goes through each line and makes boxes by noting the transitions
+ * from 0 to 1 and 1 to 0.
+ * Then it coalesces the current line with the previous if they have boxes
+ * at the same X coordinates.
+ * Stride is in number of uint32_t per line.
+ */
+PIXMAN_EXPORT void
+PREFIX (_init_from_image) (region_type_t *region,
+                           pixman_image_t *image)
+{
+    uint32_t mask0 = 0xffffffff & ~SCREEN_SHIFT_RIGHT(0xffffffff, 1);
+    box_type_t *first_rect, *rects, *prect_line_start;
+    box_type_t *old_rect, *new_rect;
+    uint32_t *pw, w, *pw_line, *pw_line_end;
+    int	irect_prev_start, irect_line_start;
+    int	h, base, rx1 = 0, crects;
+    int	ib;
+    pixman_bool_t in_box, same;
+    int width, height, stride;
+
+    PREFIX(_init) (region);
+
+    return_if_fail (image->type == BITS);
+    return_if_fail (image->bits.format == PIXMAN_a1);
+
+    pw_line = pixman_image_get_data (image);
+    width = pixman_image_get_width (image);
+    height = pixman_image_get_height (image);
+    stride = pixman_image_get_stride (image) / 4;
+
+    first_rect = PIXREGION_BOXPTR(region);
+    rects = first_rect;
+
+    region->extents.x1 = width - 1;
+    region->extents.x2 = 0;
+    irect_prev_start = -1;
+    for (h = 0; h < height; h++)
+    {
+        pw = pw_line;
+        pw_line += stride;
+        irect_line_start = rects - first_rect;
+
+        /* If the Screen left most bit of the word is set, we're starting in
+         * a box */
+        if (READ(pw) & mask0)
+        {
+            in_box = TRUE;
+            rx1 = 0;
+        }
+        else
+        {
+            in_box = FALSE;
+        }
+
+        /* Process all words which are fully in the pixmap */
+        pw_line_end = pw + (width >> 5);
+        for (base = 0; pw < pw_line_end; base += 32)
+        {
+            w = READ(pw++);
+            if (in_box)
+            {
+                if (!~w)
+                    continue;
+            }
+            else
+            {
+                if (!w)
+                    continue;
+            }
+            for (ib = 0; ib < 32; ib++)
+            {
+                /* If the Screen left most bit of the word is set, we're
+                 * starting a box */
+                if (w & mask0)
+                {
+                    if (!in_box)
+                    {
+                        rx1 = base + ib;
+                        /* start new box */
+                        in_box = TRUE;
+                    }
+                }
+                else
+                {
+                    if (in_box)
+                    {
+                        /* end box */
+                        rects = bitmap_addrect (region, rects, &first_rect,
+                                                rx1, h, base + ib, h + 1);
+                        if (rects == NULL)
+                            goto error;
+                        in_box = FALSE;
+                    }
+                }
+                /* Shift the word VISUALLY left one. */
+                w = SCREEN_SHIFT_LEFT(w, 1);
+            }
+        }
+
+        if (width & 31)
+        {
+            /* Process final partial word on line */
+             w = READ(pw++);
+            for (ib = 0; ib < (width & 31); ib++)
+            {
+                /* If the Screen left most bit of the word is set, we're
+                 * starting a box */
+                if (w & mask0)
+                {
+                    if (!in_box)
+                    {
+                        rx1 = base + ib;
+                        /* start new box */
+                        in_box = TRUE;
+                    }
+                }
+                else
+                {
+                    if (in_box)
+                    {
+                        /* end box */
+                        rects = bitmap_addrect(region, rects, &first_rect,
+					       rx1, h, base + ib, h + 1);
+			if (rects == NULL)
+			    goto error;
+                        in_box = FALSE;
+                    }
+                }
+                /* Shift the word VISUALLY left one. */
+                w = SCREEN_SHIFT_LEFT(w, 1);
+            }
+        }
+        /* If scanline ended with last bit set, end the box */
+        if (in_box)
+        {
+            rects = bitmap_addrect(region, rects, &first_rect,
+				   rx1, h, base + (width & 31), h + 1);
+	    if (rects == NULL)
+		goto error;
+        }
+        /* if all rectangles on this line have the same x-coords as
+         * those on the previous line, then add 1 to all the previous  y2s and
+         * throw away all the rectangles from this line
+         */
+        same = FALSE;
+        if (irect_prev_start != -1)
+        {
+            crects = irect_line_start - irect_prev_start;
+            if (crects != 0 &&
+                crects == ((rects - first_rect) - irect_line_start))
+            {
+                old_rect = first_rect + irect_prev_start;
+                new_rect = prect_line_start = first_rect + irect_line_start;
+                same = TRUE;
+                while (old_rect < prect_line_start)
+                {
+                    if ((old_rect->x1 != new_rect->x1) ||
+                        (old_rect->x2 != new_rect->x2))
+                    {
+                          same = FALSE;
+                          break;
+                    }
+                    old_rect++;
+                    new_rect++;
+                }
+                if (same)
+                {
+                    old_rect = first_rect + irect_prev_start;
+                    while (old_rect < prect_line_start)
+                    {
+                        old_rect->y2 += 1;
+                        old_rect++;
+                    }
+                    rects -= crects;
+                    region->data->numRects -= crects;
+                }
+            }
+        }
+        if(!same)
+            irect_prev_start = irect_line_start;
+    }
+    if (!region->data->numRects)
+    {
+        region->extents.x1 = region->extents.x2 = 0;
+    }
+    else
+    {
+        region->extents.y1 = PIXREGION_BOXPTR(region)->y1;
+        region->extents.y2 = PIXREGION_END(region)->y2;
+        if (region->data->numRects == 1)
+        {
+            free (region->data);
+            region->data = NULL;
+        }
+    }
+
+ error:
+    return;
+}
diff --git a/lib/pixman/pixman/pixman-solid-fill.c b/lib/pixman/pixman/pixman-solid-fill.c
index 38675dca8..48c999a0e 100644
--- a/lib/pixman/pixman/pixman-solid-fill.c
+++ b/lib/pixman/pixman/pixman-solid-fill.c
@@ -36,7 +36,7 @@ solid_fill_get_scanline_32 (pixman_image_t *image,
                             uint32_t        mask_bits)
 {
     uint32_t *end = buffer + width;
-    register uint32_t color = ((solid_fill_t *)image)->color;
+    uint32_t color = image->solid.color_32;
 
     while (buffer < end)
 	*(buffer++) = color;
@@ -44,6 +44,23 @@ solid_fill_get_scanline_32 (pixman_image_t *image,
     return;
 }
 
+static void
+solid_fill_get_scanline_64 (pixman_image_t *image,
+			    int             x,
+			    int             y,
+			    int             width,
+			    uint32_t *      buffer,
+			    const uint32_t *mask,
+			    uint32_t        mask_bits)
+{
+    uint64_t *b = (uint64_t *)buffer;
+    uint64_t *e = b + width;
+    uint64_t color = image->solid.color_64;
+
+    while (b < e)
+	*(b++) = color;
+}
+
 static source_image_class_t
 solid_fill_classify (pixman_image_t *image,
                      int             x,
@@ -58,7 +75,7 @@ static void
 solid_fill_property_changed (pixman_image_t *image)
 {
     image->common.get_scanline_32 = solid_fill_get_scanline_32;
-    image->common.get_scanline_64 = _pixman_image_get_scanline_generic_64;
+    image->common.get_scanline_64 = solid_fill_get_scanline_64;
 }
 
 static uint32_t
@@ -71,6 +88,16 @@ color_to_uint32 (const pixman_color_t *color)
         (color->blue >> 8);
 }
 
+static uint64_t
+color_to_uint64 (const pixman_color_t *color)
+{
+    return
+        ((uint64_t)color->alpha << 48) |
+        ((uint64_t)color->red << 32) |
+        ((uint64_t)color->green << 16) |
+        ((uint64_t)color->blue);
+}
+
 PIXMAN_EXPORT pixman_image_t *
 pixman_image_create_solid_fill (pixman_color_t *color)
 {
@@ -80,7 +107,9 @@ pixman_image_create_solid_fill (pixman_color_t *color)
 	return NULL;
 
     img->type = SOLID;
-    img->solid.color = color_to_uint32 (color);
+    img->solid.color = *color;
+    img->solid.color_32 = color_to_uint32 (color);
+    img->solid.color_64 = color_to_uint64 (color);
 
     img->source.class = SOURCE_IMAGE_CLASS_UNKNOWN;
     img->common.classify = solid_fill_classify;
diff --git a/lib/pixman/pixman/pixman-sse2.c b/lib/pixman/pixman/pixman-sse2.c
index bb74882b2..946e7ba37 100644
--- a/lib/pixman/pixman/pixman-sse2.c
+++ b/lib/pixman/pixman/pixman-sse2.c
@@ -368,6 +368,22 @@ cache_prefetch_next (__m128i* addr)
     _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
 }
 
+/* prefetching NULL is very slow on some systems. don't do that. */
+
+static force_inline void
+maybe_prefetch (__m128i* addr)
+{
+    if (addr)
+	cache_prefetch (addr);
+}
+
+static force_inline void
+maybe_prefetch_next (__m128i* addr)
+{
+    if (addr)
+	cache_prefetch_next (addr);
+}
+
 /* load 4 pixels from a 16-byte boundary aligned address */
 static force_inline __m128i
 load_128_aligned (__m128i* src)
@@ -413,9 +429,15 @@ save_128_unaligned (__m128i* dst,
  */
 
 static force_inline __m64
+load_32_1x64 (uint32_t data)
+{
+    return _mm_cvtsi32_si64 (data);
+}
+
+static force_inline __m64
 unpack_32_1x64 (uint32_t data)
 {
-    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
+    return _mm_unpacklo_pi8 (load_32_1x64 (data), _mm_setzero_si64 ());
 }
 
 static force_inline __m64
@@ -629,7 +651,7 @@ core_combine_over_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     /* Align dst on a 16-byte boundary */
     while (w && ((unsigned long)pd & 15))
@@ -647,14 +669,14 @@ core_combine_over_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
 	/* fill cache line with next memory */
 	cache_prefetch_next ((__m128i*)ps);
 	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
+	maybe_prefetch_next ((__m128i*)pm);
 
 	/* I'm loading unaligned because I'm not sure about
 	 * the address alignment.
@@ -720,7 +742,7 @@ core_combine_over_reverse_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     /* Align dst on a 16-byte boundary */
     while (w &&
@@ -739,14 +761,14 @@ core_combine_over_reverse_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
 	/* fill cache line with next memory */
 	cache_prefetch_next ((__m128i*)ps);
 	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
+	maybe_prefetch_next ((__m128i*)pm);
 
 	/* I'm loading unaligned because I'm not sure
 	 * about the address alignment.
@@ -822,7 +844,7 @@ core_combine_in_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w && ((unsigned long) pd & 15))
     {
@@ -839,14 +861,14 @@ core_combine_in_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
 	/* fill cache line with next memory */
 	cache_prefetch_next ((__m128i*)ps);
 	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
+	maybe_prefetch_next ((__m128i*)pm);
 
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
@@ -896,7 +918,7 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w && ((unsigned long) pd & 15))
     {
@@ -913,14 +935,14 @@ core_combine_reverse_in_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
 	/* fill cache line with next memory */
 	cache_prefetch_next ((__m128i*)ps);
 	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
+	maybe_prefetch_next ((__m128i*)pm);
 
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
@@ -965,7 +987,7 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w && ((unsigned long) pd & 15))
     {
@@ -976,7 +998,7 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
 	    pix_multiply_1x64 (
 		unpack_32_1x64 (d), negate_1x64 (
 		    expand_alpha_1x64 (unpack_32_1x64 (s)))));
-	
+
 	if (pm)
 	    pm++;
 	ps++;
@@ -986,7 +1008,7 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
@@ -996,7 +1018,7 @@ core_combine_reverse_out_u_sse2 (uint32_t*       pd,
 	/* fill cache line with next memory */
 	cache_prefetch_next ((__m128i*)ps);
 	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
+	maybe_prefetch_next ((__m128i*)pm);
 
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1047,7 +1069,7 @@ core_combine_out_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w && ((unsigned long) pd & 15))
     {
@@ -1067,7 +1089,7 @@ core_combine_out_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
@@ -1077,7 +1099,7 @@ core_combine_out_u_sse2 (uint32_t*       pd,
 	/* fill cache line with next memory */
 	cache_prefetch_next ((__m128i*)ps);
 	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
+	maybe_prefetch_next ((__m128i*)pm);
 
 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1147,7 +1169,7 @@ core_combine_atop_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w && ((unsigned long) pd & 15))
     {
@@ -1164,14 +1186,14 @@ core_combine_atop_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
 	/* fill cache line with next memory */
 	cache_prefetch_next ((__m128i*)ps);
 	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
+	maybe_prefetch_next ((__m128i*)pm);
 
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1244,7 +1266,7 @@ core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w && ((unsigned long) pd & 15))
     {
@@ -1261,14 +1283,14 @@ core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
 	/* fill cache line with next memory */
 	cache_prefetch_next ((__m128i*)ps);
 	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
+	maybe_prefetch_next ((__m128i*)pm);
 
 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
@@ -1345,7 +1367,7 @@ core_combine_xor_u_sse2 (uint32_t*       dst,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w && ((unsigned long) pd & 15))
     {
@@ -1362,14 +1384,14 @@ core_combine_xor_u_sse2 (uint32_t*       dst,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
 	/* fill cache line with next memory */
 	cache_prefetch_next ((__m128i*)ps);
 	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
+	maybe_prefetch_next ((__m128i*)pm);
 
 	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
 	xmm_dst = load_128_aligned ((__m128i*) pd);
@@ -1430,7 +1452,7 @@ core_combine_add_u_sse2 (uint32_t*       dst,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w && (unsigned long)pd & 15)
     {
@@ -1448,7 +1470,7 @@ core_combine_add_u_sse2 (uint32_t*       dst,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
@@ -1457,7 +1479,7 @@ core_combine_add_u_sse2 (uint32_t*       dst,
 	/* fill cache line with next memory */
 	cache_prefetch_next ((__m128i*)ps);
 	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
+	maybe_prefetch_next ((__m128i*)pm);
 
 	s = combine4 ((__m128i*)ps, (__m128i*)pm);
 
@@ -1516,7 +1538,7 @@ core_combine_saturate_u_sse2 (uint32_t *      pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w && (unsigned long)pd & 15)
     {
@@ -1533,14 +1555,14 @@ core_combine_saturate_u_sse2 (uint32_t *      pd,
     /* call prefetch hint to optimize cache load*/
     cache_prefetch ((__m128i*)ps);
     cache_prefetch ((__m128i*)pd);
-    cache_prefetch ((__m128i*)pm);
+    maybe_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
 	/* fill cache line with next memory */
 	cache_prefetch_next ((__m128i*)ps);
 	cache_prefetch_next ((__m128i*)pd);
-	cache_prefetch_next ((__m128i*)pm);
+	maybe_prefetch_next ((__m128i*)pm);
 
 	xmm_dst = load_128_aligned  ((__m128i*)pd);
 	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
@@ -2630,8 +2652,8 @@ create_mask_2x32_64 (uint32_t mask0,
 
 /* Work around a code generation bug in Sun Studio 12. */
 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
-# define create_mask_2x32_128(mask0, mask1) \
-	(_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
+# define create_mask_2x32_128(mask0, mask1)				\
+    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
 #else
 static force_inline __m128i
 create_mask_2x32_128 (uint32_t mask0,
@@ -2928,7 +2950,7 @@ sse2_composite_over_n_8888 (pixman_implementation_t *imp,
 {
     uint32_t src;
     uint32_t    *dst_line, *dst, d;
-    uint16_t w;
+    int32_t w;
     int dst_stride;
     __m128i xmm_src, xmm_alpha;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
@@ -3019,7 +3041,7 @@ sse2_composite_over_n_0565 (pixman_implementation_t *imp,
 {
     uint32_t src;
     uint16_t    *dst_line, *dst, d;
-    uint16_t w;
+    int32_t w;
     int dst_stride;
     __m128i xmm_src, xmm_alpha;
     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
@@ -3130,7 +3152,7 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 
     src = _pixman_image_get_solid (src_image, dst_image->bits.format);
     srca = src >> 24;
-    
+
     if (src == 0)
 	return;
 
@@ -3165,7 +3187,7 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *pd;
-		
+
 		mmx_mask = unpack_32_1x64 (m);
 		mmx_dest = unpack_32_1x64 (d);
 
@@ -3204,7 +3226,7 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 				    &xmm_mask_lo, &xmm_mask_hi,
 				    &xmm_mask_lo, &xmm_mask_hi);
 		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
-		
+
 		save_128_aligned (
 		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
 	    }
@@ -3221,7 +3243,7 @@ sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		d = *pd;
-		
+
 		mmx_mask = unpack_32_1x64 (m);
 		mmx_dest = unpack_32_1x64 (d);
 
@@ -3399,7 +3421,7 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
     uint32_t    *dst_line, *dst;
     uint32_t    *src_line, *src;
     uint32_t mask;
-    uint16_t w;
+    int32_t w;
     int dst_stride, src_stride;
 
     __m128i xmm_mask;
@@ -3412,7 +3434,7 @@ sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (
 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
+    mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
 
     xmm_mask = create_mask_16_128 (mask >> 24);
 
@@ -3517,7 +3539,7 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
     uint32_t    *src_line, *src;
     uint32_t mask;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
 
     __m128i xmm_mask, xmm_alpha;
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
@@ -3528,7 +3550,7 @@ sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
     PIXMAN_IMAGE_GET_LINE (
 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
+    mask = _pixman_image_get_solid (mask_image, PIXMAN_a8r8g8b8);
 
     xmm_mask = create_mask_16_128 (mask >> 24);
     xmm_alpha = mask_00ff;
@@ -3685,7 +3707,7 @@ sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
     uint16_t    *dst_line, *dst, d;
     uint32_t    *src_line, *src, s;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
 
     __m128i xmm_alpha_lo, xmm_alpha_hi;
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
@@ -3815,7 +3837,7 @@ sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
     uint32_t *dst_line, *dst;
     uint8_t *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t m, d;
 
     __m128i xmm_src, xmm_alpha, xmm_def;
@@ -3959,9 +3981,6 @@ pixman_fill_sse2 (uint32_t *bits,
 
     __m128i xmm_def;
 
-    if (bpp == 16 && (data >> 16 != (data & 0xffff)))
-	return FALSE;
-
     if (bpp != 16 && bpp != 32)
 	return FALSE;
 
@@ -3971,6 +3990,7 @@ pixman_fill_sse2 (uint32_t *bits,
 	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
 	byte_width = 2 * width;
 	stride *= 2;
+        data = (data & 0xffff) * 0x00010001;
     }
     else
     {
@@ -4100,7 +4120,7 @@ sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
     uint32_t    *dst_line, *dst;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t m;
 
     __m128i xmm_src, xmm_def;
@@ -4246,7 +4266,7 @@ sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
     uint16_t    *dst_line, *dst, d;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t m;
     __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
@@ -4409,7 +4429,7 @@ sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
     uint16_t    *dst_line, *dst, d;
     uint32_t    *src_line, *src, s;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t opaque, zero;
 
     __m64 ms;
@@ -4555,7 +4575,7 @@ sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
     uint32_t    *dst_line, *dst, d;
     uint32_t    *src_line, *src, s;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t opaque, zero;
 
     __m128i xmm_src_lo, xmm_src_hi;
@@ -4841,9 +4861,10 @@ sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
     uint8_t     *dst_line, *dst;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w, d, m;
+    uint32_t d, m;
     uint32_t src;
     uint8_t sa;
+    int32_t w;
 
     __m128i xmm_alpha;
     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
@@ -4956,7 +4977,7 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
     int src_stride, dst_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t s, d;
 
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
@@ -5033,28 +5054,28 @@ sse2_composite_in_8_8 (pixman_implementation_t *imp,
 }
 
 /* -------------------------------------------------------------------------
- * composite_add_8888_8_8
+ * composite_add_n_8_8
  */
 
 static void
-sse2_composite_add_8888_8_8 (pixman_implementation_t *imp,
-                             pixman_op_t              op,
-                             pixman_image_t *         src_image,
-                             pixman_image_t *         mask_image,
-                             pixman_image_t *         dst_image,
-                             int32_t                  src_x,
-                             int32_t                  src_y,
-                             int32_t                  mask_x,
-                             int32_t                  mask_y,
-                             int32_t                  dest_x,
-                             int32_t                  dest_y,
-                             int32_t                  width,
-                             int32_t                  height)
+sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
+			  pixman_op_t              op,
+			  pixman_image_t *         src_image,
+			  pixman_image_t *         mask_image,
+			  pixman_image_t *         dst_image,
+			  int32_t                  src_x,
+			  int32_t                  src_y,
+			  int32_t                  mask_x,
+			  int32_t                  mask_y,
+			  int32_t                  dest_x,
+			  int32_t                  dest_y,
+			  int32_t                  width,
+			  int32_t                  height)
 {
     uint8_t     *dst_line, *dst;
     uint8_t     *mask_line, *mask;
     int dst_stride, mask_stride;
-    uint16_t w;
+    int32_t w;
     uint32_t src;
     uint8_t sa;
     uint32_t m, d;
@@ -5170,7 +5191,7 @@ sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
     uint8_t     *dst_line, *dst;
     uint8_t     *src_line, *src;
     int dst_stride, src_stride;
-    uint16_t w;
+    int32_t w;
     uint16_t t;
 
     PIXMAN_IMAGE_GET_LINE (
@@ -5428,9 +5449,7 @@ sse2_composite_copy_area (pixman_implementation_t *imp,
                      src_x, src_y, dest_x, dest_y, width, height);
 }
 
-#if 0
-/* This code are buggy in MMX version, now the bug was translated to SSE2 version */
-void
+static void
 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
                                  pixman_op_t              op,
                                  pixman_image_t *         src_image,
@@ -5450,7 +5469,8 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
     uint8_t         *mask, *mask_line;
     uint32_t m;
     int src_stride, mask_stride, dst_stride;
-    uint16_t w;
+    int32_t w;
+    __m64 ms;
 
     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
@@ -5465,258 +5485,363 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
 
     while (height--)
     {
-	src = src_line;
-	src_line += src_stride;
-	dst = dst_line;
-	dst_line += dst_stride;
-	mask = mask_line;
-	mask_line += mask_stride;
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        /* call prefetch hint to optimize cache load*/
+        cache_prefetch ((__m128i*)src);
+        cache_prefetch ((__m128i*)dst);
+        cache_prefetch ((__m128i*)mask);
+
+        while (w && (unsigned long)dst & 15)
+        {
+            s = 0xff000000 | *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+            ms = unpack_32_1x64 (s);
+
+            if (m != 0xff)
+            {
+		__m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+		__m64 md = unpack_32_1x64 (d);
+
+                ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
+            }
+
+            *dst++ = pack_1x64_32 (ms);
+            w--;
+        }
+
+        /* call prefetch hint to optimize cache load*/
+        cache_prefetch ((__m128i*)src);
+        cache_prefetch ((__m128i*)dst);
+        cache_prefetch ((__m128i*)mask);
+
+        while (w >= 4)
+        {
+            /* fill cache line with next memory */
+            cache_prefetch_next ((__m128i*)src);
+            cache_prefetch_next ((__m128i*)dst);
+            cache_prefetch_next ((__m128i*)mask);
+
+            m = *(uint32_t*) mask;
+            xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
+
+            if (m == 0xffffffff)
+            {
+                save_128_aligned ((__m128i*)dst, xmm_src);
+            }
+            else
+            {
+                xmm_dst = load_128_aligned ((__m128i*)dst);
+
+                xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+                unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+                expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+                in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+            }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+            m = (uint32_t) *mask++;
+
+            if (m)
+            {
+                s = 0xff000000 | *src;
+
+                if (m == 0xff)
+                {
+                    *dst = s;
+                }
+                else
+                {
+		    __m64 ma, md, ms;
+
+                    d = *dst;
+
+		    ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+		    md = unpack_32_1x64 (d);
+		    ms = unpack_32_1x64 (s);
+
+                    *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
+                }
+
+            }
+
+            src++;
+            dst++;
+            w--;
+        }
+    }
 
-	w = width;
+    _mm_empty ();
+}
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)mask);
+static void
+sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint8_t         *mask, *mask_line;
+    uint32_t m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
 
-	while (w && (unsigned long)dst & 15)
-	{
-	    s = 0xff000000 | *src++;
-	    m = (uint32_t) *mask++;
-	    d = *dst;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-	    __m64 ms = unpack_32_1x64 (s);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-	    if (m != 0xff)
-	    {
-		ms = in_over_1x64 (ms,
-		                   mask_x00ff,
-		                   expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
-		                   unpack_32_1x64 (d));
-	    }
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
 
-	    *dst++ = pack_1x64_32 (ms);
-	    w--;
-	}
+        w = width;
 
-	/* call prefetch hint to optimize cache load*/
-	cache_prefetch ((__m128i*)src);
-	cache_prefetch ((__m128i*)dst);
-	cache_prefetch ((__m128i*)mask);
+        /* call prefetch hint to optimize cache load*/
+        cache_prefetch ((__m128i *)src);
+        cache_prefetch ((__m128i *)dst);
+        cache_prefetch ((__m128i *)mask);
 
-	while (w >= 4)
-	{
-	    /* fill cache line with next memory */
-	    cache_prefetch_next ((__m128i*)src);
-	    cache_prefetch_next ((__m128i*)dst);
-	    cache_prefetch_next ((__m128i*)mask);
+        while (w && (unsigned long)dst & 15)
+        {
+	    uint32_t sa;
 
-	    m = *(uint32_t*) mask;
-	    xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
+            s = *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
 
-	    if (m == 0xffffffff)
+	    sa = s >> 24;
+
+	    if (m)
 	    {
-		save_128_aligned ((__m128i*)dst, xmm_src);
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m64 ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
+		    ms = unpack_32_1x64 (s);
+		    md = unpack_32_1x64 (d);
+
+		    msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
+
+		    *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+		}
 	    }
-	    else
+
+	    dst++;
+            w--;
+        }
+
+        /* call prefetch hint to optimize cache load*/
+        cache_prefetch ((__m128i *)src);
+        cache_prefetch ((__m128i *)dst);
+        cache_prefetch ((__m128i *)mask);
+
+        while (w >= 4)
+        {
+            /* fill cache line with next memory */
+            cache_prefetch_next ((__m128i *)src);
+            cache_prefetch_next ((__m128i *)dst);
+            cache_prefetch_next ((__m128i *)mask);
+
+            m = *(uint32_t *) mask;
+
+	    if (m)
 	    {
-		xmm_dst = load_128_aligned ((__m128i*)dst);
+		xmm_src = load_128_unaligned ((__m128i*)src);
+
+		if (m == 0xffffffff && is_opaque (xmm_src))
+		{
+		    save_128_aligned ((__m128i *)dst, xmm_src);
+		}
+		else
+		{
+		    xmm_dst = load_128_aligned ((__m128i *)dst);
 
-		xmm_mask = _mm_unpacklo_epi16 (
-		    unpack_32_1x128 (m), _mm_setzero_si128 ());
+		    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
 
-		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
-		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
-		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
 
-		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
-					&xmm_mask_lo, &xmm_mask_hi);
+		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+		    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
 
-		in_over_2x128 (xmm_src_lo, xmm_src_hi,
-			       mask_00ff, mask_00ff,
-			       xmm_mask_lo, xmm_mask_hi,
-			       &xmm_dst_lo, &xmm_dst_hi);
+		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-		save_128_aligned (
-		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+		}
 	    }
 
-	    src += 4;
-	    dst += 4;
-	    mask += 4;
-	    w -= 4;
-	}
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
 
-	while (w)
-	{
-	    m = (uint32_t) *mask++;
+        while (w)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+
+	    sa = s >> 24;
 
 	    if (m)
 	    {
-		s = 0xff000000 | *src;
-
-		if (m == 0xff)
+		if (sa == 0xff && m == 0xff)
 		{
 		    *dst = s;
 		}
 		else
 		{
-		    d = *dst;
+		    __m64 ms, md, ma, msa;
 
-		    *dst = pack_1x64_32 (
-			in_over_1x64 (
-			    unpack_32_1x64 (s),
-			    mask_x00ff,
-			    expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
-			    unpack_32_1x64 (d)));
-		}
+		    ma = expand_alpha_rev_1x64 (load_32_1x64 (m));
+		    ms = unpack_32_1x64 (s);
+		    md = unpack_32_1x64 (d);
+
+		    msa = expand_alpha_rev_1x64 (load_32_1x64 (sa));
 
+		    *dst = pack_1x64_32 (in_over_1x64 (&ms, &msa, &ma, &md));
+		}
 	    }
 
-	    src++;
 	    dst++;
-	    w--;
-	}
+            w--;
+        }
     }
 
     _mm_empty ();
 }
 
-#endif
-
 static const pixman_fast_path_t sse2_fast_paths[] =
 {
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   sse2_composite_over_n_8_0565,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   sse2_composite_over_n_8_0565,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_n_8888,         0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_n_8888,         0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_n_0565,         0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888,      0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888,      0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888,      0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888,      0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_8888_0565,      0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_over_8888_0565,      0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888,       0 },
-#if 0
-    /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888,    0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_x888_8_8888,    0 },
-#endif
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
-
-    { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca,  NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       sse2_composite_add_8000_8000,       0 },
-    { PIXMAN_OP_ADD,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888,       0 },
-    { PIXMAN_OP_ADD,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888,       0 },
-    { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       sse2_composite_add_8888_8_8,        0 },
-
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888,        0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888,        0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888,        0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888,        0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_SRC, PIXMAN_r5g6b5,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_copy_area,           0 },
-    { PIXMAN_OP_SRC, PIXMAN_b5g6r5,    PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_copy_area,           0 },
-
-    { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       sse2_composite_in_8_8,              0 },
-    { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       sse2_composite_in_n_8_8,            0 },
+    /* PIXMAN_OP_OVER */
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+
+    /* PIXMAN_OP_ADD */
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8000_8000),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
+
+    /* PIXMAN_OP_SRC */
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
+
+    /* PIXMAN_OP_IN */
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
 
     { PIXMAN_OP_NONE },
 };
 
-/*
- * Work around GCC bug causing crashes in Mozilla with SSE2
- *
- * When using -msse, gcc generates movdqa instructions assuming that
- * the stack is 16 byte aligned. Unfortunately some applications, such
- * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
- * causes the movdqa instructions to fail.
- *
- * The __force_align_arg_pointer__ makes gcc generate a prologue that
- * realigns the stack pointer to 16 bytes.
- *
- * On x86-64 this is not necessary because the standard ABI already
- * calls for a 16 byte aligned stack.
- *
- * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
- */
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
-static void
-sse2_composite (pixman_implementation_t *imp,
-                pixman_op_t              op,
-                pixman_image_t *         src,
-                pixman_image_t *         mask,
-                pixman_image_t *         dest,
-                int32_t                  src_x,
-                int32_t                  src_y,
-                int32_t                  mask_x,
-                int32_t                  mask_y,
-                int32_t                  dest_x,
-                int32_t                  dest_y,
-                int32_t                  width,
-                int32_t                  height)
-{
-    if (_pixman_run_fast_path (sse2_fast_paths, imp,
-                               op, src, mask, dest,
-                               src_x, src_y,
-                               mask_x, mask_y,
-                               dest_x, dest_y,
-                               width, height))
-    {
-	return;
-    }
-
-    _pixman_implementation_composite (imp->delegate, op,
-                                      src, mask, dest,
-                                      src_x, src_y,
-                                      mask_x, mask_y,
-                                      dest_x, dest_y,
-                                      width, height);
-}
-
-#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
-__attribute__((__force_align_arg_pointer__))
-#endif
 static pixman_bool_t
 sse2_blt (pixman_implementation_t *imp,
           uint32_t *               src_bits,
@@ -5775,8 +5900,12 @@ __attribute__((__force_align_arg_pointer__))
 pixman_implementation_t *
 _pixman_implementation_create_sse2 (void)
 {
-    pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
-    pixman_implementation_t *imp = _pixman_implementation_create (mmx);
+#ifdef USE_MMX
+    pixman_implementation_t *fallback = _pixman_implementation_create_mmx ();
+#else
+    pixman_implementation_t *fallback = _pixman_implementation_create_fast_path ();
+#endif
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
 
     /* SSE2 constants */
     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
@@ -5834,7 +5963,6 @@ _pixman_implementation_create_sse2 (void)
     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
 
-    imp->composite = sse2_composite;
     imp->blt = sse2_blt;
     imp->fill = sse2_fill;
 
diff --git a/lib/pixman/pixman/pixman-trap.c b/lib/pixman/pixman/pixman-trap.c
index 962cbb39e..8353992c5 100644
--- a/lib/pixman/pixman/pixman-trap.c
+++ b/lib/pixman/pixman/pixman-trap.c
@@ -28,8 +28,8 @@
 #include "pixman-private.h"
 
 /*
- * Compute the smallest value no less than y which is on a
- * grid row
+ * Compute the smallest value greater than or equal to y which is on a
+ * grid row.
  */
 
 PIXMAN_EXPORT pixman_fixed_t
@@ -38,7 +38,7 @@ pixman_sample_ceil_y (pixman_fixed_t y, int n)
     pixman_fixed_t f = pixman_fixed_frac (y);
     pixman_fixed_t i = pixman_fixed_floor (y);
 
-    f = ((f + Y_FRAC_FIRST (n)) / STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
+    f = DIV (f - Y_FRAC_FIRST (n) + (STEP_Y_SMALL (n) - pixman_fixed_e), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
 	Y_FRAC_FIRST (n);
     
     if (f > Y_FRAC_LAST (n))
@@ -57,8 +57,8 @@ pixman_sample_ceil_y (pixman_fixed_t y, int n)
 }
 
 /*
- * Compute the largest value no greater than y which is on a
- * grid row
+ * Compute the largest value strictly less than y which is on a
+ * grid row.
  */
 PIXMAN_EXPORT pixman_fixed_t
 pixman_sample_floor_y (pixman_fixed_t y,
@@ -67,7 +67,7 @@ pixman_sample_floor_y (pixman_fixed_t y,
     pixman_fixed_t f = pixman_fixed_frac (y);
     pixman_fixed_t i = pixman_fixed_floor (y);
 
-    f = DIV (f - Y_FRAC_FIRST (n), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
+    f = DIV (f - pixman_fixed_e - Y_FRAC_FIRST (n), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
 	Y_FRAC_FIRST (n);
 
     if (f < Y_FRAC_FIRST (n))
@@ -380,7 +380,7 @@ pixman_rasterize_trapezoid (pixman_image_t *          image,
     if (pixman_fixed_to_int (b) >= height)
 	b = pixman_int_to_fixed (height) - 1;
     b = pixman_sample_floor_y (b, bpp);
-
+    
     if (b >= t)
     {
 	/* initialize edge walkers */
diff --git a/lib/pixman/pixman/pixman-utils.c b/lib/pixman/pixman/pixman-utils.c
index 71282062c..3ef88b753 100644
--- a/lib/pixman/pixman/pixman-utils.c
+++ b/lib/pixman/pixman/pixman-utils.c
@@ -30,211 +30,6 @@
 
 #include "pixman-private.h"
 
-/*
- * Computing composite region
- */
-#define BOUND(v)        (int16_t) ((v) < INT16_MIN ? INT16_MIN : (v) > INT16_MAX ? INT16_MAX : (v))
-
-static inline pixman_bool_t
-clip_general_image (pixman_region32_t * region,
-                    pixman_region32_t * clip,
-                    int                 dx,
-                    int                 dy)
-{
-    if (pixman_region32_n_rects (region) == 1 &&
-        pixman_region32_n_rects (clip) == 1)
-    {
-	pixman_box32_t *  rbox = pixman_region32_rectangles (region, NULL);
-	pixman_box32_t *  cbox = pixman_region32_rectangles (clip, NULL);
-	int v;
-
-	if (rbox->x1 < (v = cbox->x1 + dx))
-	    rbox->x1 = BOUND (v);
-	if (rbox->x2 > (v = cbox->x2 + dx))
-	    rbox->x2 = BOUND (v);
-	if (rbox->y1 < (v = cbox->y1 + dy))
-	    rbox->y1 = BOUND (v);
-	if (rbox->y2 > (v = cbox->y2 + dy))
-	    rbox->y2 = BOUND (v);
-	if (rbox->x1 >= rbox->x2 ||
-	    rbox->y1 >= rbox->y2)
-	{
-	    pixman_region32_init (region);
-	}
-    }
-    else if (!pixman_region32_not_empty (clip))
-    {
-	return FALSE;
-    }
-    else
-    {
-	if (dx || dy)
-	    pixman_region32_translate (region, -dx, -dy);
-	if (!pixman_region32_intersect (region, region, clip))
-	    return FALSE;
-	if (dx || dy)
-	    pixman_region32_translate (region, dx, dy);
-    }
-    return pixman_region32_not_empty (region);
-}
-
-static inline pixman_bool_t
-clip_source_image (pixman_region32_t * region,
-                   pixman_image_t *    image,
-                   int                 dx,
-                   int                 dy)
-{
-    /* Source clips are ignored, unless they are explicitly turned on
-     * and the clip in question was set by an X client. (Because if
-     * the clip was not set by a client, then it is a hierarchy
-     * clip and those should always be ignored for sources).
-     */
-    if (!image->common.clip_sources || !image->common.client_clip)
-	return TRUE;
-
-    return clip_general_image (region,
-                               &image->common.clip_region,
-                               dx, dy);
-}
-
-/*
- * returns FALSE if the final region is empty.  Indistinguishable from
- * an allocation failure, but rendering ignores those anyways.
- */
-static pixman_bool_t
-pixman_compute_composite_region32 (pixman_region32_t * region,
-                                   pixman_image_t *    src_image,
-                                   pixman_image_t *    mask_image,
-                                   pixman_image_t *    dst_image,
-                                   int16_t             src_x,
-                                   int16_t             src_y,
-                                   int16_t             mask_x,
-                                   int16_t             mask_y,
-                                   int16_t             dest_x,
-                                   int16_t             dest_y,
-                                   uint16_t            width,
-                                   uint16_t            height)
-{
-    int v;
-
-    region->extents.x1 = dest_x;
-    v = dest_x + width;
-    region->extents.x2 = BOUND (v);
-    region->extents.y1 = dest_y;
-    v = dest_y + height;
-    region->extents.y2 = BOUND (v);
-
-    region->extents.x1 = MAX (region->extents.x1, 0);
-    region->extents.y1 = MAX (region->extents.y1, 0);
-    region->extents.x2 = MIN (region->extents.x2, dst_image->bits.width);
-    region->extents.y2 = MIN (region->extents.y2, dst_image->bits.height);
-
-    region->data = 0;
-
-    /* Check for empty operation */
-    if (region->extents.x1 >= region->extents.x2 ||
-        region->extents.y1 >= region->extents.y2)
-    {
-	pixman_region32_init (region);
-	return FALSE;
-    }
-
-    if (dst_image->common.have_clip_region)
-    {
-	if (!clip_general_image (region, &dst_image->common.clip_region, 0, 0))
-	{
-	    pixman_region32_fini (region);
-	    return FALSE;
-	}
-    }
-
-    if (dst_image->common.alpha_map && dst_image->common.alpha_map->common.have_clip_region)
-    {
-	if (!clip_general_image (region, &dst_image->common.alpha_map->common.clip_region,
-	                         -dst_image->common.alpha_origin_x,
-	                         -dst_image->common.alpha_origin_y))
-	{
-	    pixman_region32_fini (region);
-	    return FALSE;
-	}
-    }
-
-    /* clip against src */
-    if (src_image->common.have_clip_region)
-    {
-	if (!clip_source_image (region, src_image, dest_x - src_x, dest_y - src_y))
-	{
-	    pixman_region32_fini (region);
-	    return FALSE;
-	}
-    }
-    if (src_image->common.alpha_map && src_image->common.alpha_map->common.have_clip_region)
-    {
-	if (!clip_source_image (region, (pixman_image_t *)src_image->common.alpha_map,
-	                        dest_x - (src_x - src_image->common.alpha_origin_x),
-	                        dest_y - (src_y - src_image->common.alpha_origin_y)))
-	{
-	    pixman_region32_fini (region);
-	    return FALSE;
-	}
-    }
-    /* clip against mask */
-    if (mask_image && mask_image->common.have_clip_region)
-    {
-	if (!clip_source_image (region, mask_image, dest_x - mask_x, dest_y - mask_y))
-	{
-	    pixman_region32_fini (region);
-	    return FALSE;
-	}
-	if (mask_image->common.alpha_map && mask_image->common.alpha_map->common.have_clip_region)
-	{
-	    if (!clip_source_image (region, (pixman_image_t *)mask_image->common.alpha_map,
-	                            dest_x - (mask_x - mask_image->common.alpha_origin_x),
-	                            dest_y - (mask_y - mask_image->common.alpha_origin_y)))
-	    {
-		pixman_region32_fini (region);
-		return FALSE;
-	    }
-	}
-    }
-
-    return TRUE;
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_compute_composite_region (pixman_region16_t * region,
-                                 pixman_image_t *    src_image,
-                                 pixman_image_t *    mask_image,
-                                 pixman_image_t *    dst_image,
-                                 int16_t             src_x,
-                                 int16_t             src_y,
-                                 int16_t             mask_x,
-                                 int16_t             mask_y,
-                                 int16_t             dest_x,
-                                 int16_t             dest_y,
-                                 uint16_t            width,
-                                 uint16_t            height)
-{
-    pixman_region32_t r32;
-    pixman_bool_t retval;
-
-    pixman_region32_init (&r32);
-
-    retval = pixman_compute_composite_region32 (
-	&r32, src_image, mask_image, dst_image,
-	src_x, src_y, mask_x, mask_y, dest_x, dest_y,
-	width, height);
-
-    if (retval)
-    {
-	if (!pixman_region16_copy_from_region32 (region, &r32))
-	    retval = FALSE;
-    }
-
-    pixman_region32_fini (&r32);
-    return retval;
-}
-
 pixman_bool_t
 pixman_multiply_overflows_int (unsigned int a,
                                unsigned int b)
@@ -372,401 +167,6 @@ pixman_contract (uint32_t *      dst,
     }
 }
 
-static void
-walk_region_internal (pixman_implementation_t *imp,
-                      pixman_op_t              op,
-                      pixman_image_t *         src_image,
-                      pixman_image_t *         mask_image,
-                      pixman_image_t *         dst_image,
-                      int16_t                  src_x,
-                      int16_t                  src_y,
-                      int16_t                  mask_x,
-                      int16_t                  mask_y,
-                      int16_t                  dest_x,
-                      int16_t                  dest_y,
-                      uint16_t                 width,
-                      uint16_t                 height,
-                      pixman_bool_t            src_repeat,
-                      pixman_bool_t            mask_repeat,
-                      pixman_region32_t *      region,
-                      pixman_composite_func_t  composite_rect)
-{
-    int n;
-    const pixman_box32_t *pbox;
-    int w, h, w_this, h_this;
-    int x_msk, y_msk, x_src, y_src, x_dst, y_dst;
-
-    pbox = pixman_region32_rectangles (region, &n);
-    while (n--)
-    {
-	h = pbox->y2 - pbox->y1;
-	y_src = pbox->y1 - dest_y + src_y;
-	y_msk = pbox->y1 - dest_y + mask_y;
-	y_dst = pbox->y1;
-
-	while (h)
-	{
-	    h_this = h;
-	    w = pbox->x2 - pbox->x1;
-	    x_src = pbox->x1 - dest_x + src_x;
-	    x_msk = pbox->x1 - dest_x + mask_x;
-	    x_dst = pbox->x1;
-
-	    if (mask_repeat)
-	    {
-		y_msk = MOD (y_msk, mask_image->bits.height);
-		if (h_this > mask_image->bits.height - y_msk)
-		    h_this = mask_image->bits.height - y_msk;
-	    }
-
-	    if (src_repeat)
-	    {
-		y_src = MOD (y_src, src_image->bits.height);
-		if (h_this > src_image->bits.height - y_src)
-		    h_this = src_image->bits.height - y_src;
-	    }
-
-	    while (w)
-	    {
-		w_this = w;
-
-		if (mask_repeat)
-		{
-		    x_msk = MOD (x_msk, mask_image->bits.width);
-		    if (w_this > mask_image->bits.width - x_msk)
-			w_this = mask_image->bits.width - x_msk;
-		}
-
-		if (src_repeat)
-		{
-		    x_src = MOD (x_src, src_image->bits.width);
-		    if (w_this > src_image->bits.width - x_src)
-			w_this = src_image->bits.width - x_src;
-		}
-
-		(*composite_rect) (imp, op,
-				   src_image, mask_image, dst_image,
-				   x_src, y_src, x_msk, y_msk, x_dst, y_dst,
-				   w_this, h_this);
-		w -= w_this;
-
-		x_src += w_this;
-		x_msk += w_this;
-		x_dst += w_this;
-	    }
-
-	    h -= h_this;
-	    y_src += h_this;
-	    y_msk += h_this;
-	    y_dst += h_this;
-	}
-
-	pbox++;
-    }
-}
-
-void
-_pixman_walk_composite_region (pixman_implementation_t *imp,
-                               pixman_op_t              op,
-                               pixman_image_t *         src_image,
-                               pixman_image_t *         mask_image,
-                               pixman_image_t *         dst_image,
-                               int16_t                  src_x,
-                               int16_t                  src_y,
-                               int16_t                  mask_x,
-                               int16_t                  mask_y,
-                               int16_t                  dest_x,
-                               int16_t                  dest_y,
-                               uint16_t                 width,
-                               uint16_t                 height,
-                               pixman_composite_func_t  composite_rect)
-{
-    pixman_region32_t region;
-
-    pixman_region32_init (&region);
-
-    if (pixman_compute_composite_region32 (
-            &region, src_image, mask_image, dst_image,
-            src_x, src_y, mask_x, mask_y, dest_x, dest_y,
-            width, height))
-    {
-	walk_region_internal (imp, op,
-	                      src_image, mask_image, dst_image,
-	                      src_x, src_y, mask_x, mask_y, dest_x, dest_y,
-	                      width, height, FALSE, FALSE,
-	                      &region,
-	                      composite_rect);
-
-	pixman_region32_fini (&region);
-    }
-}
-
-static pixman_bool_t
-mask_is_solid (pixman_image_t *mask)
-{
-    if (mask->type == SOLID)
-	return TRUE;
-
-    if (mask->type == BITS &&
-        mask->common.repeat == PIXMAN_REPEAT_NORMAL &&
-        mask->bits.width == 1 &&
-        mask->bits.height == 1)
-    {
-	return TRUE;
-    }
-
-    return FALSE;
-}
-
-static const pixman_fast_path_t *
-get_fast_path (const pixman_fast_path_t *fast_paths,
-               pixman_op_t               op,
-               pixman_image_t *          src_image,
-               pixman_image_t *          mask_image,
-               pixman_image_t *          dst_image,
-               pixman_bool_t             is_pixbuf)
-{
-    const pixman_fast_path_t *info;
-
-    for (info = fast_paths; info->op != PIXMAN_OP_NONE; info++)
-    {
-	pixman_bool_t valid_src = FALSE;
-	pixman_bool_t valid_mask = FALSE;
-
-	if (info->op != op)
-	    continue;
-
-	if ((info->src_format == PIXMAN_solid &&
-	     _pixman_image_is_solid (src_image)) ||
-	    (src_image->type == BITS &&
-	     info->src_format == src_image->bits.format))
-	{
-	    valid_src = TRUE;
-	}
-
-	if (!valid_src)
-	    continue;
-
-	if ((info->mask_format == PIXMAN_null && !mask_image) ||
-	    (mask_image && mask_image->type == BITS &&
-	     info->mask_format == mask_image->bits.format))
-	{
-	    valid_mask = TRUE;
-
-	    if (info->flags & NEED_SOLID_MASK)
-	    {
-		if (!mask_image || !mask_is_solid (mask_image))
-		    valid_mask = FALSE;
-	    }
-
-	    if (info->flags & NEED_COMPONENT_ALPHA)
-	    {
-		if (!mask_image || !mask_image->common.component_alpha)
-		    valid_mask = FALSE;
-	    }
-	}
-
-	if (!valid_mask)
-	    continue;
-
-	if (info->dest_format != dst_image->bits.format)
-	    continue;
-
-	if ((info->flags & NEED_PIXBUF) && !is_pixbuf)
-	    continue;
-
-	return info;
-    }
-
-    return NULL;
-}
-
-static force_inline pixman_bool_t
-image_covers (pixman_image_t *image,
-              pixman_box32_t *extents,
-              int             x,
-              int             y)
-{
-    if (image->common.type == BITS &&
-	image->common.repeat == PIXMAN_REPEAT_NONE)
-    {
-	if (x > extents->x1 || y > extents->y1 ||
-	    x + image->bits.width < extents->x2 ||
-	    y + image->bits.height < extents->y2)
-	{
-	    return FALSE;
-	}
-    }
-
-    return TRUE;
-}
-
-static force_inline pixman_bool_t
-sources_cover (pixman_image_t *src,
-	       pixman_image_t *mask,
-	       pixman_box32_t *extents,
-	       int             src_x,
-	       int             src_y,
-	       int             mask_x,
-	       int             mask_y,
-	       int             dest_x,
-	       int             dest_y)
-{
-    if (!image_covers (src, extents, dest_x - src_x, dest_y - src_y))
-	return FALSE;
-
-    if (!mask)
-	return TRUE;
-    
-    if (!image_covers (mask, extents, dest_x - mask_x, dest_y - mask_y))
-	return FALSE;
-
-    return TRUE;
-}
-
-pixman_bool_t
-_pixman_run_fast_path (const pixman_fast_path_t *paths,
-                       pixman_implementation_t * imp,
-                       pixman_op_t               op,
-                       pixman_image_t *          src,
-                       pixman_image_t *          mask,
-                       pixman_image_t *          dest,
-                       int32_t                   src_x,
-                       int32_t                   src_y,
-                       int32_t                   mask_x,
-                       int32_t                   mask_y,
-                       int32_t                   dest_x,
-                       int32_t                   dest_y,
-                       int32_t                   width,
-                       int32_t                   height)
-{
-    pixman_composite_func_t func = NULL;
-    pixman_bool_t src_repeat =
-	src->common.repeat == PIXMAN_REPEAT_NORMAL;
-    pixman_bool_t mask_repeat =
-	mask && mask->common.repeat == PIXMAN_REPEAT_NORMAL;
-    pixman_bool_t result;
-    pixman_bool_t has_fast_path;
-
-    has_fast_path = !dest->common.alpha_map &&
-		    !dest->bits.read_func &&
-		    !dest->bits.write_func;
-
-    if (has_fast_path)
-    {
-	has_fast_path = (src->type == BITS || _pixman_image_is_solid (src)) &&
-	                !src->common.transform &&
-	                !src->common.alpha_map &&
-			src->common.filter != PIXMAN_FILTER_CONVOLUTION &&
-			src->common.repeat != PIXMAN_REPEAT_PAD &&
-			src->common.repeat != PIXMAN_REPEAT_REFLECT;
-	if (has_fast_path && src->type == BITS)
-	{
-	    has_fast_path = !src->bits.read_func &&
-	                    !src->bits.write_func &&
-		            !PIXMAN_FORMAT_IS_WIDE (src->bits.format);
-	}
-    }
-
-    if (mask && has_fast_path)
-    {
-	has_fast_path =
-	    mask->type == BITS &&
-	    !mask->common.transform &&
-	    !mask->common.alpha_map &&
-	    !mask->bits.read_func &&
-	    !mask->bits.write_func &&
-	    mask->common.filter != PIXMAN_FILTER_CONVOLUTION &&
-	    mask->common.repeat != PIXMAN_REPEAT_PAD &&
-	    mask->common.repeat != PIXMAN_REPEAT_REFLECT &&
-	    !PIXMAN_FORMAT_IS_WIDE (mask->bits.format);
-    }
-
-    if (has_fast_path)
-    {
-	const pixman_fast_path_t *info;
-	pixman_bool_t pixbuf;
-
-	pixbuf =
-	    src && src->type == BITS            &&
-	    mask && mask->type == BITS          &&
-	    src->bits.bits == mask->bits.bits   &&
-	    src_x == mask_x                     &&
-	    src_y == mask_y                     &&
-	    !mask->common.component_alpha       &&
-	    !mask_repeat;
-
-	info = get_fast_path (paths, op, src, mask, dest, pixbuf);
-
-	if (info)
-	{
-	    func = info->func;
-
-	    if (info->src_format == PIXMAN_solid)
-		src_repeat = FALSE;
-
-	    if (info->mask_format == PIXMAN_solid ||
-		info->flags & NEED_SOLID_MASK)
-	    {
-		mask_repeat = FALSE;
-	    }
-
-	    if ((src_repeat                     &&
-	         src->bits.width == 1           &&
-	         src->bits.height == 1) ||
-	        (mask_repeat                    &&
-	         mask->bits.width == 1          &&
-	         mask->bits.height == 1))
-	    {
-		/* If src or mask are repeating 1x1 images and src_repeat or
-		 * mask_repeat are still TRUE, it means the fast path we
-		 * selected does not actually handle repeating images.
-		 *
-		 * So rather than call the "fast path" with a zillion
-		 * 1x1 requests, we just fall back to the general code (which
-		 * does do something sensible with 1x1 repeating images).
-		 */
-		func = NULL;
-	    }
-	}
-    }
-
-    result = FALSE;
-
-    if (func)
-    {
-	pixman_region32_t region;
-	pixman_region32_init (&region);
-
-	if (pixman_compute_composite_region32 (
-	        &region, src, mask, dest,
-	        src_x, src_y, mask_x, mask_y, dest_x, dest_y, width, height))
-	{
-	    pixman_box32_t *extents = pixman_region32_extents (&region);
-
-	    if (sources_cover (
-		    src, mask, extents,
-		    src_x, src_y, mask_x, mask_y, dest_x, dest_y))
-	    {
-		walk_region_internal (imp, op,
-		                      src, mask, dest,
-		                      src_x, src_y, mask_x, mask_y,
-		                      dest_x, dest_y,
-		                      width, height,
-		                      src_repeat, mask_repeat,
-		                      &region,
-		                      func);
-
-		result = TRUE;
-	    }
-
-	    pixman_region32_fini (&region);
-	}
-    }
-
-    return result;
-}
-
 #define N_TMP_BOXES (16)
 
 pixman_bool_t
@@ -835,3 +235,24 @@ pixman_region32_copy_from_region16 (pixman_region32_t *dst,
 
     return retval;
 }
+
+#ifdef DEBUG
+
+void
+_pixman_log_error (const char *function, const char *message)
+{
+    static int n_messages = 0;
+
+    if (n_messages < 10)
+    {
+	fprintf (stderr,
+		 "*** BUG ***\n"
+		 "In %s: %s\n"
+		 "Set a breakpoint on '_pixman_log_error' to debug\n\n",
+                 function, message);
+
+	n_messages++;
+    }
+}
+
+#endif
diff --git a/lib/pixman/pixman/pixman-vmx.c b/lib/pixman/pixman/pixman-vmx.c
index 06325a7c0..e811cf733 100644
--- a/lib/pixman/pixman/pixman-vmx.c
+++ b/lib/pixman/pixman/pixman-vmx.c
@@ -1607,11 +1607,16 @@ vmx_combine_add_ca (pixman_implementation_t *imp,
     }
 }
 
+static const pixman_fast_path_t vmx_fast_paths[] =
+{
+    {   PIXMAN_OP_NONE	},
+};
+
 pixman_implementation_t *
 _pixman_implementation_create_vmx (void)
 {
     pixman_implementation_t *fast = _pixman_implementation_create_fast_path ();
-    pixman_implementation_t *imp = _pixman_implementation_create (fast);
+    pixman_implementation_t *imp = _pixman_implementation_create (fast, vmx_fast_paths);
 
     /* Set up function pointers */
 
diff --git a/lib/pixman/pixman/pixman.c b/lib/pixman/pixman/pixman.c
index 0edd967cf..548242ba0 100644
--- a/lib/pixman/pixman/pixman.c
+++ b/lib/pixman/pixman/pixman.c
@@ -28,192 +28,800 @@
 #endif
 #include "pixman-private.h"
 
+#include <stdlib.h>
+
+static pixman_implementation_t *imp;
+
+typedef struct operator_info_t operator_info_t;
+
+struct operator_info_t
+{
+    uint8_t	opaque_info[4];
+};
+
+#define PACK(neither, src, dest, both)			\
+    {{	    (uint8_t)PIXMAN_OP_ ## neither,		\
+	    (uint8_t)PIXMAN_OP_ ## src,			\
+	    (uint8_t)PIXMAN_OP_ ## dest,		\
+	    (uint8_t)PIXMAN_OP_ ## both		}}
+
+static const operator_info_t operator_table[] =
+{
+    /*    Neither Opaque         Src Opaque             Dst Opaque             Both Opaque */
+    PACK (CLEAR,                 CLEAR,                 CLEAR,                 CLEAR),
+    PACK (SRC,                   SRC,                   SRC,                   SRC),
+    PACK (DST,                   DST,                   DST,                   DST),
+    PACK (OVER,                  SRC,                   OVER,                  SRC),
+    PACK (OVER_REVERSE,          OVER_REVERSE,          DST,                   DST),
+    PACK (IN,                    IN,                    SRC,                   SRC),
+    PACK (IN_REVERSE,            DST,                   IN_REVERSE,            DST),
+    PACK (OUT,                   OUT,                   CLEAR,                 CLEAR),
+    PACK (OUT_REVERSE,           CLEAR,                 OUT_REVERSE,           CLEAR),
+    PACK (ATOP,                  IN,                    OVER,                  SRC),
+    PACK (ATOP_REVERSE,          OVER_REVERSE,          IN_REVERSE,            DST),
+    PACK (XOR,                   OUT,                   OUT_REVERSE,           CLEAR),
+    PACK (ADD,                   ADD,                   ADD,                   ADD),
+    PACK (SATURATE,              OVER_REVERSE,          DST,                   DST),
+
+    {{ 0 /* 0x0e */ }},
+    {{ 0 /* 0x0f */ }},
+
+    PACK (CLEAR,                 CLEAR,                 CLEAR,                 CLEAR),
+    PACK (SRC,                   SRC,                   SRC,                   SRC),
+    PACK (DST,                   DST,                   DST,                   DST),
+    PACK (DISJOINT_OVER,         DISJOINT_OVER,         DISJOINT_OVER,         DISJOINT_OVER),
+    PACK (DISJOINT_OVER_REVERSE, DISJOINT_OVER_REVERSE, DISJOINT_OVER_REVERSE, DISJOINT_OVER_REVERSE),
+    PACK (DISJOINT_IN,           DISJOINT_IN,           DISJOINT_IN,           DISJOINT_IN),
+    PACK (DISJOINT_IN_REVERSE,   DISJOINT_IN_REVERSE,   DISJOINT_IN_REVERSE,   DISJOINT_IN_REVERSE),
+    PACK (DISJOINT_OUT,          DISJOINT_OUT,          DISJOINT_OUT,          DISJOINT_OUT),
+    PACK (DISJOINT_OUT_REVERSE,  DISJOINT_OUT_REVERSE,  DISJOINT_OUT_REVERSE,  DISJOINT_OUT_REVERSE),
+    PACK (DISJOINT_ATOP,         DISJOINT_ATOP,         DISJOINT_ATOP,         DISJOINT_ATOP),
+    PACK (DISJOINT_ATOP_REVERSE, DISJOINT_ATOP_REVERSE, DISJOINT_ATOP_REVERSE, DISJOINT_ATOP_REVERSE),
+    PACK (DISJOINT_XOR,          DISJOINT_XOR,          DISJOINT_XOR,          DISJOINT_XOR),
+
+    {{ 0 /* 0x1c */ }},
+    {{ 0 /* 0x1d */ }},
+    {{ 0 /* 0x1e */ }},
+    {{ 0 /* 0x1f */ }},
+
+    PACK (CLEAR,                 CLEAR,                 CLEAR,                 CLEAR),
+    PACK (SRC,                   SRC,                   SRC,                   SRC),
+    PACK (DST,                   DST,                   DST,                   DST),
+    PACK (CONJOINT_OVER,         CONJOINT_OVER,         CONJOINT_OVER,         CONJOINT_OVER),
+    PACK (CONJOINT_OVER_REVERSE, CONJOINT_OVER_REVERSE, CONJOINT_OVER_REVERSE, CONJOINT_OVER_REVERSE),
+    PACK (CONJOINT_IN,           CONJOINT_IN,           CONJOINT_IN,           CONJOINT_IN),
+    PACK (CONJOINT_IN_REVERSE,   CONJOINT_IN_REVERSE,   CONJOINT_IN_REVERSE,   CONJOINT_IN_REVERSE),
+    PACK (CONJOINT_OUT,          CONJOINT_OUT,          CONJOINT_OUT,          CONJOINT_OUT),
+    PACK (CONJOINT_OUT_REVERSE,  CONJOINT_OUT_REVERSE,  CONJOINT_OUT_REVERSE,  CONJOINT_OUT_REVERSE),
+    PACK (CONJOINT_ATOP,         CONJOINT_ATOP,         CONJOINT_ATOP,         CONJOINT_ATOP),
+    PACK (CONJOINT_ATOP_REVERSE, CONJOINT_ATOP_REVERSE, CONJOINT_ATOP_REVERSE, CONJOINT_ATOP_REVERSE),
+    PACK (CONJOINT_XOR,          CONJOINT_XOR,          CONJOINT_XOR,          CONJOINT_XOR),
+
+    {{ 0 /* 0x2c */ }},
+    {{ 0 /* 0x2d */ }},
+    {{ 0 /* 0x2e */ }},
+    {{ 0 /* 0x2f */ }},
+
+    PACK (MULTIPLY,              MULTIPLY,              MULTIPLY,              MULTIPLY),
+    PACK (SCREEN,                SCREEN,                SCREEN,                SCREEN),
+    PACK (OVERLAY,               OVERLAY,               OVERLAY,               OVERLAY),
+    PACK (DARKEN,                DARKEN,                DARKEN,                DARKEN),
+    PACK (LIGHTEN,               LIGHTEN,               LIGHTEN,               LIGHTEN),
+    PACK (COLOR_DODGE,           COLOR_DODGE,           COLOR_DODGE,           COLOR_DODGE),
+    PACK (COLOR_BURN,            COLOR_BURN,            COLOR_BURN,            COLOR_BURN),
+    PACK (HARD_LIGHT,            HARD_LIGHT,            HARD_LIGHT,            HARD_LIGHT),
+    PACK (SOFT_LIGHT,            SOFT_LIGHT,            SOFT_LIGHT,            SOFT_LIGHT),
+    PACK (DIFFERENCE,            DIFFERENCE,            DIFFERENCE,            DIFFERENCE),
+    PACK (EXCLUSION,             EXCLUSION,             EXCLUSION,             EXCLUSION),
+    PACK (HSL_HUE,               HSL_HUE,               HSL_HUE,               HSL_HUE),
+    PACK (HSL_SATURATION,        HSL_SATURATION,        HSL_SATURATION,        HSL_SATURATION),
+    PACK (HSL_COLOR,             HSL_COLOR,             HSL_COLOR,             HSL_COLOR),
+    PACK (HSL_LUMINOSITY,        HSL_LUMINOSITY,        HSL_LUMINOSITY,        HSL_LUMINOSITY),
+};
+
 /*
- * Operator optimizations based on source or destination opacity
+ * Optimize the current operator based on opacity of source or destination
+ * The output operator should be mathematically equivalent to the source.
  */
-typedef struct
+static pixman_op_t
+optimize_operator (pixman_op_t     op,
+		   uint32_t        src_flags,
+		   uint32_t        mask_flags,
+		   uint32_t        dst_flags)
 {
-    pixman_op_t op;
-    pixman_op_t op_src_dst_opaque;
-    pixman_op_t op_src_opaque;
-    pixman_op_t op_dst_opaque;
-} optimized_operator_info_t;
+    pixman_bool_t is_source_opaque, is_dest_opaque;
+    int opaqueness;
+
+    is_source_opaque = ((src_flags & mask_flags) & FAST_PATH_IS_OPAQUE) != 0;
+    is_dest_opaque = (dst_flags & FAST_PATH_IS_OPAQUE) != 0;
 
-static const optimized_operator_info_t optimized_operators[] =
+    opaqueness = ((is_dest_opaque << 1) | is_source_opaque);
+
+    return operator_table[op].opaque_info[opaqueness];
+}
+
+static void
+apply_workaround (pixman_image_t *image,
+		  int32_t *       x,
+		  int32_t *       y,
+		  uint32_t **     save_bits,
+		  int *           save_dx,
+		  int *           save_dy)
 {
-    /* Input Operator           SRC&DST Opaque          SRC Opaque              DST Opaque      */
-    { PIXMAN_OP_OVER,           PIXMAN_OP_SRC,          PIXMAN_OP_SRC,          PIXMAN_OP_OVER },
-    { PIXMAN_OP_OVER_REVERSE,   PIXMAN_OP_DST,          PIXMAN_OP_OVER_REVERSE, PIXMAN_OP_DST },
-    { PIXMAN_OP_IN,             PIXMAN_OP_SRC,          PIXMAN_OP_IN,           PIXMAN_OP_SRC },
-    { PIXMAN_OP_IN_REVERSE,     PIXMAN_OP_DST,          PIXMAN_OP_DST,          PIXMAN_OP_IN_REVERSE },
-    { PIXMAN_OP_OUT,            PIXMAN_OP_CLEAR,        PIXMAN_OP_OUT,          PIXMAN_OP_CLEAR },
-    { PIXMAN_OP_OUT_REVERSE,    PIXMAN_OP_CLEAR,        PIXMAN_OP_CLEAR,        PIXMAN_OP_OUT_REVERSE },
-    { PIXMAN_OP_ATOP,           PIXMAN_OP_SRC,          PIXMAN_OP_IN,           PIXMAN_OP_OVER },
-    { PIXMAN_OP_ATOP_REVERSE,   PIXMAN_OP_DST,          PIXMAN_OP_OVER_REVERSE, PIXMAN_OP_IN_REVERSE },
-    { PIXMAN_OP_XOR,            PIXMAN_OP_CLEAR,        PIXMAN_OP_OUT,          PIXMAN_OP_OUT_REVERSE },
-    { PIXMAN_OP_SATURATE,       PIXMAN_OP_DST,          PIXMAN_OP_OVER_REVERSE, PIXMAN_OP_DST },
-    { PIXMAN_OP_NONE }
-};
+    if (image && (image->common.flags & FAST_PATH_NEEDS_WORKAROUND))
+    {
+	/* Some X servers generate images that point to the
+	 * wrong place in memory, but then set the clip region
+	 * to point to the right place. Because of an old bug
+	 * in pixman, this would actually work.
+	 *
+	 * Here we try and undo the damage
+	 */
+	int bpp = PIXMAN_FORMAT_BPP (image->bits.format) / 8;
+	pixman_box32_t *extents;
+	uint8_t *t;
+	int dx, dy;
+	
+	extents = pixman_region32_extents (&(image->common.clip_region));
+	dx = extents->x1;
+	dy = extents->y1;
+	
+	*save_bits = image->bits.bits;
+	
+	*x -= dx;
+	*y -= dy;
+	pixman_region32_translate (&(image->common.clip_region), -dx, -dy);
+	
+	t = (uint8_t *)image->bits.bits;
+	t += dy * image->bits.rowstride * 4 + dx * bpp;
+	image->bits.bits = (uint32_t *)t;
+	
+	*save_dx = dx;
+	*save_dy = dy;
+    }
+}
 
-static pixman_implementation_t *imp;
+static void
+unapply_workaround (pixman_image_t *image, uint32_t *bits, int dx, int dy)
+{
+    if (image && (image->common.flags & FAST_PATH_NEEDS_WORKAROUND))
+    {
+	image->bits.bits = bits;
+	pixman_region32_translate (&image->common.clip_region, dx, dy);
+    }
+}
 
 /*
- * Check if the current operator could be optimized
+ * Computing composite region
  */
-static const optimized_operator_info_t*
-pixman_operator_can_be_optimized (pixman_op_t op)
+static inline pixman_bool_t
+clip_general_image (pixman_region32_t * region,
+                    pixman_region32_t * clip,
+                    int                 dx,
+                    int                 dy)
 {
-    const optimized_operator_info_t *info;
+    if (pixman_region32_n_rects (region) == 1 &&
+        pixman_region32_n_rects (clip) == 1)
+    {
+	pixman_box32_t *  rbox = pixman_region32_rectangles (region, NULL);
+	pixman_box32_t *  cbox = pixman_region32_rectangles (clip, NULL);
+	int v;
 
-    for (info = optimized_operators; info->op != PIXMAN_OP_NONE; info++)
+	if (rbox->x1 < (v = cbox->x1 + dx))
+	    rbox->x1 = v;
+	if (rbox->x2 > (v = cbox->x2 + dx))
+	    rbox->x2 = v;
+	if (rbox->y1 < (v = cbox->y1 + dy))
+	    rbox->y1 = v;
+	if (rbox->y2 > (v = cbox->y2 + dy))
+	    rbox->y2 = v;
+	if (rbox->x1 >= rbox->x2 || rbox->y1 >= rbox->y2)
+	{
+	    pixman_region32_init (region);
+	    return FALSE;
+	}
+    }
+    else if (!pixman_region32_not_empty (clip))
+    {
+	return FALSE;
+    }
+    else
     {
-	if (info->op == op)
-	    return info;
+	if (dx || dy)
+	    pixman_region32_translate (region, -dx, -dy);
+
+	if (!pixman_region32_intersect (region, region, clip))
+	    return FALSE;
+
+	if (dx || dy)
+	    pixman_region32_translate (region, dx, dy);
     }
-    return NULL;
+
+    return pixman_region32_not_empty (region);
+}
+
+static inline pixman_bool_t
+clip_source_image (pixman_region32_t * region,
+                   pixman_image_t *    image,
+                   int                 dx,
+                   int                 dy)
+{
+    /* Source clips are ignored, unless they are explicitly turned on
+     * and the clip in question was set by an X client. (Because if
+     * the clip was not set by a client, then it is a hierarchy
+     * clip and those should always be ignored for sources).
+     */
+    if (!image->common.clip_sources || !image->common.client_clip)
+	return TRUE;
+
+    return clip_general_image (region,
+                               &image->common.clip_region,
+                               dx, dy);
 }
 
 /*
- * Optimize the current operator based on opacity of source or destination
- * The output operator should be mathematically equivalent to the source.
+ * returns FALSE if the final region is empty.  Indistinguishable from
+ * an allocation failure, but rendering ignores those anyways.
  */
-static pixman_op_t
-pixman_optimize_operator (pixman_op_t     op,
-                          pixman_image_t *src_image,
-                          pixman_image_t *mask_image,
-                          pixman_image_t *dst_image)
+static pixman_bool_t
+pixman_compute_composite_region32 (pixman_region32_t * region,
+                                   pixman_image_t *    src_image,
+                                   pixman_image_t *    mask_image,
+                                   pixman_image_t *    dst_image,
+                                   int32_t             src_x,
+                                   int32_t             src_y,
+                                   int32_t             mask_x,
+                                   int32_t             mask_y,
+                                   int32_t             dest_x,
+                                   int32_t             dest_y,
+                                   int32_t             width,
+                                   int32_t             height)
 {
-    pixman_bool_t is_source_opaque;
-    pixman_bool_t is_dest_opaque;
-    const optimized_operator_info_t *info = pixman_operator_can_be_optimized (op);
+    region->extents.x1 = dest_x;
+    region->extents.x2 = dest_x + width;
+    region->extents.y1 = dest_y;
+    region->extents.y2 = dest_y + height;
+
+    region->extents.x1 = MAX (region->extents.x1, 0);
+    region->extents.y1 = MAX (region->extents.y1, 0);
+    region->extents.x2 = MIN (region->extents.x2, dst_image->bits.width);
+    region->extents.y2 = MIN (region->extents.y2, dst_image->bits.height);
 
-    if (!info || mask_image)
-	return op;
+    region->data = 0;
 
-    is_source_opaque = _pixman_image_is_opaque (src_image);
-    is_dest_opaque = _pixman_image_is_opaque (dst_image);
+    /* Check for empty operation */
+    if (region->extents.x1 >= region->extents.x2 ||
+        region->extents.y1 >= region->extents.y2)
+    {
+	pixman_region32_init (region);
+	return FALSE;
+    }
 
-    if (is_source_opaque == FALSE && is_dest_opaque == FALSE)
-	return op;
+    if (dst_image->common.have_clip_region)
+    {
+	if (!clip_general_image (region, &dst_image->common.clip_region, 0, 0))
+	{
+	    pixman_region32_fini (region);
+	    return FALSE;
+	}
+    }
 
-    if (is_source_opaque && is_dest_opaque)
-	return info->op_src_dst_opaque;
-    else if (is_source_opaque)
-	return info->op_src_opaque;
-    else if (is_dest_opaque)
-	return info->op_dst_opaque;
+    if (dst_image->common.alpha_map && dst_image->common.alpha_map->common.have_clip_region)
+    {
+	if (!clip_general_image (region, &dst_image->common.alpha_map->common.clip_region,
+	                         -dst_image->common.alpha_origin_x,
+	                         -dst_image->common.alpha_origin_y))
+	{
+	    pixman_region32_fini (region);
+	    return FALSE;
+	}
+    }
 
-    return op;
+    /* clip against src */
+    if (src_image->common.have_clip_region)
+    {
+	if (!clip_source_image (region, src_image, dest_x - src_x, dest_y - src_y))
+	{
+	    pixman_region32_fini (region);
+	    return FALSE;
+	}
+    }
+    if (src_image->common.alpha_map && src_image->common.alpha_map->common.have_clip_region)
+    {
+	if (!clip_source_image (region, (pixman_image_t *)src_image->common.alpha_map,
+	                        dest_x - (src_x - src_image->common.alpha_origin_x),
+	                        dest_y - (src_y - src_image->common.alpha_origin_y)))
+	{
+	    pixman_region32_fini (region);
+	    return FALSE;
+	}
+    }
+    /* clip against mask */
+    if (mask_image && mask_image->common.have_clip_region)
+    {
+	if (!clip_source_image (region, mask_image, dest_x - mask_x, dest_y - mask_y))
+	{
+	    pixman_region32_fini (region);
+	    return FALSE;
+	}
+	if (mask_image->common.alpha_map && mask_image->common.alpha_map->common.have_clip_region)
+	{
+	    if (!clip_source_image (region, (pixman_image_t *)mask_image->common.alpha_map,
+	                            dest_x - (mask_x - mask_image->common.alpha_origin_x),
+	                            dest_y - (mask_y - mask_image->common.alpha_origin_y)))
+	    {
+		pixman_region32_fini (region);
+		return FALSE;
+	    }
+	}
+    }
 
+    return TRUE;
 }
 
 static void
-apply_workaround (pixman_image_t *image,
-		  int16_t *       x,
-		  int16_t *       y,
-		  uint32_t **     save_bits,
-		  int *           save_dx,
-		  int *           save_dy)
+walk_region_internal (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      pixman_image_t *         src_image,
+                      pixman_image_t *         mask_image,
+                      pixman_image_t *         dst_image,
+                      int32_t                  src_x,
+                      int32_t                  src_y,
+                      int32_t                  mask_x,
+                      int32_t                  mask_y,
+                      int32_t                  dest_x,
+                      int32_t                  dest_y,
+                      int32_t                  width,
+                      int32_t                  height,
+                      pixman_bool_t            src_repeat,
+                      pixman_bool_t            mask_repeat,
+                      pixman_region32_t *      region,
+                      pixman_composite_func_t  composite_rect)
 {
-    /* Some X servers generate images that point to the
-     * wrong place in memory, but then set the clip region
-     * to point to the right place. Because of an old bug
-     * in pixman, this would actually work.
-     *
-     * Here we try and undo the damage
-     */
-    int bpp = PIXMAN_FORMAT_BPP (image->bits.format) / 8;
-    pixman_box32_t *extents;
-    uint8_t *t;
-    int dx, dy;
+    int w, h, w_this, h_this;
+    int x_msk, y_msk, x_src, y_src, x_dst, y_dst;
+    int src_dy = src_y - dest_y;
+    int src_dx = src_x - dest_x;
+    int mask_dy = mask_y - dest_y;
+    int mask_dx = mask_x - dest_x;
+    const pixman_box32_t *pbox;
+    int n;
+
+    pbox = pixman_region32_rectangles (region, &n);
+
+    /* Fast path for non-repeating sources */
+    if (!src_repeat && !mask_repeat)
+    {
+       while (n--)
+       {
+           (*composite_rect) (imp, op,
+                              src_image, mask_image, dst_image,
+                              pbox->x1 + src_dx,
+                              pbox->y1 + src_dy,
+                              pbox->x1 + mask_dx,
+                              pbox->y1 + mask_dy,
+                              pbox->x1,
+                              pbox->y1,
+                              pbox->x2 - pbox->x1,
+                              pbox->y2 - pbox->y1);
+           
+           pbox++;
+       }
+
+       return;
+    }
+    
+    while (n--)
+    {
+	h = pbox->y2 - pbox->y1;
+	y_src = pbox->y1 + src_dy;
+	y_msk = pbox->y1 + mask_dy;
+	y_dst = pbox->y1;
 
-    extents = pixman_region32_extents (&(image->common.clip_region));
-    dx = extents->x1;
-    dy = extents->y1;
+	while (h)
+	{
+	    h_this = h;
+	    w = pbox->x2 - pbox->x1;
+	    x_src = pbox->x1 + src_dx;
+	    x_msk = pbox->x1 + mask_dx;
+	    x_dst = pbox->x1;
 
-    *save_bits = image->bits.bits;
+	    if (mask_repeat)
+	    {
+		y_msk = MOD (y_msk, mask_image->bits.height);
+		if (h_this > mask_image->bits.height - y_msk)
+		    h_this = mask_image->bits.height - y_msk;
+	    }
 
-    *x -= dx;
-    *y -= dy;
-    pixman_region32_translate (&(image->common.clip_region), -dx, -dy);
+	    if (src_repeat)
+	    {
+		y_src = MOD (y_src, src_image->bits.height);
+		if (h_this > src_image->bits.height - y_src)
+		    h_this = src_image->bits.height - y_src;
+	    }
 
-    t = (uint8_t *)image->bits.bits;
-    t += dy * image->bits.rowstride * 4 + dx * bpp;
-    image->bits.bits = (uint32_t *)t;
+	    while (w)
+	    {
+		w_this = w;
 
-    *save_dx = dx;
-    *save_dy = dy;
+		if (mask_repeat)
+		{
+		    x_msk = MOD (x_msk, mask_image->bits.width);
+		    if (w_this > mask_image->bits.width - x_msk)
+			w_this = mask_image->bits.width - x_msk;
+		}
+
+		if (src_repeat)
+		{
+		    x_src = MOD (x_src, src_image->bits.width);
+		    if (w_this > src_image->bits.width - x_src)
+			w_this = src_image->bits.width - x_src;
+		}
+
+		(*composite_rect) (imp, op,
+				   src_image, mask_image, dst_image,
+				   x_src, y_src, x_msk, y_msk, x_dst, y_dst,
+				   w_this, h_this);
+		w -= w_this;
+
+		x_src += w_this;
+		x_msk += w_this;
+		x_dst += w_this;
+	    }
+
+	    h -= h_this;
+	    y_src += h_this;
+	    y_msk += h_this;
+	    y_dst += h_this;
+	}
+
+	pbox++;
+    }
 }
 
-static void
-unapply_workaround (pixman_image_t *image, uint32_t *bits, int dx, int dy)
+#define IS_16BIT(x) (((x) >= INT16_MIN) && ((x) <= INT16_MAX))
+
+static force_inline uint32_t
+compute_src_extents_flags (pixman_image_t *image,
+			   pixman_box32_t *extents,
+			   int             x,
+			   int             y)
 {
-    image->bits.bits = bits;
-    pixman_region32_translate (&image->common.clip_region, dx, dy);
+    pixman_box16_t extents16;
+    uint32_t flags;
+
+    flags = FAST_PATH_COVERS_CLIP;
+
+    if (image->common.type != BITS)
+	return flags;
+
+    if (image->common.repeat == PIXMAN_REPEAT_NONE &&
+	(x > extents->x1 || y > extents->y1 ||
+	 x + image->bits.width < extents->x2 ||
+	 y + image->bits.height < extents->y2))
+    {
+	flags &= ~FAST_PATH_COVERS_CLIP;
+    }
+
+    if (IS_16BIT (extents->x1 - x) &&
+	IS_16BIT (extents->y1 - y) &&
+	IS_16BIT (extents->x2 - x) &&
+	IS_16BIT (extents->y2 - y))
+    {
+	extents16.x1 = extents->x1 - x;
+	extents16.y1 = extents->y1 - y;
+	extents16.x2 = extents->x2 - x;
+	extents16.y2 = extents->y2 - y;
+
+	if (!image->common.transform ||
+	    pixman_transform_bounds (image->common.transform, &extents16))
+	{
+	    if (extents16.x1 >= 0  && extents16.y1 >= 0 &&
+		extents16.x2 <= image->bits.width &&
+		extents16.y2 <= image->bits.height)
+	    {
+		flags |= FAST_PATH_SAMPLES_COVER_CLIP;
+	    }
+	}
+    }
+
+    if (IS_16BIT (extents->x1 - x - 1) &&
+	IS_16BIT (extents->y1 - y - 1) &&
+	IS_16BIT (extents->x2 - x + 1) &&
+	IS_16BIT (extents->y2 - y + 1))
+    {
+	extents16.x1 = extents->x1 - x - 1;
+	extents16.y1 = extents->y1 - y - 1;
+	extents16.x2 = extents->x2 - x + 1;
+	extents16.y2 = extents->y2 - y + 1;
+
+	if (/* src space expanded by one in dest space fits in 16 bit */
+	    (!image->common.transform ||
+	     pixman_transform_bounds (image->common.transform, &extents16)) &&
+	    /* And src image size can be used as 16.16 fixed point */
+	    image->bits.width < 0x7fff &&
+	    image->bits.height < 0x7fff)
+	{
+	    /* Then we're "16bit safe" */
+	    flags |= FAST_PATH_16BIT_SAFE;
+	}
+    }
+
+    return flags;
 }
 
-PIXMAN_EXPORT void
-pixman_image_composite (pixman_op_t      op,
-                        pixman_image_t * src,
-                        pixman_image_t * mask,
-                        pixman_image_t * dest,
-                        int16_t          src_x,
-                        int16_t          src_y,
-                        int16_t          mask_x,
-                        int16_t          mask_y,
-                        int16_t          dest_x,
-                        int16_t          dest_y,
-                        uint16_t         width,
-                        uint16_t         height)
+#define N_CACHED_FAST_PATHS 8
+
+typedef struct
 {
+    pixman_fast_path_t cache [N_CACHED_FAST_PATHS];
+} cache_t;
+
+PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache);
+
+static void
+do_composite (pixman_implementation_t *imp,
+	      pixman_op_t	       op,
+	      pixman_image_t	      *src,
+	      pixman_image_t	      *mask,
+	      pixman_image_t	      *dest,
+	      int		       src_x,
+	      int		       src_y,
+	      int		       mask_x,
+	      int		       mask_y,
+	      int		       dest_x,
+	      int		       dest_y,
+	      int		       width,
+	      int		       height)
+{
+    pixman_format_code_t src_format, mask_format, dest_format;
+    uint32_t src_flags, mask_flags, dest_flags;
+    pixman_region32_t region;
+    pixman_box32_t *extents;
     uint32_t *src_bits;
     int src_dx, src_dy;
     uint32_t *mask_bits;
     int mask_dx, mask_dy;
     uint32_t *dest_bits;
     int dest_dx, dest_dy;
+    pixman_bool_t need_workaround;
+    const pixman_fast_path_t *info;
+    cache_t *cache;
+    int i;
+
+    src_format = src->common.extended_format_code;
+    src_flags = src->common.flags;
 
-    _pixman_image_validate (src);
     if (mask)
-	_pixman_image_validate (mask);
-    _pixman_image_validate (dest);
+    {
+	mask_format = mask->common.extended_format_code;
+	mask_flags = mask->common.flags;
+    }
+    else
+    {
+	mask_format = PIXMAN_null;
+	mask_flags = FAST_PATH_IS_OPAQUE;
+    }
+
+    dest_format = dest->common.extended_format_code;
+    dest_flags = dest->common.flags;
+
+    /* Check for pixbufs */
+    if ((mask_format == PIXMAN_a8r8g8b8 || mask_format == PIXMAN_a8b8g8r8) &&
+	(src->type == BITS && src->bits.bits == mask->bits.bits)	   &&
+	(src->common.repeat == mask->common.repeat)			   &&
+	(src_x == mask_x && src_y == mask_y))
+    {
+	if (src_format == PIXMAN_x8b8g8r8)
+	    src_format = mask_format = PIXMAN_pixbuf;
+	else if (src_format == PIXMAN_x8r8g8b8)
+	    src_format = mask_format = PIXMAN_rpixbuf;
+    }
+
+    /* Check for workaround */
+    need_workaround = (src_flags | mask_flags | dest_flags) & FAST_PATH_NEEDS_WORKAROUND;
+
+    if (need_workaround)
+    {
+	apply_workaround (src, &src_x, &src_y, &src_bits, &src_dx, &src_dy);
+	apply_workaround (mask, &mask_x, &mask_y, &mask_bits, &mask_dx, &mask_dy);
+	apply_workaround (dest, &dest_x, &dest_y, &dest_bits, &dest_dx, &dest_dy);
+    }
+
+    pixman_region32_init (&region);
+    
+    if (!pixman_compute_composite_region32 (
+	    &region, src, mask, dest,
+	    src_x, src_y, mask_x, mask_y, dest_x, dest_y, width, height))
+    {
+	goto out;
+    }
     
+    extents = pixman_region32_extents (&region);
+    
+    src_flags |= compute_src_extents_flags (src, extents, dest_x - src_x, dest_y - src_y);
+
+    if (mask)
+	mask_flags |= compute_src_extents_flags (mask, extents, dest_x - mask_x, dest_y - mask_y);
+
     /*
      * Check if we can replace our operator by a simpler one
      * if the src or dest are opaque. The output operator should be
      * mathematically equivalent to the source.
      */
-    op = pixman_optimize_operator(op, src, mask, dest);
-    if (op == PIXMAN_OP_DST		||
-	op == PIXMAN_OP_CONJOINT_DST	||
-	op == PIXMAN_OP_DISJOINT_DST)
+    op = optimize_operator (op, src_flags, mask_flags, dest_flags);
+    if (op == PIXMAN_OP_DST)
+	goto out;
+
+    /* Check cache for fast paths */
+    cache = PIXMAN_GET_THREAD_LOCAL (fast_path_cache);
+
+    for (i = 0; i < N_CACHED_FAST_PATHS; ++i)
     {
-        return;
+	info = &(cache->cache[i]);
+
+	/* Note that we check for equality here, not whether
+	 * the cached fast path matches. This is to prevent
+	 * us from selecting an overly general fast path
+	 * when a more specific one would work.
+	 */
+	if (info->op == op			&&
+	    info->src_format == src_format	&&
+	    info->mask_format == mask_format	&&
+	    info->dest_format == dest_format	&&
+	    info->src_flags == src_flags	&&
+	    info->mask_flags == mask_flags	&&
+	    info->dest_flags == dest_flags	&&
+	    info->func)
+	{
+	    goto found;
+	}
     }
 
-    if (!imp)
-	imp = _pixman_choose_implementation ();
+    while (imp)
+    {
+	info = imp->fast_paths;
 
-    if (src->common.need_workaround)
-	apply_workaround (src, &src_x, &src_y, &src_bits, &src_dx, &src_dy);
-    if (mask && mask->common.need_workaround)
-	apply_workaround (mask, &mask_x, &mask_y, &mask_bits, &mask_dx, &mask_dy);
-    if (dest->common.need_workaround)
-	apply_workaround (dest, &dest_x, &dest_y, &dest_bits, &dest_dx, &dest_dy);
+	while (info->op != PIXMAN_OP_NONE)
+	{
+	    if ((info->op == op || info->op == PIXMAN_OP_any)		&&
+		/* Formats */
+		((info->src_format == src_format) ||
+		 (info->src_format == PIXMAN_any))			&&
+		((info->mask_format == mask_format) ||
+		 (info->mask_format == PIXMAN_any))			&&
+		((info->dest_format == dest_format) ||
+		 (info->dest_format == PIXMAN_any))			&&
+		/* Flags */
+		(info->src_flags & src_flags) == info->src_flags	&&
+		(info->mask_flags & mask_flags) == info->mask_flags	&&
+		(info->dest_flags & dest_flags) == info->dest_flags)
+	    {
+		/* Set i to the last spot in the cache so that the
+		 * move-to-front code below will work
+		 */
+		i = N_CACHED_FAST_PATHS - 1;
+
+		goto found;
+	    }
+
+	    ++info;
+	}
+
+	imp = imp->delegate;
+    }
+
+    /* We didn't find a compositing routine. This should not happen, but if
+     * it somehow does, just exit rather than crash.
+     */
+    goto out;
 
-    _pixman_implementation_composite (imp, op,
-                                      src, mask, dest,
-                                      src_x, src_y,
-                                      mask_x, mask_y,
-                                      dest_x, dest_y,
-                                      width, height);
+found:
+    walk_region_internal (imp, op,
+			  src, mask, dest,
+			  src_x, src_y, mask_x, mask_y,
+			  dest_x, dest_y,
+			  width, height,
+			  (src_flags & FAST_PATH_SIMPLE_REPEAT),
+			  (mask_flags & FAST_PATH_SIMPLE_REPEAT),
+			  &region, info->func);
+
+    if (i)
+    {
+	/* Make a copy of info->func, because info->func may change when
+	 * we update the cache.
+	 */
+	pixman_composite_func_t func = info->func;
+	
+	while (i--)
+	    cache->cache[i + 1] = cache->cache[i];
 
-    if (src->common.need_workaround)
+	cache->cache[0].op = op;
+	cache->cache[0].src_format = src_format;
+	cache->cache[0].src_flags = src_flags;
+	cache->cache[0].mask_format = mask_format;
+	cache->cache[0].mask_flags = mask_flags;
+	cache->cache[0].dest_format = dest_format;
+	cache->cache[0].dest_flags = dest_flags;
+	cache->cache[0].func = func;
+    }
+
+out:
+    if (need_workaround)
+    {
 	unapply_workaround (src, src_bits, src_dx, src_dy);
-    if (mask && mask->common.need_workaround)
 	unapply_workaround (mask, mask_bits, mask_dx, mask_dy);
-    if (dest->common.need_workaround)
 	unapply_workaround (dest, dest_bits, dest_dx, dest_dy);
+    }
+
+    pixman_region32_fini (&region);
+}
+
+PIXMAN_EXPORT void
+pixman_image_composite (pixman_op_t      op,
+                        pixman_image_t * src,
+                        pixman_image_t * mask,
+                        pixman_image_t * dest,
+                        int16_t          src_x,
+                        int16_t          src_y,
+                        int16_t          mask_x,
+                        int16_t          mask_y,
+                        int16_t          dest_x,
+                        int16_t          dest_y,
+                        uint16_t         width,
+                        uint16_t         height)
+{
+    pixman_image_composite32 (op, src, mask, dest, src_x, src_y, 
+                              mask_x, mask_y, dest_x, dest_y, width, height);
+}
+
+/*
+ * Work around GCC bug causing crashes in Mozilla with SSE2
+ *
+ * When using -msse, gcc generates movdqa instructions assuming that
+ * the stack is 16 byte aligned. Unfortunately some applications, such
+ * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
+ * causes the movdqa instructions to fail.
+ *
+ * The __force_align_arg_pointer__ makes gcc generate a prologue that
+ * realigns the stack pointer to 16 bytes.
+ *
+ * On x86-64 this is not necessary because the standard ABI already
+ * calls for a 16 byte aligned stack.
+ *
+ * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
+ */
+#if defined (USE_SSE2) && defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+PIXMAN_EXPORT void
+pixman_image_composite32 (pixman_op_t      op,
+                          pixman_image_t * src,
+                          pixman_image_t * mask,
+                          pixman_image_t * dest,
+                          int32_t          src_x,
+                          int32_t          src_y,
+                          int32_t          mask_x,
+                          int32_t          mask_y,
+                          int32_t          dest_x,
+                          int32_t          dest_y,
+                          int32_t          width,
+                          int32_t          height)
+{
+    _pixman_image_validate (src);
+    if (mask)
+	_pixman_image_validate (mask);
+    _pixman_image_validate (dest);
+
+    if (!imp)
+	imp = _pixman_choose_implementation ();
+
+    do_composite (imp, op,
+		  src, mask, dest,
+		  src_x, src_y,
+		  mask_x, mask_y,
+		  dest_x, dest_y,
+		  width, height);
 }
 
 PIXMAN_EXPORT pixman_bool_t
@@ -323,6 +931,45 @@ pixman_image_fill_rectangles (pixman_op_t                 op,
                               int                         n_rects,
                               const pixman_rectangle16_t *rects)
 {
+    pixman_box32_t stack_boxes[6];
+    pixman_box32_t *boxes;
+    pixman_bool_t result;
+    int i;
+
+    if (n_rects > 6)
+    {
+        boxes = pixman_malloc_ab (sizeof (pixman_box32_t), n_rects);
+        if (boxes == NULL)
+            return FALSE;
+    }
+    else
+    {
+        boxes = stack_boxes;
+    }
+
+    for (i = 0; i < n_rects; ++i)
+    {
+        boxes[i].x1 = rects[i].x;
+        boxes[i].y1 = rects[i].y;
+        boxes[i].x2 = boxes[i].x1 + rects[i].width;
+        boxes[i].y2 = boxes[i].y1 + rects[i].height;
+    }
+
+    result = pixman_image_fill_boxes (op, dest, color, n_rects, boxes);
+
+    if (boxes != stack_boxes)
+        free (boxes);
+    
+    return result;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_fill_boxes (pixman_op_t           op,
+                         pixman_image_t *      dest,
+                         pixman_color_t *      color,
+                         int                   n_boxes,
+                         const pixman_box32_t *boxes)
+{
     pixman_image_t *solid;
     pixman_color_t c;
     int i;
@@ -331,71 +978,69 @@ pixman_image_fill_rectangles (pixman_op_t                 op,
     
     if (color->alpha == 0xffff)
     {
-	if (op == PIXMAN_OP_OVER)
-	    op = PIXMAN_OP_SRC;
+        if (op == PIXMAN_OP_OVER)
+            op = PIXMAN_OP_SRC;
     }
 
     if (op == PIXMAN_OP_CLEAR)
     {
-	c.red = 0;
-	c.green = 0;
-	c.blue = 0;
-	c.alpha = 0;
+        c.red = 0;
+        c.green = 0;
+        c.blue = 0;
+        c.alpha = 0;
 
-	color = &c;
+        color = &c;
 
-	op = PIXMAN_OP_SRC;
+        op = PIXMAN_OP_SRC;
     }
 
     if (op == PIXMAN_OP_SRC)
     {
-	uint32_t pixel;
+        uint32_t pixel;
 
-	if (color_to_pixel (color, &pixel, dest->bits.format))
-	{
-	    for (i = 0; i < n_rects; ++i)
-	    {
-		pixman_region32_t fill_region;
-		int n_boxes, j;
-		pixman_box32_t *boxes;
+        if (color_to_pixel (color, &pixel, dest->bits.format))
+        {
+            pixman_region32_t fill_region;
+            int n_rects, j;
+            pixman_box32_t *rects;
 
-		pixman_region32_init_rect (&fill_region, rects[i].x, rects[i].y, rects[i].width, rects[i].height);
+            if (!pixman_region32_init_rects (&fill_region, boxes, n_boxes))
+                return FALSE;
 
-		if (dest->common.have_clip_region)
-		{
-		    if (!pixman_region32_intersect (&fill_region,
-		                                    &fill_region,
-		                                    &dest->common.clip_region))
-			return FALSE;
-		}
+            if (dest->common.have_clip_region)
+            {
+                if (!pixman_region32_intersect (&fill_region,
+                                                &fill_region,
+                                                &dest->common.clip_region))
+                    return FALSE;
+            }
 
-		boxes = pixman_region32_rectangles (&fill_region, &n_boxes);
-		for (j = 0; j < n_boxes; ++j)
-		{
-		    const pixman_box32_t *box = &(boxes[j]);
-		    pixman_fill (dest->bits.bits, dest->bits.rowstride, PIXMAN_FORMAT_BPP (dest->bits.format),
-		                 box->x1, box->y1, box->x2 - box->x1, box->y2 - box->y1,
-		                 pixel);
-		}
+            rects = pixman_region32_rectangles (&fill_region, &n_rects);
+            for (j = 0; j < n_rects; ++j)
+            {
+                const pixman_box32_t *rect = &(rects[j]);
+                pixman_fill (dest->bits.bits, dest->bits.rowstride, PIXMAN_FORMAT_BPP (dest->bits.format),
+                             rect->x1, rect->y1, rect->x2 - rect->x1, rect->y2 - rect->y1,
+                             pixel);
+            }
 
-		pixman_region32_fini (&fill_region);
-	    }
-	    return TRUE;
-	}
+            pixman_region32_fini (&fill_region);
+            return TRUE;
+        }
     }
 
     solid = pixman_image_create_solid_fill (color);
     if (!solid)
-	return FALSE;
+        return FALSE;
 
-    for (i = 0; i < n_rects; ++i)
+    for (i = 0; i < n_boxes; ++i)
     {
-	const pixman_rectangle16_t *rect = &(rects[i]);
+        const pixman_box32_t *box = &(boxes[i]);
 
-	pixman_image_composite (op, solid, NULL, dest,
-	                        0, 0, 0, 0,
-	                        rect->x, rect->y,
-	                        rect->width, rect->height);
+        pixman_image_composite32 (op, solid, NULL, dest,
+                                  0, 0, 0, 0,
+                                  box->x1, box->y1,
+                                  box->x2 - box->x1, box->y2 - box->y1);
     }
 
     pixman_image_unref (solid);
@@ -541,3 +1186,36 @@ pixman_format_supported_destination (pixman_format_code_t format)
     return pixman_format_supported_source (format);
 }
 
+PIXMAN_EXPORT pixman_bool_t
+pixman_compute_composite_region (pixman_region16_t * region,
+                                 pixman_image_t *    src_image,
+                                 pixman_image_t *    mask_image,
+                                 pixman_image_t *    dst_image,
+                                 int16_t             src_x,
+                                 int16_t             src_y,
+                                 int16_t             mask_x,
+                                 int16_t             mask_y,
+                                 int16_t             dest_x,
+                                 int16_t             dest_y,
+                                 uint16_t            width,
+                                 uint16_t            height)
+{
+    pixman_region32_t r32;
+    pixman_bool_t retval;
+
+    pixman_region32_init (&r32);
+
+    retval = pixman_compute_composite_region32 (
+	&r32, src_image, mask_image, dst_image,
+	src_x, src_y, mask_x, mask_y, dest_x, dest_y,
+	width, height);
+
+    if (retval)
+    {
+	if (!pixman_region16_copy_from_region32 (region, &r32))
+	    retval = FALSE;
+    }
+
+    pixman_region32_fini (&r32);
+    return retval;
+}
diff --git a/lib/pixman/pixman/pixman.h b/lib/pixman/pixman/pixman.h
index 5b90a0c8d..964d04ab9 100644
--- a/lib/pixman/pixman/pixman.h
+++ b/lib/pixman/pixman/pixman.h
@@ -71,12 +71,26 @@ SOFTWARE.
 
 #include <pixman-version.h>
 
+#ifdef  __cplusplus
+#define PIXMAN_BEGIN_DECLS extern "C" {
+#define PIXMAN_END_DECLS }
+#else
+#define PIXMAN_BEGIN_DECLS
+#define PIXMAN_END_DECLS
+#endif
+
+PIXMAN_BEGIN_DECLS
+
 /*
  * Standard integers
  */
-#if defined (_SVR4) || defined (SVR4) || defined (__OpenBSD__) || defined (_sgi) || defined (__sun) || defined (sun) || defined (__digital__)
+
+#if !defined (PIXMAN_DONT_DEFINE_STDINT)
+
+#if defined (_SVR4) || defined (SVR4) || defined (__OpenBSD__) || defined (_sgi) || defined (__sun) || defined (sun) || defined (__digital__) || defined (__HP_cc)
 #  include <inttypes.h>
-#elif defined (_MSC_VER)
+/* VS 2010 (_MSC_VER 1600) has stdint.h */
+#elif defined (_MSC_VER) && _MSC_VER < 1600
 typedef __int8 int8_t;
 typedef unsigned __int8 uint8_t;
 typedef __int16 int16_t;
@@ -91,6 +105,8 @@ typedef unsigned __int64 uint64_t;
 #  include <stdint.h>
 #endif
 
+#endif
+
 /*
  * Boolean
  */
@@ -109,6 +125,7 @@ typedef pixman_fixed_16_16_t	pixman_fixed_t;
 #define pixman_fixed_e			((pixman_fixed_t) 1)
 #define pixman_fixed_1			(pixman_int_to_fixed(1))
 #define pixman_fixed_1_minus_e		(pixman_fixed_1 - pixman_fixed_e)
+#define pixman_fixed_minus_1		(pixman_int_to_fixed(-1))
 #define pixman_fixed_to_int(f)		((int) ((f) >> 16))
 #define pixman_int_to_fixed(i)		((pixman_fixed_t) ((i) << 16))
 #define pixman_fixed_to_double(f)	(double) ((f) / (double) pixman_fixed_1)
@@ -165,6 +182,7 @@ struct pixman_transform
 
 /* forward declaration (sorry) */
 struct pixman_box16;
+typedef  union pixman_image		pixman_image_t;
 
 void          pixman_transform_init_identity    (struct pixman_transform       *matrix);
 pixman_bool_t pixman_transform_point_3d         (const struct pixman_transform *transform,
@@ -331,10 +349,13 @@ typedef enum
     PIXMAN_OP_HSL_HUE			= 0x3b,
     PIXMAN_OP_HSL_SATURATION		= 0x3c,
     PIXMAN_OP_HSL_COLOR			= 0x3d,
-    PIXMAN_OP_HSL_LUMINOSITY		= 0x3e,
+    PIXMAN_OP_HSL_LUMINOSITY		= 0x3e
 
-    PIXMAN_OP_NONE,
-    PIXMAN_OP_LAST = PIXMAN_OP_NONE
+#ifdef PIXMAN_USE_INTERNAL_API
+    ,
+    PIXMAN_N_OPERATORS,
+    PIXMAN_OP_NONE = PIXMAN_N_OPERATORS
+#endif
 } pixman_op_t;
 
 /*
@@ -390,10 +411,12 @@ void                    pixman_region_init_rect          (pixman_region16_t *reg
 							  unsigned int       width,
 							  unsigned int       height);
 pixman_bool_t           pixman_region_init_rects         (pixman_region16_t *region,
-							  pixman_box16_t    *boxes,
+							  const pixman_box16_t *boxes,
 							  int                count);
 void                    pixman_region_init_with_extents  (pixman_region16_t *region,
 							  pixman_box16_t    *extents);
+void                    pixman_region_init_from_image    (pixman_region16_t *region,
+							  pixman_image_t    *image);
 void                    pixman_region_fini               (pixman_region16_t *region);
 
 
@@ -426,7 +449,7 @@ pixman_bool_t           pixman_region_contains_point     (pixman_region16_t *reg
 							  int                x,
 							  int                y,
 							  pixman_box16_t    *box);
-pixman_region_overlap_t pixman_region_contains_rectangle (pixman_region16_t *pixman_region16_t,
+pixman_region_overlap_t pixman_region_contains_rectangle (pixman_region16_t *region,
 							  pixman_box16_t    *prect);
 pixman_bool_t           pixman_region_not_empty          (pixman_region16_t *region);
 pixman_box16_t *        pixman_region_extents            (pixman_region16_t *region);
@@ -477,10 +500,12 @@ void                    pixman_region32_init_rect          (pixman_region32_t *r
 							    unsigned int       width,
 							    unsigned int       height);
 pixman_bool_t           pixman_region32_init_rects         (pixman_region32_t *region,
-							    pixman_box32_t    *boxes,
+							    const pixman_box32_t *boxes,
 							    int                count);
 void                    pixman_region32_init_with_extents  (pixman_region32_t *region,
 							    pixman_box32_t    *extents);
+void                    pixman_region32_init_from_image    (pixman_region32_t *region,
+							    pixman_image_t    *image);
 void                    pixman_region32_fini               (pixman_region32_t *region);
 
 
@@ -554,7 +579,6 @@ const char*   pixman_version_string     (void);
 /*
  * Images
  */
-typedef  union pixman_image		pixman_image_t;
 typedef struct pixman_indexed		pixman_indexed_t;
 typedef struct pixman_gradient_stop	pixman_gradient_stop_t;
 
@@ -637,11 +661,11 @@ typedef enum {
 /* 24bpp formats */
     PIXMAN_r8g8b8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB,0,8,8,8),
     PIXMAN_b8g8r8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ABGR,0,8,8,8),
-    
+
 /* 16bpp formats */
     PIXMAN_r5g6b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,6,5),
     PIXMAN_b5g6r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,6,5),
-    
+
     PIXMAN_a1r5g5b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,1,5,5,5),
     PIXMAN_x1r5g5b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,5,5),
     PIXMAN_a1b5g5r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,1,5,5,5),
@@ -650,35 +674,35 @@ typedef enum {
     PIXMAN_x4r4g4b4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,4,4,4),
     PIXMAN_a4b4g4r4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,4,4,4,4),
     PIXMAN_x4b4g4r4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,4,4,4),
-    
+
 /* 8bpp formats */
     PIXMAN_a8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_A,8,0,0,0),
     PIXMAN_r3g3b2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,0,3,3,2),
     PIXMAN_b2g3r3 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,0,3,3,2),
     PIXMAN_a2r2g2b2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,2,2,2,2),
     PIXMAN_a2b2g2r2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,2,2,2,2),
-    
+
     PIXMAN_c8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
     PIXMAN_g8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
-    
+
     PIXMAN_x4a4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_A,4,0,0,0),
-    
+
     PIXMAN_x4c4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
     PIXMAN_x4g4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
-    
+
 /* 4bpp formats */
     PIXMAN_a4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_A,4,0,0,0),
     PIXMAN_r1g2b1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,0,1,2,1),
     PIXMAN_b1g2r1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,0,1,2,1),
     PIXMAN_a1r1g1b1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,1,1,1,1),
     PIXMAN_a1b1g1r1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,1,1,1,1),
-    
+
     PIXMAN_c4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_COLOR,0,0,0,0),
     PIXMAN_g4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_GRAY,0,0,0,0),
-    
+
 /* 1bpp formats */
     PIXMAN_a1 =		 PIXMAN_FORMAT(1,PIXMAN_TYPE_A,1,0,0,0),
-    
+
     PIXMAN_g1 =		 PIXMAN_FORMAT(1,PIXMAN_TYPE_GRAY,0,0,0,0),
 
 /* YUV formats */
@@ -719,6 +743,7 @@ pixman_bool_t   pixman_image_unref                   (pixman_image_t
 void		pixman_image_set_destroy_function    (pixman_image_t		   *image,
 						      pixman_image_destroy_func_t   function,
 						      void			   *data);
+void *		pixman_image_get_destroy_data        (pixman_image_t		   *image);
 
 /* Set properties */
 pixman_bool_t   pixman_image_set_clip_region         (pixman_image_t               *image,
@@ -758,6 +783,11 @@ pixman_bool_t	pixman_image_fill_rectangles	     (pixman_op_t		    op,
 						      pixman_color_t		   *color,
 						      int			    n_rects,
 						      const pixman_rectangle16_t   *rects);
+pixman_bool_t   pixman_image_fill_boxes              (pixman_op_t                   op,
+                                                      pixman_image_t               *dest,
+                                                      pixman_color_t               *color,
+                                                      int                           n_boxes,
+                                                      const pixman_box32_t         *boxes);
 
 /* Composite */
 pixman_bool_t pixman_compute_composite_region (pixman_region16_t *region,
@@ -784,6 +814,18 @@ void          pixman_image_composite          (pixman_op_t        op,
 					       int16_t            dest_y,
 					       uint16_t           width,
 					       uint16_t           height);
+void          pixman_image_composite32        (pixman_op_t        op,
+					       pixman_image_t    *src,
+					       pixman_image_t    *mask,
+					       pixman_image_t    *dest,
+					       int32_t            src_x,
+					       int32_t            src_y,
+					       int32_t            mask_x,
+					       int32_t            mask_y,
+					       int32_t            dest_x,
+					       int32_t            dest_y,
+					       int32_t            width,
+					       int32_t            height);
 
 /* Old X servers rely on out-of-bounds accesses when they are asked
  * to composite with a window as the source. They create a pixman image
@@ -889,4 +931,6 @@ void           pixman_rasterize_trapezoid  (pixman_image_t            *image,
 					    int                        x_off,
 					    int                        y_off);
 
+PIXMAN_END_DECLS
+
 #endif /* PIXMAN_H__ */
diff --git a/lib/pixman/pixman/solaris-hwcap.mapfile b/lib/pixman/pixman/solaris-hwcap.mapfile
index 7f439a95a..3605ca79f 100644
--- a/lib/pixman/pixman/solaris-hwcap.mapfile
+++ b/lib/pixman/pixman/solaris-hwcap.mapfile
@@ -3,29 +3,23 @@
 # Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the
-# "Software"), to deal in the Software without restriction, including
-# without limitation the rights to use, copy, modify, merge, publish,
-# distribute, and/or sell copies of the Software, and to permit persons
-# to whom the Software is furnished to do so, provided that the above
-# copyright notice(s) and this permission notice appear in all copies of
-# the Software and that both the above copyright notice(s) and this
-# permission notice appear in supporting documentation.
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
 #
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
-# OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
-# HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
-# INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
-# FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
-# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
-# WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
 #
-# Except as contained in this notice, the name of a copyright holder
-# shall not be used in advertising or otherwise to promote the sale, use
-# or other dealings in this Software without prior written authorization
-# of the copyright holder.
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
 #
 ###############################################################################
 #
diff --git a/lib/pixman/test/Makefile.am b/lib/pixman/test/Makefile.am
index c56f62de7..841ff8d7d 100644
--- a/lib/pixman/test/Makefile.am
+++ b/lib/pixman/test/Makefile.am
@@ -2,29 +2,44 @@ TEST_LDADD = $(top_builddir)/pixman/libpixman-1.la
 INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman
 
 TESTPROGRAMS =			\
+	a1-trap-test		\
 	region-test		\
-	scaling-test		\
-	blitters-test		\
 	fetch-test		\
 	oob-test		\
 	window-test		\
-	trap-crasher
+	trap-crasher		\
+	alphamap		\
+	blitters-test		\
+	scaling-test		\
+	composite
 
+a1_trap_test_LDADD = $(TEST_LDADD)
 fetch_test_LDADD = $(TEST_LDADD)
-region_test_LDADD = $(TEST_LDADD)
-scaling_test_LDADD = $(TEST_LDADD)
-blitters_test_LDADD = $(TEST_LDADD)
+composite_LDADD = $(TEST_LDADD)
 trap_crasher_LDADD = $(TEST_LDADD)
 oob_test_LDADD = $(TEST_LDADD)
 window_test_LDADD = $(TEST_LDADD)
 
+region_test_LDADD = $(TEST_LDADD)
+region_test_SOURCES = region-test.c utils.c utils.h
+
+blitters_test_LDADD = $(TEST_LDADD)
+blitters_test_SOURCES = blitters-test.c utils.c utils.h
+
+scaling_test_LDADD = $(TEST_LDADD)
+scaling_test_SOURCES = scaling-test.c utils.c utils.h
+
+alphamap_LDADD = $(TEST_LDADD)
+alphamap_SOURCES = alphamap.c utils.c utils.h
+
 # GTK using test programs
 
 if HAVE_GTK
 
 GTK_LDADD = $(TEST_LDADD) $(GTK_LIBS)
+GTK_UTILS = gtk-utils.c gtk-utils.h
 
-TESTPROGRAMS +=			\
+TESTPROGRAMS_GTK =		\
 	clip-test		\
 	clip-in			\
 	composite-test		\
@@ -32,35 +47,38 @@ TESTPROGRAMS +=			\
 	alpha-test		\
 	screen-test		\
 	convolution-test	\
-	trap-test
+	trap-test		\
+	alphamap
 
 INCLUDES += $(GTK_CFLAGS)
 
 gradient_test_LDADD = $(GTK_LDADD)
-gradient_test_SOURCES = gradient-test.c utils.c utils.h
+gradient_test_SOURCES = gradient-test.c $(GTK_UTILS)
 
 alpha_test_LDADD = $(GTK_LDADD)
-alpha_test_SOURCES = alpha-test.c utils.c utils.h
+alpha_test_SOURCES = alpha-test.c $(GTK_UTILS)
 
 composite_test_LDADD = $(GTK_LDADD)
-composite_test_SOURCES = composite-test.c utils.c utils.h
+composite_test_SOURCES = composite-test.c $(GTK_UTILS)
 
 clip_test_LDADD = $(GTK_LDADD)
-clip_test_SOURCES = clip-test.c utils.c utils.h
+clip_test_SOURCES = clip-test.c $(GTK_UTILS)
 
 clip_in_LDADD = $(GTK_LDADD)
-clip_in_SOURCES = clip-in.c utils.c utils.h
+clip_in_SOURCES = clip-in.c $(GTK_UTILS)
 
 trap_test_LDADD = $(GTK_LDADD)
-trap_test_SOURCES = trap-test.c utils.c utils.h
+trap_test_SOURCES = trap-test.c $(GTK_UTILS)
 
 screen_test_LDADD = $(GTK_LDADD)
-screen_test_SOURCES = screen-test.c utils.c utils.h
+screen_test_SOURCES = screen-test.c $(GTK_UTILS)
 
 convolution_test_LDADD = $(GTK_LDADD)
-convolution_test_SOURCES = convolution-test.c utils.c utils.h
+convolution_test_SOURCES = convolution-test.c $(GTK_UTILS)
 
 endif
 
-noinst_PROGRAMS = $(TESTPROGRAMS)
+noinst_PROGRAMS = $(TESTPROGRAMS) $(TESTPROGRAMS_GTK)
+
+TESTS = $(TESTPROGRAMS)
 
diff --git a/lib/pixman/test/Makefile.in b/lib/pixman/test/Makefile.in
index f270165db..3991ea13d 100644
--- a/lib/pixman/test/Makefile.in
+++ b/lib/pixman/test/Makefile.in
@@ -36,18 +36,8 @@ PRE_UNINSTALL = :
 POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
-@HAVE_GTK_TRUE@am__append_1 = \
-@HAVE_GTK_TRUE@	clip-test		\
-@HAVE_GTK_TRUE@	clip-in			\
-@HAVE_GTK_TRUE@	composite-test		\
-@HAVE_GTK_TRUE@	gradient-test		\
-@HAVE_GTK_TRUE@	alpha-test		\
-@HAVE_GTK_TRUE@	screen-test		\
-@HAVE_GTK_TRUE@	convolution-test	\
-@HAVE_GTK_TRUE@	trap-test
-
-@HAVE_GTK_TRUE@am__append_2 = $(GTK_CFLAGS)
-noinst_PROGRAMS = $(am__EXEEXT_2)
+@HAVE_GTK_TRUE@am__append_1 = $(GTK_CFLAGS)
+noinst_PROGRAMS = $(am__EXEEXT_1) $(am__EXEEXT_2)
 subdir = test
 DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
@@ -57,73 +47,88 @@ am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
 mkinstalldirs = $(SHELL) $(install_sh) -d
 CONFIG_HEADER = $(top_builddir)/config.h
 CONFIG_CLEAN_FILES =
-@HAVE_GTK_TRUE@am__EXEEXT_1 = clip-test$(EXEEXT) clip-in$(EXEEXT) \
+am__EXEEXT_1 = a1-trap-test$(EXEEXT) region-test$(EXEEXT) \
+	fetch-test$(EXEEXT) oob-test$(EXEEXT) window-test$(EXEEXT) \
+	trap-crasher$(EXEEXT) alphamap$(EXEEXT) blitters-test$(EXEEXT) \
+	scaling-test$(EXEEXT) composite$(EXEEXT)
+@HAVE_GTK_TRUE@am__EXEEXT_2 = clip-test$(EXEEXT) clip-in$(EXEEXT) \
 @HAVE_GTK_TRUE@	composite-test$(EXEEXT) gradient-test$(EXEEXT) \
 @HAVE_GTK_TRUE@	alpha-test$(EXEEXT) screen-test$(EXEEXT) \
-@HAVE_GTK_TRUE@	convolution-test$(EXEEXT) trap-test$(EXEEXT)
-am__EXEEXT_2 = region-test$(EXEEXT) scaling-test$(EXEEXT) \
-	blitters-test$(EXEEXT) fetch-test$(EXEEXT) oob-test$(EXEEXT) \
-	window-test$(EXEEXT) trap-crasher$(EXEEXT) $(am__EXEEXT_1)
+@HAVE_GTK_TRUE@	convolution-test$(EXEEXT) trap-test$(EXEEXT) \
+@HAVE_GTK_TRUE@	alphamap$(EXEEXT)
 PROGRAMS = $(noinst_PROGRAMS)
-am__alpha_test_SOURCES_DIST = alpha-test.c utils.c utils.h
+a1_trap_test_SOURCES = a1-trap-test.c
+a1_trap_test_OBJECTS = a1-trap-test.$(OBJEXT)
+am__DEPENDENCIES_1 = $(top_builddir)/pixman/libpixman-1.la
+a1_trap_test_DEPENDENCIES = $(am__DEPENDENCIES_1)
+am__alpha_test_SOURCES_DIST = alpha-test.c gtk-utils.c gtk-utils.h
+@HAVE_GTK_TRUE@am__objects_1 = gtk-utils.$(OBJEXT)
 @HAVE_GTK_TRUE@am_alpha_test_OBJECTS = alpha-test.$(OBJEXT) \
-@HAVE_GTK_TRUE@	utils.$(OBJEXT)
+@HAVE_GTK_TRUE@	$(am__objects_1)
 alpha_test_OBJECTS = $(am_alpha_test_OBJECTS)
-am__DEPENDENCIES_1 = $(top_builddir)/pixman/libpixman-1.la
 am__DEPENDENCIES_2 =
 @HAVE_GTK_TRUE@am__DEPENDENCIES_3 = $(am__DEPENDENCIES_1) \
 @HAVE_GTK_TRUE@	$(am__DEPENDENCIES_2)
 @HAVE_GTK_TRUE@alpha_test_DEPENDENCIES = $(am__DEPENDENCIES_3)
-blitters_test_SOURCES = blitters-test.c
-blitters_test_OBJECTS = blitters-test.$(OBJEXT)
+am_alphamap_OBJECTS = alphamap.$(OBJEXT) utils.$(OBJEXT)
+alphamap_OBJECTS = $(am_alphamap_OBJECTS)
+alphamap_DEPENDENCIES = $(am__DEPENDENCIES_1)
+am_blitters_test_OBJECTS = blitters-test.$(OBJEXT) utils.$(OBJEXT)
+blitters_test_OBJECTS = $(am_blitters_test_OBJECTS)
 blitters_test_DEPENDENCIES = $(am__DEPENDENCIES_1)
-am__clip_in_SOURCES_DIST = clip-in.c utils.c utils.h
-@HAVE_GTK_TRUE@am_clip_in_OBJECTS = clip-in.$(OBJEXT) utils.$(OBJEXT)
+am__clip_in_SOURCES_DIST = clip-in.c gtk-utils.c gtk-utils.h
+@HAVE_GTK_TRUE@am_clip_in_OBJECTS = clip-in.$(OBJEXT) $(am__objects_1)
 clip_in_OBJECTS = $(am_clip_in_OBJECTS)
 @HAVE_GTK_TRUE@clip_in_DEPENDENCIES = $(am__DEPENDENCIES_3)
-am__clip_test_SOURCES_DIST = clip-test.c utils.c utils.h
+am__clip_test_SOURCES_DIST = clip-test.c gtk-utils.c gtk-utils.h
 @HAVE_GTK_TRUE@am_clip_test_OBJECTS = clip-test.$(OBJEXT) \
-@HAVE_GTK_TRUE@	utils.$(OBJEXT)
+@HAVE_GTK_TRUE@	$(am__objects_1)
 clip_test_OBJECTS = $(am_clip_test_OBJECTS)
 @HAVE_GTK_TRUE@clip_test_DEPENDENCIES = $(am__DEPENDENCIES_3)
-am__composite_test_SOURCES_DIST = composite-test.c utils.c utils.h
+composite_SOURCES = composite.c
+composite_OBJECTS = composite.$(OBJEXT)
+composite_DEPENDENCIES = $(am__DEPENDENCIES_1)
+am__composite_test_SOURCES_DIST = composite-test.c gtk-utils.c \
+	gtk-utils.h
 @HAVE_GTK_TRUE@am_composite_test_OBJECTS = composite-test.$(OBJEXT) \
-@HAVE_GTK_TRUE@	utils.$(OBJEXT)
+@HAVE_GTK_TRUE@	$(am__objects_1)
 composite_test_OBJECTS = $(am_composite_test_OBJECTS)
 @HAVE_GTK_TRUE@composite_test_DEPENDENCIES = $(am__DEPENDENCIES_3)
-am__convolution_test_SOURCES_DIST = convolution-test.c utils.c utils.h
+am__convolution_test_SOURCES_DIST = convolution-test.c gtk-utils.c \
+	gtk-utils.h
 @HAVE_GTK_TRUE@am_convolution_test_OBJECTS =  \
-@HAVE_GTK_TRUE@	convolution-test.$(OBJEXT) utils.$(OBJEXT)
+@HAVE_GTK_TRUE@	convolution-test.$(OBJEXT) $(am__objects_1)
 convolution_test_OBJECTS = $(am_convolution_test_OBJECTS)
 @HAVE_GTK_TRUE@convolution_test_DEPENDENCIES = $(am__DEPENDENCIES_3)
 fetch_test_SOURCES = fetch-test.c
 fetch_test_OBJECTS = fetch-test.$(OBJEXT)
 fetch_test_DEPENDENCIES = $(am__DEPENDENCIES_1)
-am__gradient_test_SOURCES_DIST = gradient-test.c utils.c utils.h
+am__gradient_test_SOURCES_DIST = gradient-test.c gtk-utils.c \
+	gtk-utils.h
 @HAVE_GTK_TRUE@am_gradient_test_OBJECTS = gradient-test.$(OBJEXT) \
-@HAVE_GTK_TRUE@	utils.$(OBJEXT)
+@HAVE_GTK_TRUE@	$(am__objects_1)
 gradient_test_OBJECTS = $(am_gradient_test_OBJECTS)
 @HAVE_GTK_TRUE@gradient_test_DEPENDENCIES = $(am__DEPENDENCIES_3)
 oob_test_SOURCES = oob-test.c
 oob_test_OBJECTS = oob-test.$(OBJEXT)
 oob_test_DEPENDENCIES = $(am__DEPENDENCIES_1)
-region_test_SOURCES = region-test.c
-region_test_OBJECTS = region-test.$(OBJEXT)
+am_region_test_OBJECTS = region-test.$(OBJEXT) utils.$(OBJEXT)
+region_test_OBJECTS = $(am_region_test_OBJECTS)
 region_test_DEPENDENCIES = $(am__DEPENDENCIES_1)
-scaling_test_SOURCES = scaling-test.c
-scaling_test_OBJECTS = scaling-test.$(OBJEXT)
+am_scaling_test_OBJECTS = scaling-test.$(OBJEXT) utils.$(OBJEXT)
+scaling_test_OBJECTS = $(am_scaling_test_OBJECTS)
 scaling_test_DEPENDENCIES = $(am__DEPENDENCIES_1)
-am__screen_test_SOURCES_DIST = screen-test.c utils.c utils.h
+am__screen_test_SOURCES_DIST = screen-test.c gtk-utils.c gtk-utils.h
 @HAVE_GTK_TRUE@am_screen_test_OBJECTS = screen-test.$(OBJEXT) \
-@HAVE_GTK_TRUE@	utils.$(OBJEXT)
+@HAVE_GTK_TRUE@	$(am__objects_1)
 screen_test_OBJECTS = $(am_screen_test_OBJECTS)
 @HAVE_GTK_TRUE@screen_test_DEPENDENCIES = $(am__DEPENDENCIES_3)
 trap_crasher_SOURCES = trap-crasher.c
 trap_crasher_OBJECTS = trap-crasher.$(OBJEXT)
 trap_crasher_DEPENDENCIES = $(am__DEPENDENCIES_1)
-am__trap_test_SOURCES_DIST = trap-test.c utils.c utils.h
+am__trap_test_SOURCES_DIST = trap-test.c gtk-utils.c gtk-utils.h
 @HAVE_GTK_TRUE@am_trap_test_OBJECTS = trap-test.$(OBJEXT) \
-@HAVE_GTK_TRUE@	utils.$(OBJEXT)
+@HAVE_GTK_TRUE@	$(am__objects_1)
 trap_test_OBJECTS = $(am_trap_test_OBJECTS)
 @HAVE_GTK_TRUE@trap_test_DEPENDENCIES = $(am__DEPENDENCIES_3)
 window_test_SOURCES = window-test.c
@@ -140,18 +145,21 @@ LTCOMPILE = $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) \
 CCLD = $(CC)
 LINK = $(LIBTOOL) --tag=CC --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
 	$(AM_LDFLAGS) $(LDFLAGS) -o $@
-SOURCES = $(alpha_test_SOURCES) blitters-test.c $(clip_in_SOURCES) \
-	$(clip_test_SOURCES) $(composite_test_SOURCES) \
+SOURCES = a1-trap-test.c $(alpha_test_SOURCES) $(alphamap_SOURCES) \
+	$(blitters_test_SOURCES) $(clip_in_SOURCES) \
+	$(clip_test_SOURCES) composite.c $(composite_test_SOURCES) \
 	$(convolution_test_SOURCES) fetch-test.c \
-	$(gradient_test_SOURCES) oob-test.c region-test.c \
-	scaling-test.c $(screen_test_SOURCES) trap-crasher.c \
+	$(gradient_test_SOURCES) oob-test.c $(region_test_SOURCES) \
+	$(scaling_test_SOURCES) $(screen_test_SOURCES) trap-crasher.c \
 	$(trap_test_SOURCES) window-test.c
-DIST_SOURCES = $(am__alpha_test_SOURCES_DIST) blitters-test.c \
+DIST_SOURCES = a1-trap-test.c $(am__alpha_test_SOURCES_DIST) \
+	$(alphamap_SOURCES) $(blitters_test_SOURCES) \
 	$(am__clip_in_SOURCES_DIST) $(am__clip_test_SOURCES_DIST) \
-	$(am__composite_test_SOURCES_DIST) \
+	composite.c $(am__composite_test_SOURCES_DIST) \
 	$(am__convolution_test_SOURCES_DIST) fetch-test.c \
-	$(am__gradient_test_SOURCES_DIST) oob-test.c region-test.c \
-	scaling-test.c $(am__screen_test_SOURCES_DIST) trap-crasher.c \
+	$(am__gradient_test_SOURCES_DIST) oob-test.c \
+	$(region_test_SOURCES) $(scaling_test_SOURCES) \
+	$(am__screen_test_SOURCES_DIST) trap-crasher.c \
 	$(am__trap_test_SOURCES_DIST) window-test.c
 ETAGS = etags
 CTAGS = ctags
@@ -161,13 +169,13 @@ AMDEP_FALSE = @AMDEP_FALSE@
 AMDEP_TRUE = @AMDEP_TRUE@
 AMTAR = @AMTAR@
 AR = @AR@
-ARM_NEON_CFLAGS = @ARM_NEON_CFLAGS@
-ARM_SIMD_CFLAGS = @ARM_SIMD_CFLAGS@
 AUTOCONF = @AUTOCONF@
 AUTOHEADER = @AUTOHEADER@
 AUTOMAKE = @AUTOMAKE@
 AWK = @AWK@
 CC = @CC@
+CCAS = @CCAS@
+CCASFLAGS = @CCASFLAGS@
 CCDEPMODE = @CCDEPMODE@
 CFLAGS = @CFLAGS@
 CPP = @CPP@
@@ -195,6 +203,7 @@ GTK_CFLAGS = @GTK_CFLAGS@
 GTK_LIBS = @GTK_LIBS@
 HAVE_GTK_FALSE = @HAVE_GTK_FALSE@
 HAVE_GTK_TRUE = @HAVE_GTK_TRUE@
+HAVE_PTHREAD_SETSPECIFIC = @HAVE_PTHREAD_SETSPECIFIC@
 INSTALL_DATA = @INSTALL_DATA@
 INSTALL_PROGRAM = @INSTALL_PROGRAM@
 INSTALL_SCRIPT = @INSTALL_SCRIPT@
@@ -224,6 +233,7 @@ PIXMAN_VERSION_MAJOR = @PIXMAN_VERSION_MAJOR@
 PIXMAN_VERSION_MICRO = @PIXMAN_VERSION_MICRO@
 PIXMAN_VERSION_MINOR = @PIXMAN_VERSION_MINOR@
 PKG_CONFIG = @PKG_CONFIG@
+PTHREAD_LDFLAGS = @PTHREAD_LDFLAGS@
 RANLIB = @RANLIB@
 SED = @SED@
 SET_MAKE = @SET_MAKE@
@@ -231,6 +241,9 @@ SHELL = @SHELL@
 SSE2_CFLAGS = @SSE2_CFLAGS@
 SSE2_LDFLAGS = @SSE2_LDFLAGS@
 STRIP = @STRIP@
+STUBS_CFLAGS = @STUBS_CFLAGS@
+STUBS_LIBS = @STUBS_LIBS@
+TOOLCHAIN_SUPPORTS__THREAD = @TOOLCHAIN_SUPPORTS__THREAD@
 USE_ARM_NEON_FALSE = @USE_ARM_NEON_FALSE@
 USE_ARM_NEON_TRUE = @USE_ARM_NEON_TRUE@
 USE_ARM_SIMD_FALSE = @USE_ARM_SIMD_FALSE@
@@ -294,35 +307,65 @@ sysconfdir = @sysconfdir@
 target_alias = @target_alias@
 TEST_LDADD = $(top_builddir)/pixman/libpixman-1.la
 INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman \
-	$(am__append_2)
-TESTPROGRAMS = region-test scaling-test blitters-test fetch-test \
-	oob-test window-test trap-crasher $(am__append_1)
+	$(am__append_1)
+TESTPROGRAMS = \
+	a1-trap-test		\
+	region-test		\
+	fetch-test		\
+	oob-test		\
+	window-test		\
+	trap-crasher		\
+	alphamap		\
+	blitters-test		\
+	scaling-test		\
+	composite
+
+a1_trap_test_LDADD = $(TEST_LDADD)
 fetch_test_LDADD = $(TEST_LDADD)
-region_test_LDADD = $(TEST_LDADD)
-scaling_test_LDADD = $(TEST_LDADD)
-blitters_test_LDADD = $(TEST_LDADD)
+composite_LDADD = $(TEST_LDADD)
 trap_crasher_LDADD = $(TEST_LDADD)
 oob_test_LDADD = $(TEST_LDADD)
 window_test_LDADD = $(TEST_LDADD)
+region_test_LDADD = $(TEST_LDADD)
+region_test_SOURCES = region-test.c utils.c utils.h
+blitters_test_LDADD = $(TEST_LDADD)
+blitters_test_SOURCES = blitters-test.c utils.c utils.h
+scaling_test_LDADD = $(TEST_LDADD)
+scaling_test_SOURCES = scaling-test.c utils.c utils.h
+alphamap_LDADD = $(TEST_LDADD)
+alphamap_SOURCES = alphamap.c utils.c utils.h
 
 # GTK using test programs
 @HAVE_GTK_TRUE@GTK_LDADD = $(TEST_LDADD) $(GTK_LIBS)
+@HAVE_GTK_TRUE@GTK_UTILS = gtk-utils.c gtk-utils.h
+@HAVE_GTK_TRUE@TESTPROGRAMS_GTK = \
+@HAVE_GTK_TRUE@	clip-test		\
+@HAVE_GTK_TRUE@	clip-in			\
+@HAVE_GTK_TRUE@	composite-test		\
+@HAVE_GTK_TRUE@	gradient-test		\
+@HAVE_GTK_TRUE@	alpha-test		\
+@HAVE_GTK_TRUE@	screen-test		\
+@HAVE_GTK_TRUE@	convolution-test	\
+@HAVE_GTK_TRUE@	trap-test		\
+@HAVE_GTK_TRUE@	alphamap
+
 @HAVE_GTK_TRUE@gradient_test_LDADD = $(GTK_LDADD)
-@HAVE_GTK_TRUE@gradient_test_SOURCES = gradient-test.c utils.c utils.h
+@HAVE_GTK_TRUE@gradient_test_SOURCES = gradient-test.c $(GTK_UTILS)
 @HAVE_GTK_TRUE@alpha_test_LDADD = $(GTK_LDADD)
-@HAVE_GTK_TRUE@alpha_test_SOURCES = alpha-test.c utils.c utils.h
+@HAVE_GTK_TRUE@alpha_test_SOURCES = alpha-test.c $(GTK_UTILS)
 @HAVE_GTK_TRUE@composite_test_LDADD = $(GTK_LDADD)
-@HAVE_GTK_TRUE@composite_test_SOURCES = composite-test.c utils.c utils.h
+@HAVE_GTK_TRUE@composite_test_SOURCES = composite-test.c $(GTK_UTILS)
 @HAVE_GTK_TRUE@clip_test_LDADD = $(GTK_LDADD)
-@HAVE_GTK_TRUE@clip_test_SOURCES = clip-test.c utils.c utils.h
+@HAVE_GTK_TRUE@clip_test_SOURCES = clip-test.c $(GTK_UTILS)
 @HAVE_GTK_TRUE@clip_in_LDADD = $(GTK_LDADD)
-@HAVE_GTK_TRUE@clip_in_SOURCES = clip-in.c utils.c utils.h
+@HAVE_GTK_TRUE@clip_in_SOURCES = clip-in.c $(GTK_UTILS)
 @HAVE_GTK_TRUE@trap_test_LDADD = $(GTK_LDADD)
-@HAVE_GTK_TRUE@trap_test_SOURCES = trap-test.c utils.c utils.h
+@HAVE_GTK_TRUE@trap_test_SOURCES = trap-test.c $(GTK_UTILS)
 @HAVE_GTK_TRUE@screen_test_LDADD = $(GTK_LDADD)
-@HAVE_GTK_TRUE@screen_test_SOURCES = screen-test.c utils.c utils.h
+@HAVE_GTK_TRUE@screen_test_SOURCES = screen-test.c $(GTK_UTILS)
 @HAVE_GTK_TRUE@convolution_test_LDADD = $(GTK_LDADD)
-@HAVE_GTK_TRUE@convolution_test_SOURCES = convolution-test.c utils.c utils.h
+@HAVE_GTK_TRUE@convolution_test_SOURCES = convolution-test.c $(GTK_UTILS)
+TESTS = $(TESTPROGRAMS)
 all: all-am
 
 .SUFFIXES:
@@ -336,9 +379,9 @@ $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	      exit 1;; \
 	  esac; \
 	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu  test/Makefile'; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign  test/Makefile'; \
 	cd $(top_srcdir) && \
-	  $(AUTOMAKE) --gnu  test/Makefile
+	  $(AUTOMAKE) --foreign  test/Makefile
 .PRECIOUS: Makefile
 Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 	@case '$?' in \
@@ -363,9 +406,15 @@ clean-noinstPROGRAMS:
 	  echo " rm -f $$p $$f"; \
 	  rm -f $$p $$f ; \
 	done
+a1-trap-test$(EXEEXT): $(a1_trap_test_OBJECTS) $(a1_trap_test_DEPENDENCIES) 
+	@rm -f a1-trap-test$(EXEEXT)
+	$(LINK) $(a1_trap_test_LDFLAGS) $(a1_trap_test_OBJECTS) $(a1_trap_test_LDADD) $(LIBS)
 alpha-test$(EXEEXT): $(alpha_test_OBJECTS) $(alpha_test_DEPENDENCIES) 
 	@rm -f alpha-test$(EXEEXT)
 	$(LINK) $(alpha_test_LDFLAGS) $(alpha_test_OBJECTS) $(alpha_test_LDADD) $(LIBS)
+alphamap$(EXEEXT): $(alphamap_OBJECTS) $(alphamap_DEPENDENCIES) 
+	@rm -f alphamap$(EXEEXT)
+	$(LINK) $(alphamap_LDFLAGS) $(alphamap_OBJECTS) $(alphamap_LDADD) $(LIBS)
 blitters-test$(EXEEXT): $(blitters_test_OBJECTS) $(blitters_test_DEPENDENCIES) 
 	@rm -f blitters-test$(EXEEXT)
 	$(LINK) $(blitters_test_LDFLAGS) $(blitters_test_OBJECTS) $(blitters_test_LDADD) $(LIBS)
@@ -375,6 +424,9 @@ clip-in$(EXEEXT): $(clip_in_OBJECTS) $(clip_in_DEPENDENCIES)
 clip-test$(EXEEXT): $(clip_test_OBJECTS) $(clip_test_DEPENDENCIES) 
 	@rm -f clip-test$(EXEEXT)
 	$(LINK) $(clip_test_LDFLAGS) $(clip_test_OBJECTS) $(clip_test_LDADD) $(LIBS)
+composite$(EXEEXT): $(composite_OBJECTS) $(composite_DEPENDENCIES) 
+	@rm -f composite$(EXEEXT)
+	$(LINK) $(composite_LDFLAGS) $(composite_OBJECTS) $(composite_LDADD) $(LIBS)
 composite-test$(EXEEXT): $(composite_test_OBJECTS) $(composite_test_DEPENDENCIES) 
 	@rm -f composite-test$(EXEEXT)
 	$(LINK) $(composite_test_LDFLAGS) $(composite_test_OBJECTS) $(composite_test_LDADD) $(LIBS)
@@ -415,14 +467,18 @@ mostlyclean-compile:
 distclean-compile:
 	-rm -f *.tab.c
 
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/a1-trap-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/alpha-test.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/alphamap.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blitters-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/clip-in.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/clip-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/composite-test.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/composite.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/convolution-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fetch-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gradient-test.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gtk-utils.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oob-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/region-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/scaling-test.Po@am__quote@
@@ -511,6 +567,79 @@ GTAGS:
 distclean-tags:
 	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
 
+check-TESTS: $(TESTS)
+	@failed=0; all=0; xfail=0; xpass=0; skip=0; \
+	srcdir=$(srcdir); export srcdir; \
+	list='$(TESTS)'; \
+	if test -n "$$list"; then \
+	  for tst in $$list; do \
+	    if test -f ./$$tst; then dir=./; \
+	    elif test -f $$tst; then dir=; \
+	    else dir="$(srcdir)/"; fi; \
+	    if $(TESTS_ENVIRONMENT) $${dir}$$tst; then \
+	      all=`expr $$all + 1`; \
+	      case " $(XFAIL_TESTS) " in \
+	      *" $$tst "*) \
+		xpass=`expr $$xpass + 1`; \
+		failed=`expr $$failed + 1`; \
+		echo "XPASS: $$tst"; \
+	      ;; \
+	      *) \
+		echo "PASS: $$tst"; \
+	      ;; \
+	      esac; \
+	    elif test $$? -ne 77; then \
+	      all=`expr $$all + 1`; \
+	      case " $(XFAIL_TESTS) " in \
+	      *" $$tst "*) \
+		xfail=`expr $$xfail + 1`; \
+		echo "XFAIL: $$tst"; \
+	      ;; \
+	      *) \
+		failed=`expr $$failed + 1`; \
+		echo "FAIL: $$tst"; \
+	      ;; \
+	      esac; \
+	    else \
+	      skip=`expr $$skip + 1`; \
+	      echo "SKIP: $$tst"; \
+	    fi; \
+	  done; \
+	  if test "$$failed" -eq 0; then \
+	    if test "$$xfail" -eq 0; then \
+	      banner="All $$all tests passed"; \
+	    else \
+	      banner="All $$all tests behaved as expected ($$xfail expected failures)"; \
+	    fi; \
+	  else \
+	    if test "$$xpass" -eq 0; then \
+	      banner="$$failed of $$all tests failed"; \
+	    else \
+	      banner="$$failed of $$all tests did not behave as expected ($$xpass unexpected passes)"; \
+	    fi; \
+	  fi; \
+	  dashes="$$banner"; \
+	  skipped=""; \
+	  if test "$$skip" -ne 0; then \
+	    skipped="($$skip tests were not run)"; \
+	    test `echo "$$skipped" | wc -c` -le `echo "$$banner" | wc -c` || \
+	      dashes="$$skipped"; \
+	  fi; \
+	  report=""; \
+	  if test "$$failed" -ne 0 && test -n "$(PACKAGE_BUGREPORT)"; then \
+	    report="Please report to $(PACKAGE_BUGREPORT)"; \
+	    test `echo "$$report" | wc -c` -le `echo "$$banner" | wc -c` || \
+	      dashes="$$report"; \
+	  fi; \
+	  dashes=`echo "$$dashes" | sed s/./=/g`; \
+	  echo "$$dashes"; \
+	  echo "$$banner"; \
+	  test -z "$$skipped" || echo "$$skipped"; \
+	  test -z "$$report" || echo "$$report"; \
+	  echo "$$dashes"; \
+	  test "$$failed" -eq 0; \
+	else :; fi
+
 distdir: $(DISTFILES)
 	@srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \
 	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \
@@ -539,6 +668,7 @@ distdir: $(DISTFILES)
 	  fi; \
 	done
 check-am: all-am
+	$(MAKE) $(AM_MAKEFLAGS) check-TESTS
 check: check-am
 all-am: Makefile $(PROGRAMS)
 installdirs:
@@ -617,17 +747,17 @@ ps-am:
 
 uninstall-am: uninstall-info-am
 
-.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
-	clean-libtool clean-noinstPROGRAMS ctags distclean \
-	distclean-compile distclean-generic distclean-libtool \
-	distclean-tags distdir dvi dvi-am html html-am info info-am \
-	install install-am install-data install-data-am install-exec \
-	install-exec-am install-info install-info-am install-man \
-	install-strip installcheck installcheck-am installdirs \
-	maintainer-clean maintainer-clean-generic mostlyclean \
-	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
-	pdf pdf-am ps ps-am tags uninstall uninstall-am \
-	uninstall-info-am
+.PHONY: CTAGS GTAGS all all-am check check-TESTS check-am clean \
+	clean-generic clean-libtool clean-noinstPROGRAMS ctags \
+	distclean distclean-compile distclean-generic \
+	distclean-libtool distclean-tags distdir dvi dvi-am html \
+	html-am info info-am install install-am install-data \
+	install-data-am install-exec install-exec-am install-info \
+	install-info-am install-man install-strip installcheck \
+	installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags uninstall uninstall-am uninstall-info-am
 
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
diff --git a/lib/pixman/test/a1-trap-test.c b/lib/pixman/test/a1-trap-test.c
new file mode 100644
index 000000000..6163e7c61
--- /dev/null
+++ b/lib/pixman/test/a1-trap-test.c
@@ -0,0 +1,50 @@
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 20
+#define HEIGHT 20
+
+    pixman_image_t *src_img;
+    pixman_image_t *mask_img;
+    pixman_image_t *dest_img;
+    pixman_trap_t trap;
+    pixman_color_t red = { 0xffff, 0x0000, 0x0000, 0xffff };
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *mbits = malloc (WIDTH * HEIGHT);
+
+    memset (mbits, 0, WIDTH * HEIGHT);
+    memset (bits, 0xff, WIDTH * HEIGHT * 4);
+    
+    trap.top.l = pixman_double_to_fixed (0.5);
+    trap.top.r = pixman_double_to_fixed (1.5);
+    trap.top.y = pixman_double_to_fixed (0.5);
+
+    trap.bot.l = pixman_double_to_fixed (0.5);
+    trap.bot.r = pixman_double_to_fixed (1.5);
+    trap.bot.y = pixman_double_to_fixed (1.5);
+
+    mask_img = pixman_image_create_bits (
+	PIXMAN_a1, WIDTH, HEIGHT, mbits, WIDTH);
+    src_img = pixman_image_create_solid_fill (&red);
+    dest_img = pixman_image_create_bits (
+	PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4);
+    
+    pixman_add_traps (mask_img, 0, 0, 1, &trap);
+
+    pixman_image_composite (PIXMAN_OP_OVER,
+			    src_img, mask_img, dest_img,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    assert (bits[0] == 0xffff0000);
+    assert (bits[1] == 0xffffffff);
+    assert (bits[1 * WIDTH + 0] == 0xffffffff);
+    assert (bits[1 * WIDTH + 1] == 0xffffffff);
+    
+    return 0;
+}
diff --git a/lib/pixman/test/alpha-test.c b/lib/pixman/test/alpha-test.c
index e2b97c789..92c208142 100644
--- a/lib/pixman/test/alpha-test.c
+++ b/lib/pixman/test/alpha-test.c
@@ -1,7 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "pixman.h"
-#include "utils.h"
+#include "gtk-utils.h"
 
 int
 main (int argc, char **argv)
@@ -14,7 +14,6 @@ main (int argc, char **argv)
     uint32_t *src = malloc (WIDTH * HEIGHT * 4);
     pixman_image_t *grad_img;
     pixman_image_t *alpha_img;
-    pixman_image_t *solid_img;
     pixman_image_t *dest_img;
     pixman_image_t *src_img;
     int i;
@@ -26,24 +25,25 @@ main (int argc, char **argv)
     pixman_point_fixed_t p1 = { pixman_double_to_fixed (0), 0 };
     pixman_point_fixed_t p2 = { pixman_double_to_fixed (WIDTH),
 				pixman_int_to_fixed (0) };
+#if 0
     pixman_transform_t trans = {
 	{ { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), },
 	  { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), },
 	  { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
 	}
     };
-
-    pixman_transform_t id = {
+#else
+    pixman_transform_t trans = {
 	{ { pixman_fixed_1, 0, 0 },
 	  { 0, pixman_fixed_1, 0 },
 	  { 0, 0, pixman_fixed_1 } }
     };
+#endif
 
     pixman_point_fixed_t c_inner;
     pixman_point_fixed_t c_outer;
     pixman_fixed_t r_inner;
     pixman_fixed_t r_outer;
-    pixman_color_t red = { 0xffff, 0x0000, 0x0000, 0xffff };
     
     for (i = 0; i < WIDTH * HEIGHT; ++i)
 	alpha[i] = 0x4f00004f; /* pale blue */
@@ -91,7 +91,7 @@ main (int argc, char **argv)
     grad_img = pixman_image_create_linear_gradient  (&p1, &p2,
 						    stops, 2);
 
-    pixman_image_set_transform (grad_img, &id);
+    pixman_image_set_transform (grad_img, &trans);
     pixman_image_set_repeat (grad_img, PIXMAN_REPEAT_PAD);
     
     pixman_image_composite (PIXMAN_OP_OVER, grad_img, NULL, alpha_img,
diff --git a/lib/pixman/test/alphamap.c b/lib/pixman/test/alphamap.c
new file mode 100644
index 000000000..e6a25efcb
--- /dev/null
+++ b/lib/pixman/test/alphamap.c
@@ -0,0 +1,49 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils.h"
+
+#define WIDTH 400
+#define HEIGHT 200
+
+int
+main (int argc, char **argv)
+{
+    uint8_t *alpha = make_random_bytes (WIDTH * HEIGHT);
+    uint32_t *src = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * 4);
+    uint32_t *dest = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * 4);
+    int i;
+
+    pixman_image_t *a = pixman_image_create_bits (PIXMAN_a8, WIDTH, HEIGHT, (uint32_t *)alpha, WIDTH);
+    pixman_image_t *d = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, dest, WIDTH * 4);
+
+    for (i = 0; i < 2; ++i)
+    {
+	pixman_format_code_t sformat = (i == 0)? PIXMAN_a8r8g8b8 : PIXMAN_a2r10g10b10;
+	pixman_image_t *s = pixman_image_create_bits (sformat, WIDTH, HEIGHT, src, WIDTH * 4);
+	int j, k;
+
+	pixman_image_set_alpha_map (s, a, 0, 0);
+
+	pixman_image_composite (PIXMAN_OP_SRC, s, NULL, d, 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+	for (j = 0; j < HEIGHT; ++j)
+	{
+	    for (k = 0; k < WIDTH; ++k)
+	    {
+		uint8_t ap = ((uint8_t *)alpha)[j * WIDTH + k];
+		uint32_t dap = (dest[j * WIDTH + k] >> 24);
+		uint32_t sap = (src[j * WIDTH + k] >> 24);
+
+		if (ap != dap)
+		{
+		    printf ("Wrong alpha value at (%d, %d). Should be %d; got %d (src was %d)\n", k, j, ap, dap, sap);
+		    return 1;
+		}
+	    }
+	}
+
+	pixman_image_unref (s);
+    }
+
+    return 0;
+}
diff --git a/lib/pixman/test/blitters-test.c b/lib/pixman/test/blitters-test.c
index d5201e541..1ebf6d9ca 100644
--- a/lib/pixman/test/blitters-test.c
+++ b/lib/pixman/test/blitters-test.c
@@ -25,30 +25,9 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <config.h>
-#include "pixman.h"
+#include "utils.h"
 
-/* A primitive pseudorandom number generator, taken from POSIX.1-2001 example */
-
-static uint32_t lcg_seed;
-
-static inline uint32_t
-lcg_rand (void)
-{
-    lcg_seed = lcg_seed * 1103515245 + 12345;
-    return ((uint32_t)(lcg_seed / 65536) % 32768);
-}
-
-static inline void
-lcg_srand (uint32_t seed)
-{
-    lcg_seed = seed;
-}
-
-static inline uint32_t
-lcg_rand_n (int max)
-{
-    return lcg_rand () % max;
-}
+static pixman_indexed_t palette;
 
 static void *
 aligned_malloc (size_t align, size_t size)
@@ -56,7 +35,8 @@ aligned_malloc (size_t align, size_t size)
     void *result;
 
 #ifdef HAVE_POSIX_MEMALIGN
-    posix_memalign (&result, align, size);
+    if (posix_memalign (&result, align, size) != 0)
+      result = NULL;
 #else
     result = malloc (size);
 #endif
@@ -64,192 +44,6 @@ aligned_malloc (size_t align, size_t size)
     return result;
 }
 
-/*----------------------------------------------------------------------------*\
- *  CRC-32 version 2.0.0 by Craig Bruce, 2006-04-29.
- *
- *  This program generates the CRC-32 values for the files named in the
- *  command-line arguments.  These are the same CRC-32 values used by GZIP,
- *  PKZIP, and ZMODEM.  The Crc32_ComputeBuf () can also be detached and
- *  used independently.
- *
- *  THIS PROGRAM IS PUBLIC-DOMAIN SOFTWARE.
- *
- *  Based on the byte-oriented implementation "File Verification Using CRC"
- *  by Mark R. Nelson in Dr. Dobb's Journal, May 1992, pp. 64-67.
- *
- *  v1.0.0: original release.
- *  v1.0.1: fixed printf formats.
- *  v1.0.2: fixed something else.
- *  v1.0.3: replaced CRC constant table by generator function.
- *  v1.0.4: reformatted code, made ANSI C.  1994-12-05.
- *  v2.0.0: rewrote to use memory buffer & static table, 2006-04-29.
-\*----------------------------------------------------------------------------*/
-
-/*----------------------------------------------------------------------------*\
- *  NAME:
- *     Crc32_ComputeBuf () - computes the CRC-32 value of a memory buffer
- *  DESCRIPTION:
- *     Computes or accumulates the CRC-32 value for a memory buffer.
- *     The 'inCrc32' gives a previously accumulated CRC-32 value to allow
- *     a CRC to be generated for multiple sequential buffer-fuls of data.
- *     The 'inCrc32' for the first buffer must be zero.
- *  ARGUMENTS:
- *     inCrc32 - accumulated CRC-32 value, must be 0 on first call
- *     buf     - buffer to compute CRC-32 value for
- *     bufLen  - number of bytes in buffer
- *  RETURNS:
- *     crc32 - computed CRC-32 value
- *  ERRORS:
- *     (no errors are possible)
-\*----------------------------------------------------------------------------*/
-
-static uint32_t
-compute_crc32 (uint32_t    in_crc32,
-	       const void *buf,
-	       size_t      buf_len)
-{
-    static const uint32_t crc_table[256] = {
-	0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F,
-	0xE963A535, 0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
-	0x09B64C2B, 0x7EB17CBD,	0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2,
-	0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
-	0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,	0x14015C4F, 0x63066CD9,
-	0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
-	0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
-	0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
-	0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423,
-	0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
-	0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106,
-	0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
-	0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D,
-	0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
-	0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950,
-	0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
-	0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7,
-	0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
-	0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA,
-	0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
-	0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
-	0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
-	0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84,
-	0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
-	0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB,
-	0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
-	0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E,
-	0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
-	0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55,
-	0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
-	0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28,
-	0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
-	0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F,
-	0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
-	0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
-	0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
-	0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69,
-	0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
-	0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC,
-	0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
-	0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693,
-	0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
-	0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
-    };
-
-    uint32_t              crc32;
-    unsigned char *       byte_buf;
-    size_t                i;
-
-    /* accumulate crc32 for buffer */
-    crc32 = in_crc32 ^ 0xFFFFFFFF;
-    byte_buf = (unsigned char*) buf;
-
-    for (i = 0; i < buf_len; i++)
-	crc32 = (crc32 >> 8) ^ crc_table[(crc32 ^ byte_buf[i]) & 0xFF];
-
-    return (crc32 ^ 0xFFFFFFFF);
-}
-
-/* perform endian conversion of pixel data */
-static void
-image_endian_swap (pixman_image_t *img, int bpp)
-{
-    int stride = pixman_image_get_stride (img);
-    uint32_t *data = pixman_image_get_data (img);
-    int height = pixman_image_get_height (img);
-    int i, j;
-
-    /* swap bytes only on big endian systems */
-    volatile uint16_t endian_check_var = 0x1234;
-    if (*(volatile uint8_t *)&endian_check_var != 0x12)
-	return;
-
-    for (i = 0; i < height; i++)
-    {
-	uint8_t *line_data = (uint8_t *)data + stride * i;
-	/* swap bytes only for 16, 24 and 32 bpp for now */
-	switch (bpp)
-	{
-	case 1:
-	    for (j = 0; j < stride; j++)
-	    {
-		line_data[j] =
-		    ((line_data[j] & 0x80) >> 7) |
-		    ((line_data[j] & 0x40) >> 5) |
-		    ((line_data[j] & 0x20) >> 3) |
-		    ((line_data[j] & 0x10) >> 1) |
-		    ((line_data[j] & 0x08) << 1) |
-		    ((line_data[j] & 0x04) << 3) |
-		    ((line_data[j] & 0x02) << 5) |
-		    ((line_data[j] & 0x01) << 7);
-	    }
-	    break;
-	case 4:
-	    for (j = 0; j < stride; j++)
-	    {
-		line_data[j] = (line_data[j] >> 4) | (line_data[j] << 4);
-	    }
-	    break;
-	case 16:
-	    for (j = 0; j + 2 <= stride; j += 2)
-	    {
-		char t1 = line_data[j + 0];
-		char t2 = line_data[j + 1];
-
-		line_data[j + 1] = t1;
-		line_data[j + 0] = t2;
-	    }
-	    break;
-	case 24:
-	    for (j = 0; j + 3 <= stride; j += 3)
-	    {
-		char t1 = line_data[j + 0];
-		char t2 = line_data[j + 1];
-		char t3 = line_data[j + 2];
-
-		line_data[j + 2] = t1;
-		line_data[j + 1] = t2;
-		line_data[j + 0] = t3;
-	    }
-	    break;
-	case 32:
-	    for (j = 0; j + 4 <= stride; j += 4)
-	    {
-		char t1 = line_data[j + 0];
-		char t2 = line_data[j + 1];
-		char t3 = line_data[j + 2];
-		char t4 = line_data[j + 3];
-
-		line_data[j + 3] = t1;
-		line_data[j + 2] = t2;
-		line_data[j + 1] = t3;
-		line_data[j + 0] = t4;
-	    }
-	    break;
-	default:
-	    break;
-	}
-    }
-}
-
 /* Create random image for testing purposes */
 static pixman_image_t *
 create_random_image (pixman_format_code_t *allowed_formats,
@@ -266,6 +60,7 @@ create_random_image (pixman_format_code_t *allowed_formats,
     while (allowed_formats[n] != -1)
 	n++;
     fmt = allowed_formats[lcg_rand_n (n)];
+
     width = lcg_rand_n (max_width) + 1;
     height = lcg_rand_n (max_height) + 1;
     stride = (width * PIXMAN_FORMAT_BPP (fmt) + 7) / 8 +
@@ -287,6 +82,12 @@ create_random_image (pixman_format_code_t *allowed_formats,
 
     img = pixman_image_create_bits (fmt, width, height, buf, stride);
 
+    if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_COLOR	||
+	PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_GRAY)
+    {
+	pixman_image_set_indexed (img, &palette);
+    }
+
     image_endian_swap (img, PIXMAN_FORMAT_BPP (fmt));
 
     if (used_fmt) *used_fmt = fmt;
@@ -302,7 +103,7 @@ free_random_image (uint32_t initcrc,
     uint32_t crc32 = 0;
     int stride = pixman_image_get_stride (img);
     uint32_t *data = pixman_image_get_data (img);
-    int height = pixman_image_get_height (img);;
+    int height = pixman_image_get_height (img);
 
     if (fmt != -1)
     {
@@ -429,7 +230,6 @@ static pixman_format_code_t img_fmt_list[] = {
     PIXMAN_b2g3r3,
     PIXMAN_a2r2g2b2,
     PIXMAN_a2b2g2r2,
-#if 0 /* using these crashes the test */
     PIXMAN_c8,
     PIXMAN_g8,
     PIXMAN_x4c4,
@@ -437,7 +237,6 @@ static pixman_format_code_t img_fmt_list[] = {
     PIXMAN_c4,
     PIXMAN_g4,
     PIXMAN_g1,
-#endif
     PIXMAN_x4a4,
     PIXMAN_a4,
     PIXMAN_r1g2b1,
@@ -472,10 +271,11 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
     int src_stride, dst_stride;
     int src_x, src_y;
     int dst_x, dst_y;
+    int mask_x, mask_y;
     int w, h;
     int op;
     pixman_format_code_t src_fmt, dst_fmt, mask_fmt;
-    uint32_t *dstbuf;
+    uint32_t *dstbuf, *srcbuf, *maskbuf;
     uint32_t crc32;
     int max_width, max_height, max_extra_stride;
 
@@ -513,10 +313,43 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
     dst_img = create_random_image (img_fmt_list, max_width, max_height,
 				   max_extra_stride, &dst_fmt);
 
+    src_width = pixman_image_get_width (src_img);
+    src_height = pixman_image_get_height (src_img);
+    src_stride = pixman_image_get_stride (src_img);
+
+    dst_width = pixman_image_get_width (dst_img);
+    dst_height = pixman_image_get_height (dst_img);
+    dst_stride = pixman_image_get_stride (dst_img);
+
+    dstbuf = pixman_image_get_data (dst_img);
+    srcbuf = pixman_image_get_data (src_img);
+
+    src_x = lcg_rand_n (src_width);
+    src_y = lcg_rand_n (src_height);
+    dst_x = lcg_rand_n (dst_width);
+    dst_y = lcg_rand_n (dst_height);
+
     mask_img = NULL;
     mask_fmt = -1;
+    mask_x = 0;
+    mask_y = 0;
+    maskbuf = NULL;
 
-    if (lcg_rand_n (2))
+    if ((src_fmt == PIXMAN_x8r8g8b8 || src_fmt == PIXMAN_x8b8g8r8) &&
+	(lcg_rand_n (4) == 0))
+    {
+	/* PIXBUF */
+	mask_fmt = lcg_rand_n (2) ? PIXMAN_a8r8g8b8 : PIXMAN_a8b8g8r8;
+	mask_img = pixman_image_create_bits (mask_fmt,
+	                                     src_width,
+	                                     src_height,
+	                                     srcbuf,
+	                                     src_stride);
+	mask_x = src_x;
+	mask_y = src_y;
+	maskbuf = srcbuf;
+    }
+    else if (lcg_rand_n (2))
     {
 	if (lcg_rand_n (2))
 	{
@@ -533,22 +366,11 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
 
 	if (lcg_rand_n (2))
 	    pixman_image_set_component_alpha (mask_img, 1);
-    }
 
-    src_width = pixman_image_get_width (src_img);
-    src_height = pixman_image_get_height (src_img);
-    src_stride = pixman_image_get_stride (src_img);
-
-    dst_width = pixman_image_get_width (dst_img);
-    dst_height = pixman_image_get_height (dst_img);
-    dst_stride = pixman_image_get_stride (dst_img);
-
-    dstbuf = pixman_image_get_data (dst_img);
+	mask_x = lcg_rand_n (pixman_image_get_width (mask_img));
+	mask_y = lcg_rand_n (pixman_image_get_height (mask_img));
+    }
 
-    src_x = lcg_rand_n (src_width);
-    src_y = lcg_rand_n (src_height);
-    dst_x = lcg_rand_n (dst_width);
-    dst_y = lcg_rand_n (dst_height);
 
     w = lcg_rand_n (dst_width - dst_x + 1);
     h = lcg_rand_n (dst_height - dst_y + 1);
@@ -567,7 +389,7 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
     }
 
     pixman_image_composite (op, src_img, mask_img, dst_img,
-			    src_x, src_y, src_x, src_y, dst_x, dst_y, w, h);
+			    src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
 
     if (verbose)
     {
@@ -592,11 +414,29 @@ test_composite (uint32_t initcrc, int testnum, int verbose)
     crc32 = free_random_image (initcrc, dst_img, dst_fmt);
 
     if (mask_img)
-	free_random_image (initcrc, mask_img, -1);
+    {
+	if (srcbuf == maskbuf)
+	    pixman_image_unref(mask_img);
+	else
+	    free_random_image (initcrc, mask_img, -1);
+    }
+
 
     return crc32;
 }
 
+static void
+initialize_palette (void)
+{
+    int i;
+
+    for (i = 0; i < PIXMAN_MAX_INDEXED; ++i)
+	palette.rgba[i] = lcg_rand ();
+
+    for (i = 0; i < 32768; ++i)
+	palette.ent[i] = lcg_rand() & 0xff;
+}
+
 int
 main (int argc, char *argv[])
 {
@@ -604,6 +444,8 @@ main (int argc, char *argv[])
     uint32_t crc = 0;
     int verbose = getenv ("VERBOSE") != NULL;
 
+    initialize_palette();
+
     if (argc >= 3)
     {
 	n1 = atoi (argv[1]);
@@ -640,7 +482,7 @@ main (int argc, char *argv[])
 	    /* Predefined value for running with all the fastpath functions
 	       disabled. It needs to be updated every time when changes are
 	       introduced to this program or behavior of pixman changes! */
-	    if (crc == 0x06D8EDB6)
+	    if (crc == 0xBBACC28D)
 	    {
 		printf ("blitters test passed\n");
 	    }
diff --git a/lib/pixman/test/clip-in.c b/lib/pixman/test/clip-in.c
index 55459b204..51579811f 100644
--- a/lib/pixman/test/clip-in.c
+++ b/lib/pixman/test/clip-in.c
@@ -2,7 +2,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "pixman.h"
-#include "utils.h"
+#include "gtk-utils.h"
 
 /* This test demonstrates that clipping is done totally different depending
  * on whether the source is transformed or not.
diff --git a/lib/pixman/test/clip-test.c b/lib/pixman/test/clip-test.c
index 900013718..aa0df4482 100644
--- a/lib/pixman/test/clip-test.c
+++ b/lib/pixman/test/clip-test.c
@@ -1,7 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "pixman.h"
-#include "utils.h"
+#include "gtk-utils.h"
 
 #define WIDTH 200
 #define HEIGHT 200
@@ -31,9 +31,11 @@ main (int argc, char **argv)
 	    { pixman_int_to_fixed (0), { 0xffff, 0x0000, 0x0000, 0xffff } },
 	    { pixman_int_to_fixed (1), { 0xffff, 0xffff, 0x0000, 0xffff } }
 	};
+#if 0
     pixman_point_fixed_t p1 = { 0, 0 };
     pixman_point_fixed_t p2 = { pixman_int_to_fixed (WIDTH),
 				pixman_int_to_fixed (HEIGHT) };
+#endif
     pixman_point_fixed_t c_inner;
     pixman_point_fixed_t c_outer;
     pixman_fixed_t r_inner;
diff --git a/lib/pixman/test/composite-test.c b/lib/pixman/test/composite-test.c
index 49e0220a4..5401abfdf 100644
--- a/lib/pixman/test/composite-test.c
+++ b/lib/pixman/test/composite-test.c
@@ -2,7 +2,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include "pixman.h"
-#include "utils.h"
+#include "gtk-utils.h"
 
 #define WIDTH	60
 #define HEIGHT	60
@@ -77,6 +77,9 @@ writer (void *src, uint32_t value, int size)
     case 4:
 	*(uint32_t *)src = value;
 	break;
+
+    default:
+        break;
     }
 }
 
@@ -113,7 +116,7 @@ main (int argc, char **argv)
 
     window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
 
-    gtk_window_set_default_size (window, 800, 600);
+    gtk_window_set_default_size (GTK_WINDOW (window), 800, 600);
     
     g_signal_connect (window, "delete-event",
 		      G_CALLBACK (gtk_main_quit),
diff --git a/lib/pixman/test/composite.c b/lib/pixman/test/composite.c
new file mode 100644
index 000000000..9e8c0fbd4
--- /dev/null
+++ b/lib/pixman/test/composite.c
@@ -0,0 +1,901 @@
+/*
+ * Copyright � 2005 Eric Anholt
+ * Copyright � 2009 Chris Wilson
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Eric Anholt not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Eric Anholt makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * ERIC ANHOLT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL ERIC ANHOLT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <pixman.h>
+#include <stdio.h>
+#include <stdlib.h> /* abort() */
+#include <math.h>
+#include <config.h>
+
+#define FALSE 0
+#define TRUE !FALSE
+
+#define ARRAY_LENGTH(A) ((int) (sizeof (A) / sizeof ((A) [0])))
+#define min(a,b) ((a) <= (b) ? (a) : (b))
+#define max(a,b) ((a) >= (b) ? (a) : (b))
+
+typedef struct color_t color_t;
+typedef struct format_t format_t;
+typedef struct image_t image_t;
+typedef struct operator_t operator_t;
+
+struct color_t
+{
+    double r, g, b, a;
+};
+
+struct format_t
+{
+    pixman_format_code_t format;
+    const char *name;
+};
+
+static color_t colors[] =
+{
+    /* these are premultiplied in main() */
+    { 1.0, 1.0, 1.0, 1.0 },
+    { 1.0, 0.0, 0.0, 1.0 },
+    { 0.0, 1.0, 0.0, 1.0 },
+    { 0.0, 0.0, 1.0, 1.0 },
+    { 0.0, 0.0, 0.0, 1.0 },
+    { 0.5, 0.0, 0.0, 0.5 },
+};
+
+static uint16_t
+_color_double_to_short (double d)
+{
+    uint32_t i;
+
+    i = (uint32_t) (d * 65536);
+    i -= (i >> 16);
+
+    return i;
+}
+
+static void
+compute_pixman_color (const color_t *color,
+		      pixman_color_t *out)
+{
+    out->red   = _color_double_to_short (color->r);
+    out->green = _color_double_to_short (color->g);
+    out->blue  = _color_double_to_short (color->b);
+    out->alpha = _color_double_to_short (color->a);
+}
+
+static const format_t formats[] =
+{
+#define P(x) { PIXMAN_##x, #x }
+    P(a8),
+
+    /* 32bpp formats */
+    P(a8r8g8b8),
+    P(x8r8g8b8),
+    P(a8b8g8r8),
+    P(x8b8g8r8),
+    P(b8g8r8a8),
+    P(b8g8r8x8),
+
+    /* XXX: and here the errors begin! */
+#if 0
+    P(x2r10g10b10),
+    P(a2r10g10b10),
+    P(x2b10g10r10),
+    P(a2b10g10r10),
+
+    /* 24bpp formats */
+    P(r8g8b8),
+    P(b8g8r8),
+
+    /* 16bpp formats */
+    P(r5g6b5),
+    P(b5g6r5),
+
+    P(a1r5g5b5),
+    P(x1r5g5b5),
+    P(a1b5g5r5),
+    P(x1b5g5r5),
+    P(a4r4g4b4),
+    P(x4r4g4b4),
+    P(a4b4g4r4),
+    P(x4b4g4r4),
+
+    /* 8bpp formats */
+    P(a8),
+    P(r3g3b2),
+    P(b2g3r3),
+    P(a2r2g2b2),
+    P(a2b2g2r2),
+
+    P(x4a4),
+
+    /* 4bpp formats */
+    P(a4),
+    P(r1g2b1),
+    P(b1g2r1),
+    P(a1r1g1b1),
+    P(a1b1g1r1),
+
+    /* 1bpp formats */
+    P(a1)
+#endif
+#undef P
+};
+
+struct image_t
+{
+    pixman_image_t *image;
+    const format_t *format;
+    const color_t *color;
+    pixman_repeat_t repeat;
+    int size;
+};
+
+struct operator_t
+{
+    pixman_op_t op;
+    const char *name;
+};
+
+static const operator_t operators[] =
+{
+#define P(x) { PIXMAN_OP_##x, #x }
+    P(CLEAR),
+    P(SRC),
+    P(DST),
+    P(OVER),
+    P(OVER_REVERSE),
+    P(IN),
+    P(IN_REVERSE),
+    P(OUT),
+    P(OUT_REVERSE),
+    P(ATOP),
+    P(ATOP_REVERSE),
+    P(XOR),
+    P(ADD),
+    P(SATURATE),
+
+    P(DISJOINT_CLEAR),
+    P(DISJOINT_SRC),
+    P(DISJOINT_DST),
+    P(DISJOINT_OVER),
+    P(DISJOINT_OVER_REVERSE),
+    P(DISJOINT_IN),
+    P(DISJOINT_IN_REVERSE),
+    P(DISJOINT_OUT),
+    P(DISJOINT_OUT_REVERSE),
+    P(DISJOINT_ATOP),
+    P(DISJOINT_ATOP_REVERSE),
+    P(DISJOINT_XOR),
+
+    P(CONJOINT_CLEAR),
+    P(CONJOINT_SRC),
+    P(CONJOINT_DST),
+    P(CONJOINT_OVER),
+    P(CONJOINT_OVER_REVERSE),
+    P(CONJOINT_IN),
+    P(CONJOINT_IN_REVERSE),
+    P(CONJOINT_OUT),
+    P(CONJOINT_OUT_REVERSE),
+    P(CONJOINT_ATOP),
+    P(CONJOINT_ATOP_REVERSE),
+    P(CONJOINT_XOR),
+#undef P
+};
+
+static double
+calc_op (pixman_op_t op, double src, double dst, double srca, double dsta)
+{
+#define mult_chan(src, dst, Fa, Fb) min ((src) * (Fa) + (dst) * (Fb), 1.0)
+
+    double Fa, Fb;
+
+    switch (op)
+    {
+    case PIXMAN_OP_CLEAR:
+    case PIXMAN_OP_DISJOINT_CLEAR:
+    case PIXMAN_OP_CONJOINT_CLEAR:
+	return mult_chan (src, dst, 0.0, 0.0);
+
+    case PIXMAN_OP_SRC:
+    case PIXMAN_OP_DISJOINT_SRC:
+    case PIXMAN_OP_CONJOINT_SRC:
+	return mult_chan (src, dst, 1.0, 0.0);
+
+    case PIXMAN_OP_DST:
+    case PIXMAN_OP_DISJOINT_DST:
+    case PIXMAN_OP_CONJOINT_DST:
+	return mult_chan (src, dst, 0.0, 1.0);
+
+    case PIXMAN_OP_OVER:
+	return mult_chan (src, dst, 1.0, 1.0 - srca);
+
+    case PIXMAN_OP_OVER_REVERSE:
+	return mult_chan (src, dst, 1.0 - dsta, 1.0);
+
+    case PIXMAN_OP_IN:
+	return mult_chan (src, dst, dsta, 0.0);
+
+    case PIXMAN_OP_IN_REVERSE:
+	return mult_chan (src, dst, 0.0, srca);
+
+    case PIXMAN_OP_OUT:
+	return mult_chan (src, dst, 1.0 - dsta, 0.0);
+
+    case PIXMAN_OP_OUT_REVERSE:
+	return mult_chan (src, dst, 0.0, 1.0 - srca);
+
+    case PIXMAN_OP_ATOP:
+	return mult_chan (src, dst, dsta, 1.0 - srca);
+
+    case PIXMAN_OP_ATOP_REVERSE:
+	return mult_chan (src, dst, 1.0 - dsta,  srca);
+
+    case PIXMAN_OP_XOR:
+	return mult_chan (src, dst, 1.0 - dsta, 1.0 - srca);
+
+    case PIXMAN_OP_ADD:
+	return mult_chan (src, dst, 1.0, 1.0);
+
+    case PIXMAN_OP_SATURATE:
+    case PIXMAN_OP_DISJOINT_OVER_REVERSE:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = min (1.0, (1.0 - dsta) / srca);
+	return mult_chan (src, dst, Fa, 1.0);
+
+    case PIXMAN_OP_DISJOINT_OVER:
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = min (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, 1.0, Fb);
+
+    case PIXMAN_OP_DISJOINT_IN:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = max (0.0, 1.0 - (1.0 - dsta) / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_DISJOINT_IN_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = max (0.0, 1.0 - (1.0 - srca) / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_DISJOINT_OUT:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = min (1.0, (1.0 - dsta) / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_DISJOINT_OUT_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = min (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_DISJOINT_ATOP:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = max (0.0, 1.0 - (1.0 - dsta) / srca);
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = min (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_DISJOINT_ATOP_REVERSE:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = min (1.0, (1.0 - dsta) / srca);
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = max (0.0, 1.0 - (1.0 - srca) / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_DISJOINT_XOR:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = min (1.0, (1.0 - dsta) / srca);
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = min (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_CONJOINT_OVER:
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = max (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, 1.0, Fb);
+
+    case PIXMAN_OP_CONJOINT_OVER_REVERSE:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = max (0.0, 1.0 - dsta / srca);
+	return mult_chan (src, dst, Fa, 1.0);
+
+    case PIXMAN_OP_CONJOINT_IN:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = min (1.0, dsta / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_CONJOINT_IN_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = min (1.0, srca / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_CONJOINT_OUT:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = max (0.0, 1.0 - dsta / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_CONJOINT_OUT_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = max (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_CONJOINT_ATOP:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = min (1.0, dsta / srca);
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = max (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_CONJOINT_ATOP_REVERSE:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = max (0.0, 1.0 - dsta / srca);
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = min (1.0, srca / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_CONJOINT_XOR:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = max (0.0, 1.0 - dsta / srca);
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = max (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_MULTIPLY:
+    case PIXMAN_OP_SCREEN:
+    case PIXMAN_OP_OVERLAY:
+    case PIXMAN_OP_DARKEN:
+    case PIXMAN_OP_LIGHTEN:
+    case PIXMAN_OP_COLOR_DODGE:
+    case PIXMAN_OP_COLOR_BURN:
+    case PIXMAN_OP_HARD_LIGHT:
+    case PIXMAN_OP_SOFT_LIGHT:
+    case PIXMAN_OP_DIFFERENCE:
+    case PIXMAN_OP_EXCLUSION:
+    case PIXMAN_OP_HSL_HUE:
+    case PIXMAN_OP_HSL_SATURATION:
+    case PIXMAN_OP_HSL_COLOR:
+    case PIXMAN_OP_HSL_LUMINOSITY:
+    default:
+	abort();
+    }
+#undef mult_chan
+}
+
+static void
+do_composite (pixman_op_t op,
+	      const color_t *src,
+	      const color_t *mask,
+	      const color_t *dst,
+	      color_t *result,
+	      pixman_bool_t component_alpha)
+{
+    color_t srcval, srcalpha;
+
+    if (mask == NULL)
+    {
+	srcval = *src;
+
+	srcalpha.r = src->a;
+	srcalpha.g = src->a;
+	srcalpha.b = src->a;
+	srcalpha.a = src->a;
+    }
+    else if (component_alpha)
+    {
+	srcval.r = src->r * mask->r;
+	srcval.g = src->g * mask->g;
+	srcval.b = src->b * mask->b;
+	srcval.a = src->a * mask->a;
+
+	srcalpha.r = src->a * mask->r;
+	srcalpha.g = src->a * mask->g;
+	srcalpha.b = src->a * mask->b;
+	srcalpha.a = src->a * mask->a;
+    }
+    else
+    {
+	srcval.r = src->r * mask->a;
+	srcval.g = src->g * mask->a;
+	srcval.b = src->b * mask->a;
+	srcval.a = src->a * mask->a;
+
+	srcalpha.r = src->a * mask->a;
+	srcalpha.g = src->a * mask->a;
+	srcalpha.b = src->a * mask->a;
+	srcalpha.a = src->a * mask->a;
+    }
+
+    result->r = calc_op (op, srcval.r, dst->r, srcalpha.r, dst->a);
+    result->g = calc_op (op, srcval.g, dst->g, srcalpha.g, dst->a);
+    result->b = calc_op (op, srcval.b, dst->b, srcalpha.b, dst->a);
+    result->a = calc_op (op, srcval.a, dst->a, srcalpha.a, dst->a);
+}
+
+static void
+color_correct (pixman_format_code_t format,
+	       color_t *color)
+{
+#define round_pix(pix, mask) \
+    ((int)((pix) * (mask) + .5) / (double) (mask))
+
+    if (PIXMAN_FORMAT_R (format) == 0)
+    {
+	color->r = 0.0;
+	color->g = 0.0;
+	color->b = 0.0;
+    }
+    else
+    {
+	color->r = round_pix (color->r, PIXMAN_FORMAT_R (format));
+	color->g = round_pix (color->g, PIXMAN_FORMAT_G (format));
+	color->b = round_pix (color->b, PIXMAN_FORMAT_B (format));
+    }
+
+    if (PIXMAN_FORMAT_A (format) == 0)
+	color->a = 1.0;
+    else
+	color->a = round_pix (color->a, PIXMAN_FORMAT_A (format));
+
+#undef round_pix
+}
+
+static void
+get_pixel (pixman_image_t *image,
+	   pixman_format_code_t format,
+	   color_t *color)
+{
+#define MASK(N) ((1UL << (N))-1)
+
+    unsigned long rs, gs, bs, as;
+    int a, r, g, b;
+    unsigned long val;
+
+    val = *(unsigned long *) pixman_image_get_data (image);
+#ifdef WORDS_BIGENDIAN
+    val >>= 8 * sizeof(val) - PIXMAN_FORMAT_BPP (format);
+#endif
+
+    /* Number of bits in each channel */
+    a = PIXMAN_FORMAT_A (format);
+    r = PIXMAN_FORMAT_R (format);
+    g = PIXMAN_FORMAT_G (format);
+    b = PIXMAN_FORMAT_B (format);
+
+    switch (PIXMAN_FORMAT_TYPE (format))
+    {
+    case PIXMAN_TYPE_ARGB:
+        bs = 0;
+        gs = b + bs;
+        rs = g + gs;
+        as = r + rs;
+	break;
+
+    case PIXMAN_TYPE_ABGR:
+        rs = 0;
+        gs = r + rs;
+        bs = g + gs;
+        as = b + bs;
+	break;
+
+    case PIXMAN_TYPE_BGRA:
+        as = 0;
+	rs = PIXMAN_FORMAT_BPP (format) - (b + g + r);
+        gs = r + rs;
+        bs = g + gs;
+	break;
+
+    case PIXMAN_TYPE_A:
+        as = 0;
+        rs = 0;
+        gs = 0;
+        bs = 0;
+	break;
+
+    case PIXMAN_TYPE_OTHER:
+    case PIXMAN_TYPE_COLOR:
+    case PIXMAN_TYPE_GRAY:
+    case PIXMAN_TYPE_YUY2:
+    case PIXMAN_TYPE_YV12:
+    default:
+	abort ();
+        as = 0;
+        rs = 0;
+        gs = 0;
+        bs = 0;
+	break;
+    }
+
+    if (MASK (a) != 0)
+	color->a = ((val >> as) & MASK (a)) / (double) MASK (a);
+    else
+	color->a = 1.0;
+
+    if (MASK (r) != 0)
+    {
+	color->r = ((val >> rs) & MASK (r)) / (double) MASK (r);
+	color->g = ((val >> gs) & MASK (g)) / (double) MASK (g);
+	color->b = ((val >> bs) & MASK (b)) / (double) MASK (b);
+    }
+    else
+    {
+	color->r = 0.0;
+	color->g = 0.0;
+	color->b = 0.0;
+    }
+
+#undef MASK
+}
+
+static double
+eval_diff (color_t *expected, color_t *test)
+{
+    double rscale, gscale, bscale, ascale;
+    double rdiff, gdiff, bdiff, adiff;
+
+    /* XXX: Need to be provided mask shifts so we can produce useful error
+     * values.
+     */
+    rscale = 1.0 * (1 << 5);
+    gscale = 1.0 * (1 << 6);
+    bscale = 1.0 * (1 << 5);
+    ascale = 1.0 * 32;
+
+    rdiff = fabs (test->r - expected->r) * rscale;
+    bdiff = fabs (test->g - expected->g) * gscale;
+    gdiff = fabs (test->b - expected->b) * bscale;
+    adiff = fabs (test->a - expected->a) * ascale;
+
+    return max (max (max (rdiff, gdiff), bdiff), adiff);
+}
+
+static char *
+describe_image (image_t *info, char *buf, int buflen)
+{
+    if (info->size)
+    {
+	snprintf (buf, buflen, "%s %dx%d%s",
+		  info->format->name,
+		  info->size, info->size,
+		  info->repeat ? "R" :"");
+    }
+    else
+    {
+	snprintf (buf, buflen, "solid");
+    }
+
+    return buf;
+}
+
+/* Test a composite of a given operation, source, mask, and destination
+ * picture.
+ * Fills the window, and samples from the 0,0 pixel corner.
+ */
+static pixman_bool_t
+composite_test (image_t *dst,
+		const operator_t *op,
+		image_t *src,
+		image_t *mask,
+		pixman_bool_t component_alpha)
+{
+    pixman_color_t fill;
+    pixman_rectangle16_t rect;
+    color_t expected, result, tdst, tsrc, tmsk;
+    double diff;
+    pixman_bool_t success = TRUE;
+
+    compute_pixman_color (dst->color, &fill);
+    rect.x = rect.y = 0;
+    rect.width = rect.height = dst->size;
+    pixman_image_fill_rectangles (PIXMAN_OP_SRC, dst->image,
+				  &fill, 1, &rect);
+
+    if (mask != NULL)
+    {
+	pixman_image_set_component_alpha (mask->image, component_alpha);
+	pixman_image_composite (op->op, src->image, mask->image, dst->image,
+				0, 0,
+				0, 0,
+				0, 0,
+				dst->size, dst->size);
+
+	tmsk = *mask->color;
+	if (mask->size)
+	{
+	    color_correct (mask->format->format, &tmsk);
+
+	    if (component_alpha &&
+		PIXMAN_FORMAT_R (mask->format->format) == 0)
+	    {
+		/* Ax component-alpha masks expand alpha into
+		 * all color channels.
+		 */
+		tmsk.r = tmsk.g = tmsk.b = tmsk.a;
+	    }
+	}
+    }
+    else
+    {
+	pixman_image_composite (op->op, src->image, NULL, dst->image,
+				0, 0,
+				0, 0,
+				0, 0,
+				dst->size, dst->size);
+    }
+    get_pixel (dst->image, dst->format->format, &result);
+
+    tdst = *dst->color;
+    color_correct (dst->format->format, &tdst);
+    tsrc = *src->color;
+    if (src->size)
+	color_correct (src->format->format, &tsrc);
+    do_composite (op->op, &tsrc, mask ? &tmsk : NULL, &tdst,
+		  &expected, component_alpha);
+    color_correct (dst->format->format, &expected);
+
+    diff = eval_diff (&expected, &result);
+    if (diff > 3.0)
+    {
+	char buf[40];
+
+	snprintf (buf, sizeof (buf),
+		  "%s %scomposite",
+		  op->name,
+		  component_alpha ? "CA " : "");
+
+	printf ("%s test error of %.4f --\n"
+		"           R    G    B    A\n"
+		"got:       %.2f %.2f %.2f %.2f [%08lx]\n"
+		"expected:  %.2f %.2f %.2f %.2f\n",
+		buf, diff,
+		result.r, result.g, result.b, result.a,
+		*(unsigned long *) pixman_image_get_data (dst->image),
+		expected.r, expected.g, expected.b, expected.a);
+	
+	if (mask != NULL)
+	{
+	    printf ("src color: %.2f %.2f %.2f %.2f\n"
+		    "msk color: %.2f %.2f %.2f %.2f\n"
+		    "dst color: %.2f %.2f %.2f %.2f\n",
+		    src->color->r, src->color->g,
+		    src->color->b, src->color->a,
+		    mask->color->r, mask->color->g,
+		    mask->color->b, mask->color->a,
+		    dst->color->r, dst->color->g,
+		    dst->color->b, dst->color->a);
+	    printf ("src: %s, ", describe_image (src, buf, sizeof (buf)));
+	    printf ("mask: %s, ", describe_image (mask, buf, sizeof (buf)));
+	    printf ("dst: %s\n\n", describe_image (dst, buf, sizeof (buf)));
+	}
+	else
+	{
+	    printf ("src color: %.2f %.2f %.2f %.2f\n"
+		    "dst color: %.2f %.2f %.2f %.2f\n",
+		    src->color->r, src->color->g,
+		    src->color->b, src->color->a,
+		    dst->color->r, dst->color->g,
+		    dst->color->b, dst->color->a);
+	    printf ("src: %s, ", describe_image (src, buf, sizeof (buf)));
+	    printf ("dst: %s\n\n", describe_image (dst, buf, sizeof (buf)));
+	}
+
+	success = FALSE;
+    }
+
+    return success;
+}
+
+#define REPEAT 0x01000000
+#define FLAGS  0xff000000
+
+static void
+image_init (image_t *info,
+	    int color,
+	    int format,
+	    int size)
+{
+    pixman_color_t fill;
+
+    info->color = &colors[color];
+    compute_pixman_color (info->color, &fill);
+
+    info->format = &formats[format];
+    info->size = size & ~FLAGS;
+    info->repeat = PIXMAN_REPEAT_NONE;
+
+    if (info->size)
+    {
+	pixman_rectangle16_t rect;
+
+	info->image = pixman_image_create_bits (info->format->format,
+						info->size, info->size,
+						NULL, 0);
+
+	rect.x = rect.y = 0;
+	rect.width = rect.height = info->size;
+	pixman_image_fill_rectangles (PIXMAN_OP_SRC, info->image, &fill,
+				      1, &rect);
+
+	if (size & REPEAT)
+	{
+	    pixman_image_set_repeat (info->image, PIXMAN_REPEAT_NORMAL);
+	    info->repeat = PIXMAN_REPEAT_NORMAL;
+	}
+    }
+    else
+    {
+	info->image = pixman_image_create_solid_fill (&fill);
+    }
+}
+
+static void
+image_fini (image_t *info)
+{
+    pixman_image_unref (info->image);
+}
+
+int
+main (void)
+{
+    pixman_bool_t ok, group_ok = TRUE, ca;
+    int i, d, m, s;
+    int tests_passed = 0, tests_total = 0;
+    int sizes[] = { 1, 1 | REPEAT, 10 };
+    int num_tests;
+
+    for (i = 0; i < ARRAY_LENGTH (colors); i++)
+    {
+	colors[i].r *= colors[i].a;
+	colors[i].g *= colors[i].a;
+	colors[i].b *= colors[i].a;
+    }
+
+    num_tests = ARRAY_LENGTH (colors) * ARRAY_LENGTH (formats);
+
+    for (d = 0; d < num_tests; d++)
+    {
+	image_t dst;
+
+	image_init (
+	    &dst, d / ARRAY_LENGTH (formats), d % ARRAY_LENGTH (formats), 1);
+
+
+	for (s = -ARRAY_LENGTH (colors);
+	     s < ARRAY_LENGTH (sizes) * num_tests;
+	     s++)
+	{
+	    image_t src;
+
+	    if (s < 0)
+	    {
+		image_init (&src, -s - 1, 0, 0);
+	    }
+	    else
+	    {
+		image_init (&src,
+			    s / ARRAY_LENGTH (sizes) / ARRAY_LENGTH (formats),
+			    s / ARRAY_LENGTH (sizes) % ARRAY_LENGTH (formats),
+			    sizes[s % ARRAY_LENGTH (sizes)]);
+	    }
+
+	    for (m = -ARRAY_LENGTH (colors);
+		 m < ARRAY_LENGTH (sizes) * num_tests;
+		 m++)
+	    {
+		image_t mask;
+
+		if (m < 0)
+		{
+		    image_init (&mask, -m - 1, 0, 0);
+		}
+		else
+		{
+		    image_init (
+			&mask,
+			m / ARRAY_LENGTH (sizes) / ARRAY_LENGTH (formats),
+			m / ARRAY_LENGTH (sizes) % ARRAY_LENGTH (formats),
+			sizes[m % ARRAY_LENGTH (sizes)]);
+		}
+
+		for (ca = -1; ca <= 1; ca++)
+		{
+		    for (i = 0; i < ARRAY_LENGTH (operators); i++)
+		    {
+			const operator_t *op = &operators[i];
+
+			switch (ca)
+			{
+			case -1:
+			    ok = composite_test (&dst, op, &src, NULL, FALSE);
+			    break;
+			case 0:
+			    ok = composite_test (&dst, op, &src, &mask, FALSE);
+			    break;
+			case 1:
+			    ok = composite_test (&dst, op, &src, &mask,
+						 mask.size? TRUE : FALSE);
+			    break;
+                        default:
+                            break;
+			}
+			group_ok = group_ok && ok;
+			tests_passed += ok;
+			tests_total++;
+		    }
+		}
+
+		image_fini (&mask);
+	    }
+	    image_fini (&src);
+	}
+	image_fini (&dst);
+    }
+
+    return group_ok == FALSE;
+}
diff --git a/lib/pixman/test/convolution-test.c b/lib/pixman/test/convolution-test.c
index 8609d38a0..da284af7b 100644
--- a/lib/pixman/test/convolution-test.c
+++ b/lib/pixman/test/convolution-test.c
@@ -1,7 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "pixman.h"
-#include "utils.h"
+#include "gtk-utils.h"
 
 int
 main (int argc, char **argv)
diff --git a/lib/pixman/test/fetch-test.c b/lib/pixman/test/fetch-test.c
index 6306a4c42..2ca16ddbf 100644
--- a/lib/pixman/test/fetch-test.c
+++ b/lib/pixman/test/fetch-test.c
@@ -6,7 +6,8 @@
 
 #define SIZE 1024
 
-pixman_indexed_t mono_pallete = {
+static pixman_indexed_t mono_palette =
+{
     .rgba = { 0x00000000, 0x00ffffff },
 };
 
@@ -20,14 +21,15 @@ typedef struct {
     pixman_indexed_t *indexed;
 } testcase_t;
 
-testcase_t testcases[] = {
+static testcase_t testcases[] =
+{
     {
 	.format = PIXMAN_a8r8g8b8,
 	.width = 2, .height = 2,
 	.stride = 8,
-	.src = { 0x00112233, 0x44556677, 
+	.src = { 0x00112233, 0x44556677,
 	         0x8899aabb, 0xccddeeff },
-	.dst = { 0x00112233, 0x44556677, 
+	.dst = { 0x00112233, 0x44556677,
 	         0x8899aabb, 0xccddeeff },
 	.indexed = NULL,
     },
@@ -36,24 +38,33 @@ testcase_t testcases[] = {
 	.width = 8, .height = 2,
 	.stride = 4,
 #ifdef WORDS_BIGENDIAN
-	.src = { 0xaa000000,
-		 0x55000000 },
+	.src =
+	{
+	    0xaa000000,
+	    0x55000000
+	},
 #else
-	.src = { 0x00000055, 
-	         0x000000aa },
+	.src =
+	{
+	    0x00000055,
+	    0x000000aa
+	},
 #endif
-	.dst = { 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000,
-	         0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff },
-	.indexed = &mono_pallete,
+	.dst =
+	{
+	    0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000,
+	    0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff
+	},
+	.indexed = &mono_palette,
     },
 #if 0
     {
 	.format = PIXMAN_g8,
 	.width = 4, .height = 2,
 	.stride = 4,
-	.src = { 0x01234567, 
+	.src = { 0x01234567,
 	         0x89abcdef },
-	.dst = { 0x00010101, 0x00232323, 0x00454545, 0x00676767, 
+	.dst = { 0x00010101, 0x00232323, 0x00454545, 0x00676767,
 	         0x00898989, 0x00ababab, 0x00cdcdcd, 0x00efefef, },
     },
 #endif
@@ -63,28 +74,33 @@ testcase_t testcases[] = {
 	.width = 8, .height = 2,
 	.stride = 8,
 #ifdef WORDS_BIGENDIAN
-	.src = { 0x00ff00ff, 0x00ff00ff, 
-	         0xff00ff00, 0xff00ff00, 
-	         0x80ff8000, 
-		 0x800080ff
+	.src =
+	{
+	    0x00ff00ff, 0x00ff00ff,
+	    0xff00ff00, 0xff00ff00,
+	    0x80ff8000,
+	    0x800080ff
 	},
 #else
-	.src = { 0xff00ff00, 0xff00ff00, 
-	         0x00ff00ff, 0x00ff00ff, 
-	         0x0080ff80, 
-		 0xff800080
-	 },
+	.src =
+	{
+	    0xff00ff00, 0xff00ff00,
+	    0x00ff00ff, 0x00ff00ff,
+	    0x0080ff80,
+	    0xff800080
+	},
 #endif
-	.dst = { 
-		0xff000000, 0xffffffff, 0xffb80000, 0xffffe113,
-		0xff000000, 0xffffffff, 0xff0023ee, 0xff4affff,
-		0xffffffff, 0xff000000, 0xffffe113, 0xffb80000,
-		0xffffffff, 0xff000000, 0xff4affff, 0xff0023ee,
+	.dst =
+	{
+	    0xff000000, 0xffffffff, 0xffb80000, 0xffffe113,
+	    0xff000000, 0xffffffff, 0xff0023ee, 0xff4affff,
+	    0xffffffff, 0xff000000, 0xffffe113, 0xffb80000,
+	    0xffffffff, 0xff000000, 0xff4affff, 0xff0023ee,
 	},
     },
 };
 
-const int ntestcases = sizeof(testcases)/sizeof(testcases[0]);
+int n_test_cases = sizeof(testcases)/sizeof(testcases[0]);
 
 
 static uint32_t
@@ -133,26 +149,29 @@ main (int argc, char **argv)
     int i, j, x, y;
     int ret = 0;
 
-    for (i = 0; i < ntestcases; ++i) {
-	for (j = 0; j < 2; ++j) {
+    for (i = 0; i < n_test_cases; ++i)
+    {
+	for (j = 0; j < 2; ++j)
+	{
 	    src_img = pixman_image_create_bits (testcases[i].format,
-						testcases[i].width, 
+						testcases[i].width,
 						testcases[i].height,
 						testcases[i].src,
 						testcases[i].stride);
 	    pixman_image_set_indexed(src_img, testcases[i].indexed);
 
 	    dst_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
-						testcases[i].width, 
+						testcases[i].width,
 						testcases[i].height,
 						dst,
 						testcases[i].width*4);
 
-	    if (j) {
+	    if (j)
+	    {
 		pixman_image_set_accessors (src_img, reader, writer);
 		pixman_image_set_accessors (dst_img, reader, writer);
 	    }
-	    
+
 	    pixman_image_composite (PIXMAN_OP_SRC, src_img, NULL, dst_img,
 				    0, 0, 0, 0, 0, 0, testcases[i].width, testcases[i].height);
 
@@ -160,18 +179,23 @@ main (int argc, char **argv)
 	    pixman_image_unref (dst_img);
 
 	    for (y = 0; y < testcases[i].height; ++y)
-		for (x = 0; x < testcases[i].width; ++x) {
-		    int offset = y*testcases[i].width + x;
-		    if (dst[offset] != testcases[i].dst[offset]) {
+	    {
+		for (x = 0; x < testcases[i].width; ++x)
+		{
+		    int offset = y * testcases[i].width + x;
+
+		    if (dst[offset] != testcases[i].dst[offset])
+		    {
 			printf ("test %i%c: pixel mismatch at (x=%d,y=%d): %08x expected, %08x obtained\n",
 			        i + 1, 'a' + j,
-			        x, y, 
+			        x, y,
 			        testcases[i].dst[offset], dst[offset]);
 			ret = 1;
 		    }
 		}
+	    }
 	}
     }
-    
+
     return ret;
 }
diff --git a/lib/pixman/test/gradient-test.c b/lib/pixman/test/gradient-test.c
index 2593ee38a..fc84844b0 100644
--- a/lib/pixman/test/gradient-test.c
+++ b/lib/pixman/test/gradient-test.c
@@ -1,7 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "pixman.h"
-#include "utils.h"
+#include "gtk-utils.h"
 
 int
 main (int argc, char **argv)
@@ -21,18 +21,20 @@ main (int argc, char **argv)
     pixman_point_fixed_t p1 = { pixman_double_to_fixed (0), 0 };
     pixman_point_fixed_t p2 = { pixman_double_to_fixed (WIDTH / 8.),
 				pixman_int_to_fixed (0) };
+#if 0
     pixman_transform_t trans = {
 	{ { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), },
 	  { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), },
 	  { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
 	}
     };
-
-    pixman_transform_t id = {
+#else
+    pixman_transform_t trans = {
 	{ { pixman_fixed_1, 0, 0 },
 	  { 0, pixman_fixed_1, 0 },
 	  { 0, 0, pixman_fixed_1 } }
     };
+#endif
 
     pixman_point_fixed_t c_inner;
     pixman_point_fixed_t c_outer;
@@ -67,7 +69,7 @@ main (int argc, char **argv)
     src_img = pixman_image_create_linear_gradient  (&p1, &p2,
 						    stops, 2);
     
-    pixman_image_set_transform (src_img, &id);
+    pixman_image_set_transform (src_img, &trans);
     pixman_image_set_repeat (src_img, PIXMAN_REPEAT_PAD);
     
     pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img,
diff --git a/lib/pixman/test/gtk-utils.c b/lib/pixman/test/gtk-utils.c
new file mode 100644
index 000000000..751a164c0
--- /dev/null
+++ b/lib/pixman/test/gtk-utils.c
@@ -0,0 +1,113 @@
+#include <gtk/gtk.h>
+#include <config.h>
+#include "pixman-private.h"	/* For image->bits.format
+				 * FIXME: there should probably be public API for this
+				 */
+#include "gtk-utils.h"
+
+GdkPixbuf *
+pixbuf_from_argb32 (uint32_t *bits,
+		    gboolean has_alpha,
+		    int width,
+		    int height,
+		    int stride)
+{
+    GdkPixbuf *pixbuf = gdk_pixbuf_new (GDK_COLORSPACE_RGB, TRUE,
+					8, width, height);
+    int p_stride = gdk_pixbuf_get_rowstride (pixbuf);
+    guint32 *p_bits = (guint32 *)gdk_pixbuf_get_pixels (pixbuf);
+    int w, h;
+    
+    for (h = 0; h < height; ++h)
+    {
+	for (w = 0; w < width; ++w)
+	{
+	    uint32_t argb = bits[h * (stride / 4) + w];
+	    guint r, g, b, a;
+	    char *pb = (char *)p_bits;
+
+	    pb += h * p_stride + w * 4;
+
+	    r = (argb & 0x00ff0000) >> 16;
+	    g = (argb & 0x0000ff00) >> 8;
+	    b = (argb & 0x000000ff) >> 0;
+	    a = has_alpha? (argb & 0xff000000) >> 24 : 0xff;
+
+	    if (a)
+	    {
+		r = (r * 255) / a;
+		g = (g * 255) / a;
+		b = (b * 255) / a;
+	    }
+
+	    if (r > 255) r = 255;
+	    if (g > 255) g = 255;
+	    if (b > 255) b = 255;
+	    
+	    pb[0] = r;
+	    pb[1] = g;
+	    pb[2] = b;
+	    pb[3] = a;
+	}
+    }
+    
+    return pixbuf;
+}
+
+
+static gboolean
+on_expose (GtkWidget *widget, GdkEventExpose *expose, gpointer data)
+{
+    GdkPixbuf *pixbuf = data;
+    
+    gdk_draw_pixbuf (widget->window, NULL,
+		     pixbuf, 0, 0, 0, 0,
+		     gdk_pixbuf_get_width (pixbuf),
+		     gdk_pixbuf_get_height (pixbuf),
+		     GDK_RGB_DITHER_NONE,
+		     0, 0);
+    
+    return TRUE;
+}
+
+void
+show_image (pixman_image_t *image)
+{
+    GtkWidget *window;
+    GdkPixbuf *pixbuf;
+    int width, height, stride;
+    int argc;
+    char **argv;
+    char *arg0 = g_strdup ("pixman-test-program");
+    gboolean has_alpha;
+    pixman_format_code_t format;
+
+    argc = 1;
+    argv = (char **)&arg0;
+
+    gtk_init (&argc, &argv);
+    
+    window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
+    width = pixman_image_get_width (image);
+    height = pixman_image_get_height (image);
+    stride = pixman_image_get_stride (image);
+
+    format = image->bits.format;
+    
+    if (format == PIXMAN_a8r8g8b8)
+	has_alpha = TRUE;
+    else if (format == PIXMAN_x8r8g8b8)
+	has_alpha = FALSE;
+    else
+	g_error ("Can't deal with this format: %x\n", format);
+    
+    pixbuf = pixbuf_from_argb32 (pixman_image_get_data (image), has_alpha,
+				 width, height, stride);
+    
+    g_signal_connect (window, "expose_event", G_CALLBACK (on_expose), pixbuf);
+    g_signal_connect (window, "delete_event", G_CALLBACK (gtk_main_quit), NULL);
+    
+    gtk_widget_show (window);
+    
+    gtk_main ();
+}
diff --git a/lib/pixman/test/gtk-utils.h b/lib/pixman/test/gtk-utils.h
new file mode 100644
index 000000000..2cb13bcf0
--- /dev/null
+++ b/lib/pixman/test/gtk-utils.h
@@ -0,0 +1,13 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <glib.h>
+#include <gtk/gtk.h>
+#include "pixman.h"
+
+void show_image (pixman_image_t *image);
+
+GdkPixbuf *pixbuf_from_argb32 (uint32_t *bits,
+		               gboolean has_alpha,
+                               int width,
+                               int height,
+                               int stride);
diff --git a/lib/pixman/test/region-test.c b/lib/pixman/test/region-test.c
index 3568969f1..9d5a41eb9 100644
--- a/lib/pixman/test/region-test.c
+++ b/lib/pixman/test/region-test.c
@@ -1,7 +1,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include "pixman.h"
+#include "utils.h"
 
 int
 main ()
@@ -22,8 +22,15 @@ main ()
 	{ 2, 6, 7, 6 },
 	{ 4, 1, 6, 1 },
     };
-    int i;
+    int i, j;
     pixman_box32_t *b;
+    pixman_image_t *image, *fill;
+    pixman_color_t white = {
+	0xffff,
+	0xffff,
+	0xffff,
+	0xffff
+    };
 
     /* This used to go into an infinite loop before pixman-region.c
      * was fixed to not use explict "short" variables
@@ -74,5 +81,43 @@ main ()
 
     assert (i == 0);
 
+    fill = pixman_image_create_solid_fill (&white);
+    for (i = 0; i < 100; i++)
+    {
+	int image_size = 128;
+
+	pixman_region32_init (&r1);
+
+	/* Add some random rectangles */
+	for (j = 0; j < 64; j++)
+	    pixman_region32_union_rect (&r1, &r1,
+					lcg_rand_n (image_size),
+					lcg_rand_n (image_size),
+					lcg_rand_n (25),
+					lcg_rand_n (25));
+
+	/* Clip to image size */
+	pixman_region32_init_rect (&r2, 0, 0, image_size, image_size);
+	pixman_region32_intersect (&r1, &r1, &r2);
+	pixman_region32_fini (&r2);
+
+	/* render region to a1 mask */
+	image = pixman_image_create_bits (PIXMAN_a1, image_size, image_size, NULL, 0);
+	pixman_image_set_clip_region32 (image, &r1);
+	pixman_image_composite32 (PIXMAN_OP_SRC,
+				  fill, NULL, image,
+				  0, 0, 0, 0, 0, 0,
+				  image_size, image_size);
+	pixman_region32_init_from_image (&r2, image);
+
+	pixman_image_unref (image);
+
+	assert (pixman_region32_equal (&r1, &r2));
+	pixman_region32_fini (&r1);
+	pixman_region32_fini (&r2);
+
+    }
+    pixman_image_unref (fill);
+
     return 0;
 }
diff --git a/lib/pixman/test/scaling-test.c b/lib/pixman/test/scaling-test.c
index 8899c594f..29772906d 100644
--- a/lib/pixman/test/scaling-test.c
+++ b/lib/pixman/test/scaling-test.c
@@ -23,198 +23,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include "pixman.h"
-
-/* A primitive pseudorandom number generator, taken from POSIX.1-2001 example */
-
-static uint32_t lcg_seed;
-
-uint32_t
-lcg_rand (void)
-{
-    lcg_seed = lcg_seed * 1103515245 + 12345;
-    return ((uint32_t)(lcg_seed / 65536) % 32768);
-}
-
-void
-lcg_srand (uint32_t seed)
-{
-    lcg_seed = seed;
-}
-
-uint32_t
-lcg_rand_n (int max)
-{
-    return lcg_rand () % max;
-}
-
-/*----------------------------------------------------------------------------*\
-*  CRC-32 version 2.0.0 by Craig Bruce, 2006-04-29.
-*
-*  This program generates the CRC-32 values for the files named in the
-*  command-line arguments.  These are the same CRC-32 values used by GZIP,
-*  PKZIP, and ZMODEM.  The compute_crc32() can also be detached and
-*  used independently.
-*
-*  THIS PROGRAM IS PUBLIC-DOMAIN SOFTWARE.
-*
-*  Based on the byte-oriented implementation "File Verification Using CRC"
-*  by Mark R. Nelson in Dr. Dobb's Journal, May 1992, pp. 64-67.
-*
-*  v1.0.0: original release.
-*  v1.0.1: fixed printf formats.
-*  v1.0.2: fixed something else.
-*  v1.0.3: replaced CRC constant table by generator function.
-*  v1.0.4: reformatted code, made ANSI C.  1994-12-05.
-*  v2.0.0: rewrote to use memory buffer & static table, 2006-04-29.
-\*----------------------------------------------------------------------------*/
-
-/*----------------------------------------------------------------------------*\
-*  NAME:
-*     compute_crc32() - computes the CRC-32 value of a memory buffer
-*  DESCRIPTION:
-*     Computes or accumulates the CRC-32 value for a memory buffer.
-*     The 'in_crc32' gives a previously accumulated CRC-32 value to allow
-*     a CRC to be generated for multiple sequential buffer-fuls of data.
-*     The 'in_crc32' for the first buffer must be zero.
-*  ARGUMENTS:
-*     in_crc32 - accumulated CRC-32 value, must be 0 on first call
-*     buf     - buffer to compute CRC-32 value for
-*     buf_len  - number of bytes in buffer
-*  RETURNS:
-*     crc32 - computed CRC-32 value
-*  ERRORS:
-*     (no errors are possible)
-\*----------------------------------------------------------------------------*/
-
-static uint32_t
-compute_crc32 (uint32_t    in_crc32,
-		  const void *buf,
-		  size_t      buf_len)
-{
-    static const uint32_t crc_table[256] = {
-	0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F,
-	0xE963A535, 0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
-	0x09B64C2B, 0x7EB17CBD,	0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2,
-	0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
-	0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,	0x14015C4F, 0x63066CD9,
-	0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
-	0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
-	0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
-	0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423,
-	0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
-	0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106,
-	0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
-	0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D,
-	0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
-	0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950,
-	0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
-	0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7,
-	0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
-	0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA,
-	0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
-	0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
-	0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
-	0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84,
-	0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
-	0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB,
-	0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
-	0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E,
-	0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
-	0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55,
-	0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
-	0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28,
-	0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
-	0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F,
-	0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
-	0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
-	0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
-	0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69,
-	0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
-	0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC,
-	0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
-	0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693,
-	0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
-	0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
-    };
-    
-    uint32_t              crc32;
-    unsigned char *       byte_buf;
-    size_t                i;
-
-    /** accumulate crc32 for buffer **/
-    crc32 = in_crc32 ^ 0xFFFFFFFF;
-    byte_buf = (unsigned char*) buf;
-
-    for (i = 0; i < buf_len; i++)
-	crc32 = (crc32 >> 8) ^ crc_table[(crc32 ^ byte_buf[i]) & 0xFF];
-    
-    return (crc32 ^ 0xFFFFFFFF);
-}
-
-/* perform endian conversion of pixel data */
-static void
-image_endian_swap (pixman_image_t *img,
-		   int             bpp)
-{
-    int       stride = pixman_image_get_stride (img);
-    uint32_t *data = pixman_image_get_data (img);
-    int       height = pixman_image_get_height (img);
-    int i, j;
-
-    /* swap bytes only on big endian systems */
-    volatile uint16_t endian_check_var = 0x1234;
-    if (*(volatile uint8_t *)&endian_check_var != 0x12)
-	return;
-
-    for (i = 0; i < height; i++)
-    {
-	char *line_data = (char *)data + stride * i;
-	
-	/* swap bytes only for 16, 24 and 32 bpp for now */
-	switch (bpp)
-	{
-	case 16:
-	    for (j = 0; j + 2 <= stride; j += 2)
-	    {
-		char t1 = line_data[j + 0];
-		char t2 = line_data[j + 1];
-		line_data[j + 1] = t1;
-		line_data[j + 0] = t2;
-	    }
-	    break;
-
-	case 24:
-	    for (j = 0; j + 3 <= stride; j += 3)
-	    {
-		char t1 = line_data[j + 0];
-		char t2 = line_data[j + 1];
-		char t3 = line_data[j + 2];
-		line_data[j + 2] = t1;
-		line_data[j + 1] = t2;
-		line_data[j + 0] = t3;
-	    }
-	    break;
-
-	case 32:
-	    for (j = 0; j + 4 <= stride; j += 4)
-	    {
-		char t1 = line_data[j + 0];
-		char t2 = line_data[j + 1];
-		char t3 = line_data[j + 2];
-		char t4 = line_data[j + 3];
-		line_data[j + 3] = t1;
-		line_data[j + 2] = t2;
-		line_data[j + 1] = t3;
-		line_data[j + 0] = t4;
-	    }
-	    break;
-
-	default:
-	    break;
-	}
-    }
-}
+#include "utils.h"
 
 #define MAX_SRC_WIDTH  10
 #define MAX_SRC_HEIGHT 10
@@ -266,7 +75,7 @@ test_composite (uint32_t initcrc,
 
     if (src_stride & 3)
 	src_stride += 2;
-    
+
     if (dst_stride & 3)
 	dst_stride += 2;
 
@@ -326,9 +135,17 @@ test_composite (uint32_t initcrc,
     case 3:
 	repeat = PIXMAN_REPEAT_REFLECT;
 	break;
+
+    default:
+        break;
     }
     pixman_image_set_repeat (src_img, repeat);
 
+    if (lcg_rand_n (2))
+	pixman_image_set_filter (src_img, PIXMAN_FILTER_NEAREST, NULL, 0);
+    else
+	pixman_image_set_filter (src_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
     if (verbose)
     {
 	printf ("src_fmt=%08X, dst_fmt=%08X\n", src_fmt, dst_fmt);
@@ -362,7 +179,7 @@ test_composite (uint32_t initcrc,
 		        clip_boxes[i].x2, clip_boxes[i].y2);
 	    }
 	}
-	
+
 	pixman_region_init_rects (&clip, clip_boxes, n);
 	pixman_image_set_clip_region (src_img, &clip);
 	pixman_image_set_source_clipping (src_img, 1);
@@ -458,7 +275,7 @@ main (int   argc, char *argv[])
 	    /* predefined value for running with all the fastpath functions disabled  */
 	    /* it needs to be updated every time changes are introduced to this program! */
 
-	    if (crc == 0x0B633CF4)
+	    if (crc == 0x2168ACD1)
 	    {
 		printf ("scaling test passed\n");
 	    }
diff --git a/lib/pixman/test/screen-test.c b/lib/pixman/test/screen-test.c
index 5e02eee08..e69dba3de 100644
--- a/lib/pixman/test/screen-test.c
+++ b/lib/pixman/test/screen-test.c
@@ -1,7 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "pixman.h"
-#include "utils.h"
+#include "gtk-utils.h"
 
 int
 main (int argc, char **argv)
diff --git a/lib/pixman/test/trap-test.c b/lib/pixman/test/trap-test.c
index 1da439bd6..19295e7a5 100644
--- a/lib/pixman/test/trap-test.c
+++ b/lib/pixman/test/trap-test.c
@@ -2,7 +2,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "pixman.h"
-#include "utils.h"
+#include "gtk-utils.h"
 
 int
 main (int argc, char **argv)
diff --git a/lib/pixman/test/utils.c b/lib/pixman/test/utils.c
index a609315c5..58cd100e2 100644
--- a/lib/pixman/test/utils.c
+++ b/lib/pixman/test/utils.c
@@ -1,113 +1,208 @@
-#include <gtk/gtk.h>
-#include <config.h>
-#include "pixman-private.h"	/* For image->bits.format
-				 * FIXME: there should probably be public API for this
-				 */
 #include "utils.h"
 
-GdkPixbuf *
-pixbuf_from_argb32 (uint32_t *bits,
-		    gboolean has_alpha,
-		    int width,
-		    int height,
-		    int stride)
+/* Random number seed
+ */
+
+uint32_t lcg_seed;
+
+/*----------------------------------------------------------------------------*\
+ *  CRC-32 version 2.0.0 by Craig Bruce, 2006-04-29.
+ *
+ *  This program generates the CRC-32 values for the files named in the
+ *  command-line arguments.  These are the same CRC-32 values used by GZIP,
+ *  PKZIP, and ZMODEM.  The Crc32_ComputeBuf () can also be detached and
+ *  used independently.
+ *
+ *  THIS PROGRAM IS PUBLIC-DOMAIN SOFTWARE.
+ *
+ *  Based on the byte-oriented implementation "File Verification Using CRC"
+ *  by Mark R. Nelson in Dr. Dobb's Journal, May 1992, pp. 64-67.
+ *
+ *  v1.0.0: original release.
+ *  v1.0.1: fixed printf formats.
+ *  v1.0.2: fixed something else.
+ *  v1.0.3: replaced CRC constant table by generator function.
+ *  v1.0.4: reformatted code, made ANSI C.  1994-12-05.
+ *  v2.0.0: rewrote to use memory buffer & static table, 2006-04-29.
+\*----------------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------------*\
+ *  NAME:
+ *     Crc32_ComputeBuf () - computes the CRC-32 value of a memory buffer
+ *  DESCRIPTION:
+ *     Computes or accumulates the CRC-32 value for a memory buffer.
+ *     The 'inCrc32' gives a previously accumulated CRC-32 value to allow
+ *     a CRC to be generated for multiple sequential buffer-fuls of data.
+ *     The 'inCrc32' for the first buffer must be zero.
+ *  ARGUMENTS:
+ *     inCrc32 - accumulated CRC-32 value, must be 0 on first call
+ *     buf     - buffer to compute CRC-32 value for
+ *     bufLen  - number of bytes in buffer
+ *  RETURNS:
+ *     crc32 - computed CRC-32 value
+ *  ERRORS:
+ *     (no errors are possible)
+\*----------------------------------------------------------------------------*/
+
+uint32_t
+compute_crc32 (uint32_t    in_crc32,
+	       const void *buf,
+	       size_t      buf_len)
 {
-    GdkPixbuf *pixbuf = gdk_pixbuf_new (GDK_COLORSPACE_RGB, TRUE,
-					8, width, height);
-    int p_stride = gdk_pixbuf_get_rowstride (pixbuf);
-    guint32 *p_bits = (guint32 *)gdk_pixbuf_get_pixels (pixbuf);
-    int w, h;
-    
-    for (h = 0; h < height; ++h)
-    {
-	for (w = 0; w < width; ++w)
-	{
-	    uint32_t argb = bits[h * (stride / 4) + w];
-	    guint r, g, b, a;
-	    char *pb = (char *)p_bits;
+    static const uint32_t crc_table[256] = {
+	0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F,
+	0xE963A535, 0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
+	0x09B64C2B, 0x7EB17CBD,	0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2,
+	0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+	0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,	0x14015C4F, 0x63066CD9,
+	0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
+	0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
+	0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+	0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423,
+	0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
+	0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106,
+	0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+	0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D,
+	0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+	0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950,
+	0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+	0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7,
+	0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
+	0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA,
+	0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+	0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
+	0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
+	0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84,
+	0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+	0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB,
+	0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
+	0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E,
+	0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+	0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55,
+	0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
+	0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28,
+	0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+	0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F,
+	0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
+	0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
+	0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+	0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69,
+	0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
+	0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC,
+	0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+	0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693,
+	0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+	0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+    };
+
+    uint32_t              crc32;
+    unsigned char *       byte_buf;
+    size_t                i;
+
+    /* accumulate crc32 for buffer */
+    crc32 = in_crc32 ^ 0xFFFFFFFF;
+    byte_buf = (unsigned char*) buf;
 
-	    pb += h * p_stride + w * 4;
+    for (i = 0; i < buf_len; i++)
+	crc32 = (crc32 >> 8) ^ crc_table[(crc32 ^ byte_buf[i]) & 0xFF];
 
-	    r = (argb & 0x00ff0000) >> 16;
-	    g = (argb & 0x0000ff00) >> 8;
-	    b = (argb & 0x000000ff) >> 0;
-	    a = has_alpha? (argb & 0xff000000) >> 24 : 0xff;
+    return (crc32 ^ 0xFFFFFFFF);
+}
+
+/* perform endian conversion of pixel data
+ */
+void
+image_endian_swap (pixman_image_t *img, int bpp)
+{
+    int stride = pixman_image_get_stride (img);
+    uint32_t *data = pixman_image_get_data (img);
+    int height = pixman_image_get_height (img);
+    int i, j;
 
-	    if (a)
+    /* swap bytes only on big endian systems */
+    volatile uint16_t endian_check_var = 0x1234;
+    if (*(volatile uint8_t *)&endian_check_var != 0x12)
+	return;
+
+    for (i = 0; i < height; i++)
+    {
+	uint8_t *line_data = (uint8_t *)data + stride * i;
+	/* swap bytes only for 16, 24 and 32 bpp for now */
+	switch (bpp)
+	{
+	case 1:
+	    for (j = 0; j < stride; j++)
 	    {
-		r = (r * 255) / a;
-		g = (g * 255) / a;
-		b = (b * 255) / a;
+		line_data[j] =
+		    ((line_data[j] & 0x80) >> 7) |
+		    ((line_data[j] & 0x40) >> 5) |
+		    ((line_data[j] & 0x20) >> 3) |
+		    ((line_data[j] & 0x10) >> 1) |
+		    ((line_data[j] & 0x08) << 1) |
+		    ((line_data[j] & 0x04) << 3) |
+		    ((line_data[j] & 0x02) << 5) |
+		    ((line_data[j] & 0x01) << 7);
 	    }
+	    break;
+	case 4:
+	    for (j = 0; j < stride; j++)
+	    {
+		line_data[j] = (line_data[j] >> 4) | (line_data[j] << 4);
+	    }
+	    break;
+	case 16:
+	    for (j = 0; j + 2 <= stride; j += 2)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+
+		line_data[j + 1] = t1;
+		line_data[j + 0] = t2;
+	    }
+	    break;
+	case 24:
+	    for (j = 0; j + 3 <= stride; j += 3)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		char t3 = line_data[j + 2];
+
+		line_data[j + 2] = t1;
+		line_data[j + 1] = t2;
+		line_data[j + 0] = t3;
+	    }
+	    break;
+	case 32:
+	    for (j = 0; j + 4 <= stride; j += 4)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		char t3 = line_data[j + 2];
+		char t4 = line_data[j + 3];
 
-	    if (r > 255) r = 255;
-	    if (g > 255) g = 255;
-	    if (b > 255) b = 255;
-	    
-	    pb[0] = r;
-	    pb[1] = g;
-	    pb[2] = b;
-	    pb[3] = a;
+		line_data[j + 3] = t1;
+		line_data[j + 2] = t2;
+		line_data[j + 1] = t3;
+		line_data[j + 0] = t4;
+	    }
+	    break;
+	default:
+	    break;
 	}
     }
-    
-    return pixbuf;
 }
 
-
-static gboolean
-on_expose (GtkWidget *widget, GdkEventExpose *expose, gpointer data)
+uint8_t *
+make_random_bytes (int n_bytes)
 {
-    GdkPixbuf *pixbuf = data;
-    
-    gdk_draw_pixbuf (widget->window, NULL,
-		     pixbuf, 0, 0, 0, 0,
-		     gdk_pixbuf_get_width (pixbuf),
-		     gdk_pixbuf_get_height (pixbuf),
-		     GDK_RGB_DITHER_NONE,
-		     0, 0);
-    
-    return TRUE;
-}
+    uint8_t *bytes = malloc (n_bytes);
+    int i;
 
-void
-show_image (pixman_image_t *image)
-{
-    GtkWidget *window;
-    GdkPixbuf *pixbuf;
-    int width, height, stride;
-    int argc;
-    char **argv;
-    char *arg0 = g_strdup ("pixman-test-program");
-    gboolean has_alpha;
-    pixman_format_code_t format;
-
-    argc = 1;
-    argv = (char **)&arg0;
-
-    gtk_init (&argc, &argv);
-    
-    window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
-    width = pixman_image_get_width (image);
-    height = pixman_image_get_height (image);
-    stride = pixman_image_get_stride (image);
-
-    format = image->bits.format;
-    
-    if (format == PIXMAN_a8r8g8b8)
-	has_alpha = TRUE;
-    else if (format == PIXMAN_x8r8g8b8)
-	has_alpha = FALSE;
-    else
-	g_error ("Can't deal with this format: %x\n", format);
-    
-    pixbuf = pixbuf_from_argb32 (pixman_image_get_data (image), has_alpha,
-				 width, height, stride);
-    
-    g_signal_connect (window, "expose_event", G_CALLBACK (on_expose), pixbuf);
-    g_signal_connect (window, "delete_event", G_CALLBACK (gtk_main_quit), NULL);
-    
-    gtk_widget_show (window);
-    
-    gtk_main ();
+    if (!bytes)
+	return NULL;
+
+    for (i = 0; i < n_bytes; ++i)
+	bytes[i] = lcg_rand () & 0xff;
+
+    return bytes;
 }
diff --git a/lib/pixman/test/utils.h b/lib/pixman/test/utils.h
index bc110d847..fb1ccec48 100644
--- a/lib/pixman/test/utils.h
+++ b/lib/pixman/test/utils.h
@@ -1,6 +1,45 @@
-#include <stdio.h>
 #include <stdlib.h>
-#include <glib.h>
-#include "pixman.h"
+#include <config.h>
+#include "pixman-private.h" /* For 'inline' definition */
 
-void show_image (pixman_image_t *image);
+/* A primitive pseudorandom number generator,
+ * taken from POSIX.1-2001 example
+ */
+
+extern uint32_t lcg_seed;
+
+static inline uint32_t
+lcg_rand (void)
+{
+    lcg_seed = lcg_seed * 1103515245 + 12345;
+    return ((uint32_t)(lcg_seed / 65536) % 32768);
+}
+
+static inline void
+lcg_srand (uint32_t seed)
+{
+    lcg_seed = seed;
+}
+
+static inline uint32_t
+lcg_rand_n (int max)
+{
+    return lcg_rand () % max;
+}
+
+
+/* CRC 32 computation
+ */
+uint32_t
+compute_crc32 (uint32_t    in_crc32,
+	       const void *buf,
+	       size_t      buf_len);
+
+/* perform endian conversion of pixel data
+ */
+void
+image_endian_swap (pixman_image_t *img, int bpp);
+
+/* Generate n_bytes random bytes in malloced memory */
+uint8_t *
+make_random_bytes (int n_bytes);
diff --git a/lib/pixman/test/window-test.c b/lib/pixman/test/window-test.c
index bbaa3e211..919fc16ed 100644
--- a/lib/pixman/test/window-test.c
+++ b/lib/pixman/test/window-test.c
@@ -1,8 +1,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <config.h>
-#include "pixman.h"
 #include "pixman-private.h"
+#include "pixman.h"
 
 #define FALSE 0
 #define TRUE 1
@@ -137,8 +137,8 @@ main ()
     pixman_image_t *src, *dest;
     int src_x, src_y, dest_x, dest_y;
     int i, j;
-    int width = get_rand (500);
-    int height = get_rand (500);
+    int width = get_rand (499) + 1;
+    int height = get_rand (499) + 1;
 
     src = make_image (width, height, TRUE, &src_x, &src_y);
     dest = make_image (width, height, FALSE, &dest_x, &dest_y);
author	Matthieu Herrb <matthieu@cvs.openbsd.org>	2010-10-03 18:30:05 +0000
committer	Matthieu Herrb <matthieu@cvs.openbsd.org>	2010-10-03 18:30:05 +0000
commit	519bd19882b18b3cfcccca5fe8e0e6ab6eb3b937 (patch)
tree	1ed8f61276ba41eeaf1ffa509465cd2f767cc3aa /lib/pixman
parent	9b631ded21a25e9a701bb5c1be5a29597ce2e3c9 (diff)