Update to pixman 0.16.6. Tested on a full ports build by naddy@.

author: Matthieu Herrb <matthieu@cvs.openbsd.org> 2010-03-25 21:58:53 +0000
committer: Matthieu Herrb <matthieu@cvs.openbsd.org> 2010-03-25 21:58:53 +0000
commit: cea3749b11718d3b585f653d4acbb6c5287794cb (patch)
tree: ab4cf3134d3a5b6e9049cde32c7b44ba677554ff /lib
parent: 1a68a9b7a165123cd605727933898146a409555c (diff)
66 files changed, 24103 insertions, 17591 deletions
diff --git a/lib/pixman/Makefile.bsd-wrapper b/lib/pixman/Makefile.bsd-wrapper
index 310364807..8f45708dc 100644
--- a/lib/pixman/Makefile.bsd-wrapper
+++ b/lib/pixman/Makefile.bsd-wrapper
@@ -1,6 +1,6 @@
-# $OpenBSD: Makefile.bsd-wrapper,v 1.8 2009/06/12 09:16:54 matthieu Exp $
+# $OpenBSD: Makefile.bsd-wrapper,v 1.9 2010/03/25 21:58:52 matthieu Exp $
 
-SHARED_LIBS=	pixman-1 15.8
+SHARED_LIBS=	pixman-1 16.6
 
 .if ${MACHINE} == amd64
 CONFIGURE_ARGS += --disable-sse2
diff --git a/lib/pixman/Makefile.in b/lib/pixman/Makefile.in
index 49e307285..d51ee56ad 100644
--- a/lib/pixman/Makefile.in
+++ b/lib/pixman/Makefile.in
@@ -146,6 +146,7 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
 PERL = @PERL@
+PIXMAN_TIMERS = @PIXMAN_TIMERS@
 PIXMAN_VERSION_MAJOR = @PIXMAN_VERSION_MAJOR@
 PIXMAN_VERSION_MICRO = @PIXMAN_VERSION_MICRO@
 PIXMAN_VERSION_MINOR = @PIXMAN_VERSION_MINOR@
diff --git a/lib/pixman/README b/lib/pixman/README
index e69de29bb..843b06980 100644
--- a/lib/pixman/README
+++ b/lib/pixman/README
@@ -0,0 +1,26 @@
+pixman is a library that provides low-level pixel manipulation
+features such as image compositing and trapezoid rasterization.
+
+Please submit bugs & patches to the libpixman bugzilla:
+
+       https://bugs.freedesktop.org/enter_bug.cgi?product=pixman
+
+All questions regarding this software should be directed to either the 
+Xorg mailing list:
+
+       http://lists.freedesktop.org/mailman/listinfo/xorg
+
+or the cairo mailing list:
+
+       http://lists.freedesktop.org/mailman/listinfo/cairo
+
+The master development code repository can be found at:
+
+	git://anongit.freedesktop.org/git/pixman
+
+	http://gitweb.freedesktop.org/?p=pixman;a=summary
+
+For more information on the git code manager, see:
+
+	http://wiki.x.org/wiki/GitPage
+
diff --git a/lib/pixman/TODO b/lib/pixman/TODO
index 6abeb0b0d..52d737706 100644
--- a/lib/pixman/TODO
+++ b/lib/pixman/TODO
@@ -14,6 +14,8 @@
         the required precision by simply adding offset_x/y to the
         relevant rendering API?
 
+      - Get rid of workaround for X server bug.
+
       - pixman_image_set_indexed() should copy its argument, and X
         should be ported over to use a pixman_image as the
         representation of a Picture, rather than creating one on each
diff --git a/lib/pixman/config.h.in b/lib/pixman/config.h.in
index 01d2b4edf..283eb1a1b 100644
--- a/lib/pixman/config.h.in
+++ b/lib/pixman/config.h.in
@@ -15,6 +15,9 @@
 /* Define to 1 if you have the <memory.h> header file. */
 #undef HAVE_MEMORY_H
 
+/* Whether we have posix_memalign() */
+#undef HAVE_POSIX_MEMALIGN
+
 /* Define to 1 if you have the <stdint.h> header file. */
 #undef HAVE_STDINT_H
 
@@ -54,6 +57,9 @@
 /* Define to the version of this package. */
 #undef PACKAGE_VERSION
 
+/* enable TIMER_BEGIN/TIMER_END macros */
+#undef PIXMAN_TIMERS
+
 /* Define to 1 if you have the ANSI C header files. */
 #undef STDC_HEADERS
 
diff --git a/lib/pixman/configure b/lib/pixman/configure
index 9a0567cee..262b1f57d 100644
--- a/lib/pixman/configure
+++ b/lib/pixman/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.62 for pixman 0.15.8.
+# Generated by GNU Autoconf 2.62 for pixman 0.16.6.
 #
 # Report bugs to <"sandmann@daimi.au.dk">.
 #
@@ -750,8 +750,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='pixman'
 PACKAGE_TARNAME='pixman'
-PACKAGE_VERSION='0.15.8'
-PACKAGE_STRING='pixman 0.15.8'
+PACKAGE_VERSION='0.16.6'
+PACKAGE_STRING='pixman 0.16.6'
 PACKAGE_BUGREPORT='"sandmann@daimi.au.dk"'
 
 # Factoring default headers for most tests.
@@ -910,13 +910,14 @@ VMX_CFLAGS
 USE_VMX_TRUE
 USE_VMX_FALSE
 ARM_SIMD_CFLAGS
-ARM_NEON_CFLAGS
 USE_ARM_SIMD_TRUE
 USE_ARM_SIMD_FALSE
+ARM_NEON_CFLAGS
 USE_ARM_NEON_TRUE
 USE_ARM_NEON_FALSE
 USE_GCC_INLINE_ASM_TRUE
 USE_GCC_INLINE_ASM_FALSE
+PIXMAN_TIMERS
 PKG_CONFIG
 GTK_CFLAGS
 GTK_LIBS
@@ -943,6 +944,7 @@ enable_vmx
 enable_arm_simd
 enable_arm_neon
 enable_gcc_inline_asm
+enable_timers
 enable_gtk
 '
       ac_precious_vars='build_alias
@@ -1515,7 +1517,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures pixman 0.15.8 to adapt to many kinds of systems.
+\`configure' configures pixman 0.16.6 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1585,7 +1587,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of pixman 0.15.8:";;
+     short | recursive ) echo "Configuration of pixman 0.16.6:";;
    esac
   cat <<\_ACEOF
 
@@ -1607,6 +1609,7 @@ Optional Features:
   --disable-arm-neon      disable ARM NEON fast paths
   --disable-gcc-inline-asm
                           disable GNU-style inline assembler
+  --enable-timers         enable TIMER_BEGIN and TIMER_END macros [default=no]
   --enable-gtk            enable tests using GTK+ [default=auto]
 
 Optional Packages:
@@ -1701,7 +1704,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-pixman configure 0.15.8
+pixman configure 0.16.6
 generated by GNU Autoconf 2.62
 
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1715,7 +1718,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by pixman $as_me 0.15.8, which was
+It was created by pixman $as_me 0.16.6, which was
 generated by GNU Autoconf 2.62.  Invocation command line was
 
   $ $0 $@
@@ -2364,7 +2367,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='pixman'
- VERSION='0.15.8'
+ VERSION='0.16.6'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -2598,6 +2601,8 @@ case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac
 
 
 
+test_CFLAGS=${CFLAGS+set} # We may override autoconf default CFLAGS.
+
 ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
@@ -4400,7 +4405,7 @@ ia64-*-hpux*)
   ;;
 *-*-irix6*)
   # Find out which ABI we are using.
-  echo '#line 4403 "configure"' > conftest.$ac_ext
+  echo '#line 4408 "configure"' > conftest.$ac_ext
   if { (eval echo "$as_me:$LINENO: \"$ac_compile\"") >&5
   (eval $ac_compile) 2>&5
   ac_status=$?
@@ -7508,11 +7513,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:7511: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:7516: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:7515: \$? = $ac_status" >&5
+   echo "$as_me:7520: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -7798,11 +7803,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:7801: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:7806: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:7805: \$? = $ac_status" >&5
+   echo "$as_me:7810: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -7902,11 +7907,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:7905: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:7910: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:7909: \$? = $ac_status" >&5
+   echo "$as_me:7914: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -10302,7 +10307,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10305 "configure"
+#line 10310 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -10402,7 +10407,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10405 "configure"
+#line 10410 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -12811,11 +12816,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:12814: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:12819: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:12818: \$? = $ac_status" >&5
+   echo "$as_me:12823: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -12915,11 +12920,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:12918: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:12923: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:12922: \$? = $ac_status" >&5
+   echo "$as_me:12927: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -14498,11 +14503,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:14501: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:14506: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:14505: \$? = $ac_status" >&5
+   echo "$as_me:14510: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -14602,11 +14607,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:14605: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:14610: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:14609: \$? = $ac_status" >&5
+   echo "$as_me:14614: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -16817,11 +16822,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:16820: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:16825: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:16824: \$? = $ac_status" >&5
+   echo "$as_me:16829: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -17107,11 +17112,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:17110: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:17115: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>conftest.err)
    ac_status=$?
    cat conftest.err >&5
-   echo "$as_me:17114: \$? = $ac_status" >&5
+   echo "$as_me:17119: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s "$ac_outfile"; then
      # The compiler can only warn and ignore the option if not recognized
      # So say no if there are warnings other than the usual output.
@@ -17211,11 +17216,11 @@ else
    -e 's:.*FLAGS}\{0,1\} :&$lt_compiler_flag :; t' \
    -e 's: [^ ]*conftest\.: $lt_compiler_flag&:; t' \
    -e 's:$: $lt_compiler_flag:'`
-   (eval echo "\"\$as_me:17214: $lt_compile\"" >&5)
+   (eval echo "\"\$as_me:17219: $lt_compile\"" >&5)
    (eval "$lt_compile" 2>out/conftest.err)
    ac_status=$?
    cat out/conftest.err >&5
-   echo "$as_me:17218: \$? = $ac_status" >&5
+   echo "$as_me:17223: \$? = $ac_status" >&5
    if (exit $ac_status) && test -s out/conftest2.$ac_objext
    then
      # The compiler can only warn and ignore the option if not recognized
@@ -20593,6 +20598,16 @@ else
 fi
 
 
+# Default CFLAGS to -O -g rather than just the -g from AC_PROG_CC
+# if we're using Sun Studio and neither the user nor a config.site
+# has set CFLAGS.
+if test $SUNCC = yes &&			\
+   test "$test_CFLAGS" == "" &&		\
+   test "$CFLAGS" = "-g"
+then
+  CFLAGS="-O -g"
+fi
+
 #
 # We ignore pixman_major in the version here because the major version should
 # always be encoded in the actual library name. Ie., the soname is:
@@ -20603,13 +20618,13 @@ fi
 
 
 
-LT_VERSION_INFO="15:8:15"
+LT_VERSION_INFO="16:6:16"
 
 PIXMAN_VERSION_MAJOR=0
 
-PIXMAN_VERSION_MINOR=15
+PIXMAN_VERSION_MINOR=16
 
-PIXMAN_VERSION_MICRO=8
+PIXMAN_VERSION_MICRO=6
 
 
 
@@ -20618,10 +20633,18 @@ PIXMAN_VERSION_MICRO=8
 #PKG_CHECK_MODULES(DEP, x11)
 
 if test "x$GCC" = "xyes"; then
+
   case " $CFLAGS " in
   *[\ \	]-Wall[\ \	]*) ;;
   *) CFLAGS="$CFLAGS -Wall" ;;
-  esac fi
+  esac
+
+  case " $CFLAGS " in
+  *[\ \	]-fno-strict-aliasing[\ \	]*) ;;
+  *) CFLAGS="$CFLAGS -fno-strict-aliasing" ;;
+  esac
+
+fi
 # Extract the first word of "perl", so it can be a program name with args.
 set dummy perl; ac_word=$2
 { $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
@@ -20969,9 +20992,53 @@ case $host_os in
    solaris*)
       # When building 32-bit binaries, apply a mapfile to ensure that the
       # binaries aren't flagged as only able to run on MMX+SSE capable CPUs
-      # since they check at runtime before using those instructions
+      # since they check at runtime before using those instructions.
+      # Not all linkers grok the mapfile format so we check for that first.
       if test "$AMD64_ABI" = "no" ; then
-         HWCAP_LDFLAGS='-Wl,-M,$(srcdir)/solaris-hwcap.mapfile'
+	 use_hwcap_mapfile=no
+	 { $as_echo "$as_me:$LINENO: checking whether to use a hardware capability map file" >&5
+$as_echo_n "checking whether to use a hardware capability map file... " >&6; }
+	 hwcap_save_LDFLAGS="$LDFLAGS"
+	 HWCAP_LDFLAGS='-Wl,-M,$(srcdir)/solaris-hwcap.mapfile'
+	 LDFLAGS="$LDFLAGS -Wl,-M,pixman/solaris-hwcap.mapfile"
+	 cat >conftest.$ac_ext <<_ACEOF
+int main() { return 0; }
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+	 test "$cross_compiling" = yes ||
+	 $as_test_x conftest$ac_exeext
+       }; then
+  use_hwcap_mapfile=yes
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	HWCAP_LDFLAGS=""
+fi
+
+rm -rf conftest.dSYM
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+      conftest$ac_exeext conftest.$ac_ext
+	 LDFLAGS="$hwcap_save_LDFLAGS"
+	 { $as_echo "$as_me:$LINENO: result: $use_hwcap_mapfile" >&5
+$as_echo "$use_hwcap_mapfile" >&6; }
       fi
       if test "x$MMX_LDFLAGS" = "x" ; then
          MMX_LDFLAGS="$HWCAP_LDFLAGS"
@@ -20997,7 +21064,7 @@ have_vmx_intrinsics=no
 { $as_echo "$as_me:$LINENO: checking whether to use VMX/Altivec intrinsics" >&5
 $as_echo_n "checking whether to use VMX/Altivec intrinsics... " >&6; }
 xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$CFLAGS $VMX_CFLAGS"
+CFLAGS="$VMX_CFLAGS $CFLAGS"
 cat >conftest.$ac_ext <<_ACEOF
 
 #if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
@@ -21082,13 +21149,13 @@ else
 fi
 
 
-ARM_SIMD_CFLAGS=""
+ARM_SIMD_CFLAGS="-mcpu=arm1136j-s"
 
 have_arm_simd=no
 { $as_echo "$as_me:$LINENO: checking whether to use ARM SIMD assembler" >&5
 $as_echo_n "checking whether to use ARM SIMD assembler... " >&6; }
 xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$CFLAGS $ARM_SIMD_CFLAGS"
+CFLAGS="$ARM_SIMD_CFLAGS $CFLAGS"
 cat >conftest.$ac_ext <<_ACEOF
 
 int main () {
@@ -21155,13 +21222,26 @@ $as_echo "$as_me: error: ARM SIMD intrinsics not detected" >&2;}
    { (exit 1); exit 1; }; }
 fi
 
-ARM_NEON_CFLAGS="-mfpu=neon -mfloat-abi=softfp"
+
+
+
+
+if test $have_arm_simd = yes; then
+  USE_ARM_SIMD_TRUE=
+  USE_ARM_SIMD_FALSE='#'
+else
+  USE_ARM_SIMD_TRUE='#'
+  USE_ARM_SIMD_FALSE=
+fi
+
+
+ARM_NEON_CFLAGS="-mfpu=neon -mcpu=cortex-a8"
 
 have_arm_neon=no
 { $as_echo "$as_me:$LINENO: checking whether to use ARM NEON" >&5
 $as_echo_n "checking whether to use ARM NEON... " >&6; }
 xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$CFLAGS $ARM_NEON_CFLAGS"
+CFLAGS="$ARM_NEON_CFLAGS $CFLAGS"
 cat >conftest.$ac_ext <<_ACEOF
 
 #include <arm_neon.h>
@@ -21221,6 +21301,19 @@ else
    ARM_NEON_CFLAGS=
 fi
 
+
+
+
+
+if test $have_arm_neon = yes; then
+  USE_ARM_NEON_TRUE=
+  USE_ARM_NEON_FALSE='#'
+else
+  USE_ARM_NEON_TRUE='#'
+  USE_ARM_NEON_FALSE=
+fi
+
+
 { $as_echo "$as_me:$LINENO: result: $have_arm_neon" >&5
 $as_echo "$have_arm_neon" >&6; }
 if test $enable_arm_neon = yes && test $have_arm_neon = no ; then
@@ -21299,38 +21392,31 @@ fi
 
 
 
-
-
-
-
-if test $have_arm_simd = yes; then
-  USE_ARM_SIMD_TRUE=
-  USE_ARM_SIMD_FALSE='#'
+if test $have_gcc_inline_asm = yes; then
+  USE_GCC_INLINE_ASM_TRUE=
+  USE_GCC_INLINE_ASM_FALSE='#'
 else
-  USE_ARM_SIMD_TRUE='#'
-  USE_ARM_SIMD_FALSE=
+  USE_GCC_INLINE_ASM_TRUE='#'
+  USE_GCC_INLINE_ASM_FALSE=
 fi
 
 
 
-if test $have_arm_neon = yes; then
-  USE_ARM_NEON_TRUE=
-  USE_ARM_NEON_FALSE='#'
+# Check whether --enable-timers was given.
+if test "${enable_timers+set}" = set; then
+  enableval=$enable_timers; enable_timers=$enableval
 else
-  USE_ARM_NEON_TRUE='#'
-  USE_ARM_NEON_FALSE=
+  enable_timers=no
 fi
 
 
+if test $enable_timers = yes ; then
 
-if test $have_gcc_inline_asm = yes; then
-  USE_GCC_INLINE_ASM_TRUE=
-  USE_GCC_INLINE_ASM_FALSE='#'
-else
-  USE_GCC_INLINE_ASM_TRUE='#'
-  USE_GCC_INLINE_ASM_FALSE=
-fi
+cat >>confdefs.h <<\_ACEOF
+#define PIXMAN_TIMERS 1
+_ACEOF
 
+fi
 
 
 
@@ -21604,6 +21690,106 @@ fi
 
 
 
+
+{ $as_echo "$as_me:$LINENO: checking for posix_memalign" >&5
+$as_echo_n "checking for posix_memalign... " >&6; }
+if test "${ac_cv_func_posix_memalign+set}" = set; then
+  $as_echo_n "(cached) " >&6
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+/* Define posix_memalign to an innocuous variant, in case <limits.h> declares posix_memalign.
+   For example, HP-UX 11i <limits.h> declares gettimeofday.  */
+#define posix_memalign innocuous_posix_memalign
+
+/* System header to define __stub macros and hopefully few prototypes,
+    which can conflict with char posix_memalign (); below.
+    Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+    <limits.h> exists even on freestanding compilers.  */
+
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+
+#undef posix_memalign
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+#ifdef __cplusplus
+extern "C"
+#endif
+char posix_memalign ();
+/* The GNU C library defines this for functions which it implements
+    to always fail with ENOSYS.  Some functions are actually named
+    something starting with __ and the normal name is an alias.  */
+#if defined __stub_posix_memalign || defined __stub___posix_memalign
+choke me
+#endif
+
+int
+main ()
+{
+return posix_memalign ();
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+	 test "$cross_compiling" = yes ||
+	 $as_test_x conftest$ac_exeext
+       }; then
+  ac_cv_func_posix_memalign=yes
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_cv_func_posix_memalign=no
+fi
+
+rm -rf conftest.dSYM
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+      conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_func_posix_memalign" >&5
+$as_echo "$ac_cv_func_posix_memalign" >&6; }
+if test $ac_cv_func_posix_memalign = yes; then
+  have_posix_memalign=yes
+else
+  have_posix_memalign=no
+fi
+
+if test x$have_posix_memalign = xyes; then
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE_POSIX_MEMALIGN 1
+_ACEOF
+
+fi
+
 ac_config_files="$ac_config_files pixman-1.pc pixman-1-uninstalled.pc Makefile pixman/Makefile pixman/pixman-version.h test/Makefile"
 
 cat >confcache <<\_ACEOF
@@ -22096,7 +22282,7 @@ exec 6>&1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by pixman $as_me 0.15.8, which was
+This file was extended by pixman $as_me 0.16.6, which was
 generated by GNU Autoconf 2.62.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -22149,7 +22335,7 @@ Report bugs to <bug-autoconf@gnu.org>."
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_version="\\
-pixman config.status 0.15.8
+pixman config.status 0.16.6
 configured by $0, generated by GNU Autoconf 2.62,
   with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
 
diff --git a/lib/pixman/configure.ac b/lib/pixman/configure.ac
index c9085d20b..8fa959ae4 100644
--- a/lib/pixman/configure.ac
+++ b/lib/pixman/configure.ac
@@ -31,7 +31,7 @@ AC_PREREQ([2.57])
 #
 #   - Released development versions have an odd MINOR number
 #
-#   - Released stable versions have an event MINOR number
+#   - Released stable versions have an even MINOR number
 #
 #   - Versions that break ABI must have a new MAJOR number
 #
@@ -53,8 +53,8 @@ AC_PREREQ([2.57])
 #
 
 m4_define([pixman_major], 0)
-m4_define([pixman_minor], 15)
-m4_define([pixman_micro], 8)
+m4_define([pixman_minor], 16)
+m4_define([pixman_micro], 6)
 
 m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
 
@@ -65,6 +65,8 @@ AM_CONFIG_HEADER(config.h)
 
 AC_CANONICAL_HOST
 
+test_CFLAGS=${CFLAGS+set} # We may override autoconf default CFLAGS.
+
 AC_PROG_CC
 AC_PROG_LIBTOOL
 AC_CHECK_FUNCS([getisax])
@@ -75,6 +77,16 @@ AC_C_INLINE
 AC_CHECK_DECL([__SUNPRO_C], [SUNCC="yes"], [SUNCC="no"])
 AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
 
+# Default CFLAGS to -O -g rather than just the -g from AC_PROG_CC
+# if we're using Sun Studio and neither the user nor a config.site
+# has set CFLAGS.
+if test $SUNCC = yes &&			\
+   test "$test_CFLAGS" == "" &&		\
+   test "$CFLAGS" = "-g"
+then
+  CFLAGS="-O -g"
+fi
+
 # 
 # We ignore pixman_major in the version here because the major version should
 # always be encoded in the actual library name. Ie., the soname is:
@@ -101,10 +113,18 @@ AC_SUBST(LT_VERSION_INFO)
 
 changequote(,)dnl
 if test "x$GCC" = "xyes"; then
+
   case " $CFLAGS " in
   *[\ \	]-Wall[\ \	]*) ;;
   *) CFLAGS="$CFLAGS -Wall" ;;
-  esac fi changequote([,])dnl
+  esac 
+
+  case " $CFLAGS " in
+  *[\ \	]-fno-strict-aliasing[\ \	]*) ;;
+  *) CFLAGS="$CFLAGS -fno-strict-aliasing" ;;
+  esac
+
+fi changequote([,])dnl
 
 AC_PATH_PROG(PERL, perl, no)
 if test "x$PERL" = xno; then
@@ -259,9 +279,19 @@ case $host_os in
    solaris*)
       # When building 32-bit binaries, apply a mapfile to ensure that the
       # binaries aren't flagged as only able to run on MMX+SSE capable CPUs
-      # since they check at runtime before using those instructions
+      # since they check at runtime before using those instructions.
+      # Not all linkers grok the mapfile format so we check for that first.
       if test "$AMD64_ABI" = "no" ; then
-         HWCAP_LDFLAGS='-Wl,-M,$(srcdir)/solaris-hwcap.mapfile'
+	 use_hwcap_mapfile=no
+	 AC_MSG_CHECKING(whether to use a hardware capability map file)
+	 hwcap_save_LDFLAGS="$LDFLAGS"
+	 HWCAP_LDFLAGS='-Wl,-M,$(srcdir)/solaris-hwcap.mapfile'
+	 LDFLAGS="$LDFLAGS -Wl,-M,pixman/solaris-hwcap.mapfile"
+	 AC_LINK_IFELSE([int main() { return 0; }],
+			use_hwcap_mapfile=yes,
+			HWCAP_LDFLAGS="")
+	 LDFLAGS="$hwcap_save_LDFLAGS"
+	 AC_MSG_RESULT($use_hwcap_mapfile)
       fi
       if test "x$MMX_LDFLAGS" = "x" ; then
          MMX_LDFLAGS="$HWCAP_LDFLAGS"
@@ -288,7 +318,7 @@ fi
 have_vmx_intrinsics=no
 AC_MSG_CHECKING(whether to use VMX/Altivec intrinsics)
 xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$CFLAGS $VMX_CFLAGS"
+CFLAGS="$VMX_CFLAGS $CFLAGS"
 AC_COMPILE_IFELSE([
 #if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
 error "Need GCC >= 3.4 for sane altivec support"
@@ -325,13 +355,14 @@ AC_SUBST(VMX_CFLAGS)
 
 AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
 
+dnl ===========================================================================
 dnl Check for ARM SIMD instructions
-ARM_SIMD_CFLAGS=""
+ARM_SIMD_CFLAGS="-mcpu=arm1136j-s"
 
 have_arm_simd=no
 AC_MSG_CHECKING(whether to use ARM SIMD assembler)
 xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$CFLAGS $ARM_SIMD_CFLAGS"
+CFLAGS="$ARM_SIMD_CFLAGS $CFLAGS"
 AC_COMPILE_IFELSE([
 int main () {
     asm("uqadd8 r1, r1, r2");
@@ -359,13 +390,18 @@ if test $enable_arm_simd = yes && test $have_arm_simd = no ; then
    AC_MSG_ERROR([ARM SIMD intrinsics not detected])
 fi
 
+AC_SUBST(ARM_SIMD_CFLAGS)
+
+AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
+
+dnl ==========================================================================
 dnl Check for ARM NEON instructions
-ARM_NEON_CFLAGS="-mfpu=neon -mfloat-abi=softfp"
+ARM_NEON_CFLAGS="-mfpu=neon -mcpu=cortex-a8"
 
 have_arm_neon=no
 AC_MSG_CHECKING(whether to use ARM NEON)
 xserver_save_CFLAGS=$CFLAGS
-CFLAGS="$CFLAGS $ARM_NEON_CFLAGS"
+CFLAGS="$ARM_NEON_CFLAGS $CFLAGS"
 AC_COMPILE_IFELSE([
 #include <arm_neon.h>
 int main () {
@@ -389,11 +425,16 @@ else
    ARM_NEON_CFLAGS=
 fi
 
+AC_SUBST(ARM_NEON_CFLAGS)
+
+AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes)
+
 AC_MSG_RESULT($have_arm_neon)
 if test $enable_arm_neon = yes && test $have_arm_neon = no ; then
    AC_MSG_ERROR([ARM NEON intrinsics not detected])
 fi
 
+dnl =========================================================================================
 dnl Check for GNU-style inline assembly support
 
 have_gcc_inline_asm=no
@@ -423,15 +464,23 @@ if test $enable_gcc_inline_asm = yes && test $have_gcc_inline_asm = no ; then
    AC_MSG_ERROR([GNU-style inline assembler not detected])
 fi
 
+AM_CONDITIONAL(USE_GCC_INLINE_ASM, test $have_gcc_inline_asm = yes)
 
-AC_SUBST(ARM_SIMD_CFLAGS)
-AC_SUBST(ARM_NEON_CFLAGS)
+dnl ==============================================
+dnl Timers
 
-AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
-AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes)
-AM_CONDITIONAL(USE_GCC_INLINE_ASM, test $have_gcc_inline_asm = yes)
+AC_ARG_ENABLE(timers,
+   [AC_HELP_STRING([--enable-timers],
+		   [enable TIMER_BEGIN and TIMER_END macros [default=no]])],
+   [enable_timers=$enableval], [enable_timers=no])
 
+if test $enable_timers = yes ; then 
+   AC_DEFINE(PIXMAN_TIMERS, 1, [enable TIMER_BEGIN/TIMER_END macros])
+fi
+AC_SUBST(PIXMAN_TIMERS)
 
+dnl ===================================
+dnl GTK+
 
 AC_ARG_ENABLE(gtk,
    [AC_HELP_STRING([--enable-gtk],
@@ -452,7 +501,15 @@ AC_SUBST(GTK_CFLAGS)
 AC_SUBST(GTK_LIBS)
 AC_SUBST(DEP_CFLAGS)
 AC_SUBST(DEP_LIBS)
-		  
+
+dnl =====================================
+dnl posix_memalign 
+
+AC_CHECK_FUNC(posix_memalign, have_posix_memalign=yes, have_posix_memalign=no)
+if test x$have_posix_memalign = xyes; then
+   AC_DEFINE(HAVE_POSIX_MEMALIGN, 1, [Whether we have posix_memalign()])
+fi
+
 AC_OUTPUT([pixman-1.pc
            pixman-1-uninstalled.pc
            Makefile
diff --git a/lib/pixman/ltmain.sh b/lib/pixman/ltmain.sh
index 248cd4047..fccf69e28 100644
--- a/lib/pixman/ltmain.sh
+++ b/lib/pixman/ltmain.sh
@@ -2127,17 +2127,6 @@ EOF
 	;;
     esac
     for pass in $passes; do
-      # The preopen pass in lib mode reverses $deplibs; put it back here
-      # so that -L comes before libs that need it for instance...
-      if test "$linkmode,$pass" = "lib,link"; then
-	## FIXME: Find the place where the list is rebuilt in the wrong
-	##        order, and fix it there properly
-	tmp_deplibs=
-	for deplib in $deplibs; do
-	  tmp_deplibs="$deplib $tmp_deplibs"
-	done
-	deplibs="$tmp_deplibs"
-      fi
       if test "$linkmode,$pass" = "lib,link" ||
 	 test "$linkmode,$pass" = "prog,scan"; then
 	libs="$deplibs"
diff --git a/lib/pixman/pixman/Makefile.am b/lib/pixman/pixman/Makefile.am
index 863caa35f..e19fa6e7f 100644
--- a/lib/pixman/pixman/Makefile.am
+++ b/lib/pixman/pixman/Makefile.am
@@ -4,12 +4,14 @@ libpixman_1_la_LIBADD = @DEP_LIBS@ -lm
 libpixman_1_la_CFLAGS = -DPIXMAN_DISABLE_DEPRECATED
 libpixman_1_la_SOURCES =			\
 	pixman.h				\
+	pixman-accessor.h			\
 	pixman-access.c				\
 	pixman-access-accessors.c		\
 	pixman-cpu.c				\
 	pixman-gradient-walker.c		\
 	pixman-region16.c			\
 	pixman-region32.c			\
+	pixman-compiler.h			\
 	pixman-private.h			\
 	pixman-image.c				\
 	pixman-implementation.c			\
@@ -18,21 +20,18 @@ libpixman_1_la_SOURCES =			\
 	pixman-combine64.c			\
 	pixman-combine64.h			\
 	pixman-general.c			\
-	pixman-pict.c				\
+	pixman.c				\
 	pixman-fast-path.c			\
 	pixman-solid-fill.c			\
 	pixman-conical-gradient.c		\
 	pixman-linear-gradient.c		\
 	pixman-radial-gradient.c		\
 	pixman-bits-image.c			\
-	pixman-transformed.c			\
-	pixman-transformed-accessors.c		\
 	pixman-utils.c				\
 	pixman-edge.c				\
 	pixman-edge-accessors.c			\
 	pixman-edge-imp.h			\
 	pixman-trap.c				\
-	pixman-compute-region.c			\
 	pixman-timer.c				\
 	pixman-matrix.c
 
@@ -40,18 +39,20 @@ libpixmanincludedir = $(includedir)/pixman-1/
 libpixmaninclude_HEADERS = pixman.h pixman-version.h
 noinst_LTLIBRARIES = 
 
-pixman-combine32.c : combine.inc pixman-combine32.h combine.pl
-	$(PERL) $(srcdir)/combine.pl 8 < $(srcdir)/combine.inc > $@ || ($(RM) $@; exit 1)
-pixman-combine32.h : combine.h.inc combine.pl
-	$(PERL) $(srcdir)/combine.pl 8 < $(srcdir)/combine.h.inc > $@ || ($(RM) $@; exit 1)
+BUILT_SOURCES = pixman-combine32.h pixman-combine32.c pixman-combine64.h pixman-combine64.c
 
-pixman-combine64.c : combine.inc pixman-combine64.h combine.pl
-	$(PERL) $(srcdir)/combine.pl 16 < $(srcdir)/combine.inc > $@ || ($(RM) $@; exit 1)
-pixman-combine64.h : combine.h.inc combine.pl
-	$(PERL) $(srcdir)/combine.pl 16 < $(srcdir)/combine.h.inc > $@ || ($(RM) $@; exit 1)
+pixman-combine32.c : pixman-combine.c.template pixman-combine32.h make-combine.pl
+	$(PERL) $(srcdir)/make-combine.pl 8 < $(srcdir)/pixman-combine.c.template > $@ || ($(RM) $@; exit 1)
+pixman-combine32.h : pixman-combine.h.template make-combine.pl
+	$(PERL) $(srcdir)/make-combine.pl 8 < $(srcdir)/pixman-combine.h.template > $@ || ($(RM) $@; exit 1)
 
-EXTRA_DIST = Makefile.win32 combine.inc combine.pl pixman-region.c \
-	combine.h.inc solaris-hwcap.mapfile
+pixman-combine64.c : pixman-combine.c.template pixman-combine64.h make-combine.pl
+	$(PERL) $(srcdir)/make-combine.pl 16 < $(srcdir)/pixman-combine.c.template > $@ || ($(RM) $@; exit 1)
+pixman-combine64.h : pixman-combine.h.template make-combine.pl
+	$(PERL) $(srcdir)/make-combine.pl 16 < $(srcdir)/pixman-combine.h.template > $@ || ($(RM) $@; exit 1)
+
+EXTRA_DIST = Makefile.win32 pixman-combine.c.template make-combine.pl pixman-region.c \
+	pixman-combine.h.template solaris-hwcap.mapfile pixman-x64-mmx-emulation.h
 CLEANFILES = pixman-combine32.c pixman-combine64.c pixman-combine32.h pixman-combine64.h
 
 # mmx code
@@ -91,8 +92,7 @@ endif
 if USE_ARM_SIMD
 noinst_LTLIBRARIES += libpixman-arm-simd.la
 libpixman_arm_simd_la_SOURCES = \
-	pixman-arm-simd.c \
-	pixman-arm-simd.h
+	pixman-arm-simd.c
 libpixman_arm_simd_la_CFLAGS = $(DEP_CFLAGS) $(ARM_SIMD_CFLAGS)
 libpixman_arm_simd_la_LIBADD = $(DEP_LIBS)
 libpixman_1_la_LIBADD += libpixman-arm-simd.la
@@ -102,8 +102,7 @@ endif
 if USE_ARM_NEON
 noinst_LTLIBRARIES += libpixman-arm-neon.la
 libpixman_arm_neon_la_SOURCES = \
-        pixman-arm-neon.c \
-        pixman-arm-neon.h
+        pixman-arm-neon.c
 libpixman_arm_neon_la_CFLAGS = $(DEP_CFLAGS) $(ARM_NEON_CFLAGS)
 libpixman_arm_neon_la_LIBADD = $(DEP_LIBS)
 libpixman_1_la_LIBADD += libpixman-arm-neon.la
diff --git a/lib/pixman/pixman/Makefile.in b/lib/pixman/pixman/Makefile.in
index 7353d1b1a..51c282071 100644
--- a/lib/pixman/pixman/Makefile.in
+++ b/lib/pixman/pixman/Makefile.in
@@ -97,34 +97,29 @@ am_libpixman_1_la_OBJECTS = libpixman_1_la-pixman-access.lo \
 	libpixman_1_la-pixman-implementation.lo \
 	libpixman_1_la-pixman-combine32.lo \
 	libpixman_1_la-pixman-combine64.lo \
-	libpixman_1_la-pixman-general.lo libpixman_1_la-pixman-pict.lo \
+	libpixman_1_la-pixman-general.lo libpixman_1_la-pixman.lo \
 	libpixman_1_la-pixman-fast-path.lo \
 	libpixman_1_la-pixman-solid-fill.lo \
 	libpixman_1_la-pixman-conical-gradient.lo \
 	libpixman_1_la-pixman-linear-gradient.lo \
 	libpixman_1_la-pixman-radial-gradient.lo \
 	libpixman_1_la-pixman-bits-image.lo \
-	libpixman_1_la-pixman-transformed.lo \
-	libpixman_1_la-pixman-transformed-accessors.lo \
 	libpixman_1_la-pixman-utils.lo libpixman_1_la-pixman-edge.lo \
 	libpixman_1_la-pixman-edge-accessors.lo \
-	libpixman_1_la-pixman-trap.lo \
-	libpixman_1_la-pixman-compute-region.lo \
-	libpixman_1_la-pixman-timer.lo libpixman_1_la-pixman-matrix.lo
+	libpixman_1_la-pixman-trap.lo libpixman_1_la-pixman-timer.lo \
+	libpixman_1_la-pixman-matrix.lo
 libpixman_1_la_OBJECTS = $(am_libpixman_1_la_OBJECTS)
 am__DEPENDENCIES_6 =
 @USE_ARM_NEON_TRUE@libpixman_arm_neon_la_DEPENDENCIES =  \
 @USE_ARM_NEON_TRUE@	$(am__DEPENDENCIES_6)
-am__libpixman_arm_neon_la_SOURCES_DIST = pixman-arm-neon.c \
-	pixman-arm-neon.h
+am__libpixman_arm_neon_la_SOURCES_DIST = pixman-arm-neon.c
 @USE_ARM_NEON_TRUE@am_libpixman_arm_neon_la_OBJECTS =  \
 @USE_ARM_NEON_TRUE@	libpixman_arm_neon_la-pixman-arm-neon.lo
 libpixman_arm_neon_la_OBJECTS = $(am_libpixman_arm_neon_la_OBJECTS)
 @USE_ARM_NEON_TRUE@am_libpixman_arm_neon_la_rpath =
 @USE_ARM_SIMD_TRUE@libpixman_arm_simd_la_DEPENDENCIES =  \
 @USE_ARM_SIMD_TRUE@	$(am__DEPENDENCIES_6)
-am__libpixman_arm_simd_la_SOURCES_DIST = pixman-arm-simd.c \
-	pixman-arm-simd.h
+am__libpixman_arm_simd_la_SOURCES_DIST = pixman-arm-simd.c
 @USE_ARM_SIMD_TRUE@am_libpixman_arm_simd_la_OBJECTS =  \
 @USE_ARM_SIMD_TRUE@	libpixman_arm_simd_la-pixman-arm-simd.lo
 libpixman_arm_simd_la_OBJECTS = $(am_libpixman_arm_simd_la_OBJECTS)
@@ -235,6 +230,7 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
 PERL = @PERL@
+PIXMAN_TIMERS = @PIXMAN_TIMERS@
 PIXMAN_VERSION_MAJOR = @PIXMAN_VERSION_MAJOR@
 PIXMAN_VERSION_MICRO = @PIXMAN_VERSION_MICRO@
 PIXMAN_VERSION_MINOR = @PIXMAN_VERSION_MINOR@
@@ -315,12 +311,14 @@ libpixman_1_la_LIBADD = @DEP_LIBS@ -lm $(am__append_3) $(am__append_5) \
 libpixman_1_la_CFLAGS = -DPIXMAN_DISABLE_DEPRECATED
 libpixman_1_la_SOURCES = \
 	pixman.h				\
+	pixman-accessor.h			\
 	pixman-access.c				\
 	pixman-access-accessors.c		\
 	pixman-cpu.c				\
 	pixman-gradient-walker.c		\
 	pixman-region16.c			\
 	pixman-region32.c			\
+	pixman-compiler.h			\
 	pixman-private.h			\
 	pixman-image.c				\
 	pixman-implementation.c			\
@@ -329,21 +327,18 @@ libpixman_1_la_SOURCES = \
 	pixman-combine64.c			\
 	pixman-combine64.h			\
 	pixman-general.c			\
-	pixman-pict.c				\
+	pixman.c				\
 	pixman-fast-path.c			\
 	pixman-solid-fill.c			\
 	pixman-conical-gradient.c		\
 	pixman-linear-gradient.c		\
 	pixman-radial-gradient.c		\
 	pixman-bits-image.c			\
-	pixman-transformed.c			\
-	pixman-transformed-accessors.c		\
 	pixman-utils.c				\
 	pixman-edge.c				\
 	pixman-edge-accessors.c			\
 	pixman-edge-imp.h			\
 	pixman-trap.c				\
-	pixman-compute-region.c			\
 	pixman-timer.c				\
 	pixman-matrix.c
 
@@ -351,8 +346,9 @@ libpixmanincludedir = $(includedir)/pixman-1/
 libpixmaninclude_HEADERS = pixman.h pixman-version.h
 noinst_LTLIBRARIES = $(am__append_1) $(am__append_4) $(am__append_6) \
 	$(am__append_9) $(am__append_11)
-EXTRA_DIST = Makefile.win32 combine.inc combine.pl pixman-region.c \
-	combine.h.inc solaris-hwcap.mapfile
+BUILT_SOURCES = pixman-combine32.h pixman-combine32.c pixman-combine64.h pixman-combine64.c
+EXTRA_DIST = Makefile.win32 pixman-combine.c.template make-combine.pl pixman-region.c \
+	pixman-combine.h.template solaris-hwcap.mapfile pixman-x64-mmx-emulation.h
 
 CLEANFILES = pixman-combine32.c pixman-combine64.c pixman-combine32.h pixman-combine64.h
 @USE_MMX_TRUE@libpixman_mmx_la_SOURCES = \
@@ -372,18 +368,17 @@ CLEANFILES = pixman-combine32.c pixman-combine64.c pixman-combine32.h pixman-com
 @USE_SSE2_TRUE@libpixman_sse2_la_CFLAGS = $(DEP_CFLAGS) $(SSE2_CFLAGS)
 @USE_SSE2_TRUE@libpixman_sse2_la_LIBADD = $(DEP_LIBS)
 @USE_ARM_SIMD_TRUE@libpixman_arm_simd_la_SOURCES = \
-@USE_ARM_SIMD_TRUE@	pixman-arm-simd.c \
-@USE_ARM_SIMD_TRUE@	pixman-arm-simd.h
+@USE_ARM_SIMD_TRUE@	pixman-arm-simd.c
 
 @USE_ARM_SIMD_TRUE@libpixman_arm_simd_la_CFLAGS = $(DEP_CFLAGS) $(ARM_SIMD_CFLAGS)
 @USE_ARM_SIMD_TRUE@libpixman_arm_simd_la_LIBADD = $(DEP_LIBS)
 @USE_ARM_NEON_TRUE@libpixman_arm_neon_la_SOURCES = \
-@USE_ARM_NEON_TRUE@        pixman-arm-neon.c \
-@USE_ARM_NEON_TRUE@        pixman-arm-neon.h
+@USE_ARM_NEON_TRUE@        pixman-arm-neon.c
 
 @USE_ARM_NEON_TRUE@libpixman_arm_neon_la_CFLAGS = $(DEP_CFLAGS) $(ARM_NEON_CFLAGS)
 @USE_ARM_NEON_TRUE@libpixman_arm_neon_la_LIBADD = $(DEP_LIBS)
-all: all-am
+all: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) all-am
 
 .SUFFIXES:
 .SUFFIXES: .c .lo .o .obj
@@ -478,7 +473,6 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-bits-image.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-combine32.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-combine64.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-compute-region.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-conical-gradient.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-cpu.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-edge-accessors.Plo@am__quote@
@@ -490,16 +484,14 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-implementation.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-linear-gradient.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-matrix.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-pict.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-radial-gradient.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-region16.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-region32.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-solid-fill.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-timer.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-transformed-accessors.Plo@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-transformed.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-trap.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman-utils.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_1_la-pixman.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_arm_neon_la-pixman-arm-neon.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_arm_simd_la-pixman-arm-simd.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libpixman_mmx_la-pixman-mmx.Plo@am__quote@
@@ -604,12 +596,12 @@ libpixman_1_la-pixman-general.lo: pixman-general.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-general.lo `test -f 'pixman-general.c' || echo '$(srcdir)/'`pixman-general.c
 
-libpixman_1_la-pixman-pict.lo: pixman-pict.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-pict.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-pict.Tpo" -c -o libpixman_1_la-pixman-pict.lo `test -f 'pixman-pict.c' || echo '$(srcdir)/'`pixman-pict.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-pict.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-pict.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-pict.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-pict.c' object='libpixman_1_la-pixman-pict.lo' libtool=yes @AMDEPBACKSLASH@
+libpixman_1_la-pixman.lo: pixman.c
+@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman.Tpo" -c -o libpixman_1_la-pixman.lo `test -f 'pixman.c' || echo '$(srcdir)/'`pixman.c; \
+@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman.Tpo" "$(DEPDIR)/libpixman_1_la-pixman.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman.Tpo"; exit 1; fi
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman.c' object='libpixman_1_la-pixman.lo' libtool=yes @AMDEPBACKSLASH@
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-pict.lo `test -f 'pixman-pict.c' || echo '$(srcdir)/'`pixman-pict.c
+@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman.lo `test -f 'pixman.c' || echo '$(srcdir)/'`pixman.c
 
 libpixman_1_la-pixman-fast-path.lo: pixman-fast-path.c
 @am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-fast-path.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-fast-path.Tpo" -c -o libpixman_1_la-pixman-fast-path.lo `test -f 'pixman-fast-path.c' || echo '$(srcdir)/'`pixman-fast-path.c; \
@@ -653,20 +645,6 @@ libpixman_1_la-pixman-bits-image.lo: pixman-bits-image.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-bits-image.lo `test -f 'pixman-bits-image.c' || echo '$(srcdir)/'`pixman-bits-image.c
 
-libpixman_1_la-pixman-transformed.lo: pixman-transformed.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-transformed.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-transformed.Tpo" -c -o libpixman_1_la-pixman-transformed.lo `test -f 'pixman-transformed.c' || echo '$(srcdir)/'`pixman-transformed.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-transformed.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-transformed.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-transformed.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-transformed.c' object='libpixman_1_la-pixman-transformed.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-transformed.lo `test -f 'pixman-transformed.c' || echo '$(srcdir)/'`pixman-transformed.c
-
-libpixman_1_la-pixman-transformed-accessors.lo: pixman-transformed-accessors.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-transformed-accessors.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-transformed-accessors.Tpo" -c -o libpixman_1_la-pixman-transformed-accessors.lo `test -f 'pixman-transformed-accessors.c' || echo '$(srcdir)/'`pixman-transformed-accessors.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-transformed-accessors.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-transformed-accessors.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-transformed-accessors.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-transformed-accessors.c' object='libpixman_1_la-pixman-transformed-accessors.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-transformed-accessors.lo `test -f 'pixman-transformed-accessors.c' || echo '$(srcdir)/'`pixman-transformed-accessors.c
-
 libpixman_1_la-pixman-utils.lo: pixman-utils.c
 @am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-utils.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-utils.Tpo" -c -o libpixman_1_la-pixman-utils.lo `test -f 'pixman-utils.c' || echo '$(srcdir)/'`pixman-utils.c; \
 @am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-utils.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-utils.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-utils.Tpo"; exit 1; fi
@@ -695,13 +673,6 @@ libpixman_1_la-pixman-trap.lo: pixman-trap.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-trap.lo `test -f 'pixman-trap.c' || echo '$(srcdir)/'`pixman-trap.c
 
-libpixman_1_la-pixman-compute-region.lo: pixman-compute-region.c
-@am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-compute-region.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-compute-region.Tpo" -c -o libpixman_1_la-pixman-compute-region.lo `test -f 'pixman-compute-region.c' || echo '$(srcdir)/'`pixman-compute-region.c; \
-@am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-compute-region.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-compute-region.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-compute-region.Tpo"; exit 1; fi
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='pixman-compute-region.c' object='libpixman_1_la-pixman-compute-region.lo' libtool=yes @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -c -o libpixman_1_la-pixman-compute-region.lo `test -f 'pixman-compute-region.c' || echo '$(srcdir)/'`pixman-compute-region.c
-
 libpixman_1_la-pixman-timer.lo: pixman-timer.c
 @am__fastdepCC_TRUE@	if $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libpixman_1_la_CFLAGS) $(CFLAGS) -MT libpixman_1_la-pixman-timer.lo -MD -MP -MF "$(DEPDIR)/libpixman_1_la-pixman-timer.Tpo" -c -o libpixman_1_la-pixman-timer.lo `test -f 'pixman-timer.c' || echo '$(srcdir)/'`pixman-timer.c; \
 @am__fastdepCC_TRUE@	then mv -f "$(DEPDIR)/libpixman_1_la-pixman-timer.Tpo" "$(DEPDIR)/libpixman_1_la-pixman-timer.Plo"; else rm -f "$(DEPDIR)/libpixman_1_la-pixman-timer.Tpo"; exit 1; fi
@@ -854,13 +825,15 @@ distdir: $(DISTFILES)
 	  fi; \
 	done
 check-am: all-am
-check: check-am
+check: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) check-am
 all-am: Makefile $(LTLIBRARIES) $(HEADERS)
 installdirs:
 	for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(libpixmanincludedir)"; do \
 	  test -z "$$dir" || $(mkdir_p) "$$dir"; \
 	done
-install: install-am
+install: $(BUILT_SOURCES)
+	$(MAKE) $(AM_MAKEFLAGS) install-am
 install-exec: install-exec-am
 install-data: install-data-am
 uninstall: uninstall-am
@@ -885,6 +858,7 @@ distclean-generic:
 maintainer-clean-generic:
 	@echo "This command is intended for maintainers to use"
 	@echo "it deletes files that may require special tools to rebuild."
+	-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
 clean: clean-am
 
 clean-am: clean-generic clean-libLTLIBRARIES clean-libtool \
@@ -952,15 +926,15 @@ uninstall-am: uninstall-info-am uninstall-libLTLIBRARIES \
 	uninstall-libLTLIBRARIES uninstall-libpixmanincludeHEADERS
 
 
-pixman-combine32.c : combine.inc pixman-combine32.h combine.pl
-	$(PERL) $(srcdir)/combine.pl 8 < $(srcdir)/combine.inc > $@ || ($(RM) $@; exit 1)
-pixman-combine32.h : combine.h.inc combine.pl
-	$(PERL) $(srcdir)/combine.pl 8 < $(srcdir)/combine.h.inc > $@ || ($(RM) $@; exit 1)
+pixman-combine32.c : pixman-combine.c.template pixman-combine32.h make-combine.pl
+	$(PERL) $(srcdir)/make-combine.pl 8 < $(srcdir)/pixman-combine.c.template > $@ || ($(RM) $@; exit 1)
+pixman-combine32.h : pixman-combine.h.template make-combine.pl
+	$(PERL) $(srcdir)/make-combine.pl 8 < $(srcdir)/pixman-combine.h.template > $@ || ($(RM) $@; exit 1)
 
-pixman-combine64.c : combine.inc pixman-combine64.h combine.pl
-	$(PERL) $(srcdir)/combine.pl 16 < $(srcdir)/combine.inc > $@ || ($(RM) $@; exit 1)
-pixman-combine64.h : combine.h.inc combine.pl
-	$(PERL) $(srcdir)/combine.pl 16 < $(srcdir)/combine.h.inc > $@ || ($(RM) $@; exit 1)
+pixman-combine64.c : pixman-combine.c.template pixman-combine64.h make-combine.pl
+	$(PERL) $(srcdir)/make-combine.pl 16 < $(srcdir)/pixman-combine.c.template > $@ || ($(RM) $@; exit 1)
+pixman-combine64.h : pixman-combine.h.template make-combine.pl
+	$(PERL) $(srcdir)/make-combine.pl 16 < $(srcdir)/pixman-combine.h.template > $@ || ($(RM) $@; exit 1)
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
 .NOEXPORT:
diff --git a/lib/pixman/pixman/Makefile.win32 b/lib/pixman/pixman/Makefile.win32
index 208bb2e39..388bee23a 100644
--- a/lib/pixman/pixman/Makefile.win32
+++ b/lib/pixman/pixman/Makefile.win32
@@ -29,27 +29,30 @@ else
 CFLAGS += -O2
 endif
 
-SOURCES = \
-	pixman-image.c					\
-	pixman-access.c					\
-	pixman-access-accessors.c			\
+SOURCES =				\
+	pixman-image.c			\
+	pixman-access.c			\
+	pixman-access-accessors.c	\
 	pixman-region16.c		\
 	pixman-region32.c		\
-	pixman-compose.c				\
-	pixman-compose-accessors.c	\
-	pixman-combine32.c				\
-	pixman-combine64.c				\
-	pixman-pict.c					\
-	pixman-source.c					\
-	pixman-transformed.c				\
-	pixman-transformed-accessors.c			\
-	pixman-utils.c					\
-	pixman-edge.c					\
+	pixman-combine32.c		\
+	pixman-combine64.c		\
+	pixman-utils.c			\
+	pixman-edge.c			\
 	pixman-edge-accessors.c		\
-	pixman-trap.c					\
-	pixman-compute-region.c		\
-	pixman-timer.c					\
-	pixman-matrix.c					\
+	pixman-trap.c			\
+	pixman-timer.c			\
+	pixman-matrix.c			\
+	pixman-gradient-walker.c	\
+	pixman-linear-gradient.c	\
+	pixman-radial-gradient.c	\
+	pixman-bits-image.c		\
+	pixman.c			\
+	pixman-cpu.c			\
+	pixman-fast-path.c		\
+	pixman-implementation.c		\
+	pixman-solid-fill.c		\
+	pixman-general.c		\
 	$(NULL)
 
 # MMX compilation flags
@@ -125,16 +128,16 @@ $(CFG_VAR)/%.obj: %.c
 $(CFG_VAR)/$(LIBRARY).lib: $(OBJECTS)
 	lib -NOLOGO -OUT:$@ $(OBJECTS) || exit 0
 
-pixman-combine32.c: combine.inc pixman-combine32.h combine.pl
-	perl ./combine.pl 8 < $< > $@ || ($(RM) $@; exit 1)
-pixman-combine32.h: combine.h.inc combine.pl
-	perl ./combine.pl 8 < $< > $@ || ($(RM) $@; exit 1)
+pixman-combine32.c: pixman-combine.c.template pixman-combine32.h make-combine.pl
+	perl ./make-combine.pl 8 < $< > $@ || ($(RM) $@; exit 1)
+pixman-combine32.h: pixman-combine.h.template make-combine.pl
+	perl ./make-combine.pl 8 < $< > $@ || ($(RM) $@; exit 1)
 
-pixman-combine64.c: combine.inc pixman-combine64.h combine.pl
-	perl ./combine.pl 16 < $< > $@ || ($(RM) $@; exit 1)
-pixman-combine64.h: combine.h.inc combine.pl
-	perl ./combine.pl 16 < $< > $@ || ($(RM) $@; exit 1)
+pixman-combine64.c: pixman-combine.c.template pixman-combine64.h make-combine.pl
+	perl ./make-combine.pl 16 < $< > $@ || ($(RM) $@; exit 1)
+pixman-combine64.h: pixman-combine.h.template make-combine.pl
+	perl ./make-combine.pl 16 < $< > $@ || ($(RM) $@; exit 1)
 
 clean_r:
 	@rm -f $(CFG_VAR)/*.obj $(CFG_VAR)/*.lib $(CFG_VAR)/*.pdb $(CFG)/*.ilk || exit 0
-	@rm -f $(CFG)/*.obj $(CFG)/*.lib $(CFG)/*.pdb $(CFG)/*.ilk pixman-combine32.c pixman-combine64.c || exit 0
+	@rm -f $(CFG)/*.obj $(CFG)/*.lib $(CFG)/*.pdb $(CFG)/*.ilk pixman-combine32.c pixman-combine64.c pixman-combine64.c pixman-combine64.h || exit 0
diff --git a/lib/pixman/pixman/combine.h.inc b/lib/pixman/pixman/combine.h.inc
deleted file mode 100644
index 6ecd30139..000000000
--- a/lib/pixman/pixman/combine.h.inc
+++ /dev/null
@@ -1,213 +0,0 @@
-
-#define COMPONENT_SIZE
-#define MASK
-#define ONE_HALF
-
-#define A_SHIFT
-#define R_SHIFT
-#define G_SHIFT
-#define A_MASK
-#define R_MASK
-#define G_MASK
-
-#define RB_MASK
-#define AG_MASK
-#define RB_ONE_HALF
-#define RB_MASK_PLUS_ONE
-
-#define Alpha(x) ((x) >> A_SHIFT)
-
-/*
- * Helper macros.
- */
-
-#define IntMult(a,b,t) ( (t) = (a) * (b) + ONE_HALF, ( ( ( (t)>>G_SHIFT ) + (t) )>>G_SHIFT ) )
-#define IntDiv(a,b)    (((comp2_t) (a) * MASK) / (b))
-
-#define GetComp(v,i)   ((comp2_t) (comp1_t) ((v) >> i))
-
-#define Add(x,y,i,t)   ((t) = GetComp(x,i) + GetComp(y,i),              \
-                        (comp4_t) ((comp1_t) ((t) | (0 - ((t) >> G_SHIFT)))) << (i))
-
-#define FbGen(x,y,i,ax,ay,t,u,v) ((t) = (IntMult(GetComp(y,i),ay,(u)) + \
-					 IntMult(GetComp(x,i),ax,(v))), \
-				  (comp4_t) ((comp1_t) ((t) |		\
-							 (0 - ((t) >> G_SHIFT)))) << (i))
-
-/*
-  The methods below use some tricks to be able to do two color
-  components at the same time.
-*/
-
-/*
-  x_c = (x_c * a) / 255
-*/
-#define FbByteMul(x, a) do {                                            \
-        comp4_t t = ((x & RB_MASK) * a) + RB_ONE_HALF;                  \
-        t = (t + ((t >> COMPONENT_SIZE) & RB_MASK)) >> COMPONENT_SIZE;  \
-        t &= RB_MASK;                                                   \
-                                                                        \
-        x = (((x >> COMPONENT_SIZE) & RB_MASK) * a) + RB_ONE_HALF;      \
-        x = (x + ((x >> COMPONENT_SIZE) & RB_MASK));                    \
-        x &= RB_MASK << COMPONENT_SIZE;                                 \
-        x += t;                                                         \
-    } while (0)
-
-/*
-  x_c = (x_c * a) / 255 + y
-*/
-#define FbByteMulAdd(x, a, y) do {                                      \
-        /* multiply and divide: trunc((i + 128)*257/65536) */           \
-        comp4_t t = ((x & RB_MASK) * a) + RB_ONE_HALF;                  \
-        t = (t + ((t >> COMPONENT_SIZE) & RB_MASK)) >> COMPONENT_SIZE;  \
-        t &= RB_MASK;                                                   \
-                                                                        \
-        /* add */                                                       \
-        t += y & RB_MASK;                                               \
-                                                                        \
-        /* saturate */                                                  \
-        t |= RB_MASK_PLUS_ONE - ((t >> COMPONENT_SIZE) & RB_MASK);      \
-        t &= RB_MASK;                                                   \
-                                                                        \
-        /* multiply and divide */                                       \
-        x = (((x >> COMPONENT_SIZE) & RB_MASK) * a) + RB_ONE_HALF;      \
-        x = (x + ((x >> COMPONENT_SIZE) & RB_MASK)) >> COMPONENT_SIZE;  \
-        x &= RB_MASK;                                                   \
-                                                                        \
-        /* add */                                                       \
-        x += (y >> COMPONENT_SIZE) & RB_MASK;                           \
-                                                                        \
-        /* saturate */                                                  \
-        x |= RB_MASK_PLUS_ONE - ((x >> COMPONENT_SIZE) & RB_MASK);      \
-        x &= RB_MASK;                                                   \
-                                                                        \
-        /* recombine */                                                 \
-        x <<= COMPONENT_SIZE;                                           \
-        x += t;                                                         \
-    } while (0)
-
-/*
-  x_c = (x_c * a + y_c * b) / 255
-*/
-#define FbByteAddMul(x, a, y, b) do {                                   \
-        comp4_t t;                                                      \
-        comp4_t r = (x >> A_SHIFT) * a + (y >> A_SHIFT) * b + ONE_HALF; \
-        r += (r >> G_SHIFT);                                            \
-        r >>= G_SHIFT;                                                  \
-                                                                        \
-        t = (x & G_MASK) * a + (y & G_MASK) * b;                        \
-        t += (t >> G_SHIFT) + (ONE_HALF << G_SHIFT);                    \
-        t >>= R_SHIFT;                                                  \
-                                                                        \
-        t |= r << R_SHIFT;                                              \
-        t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);             \
-        t &= RB_MASK;                                                   \
-        t <<= G_SHIFT;                                                  \
-                                                                        \
-        r = ((x >> R_SHIFT) & MASK) * a +                               \
-            ((y >> R_SHIFT) & MASK) * b + ONE_HALF;                     \
-        r += (r >> G_SHIFT);                                            \
-        r >>= G_SHIFT;                                                  \
-                                                                        \
-        x = (x & MASK) * a + (y & MASK) * b + ONE_HALF;                 \
-        x += (x >> G_SHIFT);                                            \
-        x >>= G_SHIFT;                                                  \
-        x |= r << R_SHIFT;                                              \
-        x |= RB_MASK_PLUS_ONE - ((x >> G_SHIFT) & RB_MASK);             \
-        x &= RB_MASK;                                                   \
-        x |= t;                                                         \
-    } while (0)
-
-/*
-  x_c = (x_c * a_c) / 255
-*/
-#define FbByteMulC(x, a) do {                                           \
-        comp4_t t;                                                      \
-        comp4_t r = (x & MASK) * (a & MASK);                            \
-        r |= (x & R_MASK) * ((a >> R_SHIFT) & MASK);                    \
-        r += RB_ONE_HALF;                                               \
-        r = (r + ((r >> G_SHIFT) & RB_MASK)) >> G_SHIFT;                \
-        r &= RB_MASK;                                                   \
-                                                                        \
-        x >>= G_SHIFT;                                                  \
-        t = (x & MASK) * ((a >> G_SHIFT) & MASK);                       \
-        t |= (x & R_MASK) * (a >> A_SHIFT);                             \
-        t += RB_ONE_HALF;                                               \
-        t = t + ((t >> G_SHIFT) & RB_MASK);                             \
-        x = r | (t & AG_MASK);                                          \
-    } while (0)
-
-/*
-  x_c = (x_c * a) / 255 + y
-*/
-#define FbByteMulAddC(x, a, y) do {                                     \
-        comp4_t t;                                                      \
-        comp4_t r = (x & MASK) * (a & MASK);                            \
-        r |= (x & R_MASK) * ((a >> R_SHIFT) & MASK);                    \
-        r += RB_ONE_HALF;                                               \
-        r = (r + ((r >> G_SHIFT) & RB_MASK)) >> G_SHIFT;                \
-        r &= RB_MASK;                                                   \
-        r += y & RB_MASK;                                               \
-        r |= RB_MASK_PLUS_ONE - ((r >> G_SHIFT) & RB_MASK);             \
-        r &= RB_MASK;                                                   \
-                                                                        \
-        x >>= G_SHIFT;                                                  \
-        t = (x & MASK) * ((a >> G_SHIFT) & MASK);                       \
-        t |= (x & R_MASK) * (a >> A_SHIFT);                             \
-        t += RB_ONE_HALF;                                               \
-        t = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;                \
-        t &= RB_MASK;                                                   \
-        t += (y >> G_SHIFT) & RB_MASK;                                  \
-        t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);             \
-        t &= RB_MASK;                                                   \
-        x = r | (t << G_SHIFT);                                         \
-    } while (0)
-
-/*
-  x_c = (x_c * a_c + y_c * b) / 255
-*/
-#define FbByteAddMulC(x, a, y, b) do {                                  \
-        comp4_t t;                                                      \
-        comp4_t r = (x >> A_SHIFT) * (a >> A_SHIFT) +                   \
-                     (y >> A_SHIFT) * b;                                \
-        r += (r >> G_SHIFT) + ONE_HALF;                                 \
-        r >>= G_SHIFT;                                                  \
-                                                                        \
-        t = (x & G_MASK) * ((a >> G_SHIFT) & MASK) + (y & G_MASK) * b;  \
-        t += (t >> G_SHIFT) + (ONE_HALF << G_SHIFT);                    \
-        t >>= R_SHIFT;                                                  \
-                                                                        \
-        t |= r << R_SHIFT;                                              \
-        t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);             \
-        t &= RB_MASK;                                                   \
-        t <<= G_SHIFT;                                                  \
-                                                                        \
-        r = ((x >> R_SHIFT) & MASK) * ((a >> R_SHIFT) & MASK) +         \
-            ((y >> R_SHIFT) & MASK) * b + ONE_HALF;                     \
-        r += (r >> G_SHIFT);                                            \
-        r >>= G_SHIFT;                                                  \
-                                                                        \
-        x = (x & MASK) * (a & MASK) + (y & MASK) * b + ONE_HALF;        \
-        x += (x >> G_SHIFT);                                            \
-        x >>= G_SHIFT;                                                  \
-        x |= r << R_SHIFT;                                              \
-        x |= RB_MASK_PLUS_ONE - ((x >> G_SHIFT) & RB_MASK);             \
-        x &= RB_MASK;                                                   \
-        x |= t;                                                         \
-    } while (0)
-
-/*
-  x_c = min(x_c + y_c, 255)
-*/
-#define FbByteAdd(x, y) do {                                            \
-        comp4_t t;                                                      \
-        comp4_t r = (x & RB_MASK) + (y & RB_MASK);                      \
-        r |= RB_MASK_PLUS_ONE - ((r >> G_SHIFT) & RB_MASK);             \
-        r &= RB_MASK;                                                   \
-                                                                        \
-        t = ((x >> G_SHIFT) & RB_MASK) + ((y >> G_SHIFT) & RB_MASK);    \
-        t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);             \
-        r |= (t & RB_MASK) << G_SHIFT;                                  \
-        x = r;                                                          \
-    } while (0)
-
diff --git a/lib/pixman/pixman/combine.inc b/lib/pixman/pixman/combine.inc
deleted file mode 100644
index 0d5569400..000000000
--- a/lib/pixman/pixman/combine.inc
+++ /dev/null
@@ -1,1339 +0,0 @@
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <string.h>
-
-#include "pixman-private.h"
-
-#include "pixman-combine.h"
-
-/*
- * There are two ways of handling alpha -- either as a single unified value or
- * a separate value for each component, hence each macro must have two
- * versions.  The unified alpha version has a 'U' at the end of the name,
- * the component version has a 'C'.  Similarly, functions which deal with
- * this difference will have two versions using the same convention.
- */
-
-
-/*
- * All of the composing functions
- */
-
-static force_inline comp4_t
-combineMask (const comp4_t *src, const comp4_t *mask, int i)
-{
-    comp4_t s, m;
-
-    if (mask)
-    {
-	m = *(mask + i) >> A_SHIFT;
-
-	if (!m)
-	    return 0;
-    }
-
-    s = *(src + i);
-
-    if (mask)
-	FbByteMul (s, m);
-
-    return s;
-}
-
-FASTCALL static void
-fbCombineClear (pixman_implementation_t *imp, pixman_op_t op,
-		comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    memset(dest, 0, width*sizeof(comp4_t));
-}
-
-FASTCALL static void
-fbCombineSrcU (pixman_implementation_t *imp, pixman_op_t op,
-	       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-
-    if (!mask)
-	memcpy (dest, src, width * sizeof (comp4_t));
-    else
-    {
-	for (i = 0; i < width; ++i)
-	{
-	    comp4_t s = combineMask (src, mask, i);
-	    
-	    *(dest + i) = s;
-	}
-    }
-}
-
-/* if the Src is opaque, call fbCombineSrcU */
-FASTCALL static void
-fbCombineOverU (pixman_implementation_t *imp, pixman_op_t op,
-		comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-    for (i = 0; i < width; ++i) {
-        comp4_t s = combineMask (src, mask, i);
-        comp4_t d = *(dest + i);
-        comp4_t ia = Alpha(~s);
-
-        FbByteMulAdd(d, ia, s);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Dst is opaque, this is a noop */
-FASTCALL static void
-fbCombineOverReverseU (pixman_implementation_t *imp, pixman_op_t op,
-		       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-    for (i = 0; i < width; ++i) {
-        comp4_t s = combineMask (src, mask, i);
-        comp4_t d = *(dest + i);
-        comp4_t ia = Alpha(~*(dest + i));
-        FbByteMulAdd(s, ia, d);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Dst is opaque, call fbCombineSrcU */
-FASTCALL static void
-fbCombineInU (pixman_implementation_t *imp, pixman_op_t op,
-	      comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-    for (i = 0; i < width; ++i) {
-        comp4_t s = combineMask (src, mask, i);
-        comp4_t a = Alpha(*(dest + i));
-        FbByteMul(s, a);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, this is a noop */
-FASTCALL static void
-fbCombineInReverseU (pixman_implementation_t *imp, pixman_op_t op,
-		     comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-    for (i = 0; i < width; ++i) {
-	comp4_t s = combineMask (src, mask, i);
-	comp4_t d = *(dest + i);
-        comp4_t a = Alpha(s);
-        FbByteMul(d, a);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Dst is opaque, call fbCombineClear */
-FASTCALL static void
-fbCombineOutU (pixman_implementation_t *imp, pixman_op_t op,
-	       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-    for (i = 0; i < width; ++i) {
-        comp4_t s = combineMask (src, mask, i);
-        comp4_t a = Alpha(~*(dest + i));
-        FbByteMul(s, a);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, call fbCombineClear */
-FASTCALL static void
-fbCombineOutReverseU (pixman_implementation_t *imp, pixman_op_t op,
-		      comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-    for (i = 0; i < width; ++i) {
-	comp4_t s = combineMask (src, mask, i);
-        comp4_t d = *(dest + i);
-        comp4_t a = Alpha(~s);
-        FbByteMul(d, a);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Src is opaque, call fbCombineInU */
-/* if the Dst is opaque, call fbCombineOverU */
-/* if both the Src and Dst are opaque, call fbCombineSrcU */
-FASTCALL static void
-fbCombineAtopU (pixman_implementation_t *imp, pixman_op_t op,
-		comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-    for (i = 0; i < width; ++i) {
-        comp4_t s = combineMask (src, mask, i);
-        comp4_t d = *(dest + i);
-        comp4_t dest_a = Alpha(d);
-        comp4_t src_ia = Alpha(~s);
-
-        FbByteAddMul(s, dest_a, d, src_ia);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, call fbCombineOverReverseU */
-/* if the Dst is opaque, call fbCombineInReverseU */
-/* if both the Src and Dst are opaque, call fbCombineDstU */
-FASTCALL static void
-fbCombineAtopReverseU (pixman_implementation_t *imp, pixman_op_t op,
-		       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-    for (i = 0; i < width; ++i) {
-        comp4_t s = combineMask (src, mask, i);
-        comp4_t d = *(dest + i);
-        comp4_t src_a = Alpha(s);
-        comp4_t dest_ia = Alpha(~d);
-
-        FbByteAddMul(s, dest_ia, d, src_a);
-	*(dest + i) = s;
-    }
-}
-
-/* if the Src is opaque, call fbCombineOverU */
-/* if the Dst is opaque, call fbCombineOverReverseU */
-/* if both the Src and Dst are opaque, call fbCombineClear */
-FASTCALL static void
-fbCombineXorU (pixman_implementation_t *imp, pixman_op_t op,
-	       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-    for (i = 0; i < width; ++i) {
-        comp4_t s = combineMask (src, mask, i);
-        comp4_t d = *(dest + i);
-        comp4_t src_ia = Alpha(~s);
-        comp4_t dest_ia = Alpha(~d);
-
-        FbByteAddMul(s, dest_ia, d, src_ia);
-	*(dest + i) = s;
-    }
-}
-
-FASTCALL static void
-fbCombineAddU (pixman_implementation_t *imp, pixman_op_t op,
-	       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-    for (i = 0; i < width; ++i) {
-        comp4_t s = combineMask (src, mask, i);
-        comp4_t d = *(dest + i);
-        FbByteAdd(d, s);
-	*(dest + i) = d;
-    }
-}
-
-/* if the Src is opaque, call fbCombineAddU */
-/* if the Dst is opaque, call fbCombineAddU */
-/* if both the Src and Dst are opaque, call fbCombineAddU */
-FASTCALL static void
-fbCombineSaturateU (pixman_implementation_t *imp, pixman_op_t op,
-		    comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-    for (i = 0; i < width; ++i) {
-        comp4_t s = combineMask (src, mask, i);
-        comp4_t d = *(dest + i);
-        comp2_t sa, da;
-
-        sa = s >> A_SHIFT;
-        da = ~d >> A_SHIFT;
-        if (sa > da)
-        {
-            sa = IntDiv(da, sa);
-            FbByteMul(s, sa);
-        };
-        FbByteAdd(d, s);
-	*(dest + i) = d;
-    }
-}
-
-
-/*
- * All of the disjoint composing functions
-
- The four entries in the first column indicate what source contributions
- come from each of the four areas of the picture -- areas covered by neither
- A nor B, areas covered only by A, areas covered only by B and finally
- areas covered by both A and B.
-
- Disjoint			Conjoint
- Fa		Fb		Fa		Fb
- (0,0,0,0)	0		0		0		0
- (0,A,0,A)	1		0		1		0
- (0,0,B,B)	0		1		0		1
- (0,A,B,A)	1		min((1-a)/b,1)	1		max(1-a/b,0)
- (0,A,B,B)	min((1-b)/a,1)	1		max(1-b/a,0)	1
- (0,0,0,A)	max(1-(1-b)/a,0) 0		min(1,b/a)	0
- (0,0,0,B)	0		max(1-(1-a)/b,0) 0		min(a/b,1)
- (0,A,0,0)	min(1,(1-b)/a)	0		max(1-b/a,0)	0
- (0,0,B,0)	0		min(1,(1-a)/b)	0		max(1-a/b,0)
- (0,0,B,A)	max(1-(1-b)/a,0) min(1,(1-a)/b)	 min(1,b/a)	max(1-a/b,0)
- (0,A,0,B)	min(1,(1-b)/a)	max(1-(1-a)/b,0) max(1-b/a,0)	min(1,a/b)
- (0,A,B,0)	min(1,(1-b)/a)	min(1,(1-a)/b)	max(1-b/a,0)	max(1-a/b,0)
-
-*/
-
-#define CombineAOut 1
-#define CombineAIn  2
-#define CombineBOut 4
-#define CombineBIn  8
-
-#define CombineClear	0
-#define CombineA	(CombineAOut|CombineAIn)
-#define CombineB	(CombineBOut|CombineBIn)
-#define CombineAOver	(CombineAOut|CombineBOut|CombineAIn)
-#define CombineBOver	(CombineAOut|CombineBOut|CombineBIn)
-#define CombineAAtop	(CombineBOut|CombineAIn)
-#define CombineBAtop	(CombineAOut|CombineBIn)
-#define CombineXor	(CombineAOut|CombineBOut)
-
-/* portion covered by a but not b */
-FASTCALL static comp1_t
-fbCombineDisjointOutPart (comp1_t a, comp1_t b)
-{
-    /* min (1, (1-b) / a) */
-
-    b = ~b;		    /* 1 - b */
-    if (b >= a)		    /* 1 - b >= a -> (1-b)/a >= 1 */
-	return MASK;	    /* 1 */
-    return IntDiv(b,a);     /* (1-b) / a */
-}
-
-/* portion covered by both a and b */
-FASTCALL static comp1_t
-fbCombineDisjointInPart (comp1_t a, comp1_t b)
-{
-    /* max (1-(1-b)/a,0) */
-    /*  = - min ((1-b)/a - 1, 0) */
-    /*  = 1 - min (1, (1-b)/a) */
-
-    b = ~b;		    /* 1 - b */
-    if (b >= a)		    /* 1 - b >= a -> (1-b)/a >= 1 */
-	return 0;	    /* 1 - 1 */
-    return ~IntDiv(b,a);    /* 1 - (1-b) / a */
-}
-
-/* portion covered by a but not b */
-FASTCALL static comp1_t
-fbCombineConjointOutPart (comp1_t a, comp1_t b)
-{
-    /* max (1-b/a,0) */
-    /* = 1-min(b/a,1) */
-
-    /* min (1, (1-b) / a) */
-
-    if (b >= a)		    /* b >= a -> b/a >= 1 */
-	return 0x00;	    /* 0 */
-    return ~IntDiv(b,a);    /* 1 - b/a */
-}
-
-/* portion covered by both a and b */
-FASTCALL static comp1_t
-fbCombineConjointInPart (comp1_t a, comp1_t b)
-{
-    /* min (1,b/a) */
-
-    if (b >= a)		    /* b >= a -> b/a >= 1 */
-	return MASK;	    /* 1 */
-    return IntDiv(b,a);     /* b/a */
-}
-
-FASTCALL static void
-fbCombineDisjointGeneralU (comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width, comp1_t combine)
-{
-    int i;
-    for (i = 0; i < width; ++i) {
-        comp4_t s = combineMask (src, mask, i);
-        comp4_t d = *(dest + i);
-        comp4_t m,n,o,p;
-        comp2_t Fa, Fb, t, u, v;
-        comp1_t sa = s >> A_SHIFT;
-        comp1_t da = d >> A_SHIFT;
-
-        switch (combine & CombineA) {
-        default:
-            Fa = 0;
-            break;
-        case CombineAOut:
-            Fa = fbCombineDisjointOutPart (sa, da);
-            break;
-        case CombineAIn:
-            Fa = fbCombineDisjointInPart (sa, da);
-            break;
-        case CombineA:
-            Fa = MASK;
-            break;
-        }
-
-        switch (combine & CombineB) {
-        default:
-            Fb = 0;
-            break;
-        case CombineBOut:
-            Fb = fbCombineDisjointOutPart (da, sa);
-            break;
-        case CombineBIn:
-            Fb = fbCombineDisjointInPart (da, sa);
-            break;
-        case CombineB:
-            Fb = MASK;
-            break;
-        }
-        m = FbGen (s,d,0,Fa,Fb,t, u, v);
-        n = FbGen (s,d,G_SHIFT,Fa,Fb,t, u, v);
-        o = FbGen (s,d,R_SHIFT,Fa,Fb,t, u, v);
-        p = FbGen (s,d,A_SHIFT,Fa,Fb,t, u, v);
-        s = m|n|o|p;
-	*(dest + i) = s;
-    }
-}
-
-FASTCALL static void
-fbCombineDisjointOverU (pixman_implementation_t *imp, pixman_op_t op,
-			comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-    for (i = 0; i < width; ++i) {
-        comp4_t s = combineMask (src, mask, i);
-        comp2_t a = s >> A_SHIFT;
-
-        if (a != 0x00)
-        {
-            if (a != MASK)
-            {
-                comp4_t d = *(dest + i);
-                a = fbCombineDisjointOutPart (d >> A_SHIFT, a);
-                FbByteMulAdd(d, a, s);
-                s = d;
-            }
-	    *(dest + i) = s;
-        }
-    }
-}
-
-FASTCALL static void
-fbCombineDisjointInU (pixman_implementation_t *imp, pixman_op_t op,
-		      comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineDisjointGeneralU (dest, src, mask, width, CombineAIn);
-}
-
-FASTCALL static void
-fbCombineDisjointInReverseU (pixman_implementation_t *imp, pixman_op_t op,
-			     comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineDisjointGeneralU (dest, src, mask, width, CombineBIn);
-}
-
-FASTCALL static void
-fbCombineDisjointOutU (pixman_implementation_t *imp, pixman_op_t op,
-		       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineDisjointGeneralU (dest, src, mask, width, CombineAOut);
-}
-
-FASTCALL static void
-fbCombineDisjointOutReverseU (pixman_implementation_t *imp, pixman_op_t op,
-			      comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineDisjointGeneralU (dest, src, mask, width, CombineBOut);
-}
-
-FASTCALL static void
-fbCombineDisjointAtopU (pixman_implementation_t *imp, pixman_op_t op,
-			comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineDisjointGeneralU (dest, src, mask, width, CombineAAtop);
-}
-
-FASTCALL static void
-fbCombineDisjointAtopReverseU (pixman_implementation_t *imp, pixman_op_t op,
-			       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineDisjointGeneralU (dest, src, mask, width, CombineBAtop);
-}
-
-FASTCALL static void
-fbCombineDisjointXorU (pixman_implementation_t *imp, pixman_op_t op,
-		       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineDisjointGeneralU (dest, src, mask, width, CombineXor);
-}
-
-FASTCALL static void
-fbCombineConjointGeneralU (comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width, comp1_t combine)
-{
-    int i;
-    for (i = 0; i < width; ++i) {
-        comp4_t s = combineMask (src, mask, i);
-        comp4_t d = *(dest + i);
-        comp4_t m,n,o,p;
-        comp2_t Fa, Fb, t, u, v;
-        comp1_t sa = s >> A_SHIFT;
-        comp1_t da = d >> A_SHIFT;
-
-        switch (combine & CombineA) {
-        default:
-            Fa = 0;
-            break;
-        case CombineAOut:
-            Fa = fbCombineConjointOutPart (sa, da);
-            break;
-        case CombineAIn:
-            Fa = fbCombineConjointInPart (sa, da);
-            break;
-        case CombineA:
-            Fa = MASK;
-            break;
-        }
-
-        switch (combine & CombineB) {
-        default:
-            Fb = 0;
-            break;
-        case CombineBOut:
-            Fb = fbCombineConjointOutPart (da, sa);
-            break;
-        case CombineBIn:
-            Fb = fbCombineConjointInPart (da, sa);
-            break;
-        case CombineB:
-            Fb = MASK;
-            break;
-        }
-        m = FbGen (s,d,0,Fa,Fb,t, u, v);
-        n = FbGen (s,d,G_SHIFT,Fa,Fb,t, u, v);
-        o = FbGen (s,d,R_SHIFT,Fa,Fb,t, u, v);
-        p = FbGen (s,d,A_SHIFT,Fa,Fb,t, u, v);
-        s = m|n|o|p;
-	*(dest + i) = s;
-    }
-}
-
-FASTCALL static void
-fbCombineConjointOverU (pixman_implementation_t *imp, pixman_op_t op,
-			comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralU (dest, src, mask, width, CombineAOver);
-}
-
-
-FASTCALL static void
-fbCombineConjointOverReverseU (pixman_implementation_t *imp, pixman_op_t op,
-			       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralU (dest, src, mask, width, CombineBOver);
-}
-
-
-FASTCALL static void
-fbCombineConjointInU (pixman_implementation_t *imp, pixman_op_t op,
-		      comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralU (dest, src, mask, width, CombineAIn);
-}
-
-
-FASTCALL static void
-fbCombineConjointInReverseU (pixman_implementation_t *imp, pixman_op_t op,
-			     comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralU (dest, src, mask, width, CombineBIn);
-}
-
-FASTCALL static void
-fbCombineConjointOutU (pixman_implementation_t *imp, pixman_op_t op,
-		       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralU (dest, src, mask, width, CombineAOut);
-}
-
-FASTCALL static void
-fbCombineConjointOutReverseU (pixman_implementation_t *imp, pixman_op_t op,
-			      comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralU (dest, src, mask, width, CombineBOut);
-}
-
-FASTCALL static void
-fbCombineConjointAtopU (pixman_implementation_t *imp, pixman_op_t op,
-			comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralU (dest, src, mask, width, CombineAAtop);
-}
-
-FASTCALL static void
-fbCombineConjointAtopReverseU (pixman_implementation_t *imp, pixman_op_t op,
-			       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralU (dest, src, mask, width, CombineBAtop);
-}
-
-FASTCALL static void
-fbCombineConjointXorU (pixman_implementation_t *imp, pixman_op_t op,
-		       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralU (dest, src, mask, width, CombineXor);
-}
-
-/********************************************************************************/
-/*************************** Per Channel functions ******************************/
-/********************************************************************************/
-
-FASTCALL static void
-fbCombineMaskC (comp4_t *src, comp4_t *mask)
-{
-    comp4_t a = *mask;
-
-    comp4_t	x;
-    comp2_t	xa;
-
-    if (!a)
-    {
-	*(src) = 0;
-	return;
-    }
-
-    x = *(src);
-    if (a == ~0)
-    {
-	x = x >> A_SHIFT;
-	x |= x << G_SHIFT;
-	x |= x << R_SHIFT;
-	*(mask) = x;
-	return;
-    }
-
-    xa = x >> A_SHIFT;
-    FbByteMulC(x, a);
-    *(src) = x;
-    FbByteMul(a, xa);
-    *(mask) = a;
-}
-
-FASTCALL static void
-fbCombineMaskValueC (comp4_t *src, const comp4_t *mask)
-{
-    comp4_t a = *mask;
-    comp4_t	x;
-
-    if (!a)
-    {
-	*(src) = 0;
-	return;
-    }
-
-    if (a == ~0)
-	return;
-
-    x = *(src);
-    FbByteMulC(x, a);
-    *(src) =x;
-}
-
-FASTCALL static void
-fbCombineMaskAlphaC (const comp4_t *src, comp4_t *mask)
-{
-    comp4_t a = *(mask);
-    comp4_t	x;
-
-    if (!a)
-	return;
-
-    x = *(src) >> A_SHIFT;
-    if (x == MASK)
-	return;
-    if (a == ~0)
-    {
-	x = x >> A_SHIFT;
-	x |= x << G_SHIFT;
-	x |= x << R_SHIFT;
-	*(mask) = x;
-	return;
-    }
-
-    FbByteMul(a, x);
-    *(mask) = a;
-}
-
-FASTCALL static void
-fbCombineClearC (pixman_implementation_t *imp, pixman_op_t op,
-		 comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    memset(dest, 0, width*sizeof(comp4_t));
-}
-
-FASTCALL static void
-fbCombineSrcC (pixman_implementation_t *imp, pixman_op_t op,
-	       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i) {
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-
-	fbCombineMaskValueC (&s, &m);
-
-	*(dest) = s;
-    }
-}
-
-FASTCALL static void
-fbCombineOverC (pixman_implementation_t *imp, pixman_op_t op,
-		comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i) {
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t a;
-
-	fbCombineMaskC (&s, &m);
-
-	a = ~m;
-        if (a != ~0)
-        {
-            if (a)
-            {
-                comp4_t d = *(dest + i);
-                FbByteMulAddC(d, a, s);
-                s = d;
-            }
-	    *(dest + i) = s;
-        }
-    }
-}
-
-FASTCALL static void
-fbCombineOverReverseC (pixman_implementation_t *imp, pixman_op_t op,
-		       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i) {
-        comp4_t d = *(dest + i);
-        comp4_t a = ~d >> A_SHIFT;
-
-        if (a)
-        {
-            comp4_t s = *(src + i);
-	    comp4_t m = *(mask + i);
-
-	    fbCombineMaskValueC (&s, &m);
-
-            if (a != MASK)
-            {
-                FbByteMulAdd(s, a, d);
-            }
-	    *(dest + i) = s;
-        }
-    }
-}
-
-FASTCALL static void
-fbCombineInC (pixman_implementation_t *imp, pixman_op_t op,
-	      comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i) {
-        comp4_t d = *(dest + i);
-        comp2_t a = d >> A_SHIFT;
-        comp4_t s = 0;
-        if (a)
-        {
-	    comp4_t m = *(mask + i);
-
-	    s = *(src + i);
-	    fbCombineMaskValueC (&s, &m);
-            if (a != MASK)
-            {
-                FbByteMul(s, a);
-            }
-        }
-	*(dest + i) = s;
-    }
-}
-
-FASTCALL static void
-fbCombineInReverseC (pixman_implementation_t *imp, pixman_op_t op,
-		     comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i) {
-        comp4_t s = *(src + i);
-        comp4_t m = *(mask + i);
-        comp4_t a;
-
-	fbCombineMaskAlphaC (&s, &m);
-
-	a = m;
-        if (a != ~0)
-        {
-            comp4_t d = 0;
-            if (a)
-            {
-                d = *(dest + i);
-                FbByteMulC(d, a);
-            }
-	    *(dest + i) = d;
-        }
-    }
-}
-
-FASTCALL static void
-fbCombineOutC (pixman_implementation_t *imp, pixman_op_t op,
-	       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i) {
-        comp4_t d = *(dest + i);
-        comp2_t a = ~d >> A_SHIFT;
-        comp4_t s = 0;
-        if (a)
-        {
-	    comp4_t m = *(mask + i);
-
-	    s = *(src + i);
-	    fbCombineMaskValueC (&s, &m);
-
-            if (a != MASK)
-            {
-                FbByteMul(s, a);
-            }
-        }
-	*(dest + i) = s;
-    }
-}
-
-FASTCALL static void
-fbCombineOutReverseC (pixman_implementation_t *imp, pixman_op_t op,
-		      comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i) {
-	comp4_t s = *(src + i);
-	comp4_t m = *(mask + i);
-	comp4_t a;
-
-	fbCombineMaskAlphaC (&s, &m);
-
-        a = ~m;
-        if (a != ~0)
-        {
-            comp4_t d = 0;
-            if (a)
-            {
-                d = *(dest + i);
-                FbByteMulC(d, a);
-            }
-	    *(dest + i) = d;
-        }
-    }
-}
-
-FASTCALL static void
-fbCombineAtopC (pixman_implementation_t *imp, pixman_op_t op,
-		comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i) {
-        comp4_t d = *(dest + i);
-        comp4_t s = *(src + i);
-        comp4_t m = *(mask + i);
-        comp4_t ad;
-        comp2_t as = d >> A_SHIFT;
-
-	fbCombineMaskC (&s, &m);
-
-        ad = ~m;
-
-        FbByteAddMulC(d, ad, s, as);
-	*(dest + i) = d;
-    }
-}
-
-FASTCALL static void
-fbCombineAtopReverseC (pixman_implementation_t *imp, pixman_op_t op,
-		       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i) {
-
-        comp4_t d = *(dest + i);
-        comp4_t s = *(src + i);
-        comp4_t m = *(mask + i);
-        comp4_t ad;
-        comp2_t as = ~d >> A_SHIFT;
-
-	fbCombineMaskC (&s, &m);
-
-	ad = m;
-
-        FbByteAddMulC(d, ad, s, as);
-	*(dest + i) = d;
-    }
-}
-
-FASTCALL static void
-fbCombineXorC (pixman_implementation_t *imp, pixman_op_t op,
-	       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i) {
-        comp4_t d = *(dest + i);
-        comp4_t s = *(src + i);
-        comp4_t m = *(mask + i);
-        comp4_t ad;
-        comp2_t as = ~d >> A_SHIFT;
-
-	fbCombineMaskC (&s, &m);
-
-	ad = ~m;
-
-        FbByteAddMulC(d, ad, s, as);
-	*(dest + i) = d;
-    }
-}
-
-FASTCALL static void
-fbCombineAddC (pixman_implementation_t *imp, pixman_op_t op,
-	       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i) {
-        comp4_t s = *(src + i);
-        comp4_t m = *(mask + i);
-        comp4_t d = *(dest + i);
-
-	fbCombineMaskValueC (&s, &m);
-
-        FbByteAdd(d, s);
-	*(dest + i) = d;
-    }
-}
-
-FASTCALL static void
-fbCombineSaturateC (pixman_implementation_t *imp, pixman_op_t op,
-		    comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    int i;
-
-    for (i = 0; i < width; ++i) {
-        comp4_t s, d;
-        comp2_t sa, sr, sg, sb, da;
-        comp2_t t, u, v;
-        comp4_t m,n,o,p;
-
-        d = *(dest + i);
-        s = *(src + i);
-	m = *(mask + i);
-
-	fbCombineMaskC (&s, &m);
-
-        sa = (m >> A_SHIFT);
-        sr = (m >> R_SHIFT) & MASK;
-        sg = (m >> G_SHIFT) & MASK;
-        sb =  m             & MASK;
-        da = ~d >> A_SHIFT;
-
-        if (sb <= da)
-            m = Add(s,d,0,t);
-        else
-            m = FbGen (s, d, 0, (da << G_SHIFT) / sb, MASK, t, u, v);
-
-        if (sg <= da)
-            n = Add(s,d,G_SHIFT,t);
-        else
-            n = FbGen (s, d, G_SHIFT, (da << G_SHIFT) / sg, MASK, t, u, v);
-
-        if (sr <= da)
-            o = Add(s,d,R_SHIFT,t);
-        else
-            o = FbGen (s, d, R_SHIFT, (da << G_SHIFT) / sr, MASK, t, u, v);
-
-        if (sa <= da)
-            p = Add(s,d,A_SHIFT,t);
-        else
-            p = FbGen (s, d, A_SHIFT, (da << G_SHIFT) / sa, MASK, t, u, v);
-
-	*(dest + i) = m|n|o|p;
-    }
-}
-
-FASTCALL static void
-fbCombineDisjointGeneralC (comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width, comp1_t combine)
-{
-    int i;
-
-    for (i = 0; i < width; ++i) {
-        comp4_t s, d;
-        comp4_t m,n,o,p;
-        comp4_t Fa, Fb;
-        comp2_t t, u, v;
-        comp4_t sa;
-        comp1_t da;
-
-        s = *(src + i);
-        m = *(mask + i);
-        d = *(dest + i);
-        da = d >> A_SHIFT;
-
-	fbCombineMaskC (&s, &m);
-
-	sa = m;
-
-        switch (combine & CombineA) {
-        default:
-            Fa = 0;
-            break;
-        case CombineAOut:
-            m = (comp4_t)fbCombineDisjointOutPart ((comp1_t) (sa >> 0), da);
-            n = (comp4_t)fbCombineDisjointOutPart ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
-            o = (comp4_t)fbCombineDisjointOutPart ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
-            p = (comp4_t)fbCombineDisjointOutPart ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
-            Fa = m|n|o|p;
-            break;
-        case CombineAIn:
-            m = (comp4_t)fbCombineDisjointInPart ((comp1_t) (sa >> 0), da);
-            n = (comp4_t)fbCombineDisjointInPart ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
-            o = (comp4_t)fbCombineDisjointInPart ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
-            p = (comp4_t)fbCombineDisjointInPart ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
-            Fa = m|n|o|p;
-            break;
-        case CombineA:
-            Fa = ~0;
-            break;
-        }
-
-        switch (combine & CombineB) {
-        default:
-            Fb = 0;
-            break;
-        case CombineBOut:
-            m = (comp4_t)fbCombineDisjointOutPart (da, (comp1_t) (sa >> 0));
-            n = (comp4_t)fbCombineDisjointOutPart (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
-            o = (comp4_t)fbCombineDisjointOutPart (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
-            p = (comp4_t)fbCombineDisjointOutPart (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
-            Fb = m|n|o|p;
-            break;
-        case CombineBIn:
-            m = (comp4_t)fbCombineDisjointInPart (da, (comp1_t) (sa >> 0));
-            n = (comp4_t)fbCombineDisjointInPart (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
-            o = (comp4_t)fbCombineDisjointInPart (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
-            p = (comp4_t)fbCombineDisjointInPart (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
-            Fb = m|n|o|p;
-            break;
-        case CombineB:
-            Fb = ~0;
-            break;
-        }
-        m = FbGen (s,d,0,GetComp(Fa,0),GetComp(Fb,0),t, u, v);
-        n = FbGen (s,d,G_SHIFT,GetComp(Fa,G_SHIFT),GetComp(Fb,G_SHIFT),t, u, v);
-        o = FbGen (s,d,R_SHIFT,GetComp(Fa,R_SHIFT),GetComp(Fb,R_SHIFT),t, u, v);
-        p = FbGen (s,d,A_SHIFT,GetComp(Fa,A_SHIFT),GetComp(Fb,A_SHIFT),t, u, v);
-        s = m|n|o|p;
-	*(dest + i) = s;
-    }
-}
-
-FASTCALL static void
-fbCombineDisjointOverC (pixman_implementation_t *imp, pixman_op_t op,
-			comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineDisjointGeneralC (dest, src, mask, width, CombineAOver);
-}
-
-FASTCALL static void
-fbCombineDisjointInC (pixman_implementation_t *imp, pixman_op_t op,
-		      comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineDisjointGeneralC (dest, src, mask, width, CombineAIn);
-}
-
-FASTCALL static void
-fbCombineDisjointInReverseC (pixman_implementation_t *imp, pixman_op_t op,
-			     comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineDisjointGeneralC (dest, src, mask, width, CombineBIn);
-}
-
-FASTCALL static void
-fbCombineDisjointOutC (pixman_implementation_t *imp, pixman_op_t op,
-		       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineDisjointGeneralC (dest, src, mask, width, CombineAOut);
-}
-
-FASTCALL static void
-fbCombineDisjointOutReverseC (pixman_implementation_t *imp, pixman_op_t op,
-			      comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineDisjointGeneralC (dest, src, mask, width, CombineBOut);
-}
-
-FASTCALL static void
-fbCombineDisjointAtopC (pixman_implementation_t *imp, pixman_op_t op,
-			comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineDisjointGeneralC (dest, src, mask, width, CombineAAtop);
-}
-
-FASTCALL static void
-fbCombineDisjointAtopReverseC (pixman_implementation_t *imp, pixman_op_t op,
-			       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineDisjointGeneralC (dest, src, mask, width, CombineBAtop);
-}
-
-FASTCALL static void
-fbCombineDisjointXorC (pixman_implementation_t *imp, pixman_op_t op,
-		       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineDisjointGeneralC (dest, src, mask, width, CombineXor);
-}
-
-FASTCALL static void
-fbCombineConjointGeneralC (comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width, comp1_t combine)
-{
-    int i;
-
-    for (i = 0; i < width; ++i) {
-        comp4_t s, d;
-        comp4_t m,n,o,p;
-        comp4_t Fa, Fb;
-        comp2_t t, u, v;
-        comp4_t sa;
-        comp1_t da;
-
-        s = *(src + i);
-        m = *(mask + i);
-        d = *(dest + i);
-        da = d >> A_SHIFT;
-
-	fbCombineMaskC (&s, &m);
-
-        sa = m;
-
-        switch (combine & CombineA) {
-        default:
-            Fa = 0;
-            break;
-        case CombineAOut:
-            m = (comp4_t)fbCombineConjointOutPart ((comp1_t) (sa >> 0), da);
-            n = (comp4_t)fbCombineConjointOutPart ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
-            o = (comp4_t)fbCombineConjointOutPart ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
-            p = (comp4_t)fbCombineConjointOutPart ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
-            Fa = m|n|o|p;
-            break;
-        case CombineAIn:
-            m = (comp4_t)fbCombineConjointInPart ((comp1_t) (sa >> 0), da);
-            n = (comp4_t)fbCombineConjointInPart ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
-            o = (comp4_t)fbCombineConjointInPart ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
-            p = (comp4_t)fbCombineConjointInPart ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
-            Fa = m|n|o|p;
-            break;
-        case CombineA:
-            Fa = ~0;
-            break;
-        }
-
-        switch (combine & CombineB) {
-        default:
-            Fb = 0;
-            break;
-        case CombineBOut:
-            m = (comp4_t)fbCombineConjointOutPart (da, (comp1_t) (sa >> 0));
-            n = (comp4_t)fbCombineConjointOutPart (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
-            o = (comp4_t)fbCombineConjointOutPart (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
-            p = (comp4_t)fbCombineConjointOutPart (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
-            Fb = m|n|o|p;
-            break;
-        case CombineBIn:
-            m = (comp4_t)fbCombineConjointInPart (da, (comp1_t) (sa >> 0));
-            n = (comp4_t)fbCombineConjointInPart (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
-            o = (comp4_t)fbCombineConjointInPart (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
-            p = (comp4_t)fbCombineConjointInPart (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
-            Fb = m|n|o|p;
-            break;
-        case CombineB:
-            Fb = ~0;
-            break;
-        }
-        m = FbGen (s,d,0,GetComp(Fa,0),GetComp(Fb,0),t, u, v);
-        n = FbGen (s,d,G_SHIFT,GetComp(Fa,G_SHIFT),GetComp(Fb,G_SHIFT),t, u, v);
-        o = FbGen (s,d,R_SHIFT,GetComp(Fa,R_SHIFT),GetComp(Fb,R_SHIFT),t, u, v);
-        p = FbGen (s,d,A_SHIFT,GetComp(Fa,A_SHIFT),GetComp(Fb,A_SHIFT),t, u, v);
-        s = m|n|o|p;
-	*(dest + i) = s;
-    }
-}
-
-FASTCALL static void
-fbCombineConjointOverC (pixman_implementation_t *imp, pixman_op_t op,
-			comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralC (dest, src, mask, width, CombineAOver);
-}
-
-FASTCALL static void
-fbCombineConjointOverReverseC (pixman_implementation_t *imp, pixman_op_t op,
-			       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralC (dest, src, mask, width, CombineBOver);
-}
-
-FASTCALL static void
-fbCombineConjointInC (pixman_implementation_t *imp, pixman_op_t op,
-		      comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralC (dest, src, mask, width, CombineAIn);
-}
-
-FASTCALL static void
-fbCombineConjointInReverseC (pixman_implementation_t *imp, pixman_op_t op,
-			     comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralC (dest, src, mask, width, CombineBIn);
-}
-
-FASTCALL static void
-fbCombineConjointOutC (pixman_implementation_t *imp, pixman_op_t op,
-		       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralC (dest, src, mask, width, CombineAOut);
-}
-
-FASTCALL static void
-fbCombineConjointOutReverseC (pixman_implementation_t *imp, pixman_op_t op,
-			      comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralC (dest, src, mask, width, CombineBOut);
-}
-
-FASTCALL static void
-fbCombineConjointAtopC (pixman_implementation_t *imp, pixman_op_t op,
-			comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralC (dest, src, mask, width, CombineAAtop);
-}
-
-FASTCALL static void
-fbCombineConjointAtopReverseC (pixman_implementation_t *imp, pixman_op_t op,
-			       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralC (dest, src, mask, width, CombineBAtop);
-}
-
-FASTCALL static void
-fbCombineConjointXorC (pixman_implementation_t *imp, pixman_op_t op,
-		       comp4_t *dest, const comp4_t *src, const comp4_t *mask, int width)
-{
-    fbCombineConjointGeneralC (dest, src, mask, width, CombineXor);
-}
-
-void
-_pixman_setup_combiner_functions_width (pixman_implementation_t *imp)
-{
-    /* Unified alpha */
-    imp->combine_width[PIXMAN_OP_CLEAR] = fbCombineClear;
-    imp->combine_width[PIXMAN_OP_SRC] = fbCombineSrcU;
-    /* dest */
-    imp->combine_width[PIXMAN_OP_OVER] = fbCombineOverU;
-    imp->combine_width[PIXMAN_OP_OVER_REVERSE] = fbCombineOverReverseU;
-    imp->combine_width[PIXMAN_OP_IN] = fbCombineInU;
-    imp->combine_width[PIXMAN_OP_IN_REVERSE] = fbCombineInReverseU;
-    imp->combine_width[PIXMAN_OP_OUT] = fbCombineOutU;
-    imp->combine_width[PIXMAN_OP_OUT_REVERSE] = fbCombineOutReverseU;
-    imp->combine_width[PIXMAN_OP_ATOP] = fbCombineAtopU;
-    imp->combine_width[PIXMAN_OP_ATOP_REVERSE] = fbCombineAtopReverseU;
-    imp->combine_width[PIXMAN_OP_XOR] = fbCombineXorU;
-    imp->combine_width[PIXMAN_OP_ADD] = fbCombineAddU;
-    imp->combine_width[PIXMAN_OP_SATURATE] = fbCombineSaturateU;
-
-    /* Disjoint, unified */
-    imp->combine_width[PIXMAN_OP_DISJOINT_CLEAR] = fbCombineClear;
-    imp->combine_width[PIXMAN_OP_DISJOINT_SRC] = fbCombineSrcU;
-    /* dest */
-    imp->combine_width[PIXMAN_OP_DISJOINT_OVER] = fbCombineDisjointOverU;
-    imp->combine_width[PIXMAN_OP_DISJOINT_OVER_REVERSE] = fbCombineSaturateU;
-    imp->combine_width[PIXMAN_OP_DISJOINT_IN] = fbCombineDisjointInU;
-    imp->combine_width[PIXMAN_OP_DISJOINT_IN_REVERSE] = fbCombineDisjointInReverseU;
-    imp->combine_width[PIXMAN_OP_DISJOINT_OUT] = fbCombineDisjointOutU;
-    imp->combine_width[PIXMAN_OP_DISJOINT_OUT_REVERSE] = fbCombineDisjointOutReverseU;
-    imp->combine_width[PIXMAN_OP_DISJOINT_ATOP] = fbCombineDisjointAtopU;
-    imp->combine_width[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = fbCombineDisjointAtopReverseU;
-    imp->combine_width[PIXMAN_OP_DISJOINT_XOR] = fbCombineDisjointXorU;
-
-    /* Conjoint, unified */
-    imp->combine_width[PIXMAN_OP_CONJOINT_CLEAR] = fbCombineClear;
-    imp->combine_width[PIXMAN_OP_CONJOINT_SRC] = fbCombineSrcU;
-    /* dest */
-    imp->combine_width[PIXMAN_OP_CONJOINT_OVER] = fbCombineConjointOverU;
-    imp->combine_width[PIXMAN_OP_CONJOINT_OVER_REVERSE] = fbCombineConjointOverReverseU;
-    imp->combine_width[PIXMAN_OP_CONJOINT_IN] = fbCombineConjointInU;
-    imp->combine_width[PIXMAN_OP_CONJOINT_IN_REVERSE] = fbCombineConjointInReverseU;
-    imp->combine_width[PIXMAN_OP_CONJOINT_OUT] = fbCombineConjointOutU;
-    imp->combine_width[PIXMAN_OP_CONJOINT_OUT_REVERSE] = fbCombineConjointOutReverseU;
-    imp->combine_width[PIXMAN_OP_CONJOINT_ATOP] = fbCombineConjointAtopU;
-    imp->combine_width[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = fbCombineConjointAtopReverseU;
-    imp->combine_width[PIXMAN_OP_CONJOINT_XOR] = fbCombineConjointXorU;
-
-    /* Component alpha combiners */
-    imp->combine_width_ca[PIXMAN_OP_CLEAR] = fbCombineClearC;
-    imp->combine_width_ca[PIXMAN_OP_SRC] = fbCombineSrcC;
-    /* dest */
-    imp->combine_width_ca[PIXMAN_OP_OVER] = fbCombineOverC;
-    imp->combine_width_ca[PIXMAN_OP_OVER_REVERSE] = fbCombineOverReverseC;
-    imp->combine_width_ca[PIXMAN_OP_IN] = fbCombineInC;
-    imp->combine_width_ca[PIXMAN_OP_IN_REVERSE] = fbCombineInReverseC;
-    imp->combine_width_ca[PIXMAN_OP_OUT] = fbCombineOutC;
-    imp->combine_width_ca[PIXMAN_OP_OUT_REVERSE] = fbCombineOutReverseC;
-    imp->combine_width_ca[PIXMAN_OP_ATOP] = fbCombineAtopC;
-    imp->combine_width_ca[PIXMAN_OP_ATOP_REVERSE] = fbCombineAtopReverseC;
-    imp->combine_width_ca[PIXMAN_OP_XOR] = fbCombineXorC;
-    imp->combine_width_ca[PIXMAN_OP_ADD] = fbCombineAddC;
-    imp->combine_width_ca[PIXMAN_OP_SATURATE] = fbCombineSaturateC;
-
-    /* Disjoint CA */
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_CLEAR] = fbCombineClearC;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_SRC] = fbCombineSrcC;
-    /* dest */
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER] = fbCombineDisjointOverC;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = fbCombineSaturateC,
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_IN] = fbCombineDisjointInC;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = fbCombineDisjointInReverseC;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OUT] = fbCombineDisjointOutC;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = fbCombineDisjointOutReverseC;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_ATOP] = fbCombineDisjointAtopC;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = fbCombineDisjointAtopReverseC;
-    imp->combine_width_ca[PIXMAN_OP_DISJOINT_XOR] = fbCombineDisjointXorC;
-
-    /* Conjoint CA */
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_CLEAR] = fbCombineClearC;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_SRC] = fbCombineSrcC;
-    /* dest */
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER] = fbCombineConjointOverC;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = fbCombineConjointOverReverseC;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_IN] = fbCombineConjointInC;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = fbCombineConjointInReverseC;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OUT] = fbCombineConjointOutC;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = fbCombineConjointOutReverseC;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_ATOP] = fbCombineConjointAtopC;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = fbCombineConjointAtopReverseC;
-    imp->combine_width_ca[PIXMAN_OP_CONJOINT_XOR] = fbCombineConjointXorC;
-}
diff --git a/lib/pixman/pixman/combine.pl b/lib/pixman/pixman/make-combine.pl
index 3b7536205..210a5da12 100644
--- a/lib/pixman/pixman/combine.pl
+++ b/lib/pixman/pixman/make-combine.pl
@@ -1,4 +1,4 @@
-$usage = "Usage: combine.pl { 8 | 16 } < combine.inc";
+$usage = "Usage: combine.pl { 8 | 16 } < pixman-combine.c.template";
 
 $#ARGV == 0 or die $usage;
 
@@ -27,7 +27,7 @@ print "/* WARNING: This file is generated by combine.pl from combine.inc.\n";
 print "   Please edit one of those files rather than this one. */\n";
 print "\n";
 
-print "#line 1 \"combine.inc\"\n";
+print "#line 1 \"pixman-combine.c.template\"\n";
 
 $mask_ = mask($mask);
 $one_half_ = mask($one_half);
@@ -64,6 +64,11 @@ while (<STDIN>) {
     s/\bFbComposeFunctions\b/FbComposeFunctions$pixel_size/;
     s/combine_width/combine_$pixel_size/;
     s/_pixman_setup_combiner_functions_width/_pixman_setup_combiner_functions_$pixel_size/;
+    s/UNc/UN$size/g;
+    s/ALPHA_c/ALPHA_$size/g;
+    s/RED_c/RED_$size/g;
+    s/GREEN_c/GREEN_$size/g;
+    s/BLUE_c/BLUE_$size/g;
 
     # Convert comp*_t values into the appropriate real types.
     s/comp1_t/uint${size}_t/g;
diff --git a/lib/pixman/pixman/pixman-access.c b/lib/pixman/pixman/pixman-access.c
index 6b3ce34fa..d9fd38c15 100644
--- a/lib/pixman/pixman/pixman-access.c
+++ b/lib/pixman/pixman/pixman-access.c
@@ -33,674 +33,1092 @@
 #include <assert.h>
 
 #include "pixman-private.h"
+#include "pixman-accessor.h"
 
-#define Red(x) (((x) >> 16) & 0xff)
-#define Green(x) (((x) >> 8) & 0xff)
-#define Blue(x) ((x) & 0xff)
+#define CONVERT_RGB24_TO_Y15(s)						\
+    (((((s) >> 16) & 0xff) * 153 +					\
+      (((s) >>  8) & 0xff) * 301 +					\
+      (((s)      ) & 0xff) * 58) >> 2)
+
+#define CONVERT_RGB24_TO_RGB15(s)                                       \
+    ((((s) >> 3) & 0x001f) |                                            \
+     (((s) >> 6) & 0x03e0) |                                            \
+     (((s) >> 9) & 0x7c00))
+
+#define RGB15_TO_ENTRY(mif,rgb15)					\
+    ((mif)->ent[rgb15])
+
+#define RGB24_TO_ENTRY(mif,rgb24)					\
+    RGB15_TO_ENTRY (mif,CONVERT_RGB24_TO_RGB15 (rgb24))
+
+#define RGB24_TO_ENTRY_Y(mif,rgb24)					\
+    ((mif)->ent[CONVERT_RGB24_TO_Y15 (rgb24)])
 
 /*
  * YV12 setup and access macros
  */
 
-#define YV12_SETUP(pict) \
-	uint32_t *bits = pict->bits; \
-	int stride = pict->rowstride; \
-	int offset0 = stride < 0 ? \
-		((-stride) >> 1) * ((pict->height - 1) >> 1) - stride : \
-		stride * pict->height; \
-	int offset1 = stride < 0 ? \
-		offset0 + ((-stride) >> 1) * ((pict->height) >> 1) : \
-		offset0 + (offset0 >> 2)
+#define YV12_SETUP(image)                                               \
+    bits_image_t *__bits_image = (bits_image_t *)image;                 \
+    uint32_t *bits = __bits_image->bits;                                \
+    int stride = __bits_image->rowstride;                               \
+    int offset0 = stride < 0 ?                                          \
+    ((-stride) >> 1) * ((__bits_image->height - 1) >> 1) - stride :	\
+    stride * __bits_image->height;					\
+    int offset1 = stride < 0 ?                                          \
+    offset0 + ((-stride) >> 1) * ((__bits_image->height) >> 1) :	\
+	offset0 + (offset0 >> 2)
+
 /* Note no trailing semicolon on the above macro; if it's there, then
- * the typical usage of YV12_SETUP(pict); will have an extra trailing ;
+ * the typical usage of YV12_SETUP(image); will have an extra trailing ;
  * that some compilers will interpret as a statement -- and then any further
  * variable declarations will cause an error.
  */
 
-#define YV12_Y(line)		\
+#define YV12_Y(line)                                                    \
     ((uint8_t *) ((bits) + (stride) * (line)))
 
-#define YV12_U(line)	      \
-    ((uint8_t *) ((bits) + offset1 + \
-		((stride) >> 1) * ((line) >> 1)))
-
-#define YV12_V(line)	      \
-    ((uint8_t *) ((bits) + offset0 + \
-		((stride) >> 1) * ((line) >> 1)))
-
-/*********************************** Fetch ************************************/
-
-static FASTCALL void
-fbFetch_a8r8g8b8 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
-{
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
-    MEMCPY_WRAPPED(pict,
-                   buffer, (const uint32_t *)bits + x,
-		   width*sizeof(uint32_t));
-}
-
-static FASTCALL void
-fbFetch_x8r8g8b8 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
-{
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+#define YV12_U(line)                                                    \
+    ((uint8_t *) ((bits) + offset1 +                                    \
+                  ((stride) >> 1) * ((line) >> 1)))
+
+#define YV12_V(line)                                                    \
+    ((uint8_t *) ((bits) + offset0 +                                    \
+                  ((stride) >> 1) * ((line) >> 1)))
+
+/********************************** Fetch ************************************/
+
+static void
+fetch_scanline_a8r8g8b8 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    
+    MEMCPY_WRAPPED (image,
+                    buffer, (const uint32_t *)bits + x,
+                    width * sizeof(uint32_t));
+}
+
+static void
+fetch_scanline_x8r8g8b8 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint32_t *pixel = (const uint32_t *)bits + x;
     const uint32_t *end = pixel + width;
-    while (pixel < end) {
-	*buffer++ = READ(pict, pixel++) | 0xff000000;
-    }
+    
+    while (pixel < end)
+	*buffer++ = READ (image, pixel++) | 0xff000000;
 }
 
-static FASTCALL void
-fbFetch_a8b8g8r8 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_a8b8g8r8 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint32_t *pixel = (uint32_t *)bits + x;
     const uint32_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t p = READ(pict, pixel++);
-	*buffer++ = (p & 0xff00ff00) |
-	            ((p >> 16) & 0xff) |
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	
+	*buffer++ = (p & 0xff00ff00)	|
+	    ((p >> 16) & 0xff)		|
 	    ((p & 0xff) << 16);
     }
 }
 
-static FASTCALL void
-fbFetch_x8b8g8r8 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_x8b8g8r8 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint32_t *pixel = (uint32_t *)bits + x;
     const uint32_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t p = READ(pict, pixel++);
-	*buffer++ = 0xff000000 |
-	    (p & 0x0000ff00) |
-	    ((p >> 16) & 0xff) |
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	
+	*buffer++ = 0xff000000		|
+	    (p & 0x0000ff00)		|
+	    ((p >> 16) & 0xff)		|
 	    ((p & 0xff) << 16);
     }
 }
 
-static FASTCALL void
-fbFetch_b8g8r8a8 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_b8g8r8a8 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint32_t *pixel = (uint32_t *)bits + x;
     const uint32_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t p = READ(pict, pixel++);
-	*buffer++ = ((p & 0xff000000) >> 24) |
-	    ((p & 0x00ff0000) >> 8) |
-	    ((p & 0x0000ff00) << 8) |
-	    ((p & 0x000000ff) << 24);
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	
+	*buffer++ = (((p & 0xff000000) >> 24)	|
+	             ((p & 0x00ff0000) >> 8)	|
+	             ((p & 0x0000ff00) << 8)	|
+	             ((p & 0x000000ff) << 24));
     }
 }
 
-static FASTCALL void
-fbFetch_b8g8r8x8 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_b8g8r8x8 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint32_t *pixel = (uint32_t *)bits + x;
     const uint32_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t p = READ(pict, pixel++);
-	*buffer++ = 0xff000000 |
-	    ((p & 0xff000000) >> 24) |
-	    ((p & 0x00ff0000) >> 8) |
-	    ((p & 0x0000ff00) << 8);
-    }
-}
-
-static FASTCALL void
-fbFetch_a2b10g10r10 (bits_image_t *pict, int x, int y, int width, uint64_t *buffer)
-{
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	
+	*buffer++ = (0xff000000 |
+	             ((p & 0xff000000) >> 24)	|
+	             ((p & 0x00ff0000) >> 8)	|
+	             ((p & 0x0000ff00) << 8));
+    }
+}
+
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_a2r10g10b10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask,
+                            uint32_t        mask_bits)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint32_t *pixel = bits + x;
     const uint32_t *end = pixel + width;
-    while (pixel < end) {
-        uint32_t p = READ(pict, pixel++);
-        uint64_t a = p >> 30;
-        uint64_t b = (p >> 20) & 0x3ff;
-        uint64_t g = (p >> 10) & 0x3ff;
-        uint64_t r = p & 0x3ff;
+    uint64_t *buffer = (uint64_t *)b;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t a = p >> 30;
+	uint64_t r = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t b = p & 0x3ff;
 
-        r = r << 6 | r >> 4;
-        g = g << 6 | g >> 4;
-        b = b << 6 | b >> 4;
+	r = r << 6 | r >> 4;
+	g = g << 6 | g >> 4;
+	b = b << 6 | b >> 4;
 
-        a <<= 62;
-        a |= a >> 2;
-        a |= a >> 4;
-        a |= a >> 8;
+	a <<= 14;
+	a |= a >> 2;
+	a |= a >> 4;
+	a |= a >> 8;
 
-        *buffer++ = a << 48 | r << 32 | g << 16 | b;
+	*buffer++ = a << 48 | r << 32 | g << 16 | b;
     }
 }
 
-static FASTCALL void
-fbFetch_x2b10g10r10 (bits_image_t *pict, int x, int y, int width, uint64_t *buffer)
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_x2r10g10b10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask,
+                            uint32_t        mask_bits)
 {
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint32_t *pixel = (uint32_t *)bits + x;
     const uint32_t *end = pixel + width;
-    while (pixel < end) {
-        uint32_t p = READ(pict, pixel++);
-        uint64_t b = (p >> 20) & 0x3ff;
-        uint64_t g = (p >> 10) & 0x3ff;
-        uint64_t r = p & 0x3ff;
-
-        r = r << 6 | r >> 4;
-        g = g << 6 | g >> 4;
-        b = b << 6 | b >> 4;
-
-        *buffer++ = 0xffffULL << 48 | r << 32 | g << 16 | b;
-    }
-}
-
-static FASTCALL void
-fbFetch_r8g8b8 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
-{
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + 3*x;
-    const uint8_t *end = pixel + 3*width;
-    while (pixel < end) {
-	uint32_t b = Fetch24(pict, pixel) | 0xff000000;
-	pixel += 3;
+    uint64_t *buffer = (uint64_t *)b;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t r = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t b = p & 0x3ff;
+	
+	r = r << 6 | r >> 4;
+	g = g << 6 | g >> 4;
+	b = b << 6 | b >> 4;
+	
+	*buffer++ = 0xffffULL << 48 | r << 32 | g << 16 | b;
+    }
+}
+
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_a2b10g10r10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask,
+                            uint32_t        mask_bits)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = bits + x;
+    const uint32_t *end = pixel + width;
+    uint64_t *buffer = (uint64_t *)b;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t a = p >> 30;
+	uint64_t b = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t r = p & 0x3ff;
+	
+	r = r << 6 | r >> 4;
+	g = g << 6 | g >> 4;
+	b = b << 6 | b >> 4;
+	
+	a <<= 14;
+	a |= a >> 2;
+	a |= a >> 4;
+	a |= a >> 8;
+
+	*buffer++ = a << 48 | r << 32 | g << 16 | b;
+    }
+}
+
+/* Expects a uint64_t buffer */
+static void
+fetch_scanline_x2b10g10r10 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      b,
+                            const uint32_t *mask,
+                            uint32_t        mask_bits)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    uint64_t *buffer = (uint64_t *)b;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t b = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t r = p & 0x3ff;
+	
+	r = r << 6 | r >> 4;
+	g = g << 6 | g >> 4;
+	b = b << 6 | b >> 4;
+	
+	*buffer++ = 0xffffULL << 48 | r << 32 | g << 16 | b;
+    }
+}
+
+static void
+fetch_scanline_r8g8b8 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask,
+                       uint32_t        mask_bits)
+{
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint8_t *pixel = (const uint8_t *)bits + 3 * x;
+    const uint8_t *end = pixel + 3 * width;
+    
+    while (pixel < end)
+    {
+	uint32_t b = 0xff000000;
+	
+#ifdef WORDS_BIGENDIAN
+	b |= (READ (image, pixel++) << 16);
+	b |= (READ (image, pixel++) << 8);
+	b |= (READ (image, pixel++));
+#else
+	b |= (READ (image, pixel++));
+	b |= (READ (image, pixel++) << 8);
+	b |= (READ (image, pixel++) << 16);
+#endif
+	
 	*buffer++ = b;
     }
 }
 
-static FASTCALL void
-fbFetch_b8g8r8 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_b8g8r8 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask,
+                       uint32_t        mask_bits)
 {
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
-    const uint8_t *pixel = (const uint8_t *)bits + 3*x;
-    const uint8_t *end = pixel + 3*width;
-    while (pixel < end) {
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const uint8_t *pixel = (const uint8_t *)bits + 3 * x;
+    const uint8_t *end = pixel + 3 * width;
+    
+    while (pixel < end)
+    {
 	uint32_t b = 0xff000000;
-#if IMAGE_BYTE_ORDER == MSBFirst
-	b |= (READ(pict, pixel++));
-	b |= (READ(pict, pixel++) << 8);
-	b |= (READ(pict, pixel++) << 16);
+#ifdef WORDS_BIGENDIAN
+	b |= (READ (image, pixel++));
+	b |= (READ (image, pixel++) << 8);
+	b |= (READ (image, pixel++) << 16);
 #else
-	b |= (READ(pict, pixel++) << 16);
-	b |= (READ(pict, pixel++) << 8);
-	b |= (READ(pict, pixel++));
+	b |= (READ (image, pixel++) << 16);
+	b |= (READ (image, pixel++) << 8);
+	b |= (READ (image, pixel++));
 #endif
 	*buffer++ = b;
     }
 }
 
-static FASTCALL void
-fbFetch_r5g6b5 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_r5g6b5 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask,
+                       uint32_t        mask_bits)
 {
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint16_t *pixel = (const uint16_t *)bits + x;
     const uint16_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t p = READ(pict, pixel++);
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
 	uint32_t r = (((p) << 3) & 0xf8) |
 	    (((p) << 5) & 0xfc00) |
 	    (((p) << 8) & 0xf80000);
+	
 	r |= (r >> 5) & 0x70007;
 	r |= (r >> 6) & 0x300;
+	
 	*buffer++ = 0xff000000 | r;
     }
 }
 
-static FASTCALL void
-fbFetch_b5g6r5 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_b5g6r5 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask,
+                       uint32_t        mask_bits)
 {
-    uint32_t  r,g,b;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint16_t *pixel = (const uint16_t *)bits + x;
     const uint16_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t  p = READ(pict, pixel++);
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b;
+	
 	b = ((p & 0xf800) | ((p & 0xe000) >> 5)) >> 8;
 	g = ((p & 0x07e0) | ((p & 0x0600) >> 6)) << 5;
 	r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
+	
 	*buffer++ = 0xff000000 | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_a1r5g5b5 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_a1r5g5b5 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    uint32_t  r,g,b, a;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint16_t *pixel = (const uint16_t *)bits + x;
     const uint16_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t  p = READ(pict, pixel++);
-
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b, a;
+	
 	a = (uint32_t) ((uint8_t) (0 - ((p & 0x8000) >> 15))) << 24;
 	r = ((p & 0x7c00) | ((p & 0x7000) >> 5)) << 9;
 	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
 	b = ((p & 0x001c) | ((p & 0x001f) << 5)) >> 2;
+	
 	*buffer++ = a | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_x1r5g5b5 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_x1r5g5b5 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    uint32_t  r,g,b;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint16_t *pixel = (const uint16_t *)bits + x;
     const uint16_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t  p = READ(pict, pixel++);
-
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b;
+	
 	r = ((p & 0x7c00) | ((p & 0x7000) >> 5)) << 9;
 	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
 	b = ((p & 0x001c) | ((p & 0x001f) << 5)) >> 2;
+	
 	*buffer++ = 0xff000000 | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_a1b5g5r5 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_a1b5g5r5 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    uint32_t  r,g,b, a;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint16_t *pixel = (const uint16_t *)bits + x;
     const uint16_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t  p = READ(pict, pixel++);
-
+    uint32_t r, g, b, a;
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	
 	a = (uint32_t) ((uint8_t) (0 - ((p & 0x8000) >> 15))) << 24;
 	b = ((p & 0x7c00) | ((p & 0x7000) >> 5)) >> 7;
 	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
 	r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
+	
 	*buffer++ = a | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_x1b5g5r5 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_x1b5g5r5 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    uint32_t  r,g,b;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint16_t *pixel = (const uint16_t *)bits + x;
     const uint16_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t  p = READ(pict, pixel++);
-
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b;
+	
 	b = ((p & 0x7c00) | ((p & 0x7000) >> 5)) >> 7;
 	g = ((p & 0x03e0) | ((p & 0x0380) >> 5)) << 6;
 	r = ((p & 0x001c) | ((p & 0x001f) << 5)) << 14;
+	
 	*buffer++ = 0xff000000 | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_a4r4g4b4 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_a4r4g4b4 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    uint32_t  r,g,b, a;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint16_t *pixel = (const uint16_t *)bits + x;
     const uint16_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t  p = READ(pict, pixel++);
-
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b, a;
+	
 	a = ((p & 0xf000) | ((p & 0xf000) >> 4)) << 16;
 	r = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) << 12;
 	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
 	b = ((p & 0x000f) | ((p & 0x000f) << 4));
+	
 	*buffer++ = a | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_x4r4g4b4 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_x4r4g4b4 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    uint32_t  r,g,b;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint16_t *pixel = (const uint16_t *)bits + x;
     const uint16_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t  p = READ(pict, pixel++);
-
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b;
+	
 	r = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) << 12;
 	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
 	b = ((p & 0x000f) | ((p & 0x000f) << 4));
+	
 	*buffer++ = 0xff000000 | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_a4b4g4r4 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_a4b4g4r4 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    uint32_t  r,g,b, a;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint16_t *pixel = (const uint16_t *)bits + x;
     const uint16_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t  p = READ(pict, pixel++);
-
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b, a;
+	
 	a = ((p & 0xf000) | ((p & 0xf000) >> 4)) << 16;
 	b = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) >> 4;
 	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
 	r = ((p & 0x000f) | ((p & 0x000f) << 4)) << 16;
+	
 	*buffer++ = a | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_x4b4g4r4 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_x4b4g4r4 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    uint32_t  r,g,b;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint16_t *pixel = (const uint16_t *)bits + x;
     const uint16_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t  p = READ(pict, pixel++);
-
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b;
+	
 	b = ((p & 0x0f00) | ((p & 0x0f00) >> 4)) >> 4;
 	g = ((p & 0x00f0) | ((p & 0x00f0) >> 4)) << 8;
 	r = ((p & 0x000f) | ((p & 0x000f) << 4)) << 16;
+	
 	*buffer++ = 0xff000000 | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_a8 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_a8 (pixman_image_t *image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   uint32_t *      buffer,
+                   const uint32_t *mask,
+                   uint32_t        mask_bits)
 {
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint8_t *pixel = (const uint8_t *)bits + x;
     const uint8_t *end = pixel + width;
-    while (pixel < end) {
-	*buffer++ = READ(pict, pixel++) << 24;
-    }
+    
+    while (pixel < end)
+	*buffer++ = READ (image, pixel++) << 24;
 }
 
-static FASTCALL void
-fbFetch_r3g3b2 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_r3g3b2 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask,
+                       uint32_t        mask_bits)
 {
-    uint32_t  r,g,b;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint8_t *pixel = (const uint8_t *)bits + x;
     const uint8_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t  p = READ(pict, pixel++);
-
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b;
+	
 	r = ((p & 0xe0) | ((p & 0xe0) >> 3) | ((p & 0xc0) >> 6)) << 16;
 	g = ((p & 0x1c) | ((p & 0x18) >> 3) | ((p & 0x1c) << 3)) << 8;
 	b = (((p & 0x03)     ) |
 	     ((p & 0x03) << 2) |
 	     ((p & 0x03) << 4) |
 	     ((p & 0x03) << 6));
+	
 	*buffer++ = 0xff000000 | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_b2g3r3 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_b2g3r3 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask,
+                       uint32_t        mask_bits)
 {
-    uint32_t  r,g,b;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint8_t *pixel = (const uint8_t *)bits + x;
     const uint8_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t  p = READ(pict, pixel++);
-
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t r, g, b;
+	
 	b = (((p & 0xc0)     ) |
 	     ((p & 0xc0) >> 2) |
 	     ((p & 0xc0) >> 4) |
 	     ((p & 0xc0) >> 6));
+	
 	g = ((p & 0x38) | ((p & 0x38) >> 3) | ((p & 0x30) << 2)) << 8;
+	
 	r = (((p & 0x07)     ) |
 	     ((p & 0x07) << 3) |
 	     ((p & 0x06) << 6)) << 16;
+	
 	*buffer++ = 0xff000000 | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_a2r2g2b2 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_a2r2g2b2 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    uint32_t   a,r,g,b;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint8_t *pixel = (const uint8_t *)bits + x;
     const uint8_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t  p = READ(pict, pixel++);
-
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t a, r, g, b;
+	
 	a = ((p & 0xc0) * 0x55) << 18;
 	r = ((p & 0x30) * 0x55) << 12;
 	g = ((p & 0x0c) * 0x55) << 6;
 	b = ((p & 0x03) * 0x55);
-	*buffer++ = a|r|g|b;
+	
+	*buffer++ = a | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_a2b2g2r2 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_a2b2g2r2 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    uint32_t   a,r,g,b;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint8_t *pixel = (const uint8_t *)bits + x;
     const uint8_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t  p = READ(pict, pixel++);
-
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint32_t a, r, g, b;
+	
 	a = ((p & 0xc0) * 0x55) << 18;
 	b = ((p & 0x30) * 0x55) >> 6;
 	g = ((p & 0x0c) * 0x55) << 6;
 	r = ((p & 0x03) * 0x55) << 16;
-	*buffer++ = a|r|g|b;
+	
+	*buffer++ = a | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_c8 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_c8 (pixman_image_t *image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   uint32_t *      buffer,
+                   const uint32_t *mask,
+                   uint32_t        mask_bits)
 {
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
-    const pixman_indexed_t * indexed = pict->indexed;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const pixman_indexed_t * indexed = image->bits.indexed;
     const uint8_t *pixel = (const uint8_t *)bits + x;
     const uint8_t *end = pixel + width;
-    while (pixel < end) {
-	uint32_t  p = READ(pict, pixel++);
+    
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	
 	*buffer++ = indexed->rgba[p];
     }
 }
 
-static FASTCALL void
-fbFetch_x4a4 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_x4a4 (pixman_image_t *image,
+                     int             x,
+                     int             y,
+                     int             width,
+                     uint32_t *      buffer,
+                     const uint32_t *mask,
+                     uint32_t        mask_bits)
 {
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     const uint8_t *pixel = (const uint8_t *)bits + x;
     const uint8_t *end = pixel + width;
-    while (pixel < end) {
-	uint8_t p = READ(pict, pixel++) & 0xf;
+    
+    while (pixel < end)
+    {
+	uint8_t p = READ (image, pixel++) & 0xf;
+	
 	*buffer++ = (p | (p << 4)) << 24;
     }
 }
 
-#define Fetch8(img,l,o)    (READ(img, (uint8_t *)(l) + ((o) >> 2)))
-#if IMAGE_BYTE_ORDER == MSBFirst
-#define Fetch4(img,l,o)    ((o) & 2 ? Fetch8(img,l,o) & 0xf : Fetch8(img,l,o) >> 4)
+#define FETCH_8(img,l,o)    (READ (img, (uint8_t *)(l) + ((o) >> 2)))
+#ifdef WORDS_BIGENDIAN
+#define FETCH_4(img,l,o)    ((o) & 2 ? FETCH_8 (img,l,o) & 0xf : FETCH_8 (img,l,o) >> 4)
 #else
-#define Fetch4(img,l,o)    ((o) & 2 ? Fetch8(img,l,o) >> 4 : Fetch8(img,l,o) & 0xf)
+#define FETCH_4(img,l,o)    ((o) & 2 ? FETCH_8 (img,l,o) >> 4 : FETCH_8 (img,l,o) & 0xf)
 #endif
 
-static FASTCALL void
-fbFetch_a4 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_a4 (pixman_image_t *image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   uint32_t *      buffer,
+                   const uint32_t *mask,
+                   uint32_t        mask_bits)
 {
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     int i;
-    for (i = 0; i < width; ++i) {
-	uint32_t  p = Fetch4(pict, bits, i + x);
-
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = FETCH_4 (image, bits, i + x);
+	
 	p |= p << 4;
+	
 	*buffer++ = p << 24;
     }
 }
 
-static FASTCALL void
-fbFetch_r1g2b1 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_r1g2b1 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask,
+                       uint32_t        mask_bits)
 {
-    uint32_t  r,g,b;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     int i;
-    for (i = 0; i < width; ++i) {
-	uint32_t  p = Fetch4(pict, bits, i + x);
-
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = FETCH_4 (image, bits, i + x);
+	uint32_t r, g, b;
+	
 	r = ((p & 0x8) * 0xff) << 13;
 	g = ((p & 0x6) * 0x55) << 7;
 	b = ((p & 0x1) * 0xff);
-	*buffer++ = 0xff000000|r|g|b;
+	
+	*buffer++ = 0xff000000 | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_b1g2r1 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_b1g2r1 (pixman_image_t *image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       uint32_t *      buffer,
+                       const uint32_t *mask,
+                       uint32_t        mask_bits)
 {
-    uint32_t  r,g,b;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     int i;
-    for (i = 0; i < width; ++i) {
-	uint32_t  p = Fetch4(pict, bits, i + x);
-
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = FETCH_4 (image, bits, i + x);
+	uint32_t r, g, b;
+	
 	b = ((p & 0x8) * 0xff) >> 3;
 	g = ((p & 0x6) * 0x55) << 7;
 	r = ((p & 0x1) * 0xff) << 16;
-	*buffer++ = 0xff000000|r|g|b;
+	
+	*buffer++ = 0xff000000 | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_a1r1g1b1 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_a1r1g1b1 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    uint32_t  a,r,g,b;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    uint32_t a, r, g, b;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     int i;
-    for (i = 0; i < width; ++i) {
-	uint32_t  p = Fetch4(pict, bits, i + x);
-
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = FETCH_4 (image, bits, i + x);
+	
 	a = ((p & 0x8) * 0xff) << 21;
 	r = ((p & 0x4) * 0xff) << 14;
 	g = ((p & 0x2) * 0xff) << 7;
 	b = ((p & 0x1) * 0xff);
-	*buffer++ = a|r|g|b;
+	
+	*buffer++ = a | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_a1b1g1r1 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_a1b1g1r1 (pixman_image_t *image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         uint32_t *      buffer,
+                         const uint32_t *mask,
+                         uint32_t        mask_bits)
 {
-    uint32_t  a,r,g,b;
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     int i;
-    for (i = 0; i < width; ++i) {
-	uint32_t  p = Fetch4(pict, bits, i + x);
-
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = FETCH_4 (image, bits, i + x);
+	uint32_t a, r, g, b;
+	
 	a = ((p & 0x8) * 0xff) << 21;
 	r = ((p & 0x4) * 0xff) >> 3;
 	g = ((p & 0x2) * 0xff) << 7;
 	b = ((p & 0x1) * 0xff) << 16;
-	*buffer++ = a|r|g|b;
+	
+	*buffer++ = a | r | g | b;
     }
 }
 
-static FASTCALL void
-fbFetch_c4 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_c4 (pixman_image_t *image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   uint32_t *      buffer,
+                   const uint32_t *mask,
+                   uint32_t        mask_bits)
 {
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
-    const pixman_indexed_t * indexed = pict->indexed;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const pixman_indexed_t * indexed = image->bits.indexed;
     int i;
-    for (i = 0; i < width; ++i) {
-	uint32_t  p = Fetch4(pict, bits, i + x);
-
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = FETCH_4 (image, bits, i + x);
+	
 	*buffer++ = indexed->rgba[p];
     }
 }
 
-
-static FASTCALL void
-fbFetch_a1 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_a1 (pixman_image_t *image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   uint32_t *      buffer,
+                   const uint32_t *mask,
+                   uint32_t        mask_bits)
 {
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
     int i;
-    for (i = 0; i < width; ++i) {
-	uint32_t  p = READ(pict, bits + ((i + x) >> 5));
-	uint32_t  a;
-#if BITMAP_BIT_ORDER == MSBFirst
-	a = p >> (0x1f - ((i+x) & 0x1f));
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = READ (image, bits + ((i + x) >> 5));
+	uint32_t a;
+	
+#ifdef WORDS_BIGENDIAN
+	a = p >> (0x1f - ((i + x) & 0x1f));
 #else
-	a = p >> ((i+x) & 0x1f);
+	a = p >> ((i + x) & 0x1f);
 #endif
 	a = a & 1;
 	a |= a << 1;
 	a |= a << 2;
 	a |= a << 4;
+	
 	*buffer++ = a << 24;
     }
 }
 
-static FASTCALL void
-fbFetch_g1 (bits_image_t *pict, int x, int y, int width, uint32_t *buffer)
+static void
+fetch_scanline_g1 (pixman_image_t *image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   uint32_t *      buffer,
+                   const uint32_t *mask,
+                   uint32_t        mask_bits)
 {
-    const uint32_t *bits = pict->bits + y*pict->rowstride;
-    const pixman_indexed_t * indexed = pict->indexed;
+    const uint32_t *bits = image->bits.bits + y * image->bits.rowstride;
+    const pixman_indexed_t * indexed = image->bits.indexed;
     int i;
-    for (i = 0; i < width; ++i) {
-	uint32_t p = READ(pict, bits + ((i+x) >> 5));
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t p = READ (image, bits + ((i + x) >> 5));
 	uint32_t a;
-#if BITMAP_BIT_ORDER == MSBFirst
-	a = p >> (0x1f - ((i+x) & 0x1f));
+	
+#ifdef WORDS_BIGENDIAN
+	a = p >> (0x1f - ((i + x) & 0x1f));
 #else
-	a = p >> ((i+x) & 0x1f);
+	a = p >> ((i + x) & 0x1f);
 #endif
 	a = a & 1;
+	
 	*buffer++ = indexed->rgba[a];
     }
 }
 
-static FASTCALL void
-fbFetch_yuy2 (bits_image_t *pict, int x, int line, int width, uint32_t *buffer)
+static void
+fetch_scanline_yuy2 (pixman_image_t *image,
+                     int             x,
+                     int             line,
+                     int             width,
+                     uint32_t *      buffer,
+                     const uint32_t *mask,
+                     uint32_t        mask_bits)
 {
-    int16_t y, u, v;
-    int32_t r, g, b;
-    int   i;
-
-    const uint32_t *bits = pict->bits + pict->rowstride * line;
-
+    const uint32_t *bits = image->bits.bits + image->bits.rowstride * line;
+    int i;
+    
     for (i = 0; i < width; i++)
     {
+	int16_t y, u, v;
+	int32_t r, g, b;
+	
 	y = ((uint8_t *) bits)[(x + i) << 1] - 16;
-	u = ((uint8_t *) bits)[(((x + i) << 1) & -4) + 1] - 128;
-	v = ((uint8_t *) bits)[(((x + i) << 1) & -4) + 3] - 128;
-
+	u = ((uint8_t *) bits)[(((x + i) << 1) & - 4) + 1] - 128;
+	v = ((uint8_t *) bits)[(((x + i) << 1) & - 4) + 3] - 128;
+	
 	/* R = 1.164(Y - 16) + 1.596(V - 128) */
 	r = 0x012b27 * y + 0x019a2e * v;
 	/* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
 	g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
 	/* B = 1.164(Y - 16) + 2.018(U - 128) */
 	b = 0x012b27 * y + 0x0206a2 * u;
-
-	WRITE(pict, buffer++, 0xff000000 |
-	      (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
-	      (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
-	      (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0));
+	
+	*buffer++ = 0xff000000 |
+	    (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+	    (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+	    (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
     }
 }
 
-static FASTCALL void
-fbFetch_yv12 (bits_image_t *pict, int x, int line, int width, uint32_t *buffer)
+static void
+fetch_scanline_yv12 (pixman_image_t *image,
+                     int             x,
+                     int             line,
+                     int             width,
+                     uint32_t *      buffer,
+                     const uint32_t *mask,
+                     uint32_t        mask_bits)
 {
-    YV12_SETUP(pict);
-    uint8_t *pY = YV12_Y (line);
-    uint8_t *pU = YV12_U (line);
-    uint8_t *pV = YV12_V (line);
-    int16_t y, u, v;
-    int32_t r, g, b;
-    int   i;
-
+    YV12_SETUP (image);
+    uint8_t *y_line = YV12_Y (line);
+    uint8_t *u_line = YV12_U (line);
+    uint8_t *v_line = YV12_V (line);
+    int i;
+    
     for (i = 0; i < width; i++)
     {
-	y = pY[x + i] - 16;
-	u = pU[(x + i) >> 1] - 128;
-	v = pV[(x + i) >> 1] - 128;
+	int16_t y, u, v;
+	int32_t r, g, b;
+
+	y = y_line[x + i] - 16;
+	u = u_line[(x + i) >> 1] - 128;
+	v = v_line[(x + i) >> 1] - 128;
 
 	/* R = 1.164(Y - 16) + 1.596(V - 128) */
 	r = 0x012b27 * y + 0x019a2e * v;
@@ -709,537 +1127,601 @@ fbFetch_yv12 (bits_image_t *pict, int x, int line, int width, uint32_t *buffer)
 	/* B = 1.164(Y - 16) + 2.018(U - 128) */
 	b = 0x012b27 * y + 0x0206a2 * u;
 
-	WRITE(pict, buffer++, 0xff000000 |
+	*buffer++ = 0xff000000 |
 	    (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
 	    (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
-	    (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0));
+	    (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
     }
 }
 
-fetchProc32 ACCESS(pixman_fetchProcForPicture32) (bits_image_t * pict)
-{
-    switch(pict->format) {
-    case PIXMAN_a8r8g8b8: return fbFetch_a8r8g8b8;
-    case PIXMAN_x8r8g8b8: return fbFetch_x8r8g8b8;
-    case PIXMAN_a8b8g8r8: return fbFetch_a8b8g8r8;
-    case PIXMAN_x8b8g8r8: return fbFetch_x8b8g8r8;
-    case PIXMAN_b8g8r8a8: return fbFetch_b8g8r8a8;
-    case PIXMAN_b8g8r8x8: return fbFetch_b8g8r8x8;
-    /* These two require wide compositing */
-    case PIXMAN_a2b10g10r10: return NULL;
-    case PIXMAN_x2b10g10r10: return NULL;
-
-        /* 24bpp formats */
-    case PIXMAN_r8g8b8: return fbFetch_r8g8b8;
-    case PIXMAN_b8g8r8: return fbFetch_b8g8r8;
-
-        /* 16bpp formats */
-    case PIXMAN_r5g6b5: return fbFetch_r5g6b5;
-    case PIXMAN_b5g6r5: return fbFetch_b5g6r5;
-
-    case PIXMAN_a1r5g5b5: return fbFetch_a1r5g5b5;
-    case PIXMAN_x1r5g5b5: return fbFetch_x1r5g5b5;
-    case PIXMAN_a1b5g5r5: return fbFetch_a1b5g5r5;
-    case PIXMAN_x1b5g5r5: return fbFetch_x1b5g5r5;
-    case PIXMAN_a4r4g4b4: return fbFetch_a4r4g4b4;
-    case PIXMAN_x4r4g4b4: return fbFetch_x4r4g4b4;
-    case PIXMAN_a4b4g4r4: return fbFetch_a4b4g4r4;
-    case PIXMAN_x4b4g4r4: return fbFetch_x4b4g4r4;
-
-        /* 8bpp formats */
-    case PIXMAN_a8: return  fbFetch_a8;
-    case PIXMAN_r3g3b2: return fbFetch_r3g3b2;
-    case PIXMAN_b2g3r3: return fbFetch_b2g3r3;
-    case PIXMAN_a2r2g2b2: return fbFetch_a2r2g2b2;
-    case PIXMAN_a2b2g2r2: return fbFetch_a2b2g2r2;
-    case PIXMAN_c8: return  fbFetch_c8;
-    case PIXMAN_g8: return  fbFetch_c8;
-    case PIXMAN_x4a4: return fbFetch_x4a4;
-
-        /* 4bpp formats */
-    case PIXMAN_a4: return  fbFetch_a4;
-    case PIXMAN_r1g2b1: return fbFetch_r1g2b1;
-    case PIXMAN_b1g2r1: return fbFetch_b1g2r1;
-    case PIXMAN_a1r1g1b1: return fbFetch_a1r1g1b1;
-    case PIXMAN_a1b1g1r1: return fbFetch_a1b1g1r1;
-    case PIXMAN_c4: return  fbFetch_c4;
-    case PIXMAN_g4: return  fbFetch_c4;
-
-        /* 1bpp formats */
-    case PIXMAN_a1: return  fbFetch_a1;
-    case PIXMAN_g1: return  fbFetch_g1;
+/**************************** Pixel wise fetching *****************************/
 
-        /* YUV formats */
-    case PIXMAN_yuy2: return fbFetch_yuy2;
-    case PIXMAN_yv12: return fbFetch_yv12;
-    }
+/* Despite the type, expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_a2r10g10b10 (bits_image_t *image,
+			 int		  offset,
+			 int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t a = p >> 30;
+    uint64_t r = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t b = p & 0x3ff;
 
-    return NULL;
-}
+    r = r << 6 | r >> 4;
+    g = g << 6 | g >> 4;
+    b = b << 6 | b >> 4;
 
-static FASTCALL void
-fbFetch64_generic (bits_image_t *pict, int x, int y, int width, uint64_t *buffer)
-{
-    fetchProc32 fetch32 = ACCESS(pixman_fetchProcForPicture32) (pict);
+    a <<= 14;
+    a |= a >> 2;
+    a |= a >> 4;
+    a |= a >> 8;
 
-    // Fetch the pixels into the first half of buffer and then expand them in
-    // place.
-    fetch32(pict, x, y, width, (uint32_t*)buffer);
-    pixman_expand(buffer, (uint32_t*)buffer, pict->format, width);
+    return a << 48 | r << 32 | g << 16 | b;
 }
 
-fetchProc64 ACCESS(pixman_fetchProcForPicture64) (bits_image_t * pict)
+/* Despite the type, this function expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_x2r10g10b10 (bits_image_t *image,
+			 int	   offset,
+			 int           line)
 {
-    switch(pict->format) {
-    case PIXMAN_a2b10g10r10: return fbFetch_a2b10g10r10;
-    case PIXMAN_x2b10g10r10: return fbFetch_x2b10g10r10;
-    default: return fbFetch64_generic;
-    }
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t r = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t b = p & 0x3ff;
+    
+    r = r << 6 | r >> 4;
+    g = g << 6 | g >> 4;
+    b = b << 6 | b >> 4;
+    
+    return 0xffffULL << 48 | r << 32 | g << 16 | b;
 }
 
-/**************************** Pixel wise fetching *****************************/
-
-static FASTCALL uint64_t
-fbFetchPixel_a2b10g10r10 (bits_image_t *pict, int offset, int line)
+/* Despite the type, expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_a2b10g10r10 (bits_image_t *image,
+			 int           offset,
+			 int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t p = READ(pict, bits + offset);
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
     uint64_t a = p >> 30;
     uint64_t b = (p >> 20) & 0x3ff;
     uint64_t g = (p >> 10) & 0x3ff;
     uint64_t r = p & 0x3ff;
-
+    
     r = r << 6 | r >> 4;
     g = g << 6 | g >> 4;
     b = b << 6 | b >> 4;
-
-    a <<= 62;
+    
+    a <<= 14;
     a |= a >> 2;
     a |= a >> 4;
     a |= a >> 8;
-
+    
     return a << 48 | r << 32 | g << 16 | b;
 }
 
-static FASTCALL uint64_t
-fbFetchPixel_x2b10g10r10 (bits_image_t *pict, int offset, int line)
+/* Despite the type, this function expects a uint64_t buffer */
+static uint64_t
+fetch_pixel_x2b10g10r10 (bits_image_t *image,
+			 int           offset,
+			 int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t p = READ(pict, bits + offset);
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
     uint64_t b = (p >> 20) & 0x3ff;
     uint64_t g = (p >> 10) & 0x3ff;
     uint64_t r = p & 0x3ff;
-
+    
     r = r << 6 | r >> 4;
     g = g << 6 | g >> 4;
     b = b << 6 | b >> 4;
-
+    
     return 0xffffULL << 48 | r << 32 | g << 16 | b;
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_a8r8g8b8 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_a8r8g8b8 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    return READ(pict, (uint32_t *)bits + offset);
+    uint32_t *bits = image->bits + line * image->rowstride;
+    return READ (image, (uint32_t *)bits + offset);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_x8r8g8b8 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_x8r8g8b8 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    return READ(pict, (uint32_t *)bits + offset) | 0xff000000;
+    uint32_t *bits = image->bits + line * image->rowstride;
+
+    return READ (image, (uint32_t *)bits + offset) | 0xff000000;
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_a8b8g8r8 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_a8b8g8r8 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = READ(pict, (uint32_t *)bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
+    
     return ((pixel & 0xff000000) |
 	    ((pixel >> 16) & 0xff) |
 	    (pixel & 0x0000ff00) |
 	    ((pixel & 0xff) << 16));
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_x8b8g8r8 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_x8b8g8r8 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = READ(pict, (uint32_t *)bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
+    
     return ((0xff000000) |
 	    ((pixel >> 16) & 0xff) |
 	    (pixel & 0x0000ff00) |
 	    ((pixel & 0xff) << 16));
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_b8g8r8a8 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_b8g8r8a8 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = READ(pict, (uint32_t *)bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
+    
     return ((pixel & 0xff000000) >> 24 |
 	    (pixel & 0x00ff0000) >> 8 |
 	    (pixel & 0x0000ff00) << 8 |
 	    (pixel & 0x000000ff) << 24);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_b8g8r8x8 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_b8g8r8x8 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = READ(pict, (uint32_t *)bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint32_t *)bits + offset);
+    
     return ((0xff000000) |
 	    (pixel & 0xff000000) >> 24 |
 	    (pixel & 0x00ff0000) >> 8 |
 	    (pixel & 0x0000ff00) << 8);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_r8g8b8 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_r8g8b8 (bits_image_t *image,
+		    int           offset,
+		    int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint8_t   *pixel = ((uint8_t *) bits) + (offset*3);
-#if IMAGE_BYTE_ORDER == MSBFirst
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint8_t   *pixel = ((uint8_t *) bits) + (offset * 3);
+    
+#ifdef WORDS_BIGENDIAN
     return (0xff000000 |
-	    (READ(pict, pixel + 0) << 16) |
-	    (READ(pict, pixel + 1) << 8) |
-	    (READ(pict, pixel + 2)));
+	    (READ (image, pixel + 0) << 16) |
+	    (READ (image, pixel + 1) << 8) |
+	    (READ (image, pixel + 2)));
 #else
     return (0xff000000 |
-	    (READ(pict, pixel + 2) << 16) |
-	    (READ(pict, pixel + 1) << 8) |
-	    (READ(pict, pixel + 0)));
+	    (READ (image, pixel + 2) << 16) |
+	    (READ (image, pixel + 1) << 8) |
+	    (READ (image, pixel + 0)));
 #endif
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_b8g8r8 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_b8g8r8 (bits_image_t *image,
+		    int           offset,
+		    int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint8_t   *pixel = ((uint8_t *) bits) + (offset*3);
-#if IMAGE_BYTE_ORDER == MSBFirst
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint8_t   *pixel = ((uint8_t *) bits) + (offset * 3);
+#ifdef WORDS_BIGENDIAN
     return (0xff000000 |
-	    (READ(pict, pixel + 2) << 16) |
-	    (READ(pict, pixel + 1) << 8) |
-	    (READ(pict, pixel + 0)));
+	    (READ (image, pixel + 2) << 16) |
+	    (READ (image, pixel + 1) << 8) |
+	    (READ (image, pixel + 0)));
 #else
     return (0xff000000 |
-	    (READ(pict, pixel + 0) << 16) |
-	    (READ(pict, pixel + 1) << 8) |
-	    (READ(pict, pixel + 2)));
+	    (READ (image, pixel + 0) << 16) |
+	    (READ (image, pixel + 1) << 8) |
+	    (READ (image, pixel + 2)));
 #endif
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_r5g6b5 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_r5g6b5 (bits_image_t *image,
+		    int           offset,
+		    int           line)
 {
-    uint32_t  r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = READ(pict, (uint16_t *) bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t r, g, b;
+    
     r = ((pixel & 0xf800) | ((pixel & 0xe000) >> 5)) << 8;
     g = ((pixel & 0x07e0) | ((pixel & 0x0600) >> 6)) << 5;
     b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
+    
     return (0xff000000 | r | g | b);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_b5g6r5 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_b5g6r5 (bits_image_t *image,
+		    int           offset,
+		    int           line)
 {
-    uint32_t  r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = READ(pict, (uint16_t *) bits + offset);
-
+    uint32_t r, g, b;
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    
     b = ((pixel & 0xf800) | ((pixel & 0xe000) >> 5)) >> 8;
     g = ((pixel & 0x07e0) | ((pixel & 0x0600) >> 6)) << 5;
     r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
+    
     return (0xff000000 | r | g | b);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_a1r5g5b5 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_a1r5g5b5 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t  a,r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = READ(pict, (uint16_t *) bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t a, r, g, b;
+    
     a = (uint32_t) ((uint8_t) (0 - ((pixel & 0x8000) >> 15))) << 24;
     r = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) << 9;
     g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
     b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
+    
     return (a | r | g | b);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_x1r5g5b5 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_x1r5g5b5 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t  r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = READ(pict, (uint16_t *) bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t r, g, b;
+    
     r = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) << 9;
     g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
     b = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) >> 2;
+    
     return (0xff000000 | r | g | b);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_a1b5g5r5 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_a1b5g5r5 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t  a,r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = READ(pict, (uint16_t *) bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t a, r, g, b;
+    
     a = (uint32_t) ((uint8_t) (0 - ((pixel & 0x8000) >> 15))) << 24;
     b = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) >> 7;
     g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
     r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
+    
     return (a | r | g | b);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_x1b5g5r5 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_x1b5g5r5 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t  r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = READ(pict, (uint16_t *) bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t r, g, b;
+    
     b = ((pixel & 0x7c00) | ((pixel & 0x7000) >> 5)) >> 7;
     g = ((pixel & 0x03e0) | ((pixel & 0x0380) >> 5)) << 6;
     r = ((pixel & 0x001c) | ((pixel & 0x001f) << 5)) << 14;
+    
     return (0xff000000 | r | g | b);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_a4r4g4b4 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_a4r4g4b4 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t  a,r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = READ(pict, (uint16_t *) bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t a, r, g, b;
+    
     a = ((pixel & 0xf000) | ((pixel & 0xf000) >> 4)) << 16;
     r = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) << 12;
     g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
     b = ((pixel & 0x000f) | ((pixel & 0x000f) << 4));
+    
     return (a | r | g | b);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_x4r4g4b4 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_x4r4g4b4 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t  r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = READ(pict, (uint16_t *) bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t r, g, b;
+    
     r = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) << 12;
     g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
     b = ((pixel & 0x000f) | ((pixel & 0x000f) << 4));
+    
     return (0xff000000 | r | g | b);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_a4b4g4r4 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_a4b4g4r4 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t  a,r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = READ(pict, (uint16_t *) bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t a, r, g, b;
+    
     a = ((pixel & 0xf000) | ((pixel & 0xf000) >> 4)) << 16;
     b = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) >> 4;
     g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
     r = ((pixel & 0x000f) | ((pixel & 0x000f) << 4)) << 16;
+    
     return (a | r | g | b);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_x4b4g4r4 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_x4b4g4r4 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t  r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = READ(pict, (uint16_t *) bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint16_t *) bits + offset);
+    uint32_t r, g, b;
+    
     b = ((pixel & 0x0f00) | ((pixel & 0x0f00) >> 4)) >> 4;
     g = ((pixel & 0x00f0) | ((pixel & 0x00f0) >> 4)) << 8;
     r = ((pixel & 0x000f) | ((pixel & 0x000f) << 4)) << 16;
+    
     return (0xff000000 | r | g | b);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_a8 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_a8 (bits_image_t *image,
+		int           offset,
+		int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t   pixel = READ(pict, (uint8_t *) bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
+    
     return pixel << 24;
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_r3g3b2 (bits_image_t *pict, int offset, int line)
-{
-    uint32_t  r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t   pixel = READ(pict, (uint8_t *) bits + offset);
-
-    r = ((pixel & 0xe0) | ((pixel & 0xe0) >> 3) | ((pixel & 0xc0) >> 6)) << 16;
-    g = ((pixel & 0x1c) | ((pixel & 0x18) >> 3) | ((pixel & 0x1c) << 3)) << 8;
+static uint32_t
+fetch_pixel_r3g3b2 (bits_image_t *image,
+		    int           offset,
+		    int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
+    uint32_t r, g, b;
+    
+    r = ((pixel & 0xe0) |
+	 ((pixel & 0xe0) >> 3) |
+	 ((pixel & 0xc0) >> 6)) << 16;
+    
+    g = ((pixel & 0x1c) |
+	 ((pixel & 0x18) >> 3) |
+	 ((pixel & 0x1c) << 3)) << 8;
+    
     b = (((pixel & 0x03)     ) |
 	 ((pixel & 0x03) << 2) |
 	 ((pixel & 0x03) << 4) |
 	 ((pixel & 0x03) << 6));
+    
     return (0xff000000 | r | g | b);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_b2g3r3 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_b2g3r3 (bits_image_t *image,
+		    int           offset,
+		    int           line)
 {
-    uint32_t  r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t   pixel = READ(pict, (uint8_t *) bits + offset);
-
-    b = (((pixel & 0xc0)     ) |
-	 ((pixel & 0xc0) >> 2) |
-	 ((pixel & 0xc0) >> 4) |
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
+    uint32_t r, g, b;
+    
+    b = ((pixel & 0xc0)         |
+	 ((pixel & 0xc0) >> 2)  |
+	 ((pixel & 0xc0) >> 4)  |
 	 ((pixel & 0xc0) >> 6));
-    g = ((pixel & 0x38) | ((pixel & 0x38) >> 3) | ((pixel & 0x30) << 2)) << 8;
-    r = (((pixel & 0x07)     ) |
-	 ((pixel & 0x07) << 3) |
+    
+    g = ((pixel & 0x38)         |
+	 ((pixel & 0x38) >> 3)  |
+	 ((pixel & 0x30) << 2)) << 8;
+    
+    r = ((pixel & 0x07)         |
+	 ((pixel & 0x07) << 3)  |
 	 ((pixel & 0x06) << 6)) << 16;
+    
     return (0xff000000 | r | g | b);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_a2r2g2b2 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_a2r2g2b2 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t   a,r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t   pixel = READ(pict, (uint8_t *) bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
+    uint32_t a, r, g, b;
+    
     a = ((pixel & 0xc0) * 0x55) << 18;
     r = ((pixel & 0x30) * 0x55) << 12;
     g = ((pixel & 0x0c) * 0x55) << 6;
     b = ((pixel & 0x03) * 0x55);
-    return a|r|g|b;
+    
+    return a | r | g | b;
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_a2b2g2r2 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_a2b2g2r2 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t   a,r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t   pixel = READ(pict, (uint8_t *) bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
+    uint32_t a, r, g, b;
+    
     a = ((pixel & 0xc0) * 0x55) << 18;
     b = ((pixel & 0x30) * 0x55) >> 6;
     g = ((pixel & 0x0c) * 0x55) << 6;
     r = ((pixel & 0x03) * 0x55) << 16;
-    return a|r|g|b;
+    
+    return a | r | g | b;
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_c8 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_c8 (bits_image_t *image,
+		int           offset,
+		int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t   pixel = READ(pict, (uint8_t *) bits + offset);
-    const pixman_indexed_t * indexed = pict->indexed;
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
+    const pixman_indexed_t * indexed = image->indexed;
+    
     return indexed->rgba[pixel];
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_x4a4 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_x4a4 (bits_image_t *image,
+		  int           offset,
+		  int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t   pixel = READ(pict, (uint8_t *) bits + offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, (uint8_t *) bits + offset);
+    
     return ((pixel & 0xf) | ((pixel & 0xf) << 4)) << 24;
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_a4 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_a4 (bits_image_t *image,
+		int           offset,
+		int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = Fetch4(pict, bits, offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = FETCH_4 (image, bits, offset);
+    
     pixel |= pixel << 4;
     return pixel << 24;
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_r1g2b1 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_r1g2b1 (bits_image_t *image,
+		    int           offset,
+		    int           line)
 {
-    uint32_t  r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = Fetch4(pict, bits, offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = FETCH_4 (image, bits, offset);
+    uint32_t r, g, b;
+    
     r = ((pixel & 0x8) * 0xff) << 13;
     g = ((pixel & 0x6) * 0x55) << 7;
     b = ((pixel & 0x1) * 0xff);
-    return 0xff000000|r|g|b;
+    
+    return 0xff000000 | r | g | b;
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_b1g2r1 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_b1g2r1 (bits_image_t *image,
+		    int           offset,
+		    int           line)
 {
-    uint32_t  r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = Fetch4(pict, bits, offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = FETCH_4 (image, bits, offset);
+    uint32_t r, g, b;
+    
     b = ((pixel & 0x8) * 0xff) >> 3;
     g = ((pixel & 0x6) * 0x55) << 7;
     r = ((pixel & 0x1) * 0xff) << 16;
-    return 0xff000000|r|g|b;
+    
+    return 0xff000000 | r | g | b;
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_a1r1g1b1 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_a1r1g1b1 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t  a,r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = Fetch4(pict, bits, offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = FETCH_4 (image, bits, offset);
+    uint32_t a, r, g, b;
+    
     a = ((pixel & 0x8) * 0xff) << 21;
     r = ((pixel & 0x4) * 0xff) << 14;
     g = ((pixel & 0x2) * 0xff) << 7;
     b = ((pixel & 0x1) * 0xff);
-    return a|r|g|b;
+    
+    return a | r | g | b;
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_a1b1g1r1 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_a1b1g1r1 (bits_image_t *image,
+		      int           offset,
+		      int           line)
 {
-    uint32_t  a,r,g,b;
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = Fetch4(pict, bits, offset);
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = FETCH_4 (image, bits, offset);
+    uint32_t a, r, g, b;
+    
     a = ((pixel & 0x8) * 0xff) << 21;
     r = ((pixel & 0x4) * 0xff) >> 3;
     g = ((pixel & 0x2) * 0xff) << 7;
     b = ((pixel & 0x1) * 0xff) << 16;
-    return a|r|g|b;
+    
+    return a | r | g | b;
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_c4 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_c4 (bits_image_t *image,
+		int           offset,
+		int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = Fetch4(pict, bits, offset);
-    const pixman_indexed_t * indexed = pict->indexed;
-
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = FETCH_4 (image, bits, offset);
+    const pixman_indexed_t * indexed = image->indexed;
+    
     return indexed->rgba[pixel];
 }
 
-
-static FASTCALL uint32_t
-fbFetchPixel_a1 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_a1 (bits_image_t *image,
+		int           offset,
+		int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t  pixel = READ(pict, bits + (offset >> 5));
-    uint32_t  a;
-#if BITMAP_BIT_ORDER == MSBFirst
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, bits + (offset >> 5));
+    uint32_t a;
+    
+#ifdef WORDS_BIGENDIAN
     a = pixel >> (0x1f - (offset & 0x1f));
 #else
     a = pixel >> (offset & 0x1f);
@@ -1248,826 +1730,1147 @@ fbFetchPixel_a1 (bits_image_t *pict, int offset, int line)
     a |= a << 1;
     a |= a << 2;
     a |= a << 4;
+    
     return a << 24;
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_g1 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_g1 (bits_image_t *image,
+		int           offset,
+		int           line)
 {
-    uint32_t *bits = pict->bits + line*pict->rowstride;
-    uint32_t pixel = READ(pict, bits + (offset >> 5));
-    const pixman_indexed_t * indexed = pict->indexed;
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t pixel = READ (image, bits + (offset >> 5));
+    const pixman_indexed_t * indexed = image->indexed;
     uint32_t a;
-#if BITMAP_BIT_ORDER == MSBFirst
+    
+#ifdef WORDS_BIGENDIAN
     a = pixel >> (0x1f - (offset & 0x1f));
 #else
     a = pixel >> (offset & 0x1f);
 #endif
     a = a & 1;
+    
     return indexed->rgba[a];
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_yuy2 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_yuy2 (bits_image_t *image,
+		  int           offset,
+		  int           line)
 {
+    const uint32_t *bits = image->bits + image->rowstride * line;
+    
     int16_t y, u, v;
     int32_t r, g, b;
-
-    const uint32_t *bits = pict->bits + pict->rowstride * line;
-
+    
     y = ((uint8_t *) bits)[offset << 1] - 16;
-    u = ((uint8_t *) bits)[((offset << 1) & -4) + 1] - 128;
-    v = ((uint8_t *) bits)[((offset << 1) & -4) + 3] - 128;
-
+    u = ((uint8_t *) bits)[((offset << 1) & - 4) + 1] - 128;
+    v = ((uint8_t *) bits)[((offset << 1) & - 4) + 3] - 128;
+    
     /* R = 1.164(Y - 16) + 1.596(V - 128) */
     r = 0x012b27 * y + 0x019a2e * v;
+    
     /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
     g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+    
     /* B = 1.164(Y - 16) + 2.018(U - 128) */
     b = 0x012b27 * y + 0x0206a2 * u;
-
+    
     return 0xff000000 |
 	(r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
 	(g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
 	(b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
 }
 
-static FASTCALL uint32_t
-fbFetchPixel_yv12 (bits_image_t *pict, int offset, int line)
+static uint32_t
+fetch_pixel_yv12 (bits_image_t *image,
+		  int           offset,
+		  int           line)
 {
-    YV12_SETUP(pict);
+    YV12_SETUP (image);
     int16_t y = YV12_Y (line)[offset] - 16;
     int16_t u = YV12_U (line)[offset >> 1] - 128;
     int16_t v = YV12_V (line)[offset >> 1] - 128;
     int32_t r, g, b;
-
+    
     /* R = 1.164(Y - 16) + 1.596(V - 128) */
     r = 0x012b27 * y + 0x019a2e * v;
+    
     /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
     g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+    
     /* B = 1.164(Y - 16) + 2.018(U - 128) */
     b = 0x012b27 * y + 0x0206a2 * u;
-
+    
     return 0xff000000 |
 	(r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
 	(g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
 	(b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
 }
 
-/*
- * XXX: The transformed fetch path only works at 32-bpp so far.  When all paths
- * have wide versions, this can be removed.
- *
- * WARNING: This function loses precision!
- */
-static FASTCALL uint32_t
-fbFetchPixel32_generic_lossy (bits_image_t *pict, int offset, int line)
-{
-    fetchPixelProc64 fetchPixel64 = ACCESS(pixman_fetchPixelProcForPicture64) (pict);
-    const uint64_t argb16Pixel = fetchPixel64(pict, offset, line);
-    uint32_t argb8Pixel;
-
-    pixman_contract(&argb8Pixel, &argb16Pixel, 1);
-
-    return argb8Pixel;
-}
-
-fetchPixelProc32 ACCESS(pixman_fetchPixelProcForPicture32) (bits_image_t * pict)
-{
-    switch(pict->format) {
-    case PIXMAN_a8r8g8b8: return fbFetchPixel_a8r8g8b8;
-    case PIXMAN_x8r8g8b8: return fbFetchPixel_x8r8g8b8;
-    case PIXMAN_a8b8g8r8: return fbFetchPixel_a8b8g8r8;
-    case PIXMAN_x8b8g8r8: return fbFetchPixel_x8b8g8r8;
-    case PIXMAN_b8g8r8a8: return fbFetchPixel_b8g8r8a8;
-    case PIXMAN_b8g8r8x8: return fbFetchPixel_b8g8r8x8;
-    /* These two require wide compositing */
-    case PIXMAN_a2b10g10r10: return fbFetchPixel32_generic_lossy;
-    case PIXMAN_x2b10g10r10: return fbFetchPixel32_generic_lossy;
-
-        /* 24bpp formats */
-    case PIXMAN_r8g8b8: return fbFetchPixel_r8g8b8;
-    case PIXMAN_b8g8r8: return fbFetchPixel_b8g8r8;
-
-        /* 16bpp formats */
-    case PIXMAN_r5g6b5: return fbFetchPixel_r5g6b5;
-    case PIXMAN_b5g6r5: return fbFetchPixel_b5g6r5;
-
-    case PIXMAN_a1r5g5b5: return fbFetchPixel_a1r5g5b5;
-    case PIXMAN_x1r5g5b5: return fbFetchPixel_x1r5g5b5;
-    case PIXMAN_a1b5g5r5: return fbFetchPixel_a1b5g5r5;
-    case PIXMAN_x1b5g5r5: return fbFetchPixel_x1b5g5r5;
-    case PIXMAN_a4r4g4b4: return fbFetchPixel_a4r4g4b4;
-    case PIXMAN_x4r4g4b4: return fbFetchPixel_x4r4g4b4;
-    case PIXMAN_a4b4g4r4: return fbFetchPixel_a4b4g4r4;
-    case PIXMAN_x4b4g4r4: return fbFetchPixel_x4b4g4r4;
-
-        /* 8bpp formats */
-    case PIXMAN_a8: return  fbFetchPixel_a8;
-    case PIXMAN_r3g3b2: return fbFetchPixel_r3g3b2;
-    case PIXMAN_b2g3r3: return fbFetchPixel_b2g3r3;
-    case PIXMAN_a2r2g2b2: return fbFetchPixel_a2r2g2b2;
-    case PIXMAN_a2b2g2r2: return fbFetchPixel_a2b2g2r2;
-    case PIXMAN_c8: return  fbFetchPixel_c8;
-    case PIXMAN_g8: return  fbFetchPixel_c8;
-    case PIXMAN_x4a4: return fbFetchPixel_x4a4;
-
-        /* 4bpp formats */
-    case PIXMAN_a4: return  fbFetchPixel_a4;
-    case PIXMAN_r1g2b1: return fbFetchPixel_r1g2b1;
-    case PIXMAN_b1g2r1: return fbFetchPixel_b1g2r1;
-    case PIXMAN_a1r1g1b1: return fbFetchPixel_a1r1g1b1;
-    case PIXMAN_a1b1g1r1: return fbFetchPixel_a1b1g1r1;
-    case PIXMAN_c4: return  fbFetchPixel_c4;
-    case PIXMAN_g4: return  fbFetchPixel_c4;
-
-        /* 1bpp formats */
-    case PIXMAN_a1: return  fbFetchPixel_a1;
-    case PIXMAN_g1: return  fbFetchPixel_g1;
+/*********************************** Store ************************************/
 
-        /* YUV formats */
-    case PIXMAN_yuy2: return fbFetchPixel_yuy2;
-    case PIXMAN_yv12: return fbFetchPixel_yv12;
+#define SPLIT_A(v)              \
+    uint32_t a = ((v) >> 24),   \
+	r = ((v) >> 16) & 0xff, \
+	g = ((v) >> 8) & 0xff,  \
+	b = (v) & 0xff
+
+#define SPLIT(v)                     \
+    uint32_t r = ((v) >> 16) & 0xff, \
+	g = ((v) >> 8) & 0xff,       \
+	b = (v) & 0xff
+
+static void
+store_scanline_a2r10g10b10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = bits + x;
+    uint64_t *values = (uint64_t *)v;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >> 32) & 0xc0000000) |
+	       ((values[i] >> 18) & 0x3ff00000) |
+	       ((values[i] >> 12) & 0xffc00) | 
+	       ((values[i] >> 6) & 0x3ff));    
     }
-
-    return NULL;
 }
 
-static FASTCALL uint64_t
-fbFetchPixel64_generic (bits_image_t *pict, int offset, int line)
+static void
+store_scanline_x2r10g10b10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
 {
-    fetchPixelProc32 fetchPixel32 = ACCESS(pixman_fetchPixelProcForPicture32) (pict);
-    uint32_t argb8Pixel = fetchPixel32(pict, offset, line);
-    uint64_t argb16Pixel;
-
-    pixman_expand(&argb16Pixel, &argb8Pixel, pict->format, 1);
-
-    return argb16Pixel;
-}
-
-fetchPixelProc64 ACCESS(pixman_fetchPixelProcForPicture64) (bits_image_t * pict)
-{
-    switch(pict->format) {
-    case PIXMAN_a2b10g10r10: return fbFetchPixel_a2b10g10r10;
-    case PIXMAN_x2b10g10r10: return fbFetchPixel_x2b10g10r10;
-    default: return fbFetchPixel64_generic;
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint64_t *values = (uint64_t *)v;
+    uint32_t *pixel = bits + x;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >> 18) & 0x3ff00000) | 
+	       ((values[i] >> 12) & 0xffc00) |
+	       ((values[i] >> 6) & 0x3ff));
     }
 }
 
-/*********************************** Store ************************************/
-
-#define Splita(v)	uint32_t	a = ((v) >> 24), r = ((v) >> 16) & 0xff, g = ((v) >> 8) & 0xff, b = (v) & 0xff
-#define Split(v)	uint32_t	r = ((v) >> 16) & 0xff, g = ((v) >> 8) & 0xff, b = (v) & 0xff
-
-static FASTCALL void
-fbStore_a2b10g10r10 (pixman_image_t *image,
-		     uint32_t *bits, const uint64_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_a2b10g10r10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint32_t *pixel = bits + x;
-    for (i = 0; i < width; ++i) {
-        WRITE(image, pixel++,
-            ((values[i] >> 32) & 0xc0000000) | // A
-            ((values[i] >> 38) & 0x3ff) |      // R
-            ((values[i] >> 12) & 0xffc00) |    // G
-            ((values[i] << 14) & 0x3ff00000)); // B
+    uint64_t *values = (uint64_t *)v;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >> 32) & 0xc0000000) |
+	       ((values[i] >> 38) & 0x3ff) |
+	       ((values[i] >> 12) & 0xffc00) |
+	       ((values[i] << 14) & 0x3ff00000));
     }
 }
 
-static FASTCALL void
-fbStore_x2b10g10r10 (pixman_image_t *image,
-		     uint32_t *bits, const uint64_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_x2b10g10r10 (bits_image_t *  image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            const uint32_t *v)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint64_t *values = (uint64_t *)v;
     uint32_t *pixel = bits + x;
-    for (i = 0; i < width; ++i) {
-        WRITE(image, pixel++,
-            ((values[i] >> 38) & 0x3ff) |      // R
-            ((values[i] >> 12) & 0xffc00) |    // G
-            ((values[i] << 14) & 0x3ff00000)); // B
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >> 38) & 0x3ff) |
+	       ((values[i] >> 12) & 0xffc00) |
+	       ((values[i] << 14) & 0x3ff00000));
     }
 }
 
-static FASTCALL void
-fbStore_a8r8g8b8 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_a8r8g8b8 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    MEMCPY_WRAPPED(image, ((uint32_t *)bits) + x, values, width*sizeof(uint32_t));
+    uint32_t *bits = image->bits + image->rowstride * y;
+    
+    MEMCPY_WRAPPED (image, ((uint32_t *)bits) + x, values,
+                    width * sizeof(uint32_t));
 }
 
-static FASTCALL void
-fbStore_x8r8g8b8 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_x8r8g8b8 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint32_t *pixel = (uint32_t *)bits + x;
+    int i;
+    
     for (i = 0; i < width; ++i)
-	WRITE(image, pixel++, values[i] & 0xffffff);
+	WRITE (image, pixel++, values[i] & 0xffffff);
 }
 
-static FASTCALL void
-fbStore_a8b8g8r8 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_a8b8g8r8 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint32_t *pixel = (uint32_t *)bits + x;
+    int i;
+    
     for (i = 0; i < width; ++i)
-	WRITE(image, pixel++, (values[i] & 0xff00ff00) | ((values[i] >> 16) & 0xff) | ((values[i] & 0xff) << 16));
+    {
+	WRITE (image, pixel++,
+	       (values[i] & 0xff00ff00)         |
+	       ((values[i] >> 16) & 0xff)       |
+	       ((values[i] & 0xff) << 16));
+    }
 }
 
-static FASTCALL void
-fbStore_x8b8g8r8 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_x8b8g8r8 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint32_t *pixel = (uint32_t *)bits + x;
+    int i;
+    
     for (i = 0; i < width; ++i)
-	WRITE(image, pixel++, (values[i] & 0x0000ff00) | ((values[i] >> 16) & 0xff) | ((values[i] & 0xff) << 16));
+    {
+	WRITE (image, pixel++,
+	       (values[i] & 0x0000ff00)         |
+	       ((values[i] >> 16) & 0xff)       |
+	       ((values[i] & 0xff) << 16));
+    }
 }
 
-static FASTCALL void
-fbStore_b8g8r8a8 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_b8g8r8a8 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint32_t *pixel = (uint32_t *)bits + x;
+    int i;
+    
     for (i = 0; i < width; ++i)
-	WRITE(image, pixel++,
-	    ((values[i] >> 24) & 0x000000ff) |
-	    ((values[i] >>  8) & 0x0000ff00) |
-	    ((values[i] <<  8) & 0x00ff0000) |
-	    ((values[i] << 24) & 0xff000000));
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >> 24) & 0x000000ff) |
+	       ((values[i] >>  8) & 0x0000ff00) |
+	       ((values[i] <<  8) & 0x00ff0000) |
+	       ((values[i] << 24) & 0xff000000));
+    }
 }
 
-static FASTCALL void
-fbStore_b8g8r8x8 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_b8g8r8x8 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint32_t *pixel = (uint32_t *)bits + x;
+    int i;
+    
     for (i = 0; i < width; ++i)
-	WRITE(image, pixel++,
-	    ((values[i] >>  8) & 0x0000ff00) |
-	    ((values[i] <<  8) & 0x00ff0000) |
-	    ((values[i] << 24) & 0xff000000));
+    {
+	WRITE (image, pixel++,
+	       ((values[i] >>  8) & 0x0000ff00) |
+	       ((values[i] <<  8) & 0x00ff0000) |
+	       ((values[i] << 24) & 0xff000000));
+    }
 }
 
-static FASTCALL void
-fbStore_r8g8b8 (pixman_image_t *image,
-		uint32_t *bits, const uint32_t *values, int x, int width,
-		const pixman_indexed_t * indexed)
+static void
+store_scanline_r8g8b8 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
 {
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint8_t *pixel = ((uint8_t *) bits) + 3 * x;
     int i;
-    uint8_t *pixel = ((uint8_t *) bits) + 3*x;
-    for (i = 0; i < width; ++i) {
-	Store24(image, pixel, values[i]);
-	pixel += 3;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t val = values[i];
+	
+#ifdef WORDS_BIGENDIAN
+	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
+	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
+	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
+#else
+	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
+	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
+	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
+#endif
     }
 }
 
-static FASTCALL void
-fbStore_b8g8r8 (pixman_image_t *image,
-		uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_b8g8r8 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
 {
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint8_t *pixel = ((uint8_t *) bits) + 3 * x;
     int i;
-    uint8_t *pixel = ((uint8_t *) bits) + 3*x;
-    for (i = 0; i < width; ++i) {
+    
+    for (i = 0; i < width; ++i)
+    {
 	uint32_t val = values[i];
-#if IMAGE_BYTE_ORDER == MSBFirst
-	WRITE(image, pixel++, Blue(val));
-	WRITE(image, pixel++, Green(val));
-	WRITE(image, pixel++, Red(val));
+	
+#ifdef WORDS_BIGENDIAN
+	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
+	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
+	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
 #else
-	WRITE(image, pixel++, Red(val));
-	WRITE(image, pixel++, Green(val));
-	WRITE(image, pixel++, Blue(val));
+	WRITE (image, pixel++, (val & 0x00ff0000) >> 16);
+	WRITE (image, pixel++, (val & 0x0000ff00) >>  8);
+	WRITE (image, pixel++, (val & 0x000000ff) >>  0);
 #endif
     }
 }
 
-static FASTCALL void
-fbStore_r5g6b5 (pixman_image_t *image,
-		uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_r5g6b5 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint16_t *pixel = ((uint16_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
 	uint32_t s = values[i];
-	WRITE(image, pixel++, ((s >> 3) & 0x001f) |
-	      ((s >> 5) & 0x07e0) |
-	      ((s >> 8) & 0xf800));
+	
+	WRITE (image, pixel++,
+	       ((s >> 3) & 0x001f) |
+	       ((s >> 5) & 0x07e0) |
+	       ((s >> 8) & 0xf800));
     }
 }
 
-static FASTCALL void
-fbStore_b5g6r5 (pixman_image_t *image,
-		uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_b5g6r5 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint16_t  *pixel = ((uint16_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
-	Split(values[i]);
-	WRITE(image, pixel++, ((b << 8) & 0xf800) |
-	      ((g << 3) & 0x07e0) |
-	      ((r >> 3)         ));
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((b << 8) & 0xf800) |
+	       ((g << 3) & 0x07e0) |
+	       ((r >> 3)         ));
     }
 }
 
-static FASTCALL void
-fbStore_a1r5g5b5 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_a1r5g5b5 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint16_t  *pixel = ((uint16_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
-	Splita(values[i]);
-	WRITE(image, pixel++, ((a << 8) & 0x8000) |
-	      ((r << 7) & 0x7c00) |
-	      ((g << 2) & 0x03e0) |
-	      ((b >> 3)         ));
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT_A (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((a << 8) & 0x8000) |
+	       ((r << 7) & 0x7c00) |
+	       ((g << 2) & 0x03e0) |
+	       ((b >> 3)         ));
     }
 }
 
-static FASTCALL void
-fbStore_x1r5g5b5 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_x1r5g5b5 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint16_t  *pixel = ((uint16_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
-	Split(values[i]);
-	WRITE(image, pixel++, ((r << 7) & 0x7c00) |
-	      ((g << 2) & 0x03e0) |
-	      ((b >> 3)         ));
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((r << 7) & 0x7c00) |
+	       ((g << 2) & 0x03e0) |
+	       ((b >> 3)         ));
     }
 }
 
-static FASTCALL void
-fbStore_a1b5g5r5 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_a1b5g5r5 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint16_t  *pixel = ((uint16_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
-	Splita(values[i]);
-	WRITE(image, pixel++, ((a << 8) & 0x8000) |
-	      ((b << 7) & 0x7c00) |
-	      ((g << 2) & 0x03e0) |
-	      ((r >> 3)         ));
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT_A (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((a << 8) & 0x8000) |
+	       ((b << 7) & 0x7c00) |
+	       ((g << 2) & 0x03e0) |
+	       ((r >> 3)         ));
     }
 }
 
-static FASTCALL void
-fbStore_x1b5g5r5 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_x1b5g5r5 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint16_t  *pixel = ((uint16_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
-	Split(values[i]);
-	WRITE(image, pixel++, ((b << 7) & 0x7c00) |
-	      ((g << 2) & 0x03e0) |
-	      ((r >> 3)         ));
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT (values[i]);
+	
+	WRITE (image, pixel++, ((b << 7) & 0x7c00) |
+	       ((g << 2) & 0x03e0) |
+	       ((r >> 3)         ));
     }
 }
 
-static FASTCALL void
-fbStore_a4r4g4b4 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_a4r4g4b4 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint16_t  *pixel = ((uint16_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
-	Splita(values[i]);
-	WRITE(image, pixel++, ((a << 8) & 0xf000) |
-	      ((r << 4) & 0x0f00) |
-	      ((g     ) & 0x00f0) |
-	      ((b >> 4)         ));
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT_A (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((a << 8) & 0xf000) |
+	       ((r << 4) & 0x0f00) |
+	       ((g     ) & 0x00f0) |
+	       ((b >> 4)         ));
     }
 }
 
-static FASTCALL void
-fbStore_x4r4g4b4 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_x4r4g4b4 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint16_t  *pixel = ((uint16_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
-	Split(values[i]);
-	WRITE(image, pixel++, ((r << 4) & 0x0f00) |
-	      ((g     ) & 0x00f0) |
-	      ((b >> 4)         ));
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((r << 4) & 0x0f00) |
+	       ((g     ) & 0x00f0) |
+	       ((b >> 4)         ));
     }
 }
 
-static FASTCALL void
-fbStore_a4b4g4r4 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_a4b4g4r4 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint16_t  *pixel = ((uint16_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
-	Splita(values[i]);
-	WRITE(image, pixel++, ((a << 8) & 0xf000) |
-	      ((b << 4) & 0x0f00) |
-	      ((g     ) & 0x00f0) |
-	      ((r >> 4)         ));
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT_A (values[i]);
+	WRITE (image, pixel++, ((a << 8) & 0xf000) |
+	       ((b << 4) & 0x0f00) |
+	       ((g     ) & 0x00f0) |
+	       ((r >> 4)         ));
     }
 }
 
-static FASTCALL void
-fbStore_x4b4g4r4 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_x4b4g4r4 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint16_t  *pixel = ((uint16_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
-	Split(values[i]);
-	WRITE(image, pixel++, ((b << 4) & 0x0f00) |
-	      ((g     ) & 0x00f0) |
-	      ((r >> 4)         ));
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((b << 4) & 0x0f00) |
+	       ((g     ) & 0x00f0) |
+	       ((r >> 4)         ));
     }
 }
 
-static FASTCALL void
-fbStore_a8 (pixman_image_t *image,
-	    uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_a8 (bits_image_t *  image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint8_t   *pixel = ((uint8_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
-	WRITE(image, pixel++, values[i] >> 24);
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	WRITE (image, pixel++, values[i] >> 24);
     }
 }
 
-static FASTCALL void
-fbStore_r3g3b2 (pixman_image_t *image,
-		uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_r3g3b2 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint8_t   *pixel = ((uint8_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
-	Split(values[i]);
-	WRITE(image, pixel++,
-	      ((r     ) & 0xe0) |
-	      ((g >> 3) & 0x1c) |
-	      ((b >> 6)       ));
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((r     ) & 0xe0) |
+	       ((g >> 3) & 0x1c) |
+	       ((b >> 6)       ));
     }
 }
 
-static FASTCALL void
-fbStore_b2g3r3 (pixman_image_t *image,
-		uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_b2g3r3 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint8_t   *pixel = ((uint8_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
-	Split(values[i]);
-	WRITE(image, pixel++,
-	      ((b     ) & 0xc0) |
-	      ((g >> 2) & 0x38) |
-	      ((r >> 5)       ));
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((b     ) & 0xc0) |
+	       ((g >> 2) & 0x38) |
+	       ((r >> 5)       ));
     }
 }
 
-static FASTCALL void
-fbStore_a2r2g2b2 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_a2r2g2b2 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint8_t   *pixel = ((uint8_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
-	Splita(values[i]);
-	WRITE(image, pixel++, ((a     ) & 0xc0) |
-	      ((r >> 2) & 0x30) |
-	      ((g >> 4) & 0x0c) |
-	      ((b >> 6)       ));
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT_A (values[i]);
+	
+	WRITE (image, pixel++,
+	       ((a     ) & 0xc0) |
+	       ((r >> 2) & 0x30) |
+	       ((g >> 4) & 0x0c) |
+	       ((b >> 6)       ));
     }
 }
 
-static FASTCALL void
-fbStore_c8 (pixman_image_t *image,
-	    uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_a2b2g2r2 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint8_t   *pixel = ((uint8_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
-	WRITE(image, pixel++, miIndexToEnt24(indexed,values[i]));
-    }
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	SPLIT_A (values[i]);
+	
+	*(pixel++) =
+	    ((a     ) & 0xc0) |
+	    ((b >> 2) & 0x30) |
+	    ((g >> 4) & 0x0c) |
+	    ((r >> 6)       );
+    }
+}
+
+static void
+store_scanline_c8 (bits_image_t *  image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   const uint32_t *values)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint8_t *pixel = ((uint8_t *) bits) + x;
+    const pixman_indexed_t *indexed = image->indexed;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+	WRITE (image, pixel++, RGB24_TO_ENTRY (indexed,values[i]));
 }
 
-static FASTCALL void
-fbStore_x4a4 (pixman_image_t *image,
-	      uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_x4a4 (bits_image_t *  image,
+                     int             x,
+                     int             y,
+                     int             width,
+                     const uint32_t *values)
 {
-    int i;
+    uint32_t *bits = image->bits + image->rowstride * y;
     uint8_t   *pixel = ((uint8_t *) bits) + x;
-    for (i = 0; i < width; ++i) {
-	WRITE(image, pixel++, values[i] >> 28);
-    }
+    int i;
+    
+    for (i = 0; i < width; ++i)
+	WRITE (image, pixel++, values[i] >> 28);
 }
 
-#define Store8(img,l,o,v)  (WRITE(img, (uint8_t *)(l) + ((o) >> 3), (v)))
-#if IMAGE_BYTE_ORDER == MSBFirst
-#define Store4(img,l,o,v)  Store8(img,l,o,((o) & 4 ?				\
-				   (Fetch8(img,l,o) & 0xf0) | (v) :		\
-				   (Fetch8(img,l,o) & 0x0f) | ((v) << 4)))
+#define STORE_8(img,l,o,v)  (WRITE (img, (uint8_t *)(l) + ((o) >> 3), (v)))
+#ifdef WORDS_BIGENDIAN
+#define STORE_4(img,l,o,v)					    \
+    STORE_8 (img,l,o,((o) & 4 ?					    \
+                      (FETCH_8 (img,l,o) & 0xf0) | (v) :            \
+                      (FETCH_8 (img,l,o) & 0x0f) | ((v) << 4)))
 #else
-#define Store4(img,l,o,v)  Store8(img,l,o,((o) & 4 ?			       \
-				   (Fetch8(img,l,o) & 0x0f) | ((v) << 4) : \
-				   (Fetch8(img,l,o) & 0xf0) | (v)))
+#define STORE_4(img,l,o,v)					\
+    STORE_8 (img,l,o,((o) & 4 ?					\
+                      (FETCH_8 (img,l,o) & 0x0f) | ((v) << 4) : \
+                      (FETCH_8 (img,l,o) & 0xf0) | (v)))
 #endif
 
-static FASTCALL void
-fbStore_a4 (pixman_image_t *image,
-	    uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_a4 (bits_image_t *  image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   const uint32_t *values)
 {
+    uint32_t *bits = image->bits + image->rowstride * y;
     int i;
-    for (i = 0; i < width; ++i) {
-	Store4(image, bits, i + x, values[i]>>28);
-    }
+    
+    for (i = 0; i < width; ++i)
+	STORE_4 (image, bits, i + x, values[i] >> 28);
 }
 
-static FASTCALL void
-fbStore_r1g2b1 (pixman_image_t *image,
-		uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_r1g2b1 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
 {
+    uint32_t *bits = image->bits + image->rowstride * y;
     int i;
-    for (i = 0; i < width; ++i) {
-	uint32_t  pixel;
-
-	Split(values[i]);
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t pixel;
+	
+	SPLIT (values[i]);
 	pixel = (((r >> 4) & 0x8) |
-		 ((g >> 5) & 0x6) |
-		 ((b >> 7)      ));
-	Store4(image, bits, i + x, pixel);
+	         ((g >> 5) & 0x6) |
+	         ((b >> 7)      ));
+	STORE_4 (image, bits, i + x, pixel);
     }
 }
 
-static FASTCALL void
-fbStore_b1g2r1 (pixman_image_t *image,
-		uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_b1g2r1 (bits_image_t *  image,
+                       int             x,
+                       int             y,
+                       int             width,
+                       const uint32_t *values)
 {
+    uint32_t *bits = image->bits + image->rowstride * y;
     int i;
-    for (i = 0; i < width; ++i) {
-	uint32_t  pixel;
-
-	Split(values[i]);
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t pixel;
+	
+	SPLIT (values[i]);
 	pixel = (((b >> 4) & 0x8) |
-		 ((g >> 5) & 0x6) |
-		 ((r >> 7)      ));
-	Store4(image, bits, i + x, pixel);
+	         ((g >> 5) & 0x6) |
+	         ((r >> 7)      ));
+	STORE_4 (image, bits, i + x, pixel);
     }
 }
 
-static FASTCALL void
-fbStore_a1r1g1b1 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_a1r1g1b1 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
+    uint32_t *bits = image->bits + image->rowstride * y;
     int i;
-    for (i = 0; i < width; ++i) {
-	uint32_t  pixel;
-	Splita(values[i]);
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t pixel;
+	
+	SPLIT_A (values[i]);
 	pixel = (((a >> 4) & 0x8) |
-		 ((r >> 5) & 0x4) |
-		 ((g >> 6) & 0x2) |
-		 ((b >> 7)      ));
-	Store4(image, bits, i + x, pixel);
+	         ((r >> 5) & 0x4) |
+	         ((g >> 6) & 0x2) |
+	         ((b >> 7)      ));
+	STORE_4 (image, bits, i + x, pixel);
     }
 }
 
-static FASTCALL void
-fbStore_a1b1g1r1 (pixman_image_t *image,
-		  uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_a1b1g1r1 (bits_image_t *  image,
+                         int             x,
+                         int             y,
+                         int             width,
+                         const uint32_t *values)
 {
+    uint32_t *bits = image->bits + image->rowstride * y;
     int i;
-    for (i = 0; i < width; ++i) {
-	uint32_t  pixel;
-	Splita(values[i]);
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t pixel;
+	
+	SPLIT_A (values[i]);
 	pixel = (((a >> 4) & 0x8) |
-		 ((b >> 5) & 0x4) |
-		 ((g >> 6) & 0x2) |
-		 ((r >> 7)      ));
-	Store4(image, bits, i + x, pixel);
+	         ((b >> 5) & 0x4) |
+	         ((g >> 6) & 0x2) |
+	         ((r >> 7)      ));
+	STORE_4 (image, bits, i + x, pixel);
     }
 }
 
-static FASTCALL void
-fbStore_c4 (pixman_image_t *image,
-	    uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_c4 (bits_image_t *  image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   const uint32_t *values)
 {
+    uint32_t *bits = image->bits + image->rowstride * y;
+    const pixman_indexed_t *indexed = image->indexed;
     int i;
-    for (i = 0; i < width; ++i) {
-	uint32_t  pixel;
-
-	pixel = miIndexToEnt24(indexed, values[i]);
-	Store4(image, bits, i + x, pixel);
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t pixel;
+	
+	pixel = RGB24_TO_ENTRY (indexed, values[i]);
+	STORE_4 (image, bits, i + x, pixel);
     }
 }
 
-static FASTCALL void
-fbStore_a1 (pixman_image_t *image,
-	    uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_a1 (bits_image_t *  image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   const uint32_t *values)
 {
+    uint32_t *bits = image->bits + image->rowstride * y;
     int i;
-    for (i = 0; i < width; ++i) {
-	uint32_t  *pixel = ((uint32_t *) bits) + ((i+x) >> 5);
-	uint32_t  mask = FbStipMask((i+x) & 0x1f, 1);
-
-	uint32_t v = values[i] & 0x80000000 ? mask : 0;
-	WRITE(image, pixel, (READ(image, pixel) & ~mask) | v);
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t  *pixel = ((uint32_t *) bits) + ((i + x) >> 5);
+	uint32_t mask, v;
+	
+#ifdef WORDS_BIGENDIAN
+	mask = 1 << (0x1f - ((i + x) & 0x1f));
+#else
+	mask = 1 << ((i + x) & 0x1f);
+#endif
+	v = values[i] & 0x80000000 ? mask : 0;
+	
+	WRITE (image, pixel, (READ (image, pixel) & ~mask) | v);
     }
 }
 
-static FASTCALL void
-fbStore_g1 (pixman_image_t *image,
-	    uint32_t *bits, const uint32_t *values, int x, int width, const pixman_indexed_t * indexed)
+static void
+store_scanline_g1 (bits_image_t *  image,
+                   int             x,
+                   int             y,
+                   int             width,
+                   const uint32_t *values)
 {
+    uint32_t *bits = image->bits + image->rowstride * y;
+    const pixman_indexed_t *indexed = image->indexed;
     int i;
-    for (i = 0; i < width; ++i) {
-	uint32_t  *pixel = ((uint32_t *) bits) + ((i+x) >> 5);
-	uint32_t  mask = FbStipMask((i+x) & 0x1f, 1);
-
-	uint32_t v = miIndexToEntY24(indexed,values[i]) ? mask : 0;
-	WRITE(image, pixel, (READ(image, pixel) & ~mask) | v);
-    }
-}
-
-
-storeProc32 ACCESS(pixman_storeProcForPicture32) (bits_image_t * pict)
-{
-    switch(pict->format) {
-    case PIXMAN_a8r8g8b8: return fbStore_a8r8g8b8;
-    case PIXMAN_x8r8g8b8: return fbStore_x8r8g8b8;
-    case PIXMAN_a8b8g8r8: return fbStore_a8b8g8r8;
-    case PIXMAN_x8b8g8r8: return fbStore_x8b8g8r8;
-    case PIXMAN_b8g8r8a8: return fbStore_b8g8r8a8;
-    case PIXMAN_b8g8r8x8: return fbStore_b8g8r8x8;
-
-        /* 24bpp formats */
-    case PIXMAN_r8g8b8: return fbStore_r8g8b8;
-    case PIXMAN_b8g8r8: return fbStore_b8g8r8;
-
-        /* 16bpp formats */
-    case PIXMAN_r5g6b5: return fbStore_r5g6b5;
-    case PIXMAN_b5g6r5: return fbStore_b5g6r5;
-
-    case PIXMAN_a1r5g5b5: return fbStore_a1r5g5b5;
-    case PIXMAN_x1r5g5b5: return fbStore_x1r5g5b5;
-    case PIXMAN_a1b5g5r5: return fbStore_a1b5g5r5;
-    case PIXMAN_x1b5g5r5: return fbStore_x1b5g5r5;
-    case PIXMAN_a4r4g4b4: return fbStore_a4r4g4b4;
-    case PIXMAN_x4r4g4b4: return fbStore_x4r4g4b4;
-    case PIXMAN_a4b4g4r4: return fbStore_a4b4g4r4;
-    case PIXMAN_x4b4g4r4: return fbStore_x4b4g4r4;
-
-        /* 8bpp formats */
-    case PIXMAN_a8: return  fbStore_a8;
-    case PIXMAN_r3g3b2: return fbStore_r3g3b2;
-    case PIXMAN_b2g3r3: return fbStore_b2g3r3;
-    case PIXMAN_a2r2g2b2: return fbStore_a2r2g2b2;
-    case PIXMAN_c8: return  fbStore_c8;
-    case PIXMAN_g8: return  fbStore_c8;
-    case PIXMAN_x4a4: return fbStore_x4a4;
-
-        /* 4bpp formats */
-    case PIXMAN_a4: return  fbStore_a4;
-    case PIXMAN_r1g2b1: return fbStore_r1g2b1;
-    case PIXMAN_b1g2r1: return fbStore_b1g2r1;
-    case PIXMAN_a1r1g1b1: return fbStore_a1r1g1b1;
-    case PIXMAN_a1b1g1r1: return fbStore_a1b1g1r1;
-    case PIXMAN_c4: return  fbStore_c4;
-    case PIXMAN_g4: return  fbStore_c4;
-
-        /* 1bpp formats */
-    case PIXMAN_a1: return  fbStore_a1;
-    case PIXMAN_g1: return  fbStore_g1;
-    default:
-        return NULL;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t  *pixel = ((uint32_t *) bits) + ((i + x) >> 5);
+	uint32_t mask, v;
+	
+#ifdef WORDS_BIGENDIAN
+	mask = 1 << (0x1f - ((i + x) & 0x1f));
+#else
+	mask = 1 << ((i + x) & 0x1f);
+#endif
+	v = RGB24_TO_ENTRY_Y (indexed, values[i]) ? mask : 0;
+	
+	WRITE (image, pixel, (READ (image, pixel) & ~mask) | v);
     }
 }
 
 /*
  * Contracts a 64bpp image to 32bpp and then stores it using a regular 32-bit
- * store proc.
+ * store proc. Despite the type, this function expects a uint64_t buffer.
  */
-static FASTCALL void
-fbStore64_generic (pixman_image_t *image,
-		   uint32_t *bits, const uint64_t *values, int x, int width, const pixman_indexed_t * indexed)
-{
-    bits_image_t *pict = (bits_image_t*)image;
-    storeProc32 store32 = ACCESS(pixman_storeProcForPicture32) (pict);
-    uint32_t *argb8Pixels;
-
-    assert(image->common.type == BITS);
-    assert(store32);
-
-    argb8Pixels = pixman_malloc_ab (width, sizeof(uint32_t));
-    if (!argb8Pixels) return;
-
-    // Contract the scanline.  We could do this in place if values weren't
-    // const.
-    pixman_contract(argb8Pixels, values, width);
-    store32(image, bits, argb8Pixels, x, width, indexed);
-
-    free(argb8Pixels);
+static void
+store_scanline_generic_64 (bits_image_t *  image,
+                           int             x,
+                           int             y,
+                           int             width,
+                           const uint32_t *values)
+{
+    uint32_t *argb8_pixels;
+    
+    assert (image->common.type == BITS);
+    
+    argb8_pixels = pixman_malloc_ab (width, sizeof(uint32_t));
+    if (!argb8_pixels)
+	return;
+    
+    /* Contract the scanline.  We could do this in place if values weren't
+     * const.
+     */
+    pixman_contract (argb8_pixels, (uint64_t *)values, width);
+    
+    image->store_scanline_raw_32 (image, x, y, width, argb8_pixels);
+    
+    free (argb8_pixels);
 }
 
-storeProc64 ACCESS(pixman_storeProcForPicture64) (bits_image_t * pict)
-{
-    switch(pict->format) {
-    case PIXMAN_a2b10g10r10: return fbStore_a2b10g10r10;
-    case PIXMAN_x2b10g10r10: return fbStore_x2b10g10r10;
-    default: return fbStore64_generic;
-    }
+/* Despite the type, this function expects both buffer
+ * and mask to be uint64_t
+ */
+static void
+fetch_scanline_generic_64 (pixman_image_t *image,
+                           int             x,
+                           int             y,
+                           int             width,
+                           uint32_t *      buffer,
+                           const uint32_t *mask,
+                           uint32_t        mask_bits)
+{
+    /* Fetch the pixels into the first half of buffer and then expand them in
+     * place.
+     */
+    image->bits.fetch_scanline_raw_32 (image, x, y, width, buffer, NULL, 0);
+    
+    pixman_expand ((uint64_t *)buffer, buffer, image->bits.format, width);
 }
 
-#ifndef PIXMAN_FB_ACCESSORS
-/*
- * Helper routine to expand a color component from 0 < n <= 8 bits to 16 bits by
- * replication.
- */
-static inline uint64_t expand16(const uint8_t val, int nbits)
+/* Despite the type, this function expects a uint64_t *buffer */
+static uint64_t
+fetch_pixel_generic_64 (bits_image_t *image,
+			int	      offset,
+			int           line)
 {
-    // Start out with the high bit of val in the high bit of result.
-    uint16_t result = (uint16_t)val << (16 - nbits);
-
-    if (nbits == 0)
-        return 0;
-
-    // Copy the bits in result, doubling the number of bits each time, until we
-    // fill all 16 bits.
-    while (nbits < 16) {
-        result |= result >> nbits;
-        nbits *= 2;
-    }
+    uint32_t pixel32 = image->fetch_pixel_raw_32 (image, offset, line);
+    uint64_t result;
+    
+    pixman_expand ((uint64_t *)&result, &pixel32, image->format, 1);
 
     return result;
 }
 
 /*
- * This function expands images from ARGB8 format to ARGB16.  To preserve
- * precision, it needs to know the original source format.  For example, if the
- * source was PIXMAN_x1r5g5b5 and the red component contained bits 12345, then
- * the expanded value is 12345123.  To correctly expand this to 16 bits, it
- * should be 1234512345123451 and not 1234512312345123.
+ * XXX: The transformed fetch path only works at 32-bpp so far.  When all
+ * paths have wide versions, this can be removed.
+ *
+ * WARNING: This function loses precision!
  */
-void pixman_expand(uint64_t *dst, const uint32_t *src,
-                   pixman_format_code_t format, int width)
+static uint32_t
+fetch_pixel_generic_lossy_32 (bits_image_t *image,
+			      int           offset,
+			      int           line)
 {
-    /*
-     * Determine the sizes of each component and the masks and shifts required
-     * to extract them from the source pixel.
-     */
-    const int a_size = PIXMAN_FORMAT_A(format),
-              r_size = PIXMAN_FORMAT_R(format),
-              g_size = PIXMAN_FORMAT_G(format),
-              b_size = PIXMAN_FORMAT_B(format);
-    const int a_shift = 32 - a_size,
-              r_shift = 24 - r_size,
-              g_shift = 16 - g_size,
-              b_shift =  8 - b_size;
-    const uint8_t a_mask = ~(~0 << a_size),
-                  r_mask = ~(~0 << r_size),
-                  g_mask = ~(~0 << g_size),
-                  b_mask = ~(~0 << b_size);
-    int i;
+    uint64_t pixel64 = image->fetch_pixel_raw_64 (image, offset, line);
+    uint32_t result;
+    
+    pixman_contract (&result, &pixel64, 1);
 
-    /* Start at the end so that we can do the expansion in place when src == dst */
-    for (i = width - 1; i >= 0; i--)
-    {
-        const uint32_t pixel = src[i];
-        // Extract the components.
-        const uint8_t a = (pixel >> a_shift) & a_mask,
-                      r = (pixel >> r_shift) & r_mask,
-                      g = (pixel >> g_shift) & g_mask,
-                      b = (pixel >> b_shift) & b_mask;
-        const uint64_t a16 = a_size ? expand16(a, a_size) : 0xffff,
-                       r16 = expand16(r, r_size),
-                       g16 = expand16(g, g_size),
-                       b16 = expand16(b, b_size);
+    return result;
+}
 
-        dst[i] = a16 << 48 | r16 << 32 | g16 << 16 | b16;
+typedef struct
+{
+    pixman_format_code_t	format;
+    fetch_scanline_t		fetch_scanline_raw_32;
+    fetch_scanline_t		fetch_scanline_raw_64;
+    fetch_pixel_32_t		fetch_pixel_raw_32;
+    fetch_pixel_64_t		fetch_pixel_raw_64;
+    store_scanline_t		store_scanline_raw_32;
+    store_scanline_t		store_scanline_raw_64;
+} format_info_t;
+
+#define FORMAT_INFO(format) 						\
+    {									\
+	PIXMAN_ ## format,						\
+	    fetch_scanline_ ## format,					\
+	    fetch_scanline_generic_64,					\
+	    fetch_pixel_ ## format, fetch_pixel_generic_64,		\
+	    store_scanline_ ## format, store_scanline_generic_64	\
+    }
+
+static const format_info_t accessors[] =
+{
+/* 32 bpp formats */
+    FORMAT_INFO (a8r8g8b8),
+    FORMAT_INFO (x8r8g8b8),
+    FORMAT_INFO (a8b8g8r8),
+    FORMAT_INFO (x8b8g8r8),
+    FORMAT_INFO (b8g8r8a8),
+    FORMAT_INFO (b8g8r8x8),
+    
+/* 24bpp formats */
+    FORMAT_INFO (r8g8b8),
+    FORMAT_INFO (b8g8r8),
+    
+/* 16bpp formats */
+    FORMAT_INFO (r5g6b5),
+    FORMAT_INFO (b5g6r5),
+    
+    FORMAT_INFO (a1r5g5b5),
+    FORMAT_INFO (x1r5g5b5),
+    FORMAT_INFO (a1b5g5r5),
+    FORMAT_INFO (x1b5g5r5),
+    FORMAT_INFO (a4r4g4b4),
+    FORMAT_INFO (x4r4g4b4),
+    FORMAT_INFO (a4b4g4r4),
+    FORMAT_INFO (x4b4g4r4),
+    
+/* 8bpp formats */
+    FORMAT_INFO (a8),
+    FORMAT_INFO (r3g3b2),
+    FORMAT_INFO (b2g3r3),
+    FORMAT_INFO (a2r2g2b2),
+    FORMAT_INFO (a2b2g2r2),
+    
+    FORMAT_INFO (c8),
+    
+#define fetch_scanline_g8 fetch_scanline_c8
+#define fetch_pixel_g8 fetch_pixel_c8
+#define store_scanline_g8 store_scanline_c8
+    FORMAT_INFO (g8),
+    
+#define fetch_scanline_x4c4 fetch_scanline_c8
+#define fetch_pixel_x4c4 fetch_pixel_c8
+#define store_scanline_x4c4 store_scanline_c8
+    FORMAT_INFO (x4c4),
+    
+#define fetch_scanline_x4g4 fetch_scanline_c8
+#define fetch_pixel_x4g4 fetch_pixel_c8
+#define store_scanline_x4g4 store_scanline_c8
+    FORMAT_INFO (x4g4),
+    
+    FORMAT_INFO (x4a4),
+    
+/* 4bpp formats */
+    FORMAT_INFO (a4),
+    FORMAT_INFO (r1g2b1),
+    FORMAT_INFO (b1g2r1),
+    FORMAT_INFO (a1r1g1b1),
+    FORMAT_INFO (a1b1g1r1),
+    
+    FORMAT_INFO (c4),
+    
+#define fetch_scanline_g4 fetch_scanline_c4
+#define fetch_pixel_g4 fetch_pixel_c4
+#define store_scanline_g4 store_scanline_c4
+    FORMAT_INFO (g4),
+    
+/* 1bpp formats */
+    FORMAT_INFO (a1),
+    FORMAT_INFO (g1),
+    
+/* Wide formats */
+    
+    { PIXMAN_a2r10g10b10,
+      NULL, fetch_scanline_a2r10g10b10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_a2r10g10b10,
+      NULL, store_scanline_a2r10g10b10 },
+    
+    { PIXMAN_x2r10g10b10,
+      NULL, fetch_scanline_x2r10g10b10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_x2r10g10b10,
+      NULL, store_scanline_x2r10g10b10 },
+    
+    { PIXMAN_a2b10g10r10,
+      NULL, fetch_scanline_a2b10g10r10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_a2b10g10r10,
+      NULL, store_scanline_a2b10g10r10 },
+    
+    { PIXMAN_x2b10g10r10,
+      NULL, fetch_scanline_x2b10g10r10,
+      fetch_pixel_generic_lossy_32, fetch_pixel_x2b10g10r10,
+      NULL, store_scanline_x2b10g10r10 },
+    
+/* YUV formats */
+    { PIXMAN_yuy2,
+      fetch_scanline_yuy2, fetch_scanline_generic_64,
+      fetch_pixel_yuy2, fetch_pixel_generic_64,
+      NULL, NULL },
+    
+    { PIXMAN_yv12,
+      fetch_scanline_yv12, fetch_scanline_generic_64,
+      fetch_pixel_yv12, fetch_pixel_generic_64,
+      NULL, NULL },
+    
+    { PIXMAN_null },
+};
+
+static void
+setup_accessors (bits_image_t *image)
+{
+    const format_info_t *info = accessors;
+    
+    while (info->format != PIXMAN_null)
+    {
+	if (info->format == image->format)
+	{
+	    image->fetch_scanline_raw_32 = info->fetch_scanline_raw_32;
+	    image->fetch_scanline_raw_64 = info->fetch_scanline_raw_64;
+	    image->fetch_pixel_raw_32 = info->fetch_pixel_raw_32;
+	    image->fetch_pixel_raw_64 = info->fetch_pixel_raw_64;
+	    image->store_scanline_raw_32 = info->store_scanline_raw_32;
+	    image->store_scanline_raw_64 = info->store_scanline_raw_64;
+	    
+	    return;
+	}
+	
+	info++;
     }
 }
 
-/*
- * Contracting is easier than expanding.  We just need to truncate the
- * components.
- */
-void pixman_contract(uint32_t *dst, const uint64_t *src, int width)
+#ifndef PIXMAN_FB_ACCESSORS
+void
+_pixman_bits_image_setup_raw_accessors_accessors (bits_image_t *image);
+
+void
+_pixman_bits_image_setup_raw_accessors (bits_image_t *image)
 {
-    int i;
+    if (image->read_func || image->write_func)
+	_pixman_bits_image_setup_raw_accessors_accessors (image);
+    else
+	setup_accessors (image);
+}
 
-    /* Start at the beginning so that we can do the contraction in place when
-     * src == dst */
-    for (i = 0; i < width; i++)
-    {
-        const uint8_t a = src[i] >> 56,
-                      r = src[i] >> 40,
-                      g = src[i] >> 24,
-                      b = src[i] >> 8;
-        dst[i] = a << 24 | r << 16 | g << 8 | b;
-    }
+#else
+
+void
+_pixman_bits_image_setup_raw_accessors_accessors (bits_image_t *image)
+{
+    setup_accessors (image);
 }
-#endif // PIXMAN_FB_ACCESSORS
+
+#endif
diff --git a/lib/pixman/pixman/pixman-accessor.h b/lib/pixman/pixman/pixman-accessor.h
new file mode 100644
index 000000000..90c8ea7b7
--- /dev/null
+++ b/lib/pixman/pixman/pixman-accessor.h
@@ -0,0 +1,40 @@
+#ifdef PIXMAN_FB_ACCESSORS
+
+#define ACCESS(sym) sym##_accessors
+
+#define READ(img, ptr)							\
+    (((bits_image_t *)(img))->read_func ((ptr), sizeof(*(ptr))))
+#define WRITE(img, ptr,val)						\
+    (((bits_image_t *)(img))->write_func ((ptr), (val), sizeof (*(ptr))))
+
+#define MEMCPY_WRAPPED(img, dst, src, size)				\
+    do {								\
+	size_t _i;							\
+	uint8_t *_dst = (uint8_t*)(dst), *_src = (uint8_t*)(src);	\
+	for(_i = 0; _i < size; _i++) {					\
+	    WRITE((img), _dst +_i, READ((img), _src + _i));		\
+	}								\
+    } while (0)
+
+#define MEMSET_WRAPPED(img, dst, val, size)				\
+    do {								\
+	size_t _i;							\
+	uint8_t *_dst = (uint8_t*)(dst);				\
+	for(_i = 0; _i < (size_t) size; _i++) {				\
+	    WRITE((img), _dst +_i, (val));				\
+	}								\
+    } while (0)
+
+#else
+
+#define ACCESS(sym) sym
+
+#define READ(img, ptr)		(*(ptr))
+#define WRITE(img, ptr, val)	(*(ptr) = (val))
+#define MEMCPY_WRAPPED(img, dst, src, size)				\
+    memcpy(dst, src, size)
+#define MEMSET_WRAPPED(img, dst, val, size)				\
+    memset(dst, val, size)
+
+#endif
+
diff --git a/lib/pixman/pixman/pixman-arm-neon.c b/lib/pixman/pixman/pixman-arm-neon.c
index 5453dbbaf..8a2d72ea3 100644
--- a/lib/pixman/pixman/pixman-arm-neon.c
+++ b/lib/pixman/pixman/pixman-arm-neon.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2009 ARM Ltd
+ * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
@@ -20,7 +20,9 @@
  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  * SOFTWARE.
  *
- * Author:  Ian Rickards (ian.rickards@arm.com) 
+ * Author:  Ian Rickards (ian.rickards@arm.com)
+ * Author:  Jonathan Morton (jonathan.morton@movial.com)
+ * Author:  Markku Vire (markku.vire@movial.com)
  *
  */
 
@@ -28,1523 +30,2752 @@
 #include <config.h>
 #endif
 
-#include "pixman-arm-neon.h"
-
 #include <arm_neon.h>
+#include <string.h>
+#include "pixman-private.h"
 
+/* Deal with an intrinsic that is defined differently in GCC */
+#if !defined(__ARMCC_VERSION) && !defined(__pld)
+#define __pld(_x) __builtin_prefetch (_x)
+#endif
 
-static force_inline uint8x8x4_t unpack0565(uint16x8_t rgb)
+static force_inline uint8x8x4_t
+unpack0565 (uint16x8_t rgb)
 {
     uint16x8_t gb, b;
     uint8x8x4_t res;
 
-    res.val[3] = vdup_n_u8(0);
-    gb = vshrq_n_u16(rgb, 5);
-    b = vshrq_n_u16(rgb, 5+6);
-    res.val[0] = vmovn_u16(rgb);  // get low 5 bits
-    res.val[1] = vmovn_u16(gb);   // get mid 6 bits
-    res.val[2] = vmovn_u16(b);    // get top 5 bits
+    res.val[3] = vdup_n_u8 (0);
+    gb = vshrq_n_u16 (rgb, 5);
+    b = vshrq_n_u16 (rgb, 5 + 6);
 
-    res.val[0] = vshl_n_u8(res.val[0], 3); // shift to top
-    res.val[1] = vshl_n_u8(res.val[1], 2); // shift to top
-    res.val[2] = vshl_n_u8(res.val[2], 3); // shift to top
+    res.val[0] = vmovn_u16 (rgb);  /* get low 5 bits */
+    res.val[1] = vmovn_u16 (gb);   /* get mid 6 bits */
+    res.val[2] = vmovn_u16 (b);    /* get top 5 bits */
 
-    res.val[0] = vsri_n_u8(res.val[0], res.val[0], 5); 
-    res.val[1] = vsri_n_u8(res.val[1], res.val[1], 6);
-    res.val[2] = vsri_n_u8(res.val[2], res.val[2], 5);
+    res.val[0] = vshl_n_u8 (res.val[0], 3); /* shift to top */
+    res.val[1] = vshl_n_u8 (res.val[1], 2); /* shift to top */
+    res.val[2] = vshl_n_u8 (res.val[2], 3); /* shift to top */
+
+    res.val[0] = vsri_n_u8 (res.val[0], res.val[0], 5);
+    res.val[1] = vsri_n_u8 (res.val[1], res.val[1], 6);
+    res.val[2] = vsri_n_u8 (res.val[2], res.val[2], 5);
 
     return res;
 }
 
-static force_inline uint16x8_t pack0565(uint8x8x4_t s)
+#ifdef USE_GCC_INLINE_ASM
+/* Some versions of gcc have problems with vshll_n_u8 intrinsic (Bug 23576) */
+#define vshll_n_u8(a, n) ({ uint16x8_t r; \
+    asm ("vshll.u8 %q0, %P1, %2\n" : "=w" (r) : "w" (a), "i" (n)); r; })
+#endif
+
+static force_inline uint16x8_t
+pack0565 (uint8x8x4_t s)
 {
     uint16x8_t rgb, val_g, val_r;
 
-    rgb = vshll_n_u8(s.val[2],8);
-    val_g = vshll_n_u8(s.val[1],8);
-    val_r = vshll_n_u8(s.val[0],8);
-    rgb = vsriq_n_u16(rgb, val_g, 5);
-    rgb = vsriq_n_u16(rgb, val_r, 5+6);
+    rgb = vshll_n_u8 (s.val[2], 8);
+    val_g = vshll_n_u8 (s.val[1], 8);
+    val_r = vshll_n_u8 (s.val[0], 8);
+    rgb = vsriq_n_u16 (rgb, val_g, 5);
+    rgb = vsriq_n_u16 (rgb, val_r, 5 + 6);
 
     return rgb;
 }
 
-static force_inline uint8x8_t neon2mul(uint8x8_t x, uint8x8_t alpha)
+static force_inline uint8x8_t
+neon2mul (uint8x8_t x,
+          uint8x8_t alpha)
 {
-    uint16x8_t tmp,tmp2;
+    uint16x8_t tmp, tmp2;
     uint8x8_t res;
 
-    tmp = vmull_u8(x,alpha);
-    tmp2 = vrshrq_n_u16(tmp,8);
-    res = vraddhn_u16(tmp,tmp2);
+    tmp = vmull_u8 (x, alpha);
+    tmp2 = vrshrq_n_u16 (tmp, 8);
+    res = vraddhn_u16 (tmp, tmp2);
 
     return res;
 }
 
-static force_inline uint8x8x4_t neon8mul(uint8x8x4_t x, uint8x8_t alpha)
+static force_inline uint8x8x4_t
+neon8mul (uint8x8x4_t x,
+          uint8x8_t   alpha)
 {
     uint16x8x4_t tmp;
     uint8x8x4_t res;
-    uint16x8_t qtmp1,qtmp2;
-
-    tmp.val[0] = vmull_u8(x.val[0],alpha);
-    tmp.val[1] = vmull_u8(x.val[1],alpha);
-    tmp.val[2] = vmull_u8(x.val[2],alpha);
-    tmp.val[3] = vmull_u8(x.val[3],alpha);
-
-    qtmp1 = vrshrq_n_u16(tmp.val[0],8);
-    qtmp2 = vrshrq_n_u16(tmp.val[1],8);
-    res.val[0] = vraddhn_u16(tmp.val[0],qtmp1);
-    qtmp1 = vrshrq_n_u16(tmp.val[2],8);
-    res.val[1] = vraddhn_u16(tmp.val[1],qtmp2);
-    qtmp2 = vrshrq_n_u16(tmp.val[3],8);
-    res.val[2] = vraddhn_u16(tmp.val[2],qtmp1);
-    res.val[3] = vraddhn_u16(tmp.val[3],qtmp2);
+    uint16x8_t qtmp1, qtmp2;
+
+    tmp.val[0] = vmull_u8 (x.val[0], alpha);
+    tmp.val[1] = vmull_u8 (x.val[1], alpha);
+    tmp.val[2] = vmull_u8 (x.val[2], alpha);
+    tmp.val[3] = vmull_u8 (x.val[3], alpha);
+
+    qtmp1 = vrshrq_n_u16 (tmp.val[0], 8);
+    qtmp2 = vrshrq_n_u16 (tmp.val[1], 8);
+    res.val[0] = vraddhn_u16 (tmp.val[0], qtmp1);
+    qtmp1 = vrshrq_n_u16 (tmp.val[2], 8);
+    res.val[1] = vraddhn_u16 (tmp.val[1], qtmp2);
+    qtmp2 = vrshrq_n_u16 (tmp.val[3], 8);
+    res.val[2] = vraddhn_u16 (tmp.val[2], qtmp1);
+    res.val[3] = vraddhn_u16 (tmp.val[3], qtmp2);
 
     return res;
 }
 
-static force_inline uint8x8x4_t neon8qadd(uint8x8x4_t x, uint8x8x4_t y)
+static force_inline uint8x8x4_t
+neon8qadd (uint8x8x4_t x,
+           uint8x8x4_t y)
 {
     uint8x8x4_t res;
 
-    res.val[0] = vqadd_u8(x.val[0],y.val[0]);
-    res.val[1] = vqadd_u8(x.val[1],y.val[1]);
-    res.val[2] = vqadd_u8(x.val[2],y.val[2]);
-    res.val[3] = vqadd_u8(x.val[3],y.val[3]);
+    res.val[0] = vqadd_u8 (x.val[0], y.val[0]);
+    res.val[1] = vqadd_u8 (x.val[1], y.val[1]);
+    res.val[2] = vqadd_u8 (x.val[2], y.val[2]);
+    res.val[3] = vqadd_u8 (x.val[3], y.val[3]);
 
     return res;
 }
 
-
-void
-fbCompositeSrcAdd_8000x8000neon (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-                                pixman_image_t * pSrc,
-                                pixman_image_t * pMask,
-                                pixman_image_t * pDst,
-                                int32_t      xSrc,
-                                int32_t      ySrc,
-                                int32_t      xMask,
-                                int32_t      yMask,
-                                int32_t      xDst,
-                                int32_t      yDst,
-                                int32_t      width,
-                                int32_t      height)
+static void
+neon_composite_add_8000_8000 (pixman_implementation_t * impl,
+                              pixman_op_t               op,
+                              pixman_image_t *          src_image,
+                              pixman_image_t *          mask_image,
+                              pixman_image_t *          dst_image,
+                              int32_t                   src_x,
+                              int32_t                   src_y,
+                              int32_t                   mask_x,
+                              int32_t                   mask_y,
+                              int32_t                   dest_x,
+                              int32_t                   dest_y,
+                              int32_t                   width,
+                              int32_t                   height)
 {
-    uint8_t     *dstLine, *dst;
-    uint8_t     *srcLine, *src;
-    int dstStride, srcStride;
-    uint16_t    w;
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w;
 
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
-    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 
-    if (width>=8)
+    if (width >= 8)
     {
-        // Use overlapping 8-pixel method
-        while (height--)
-        {
-            dst = dstLine;
-            dstLine += dstStride;
-            src = srcLine;
-            srcLine += srcStride;
-            w = width;
+	/* Use overlapping 8-pixel method */
+	while (height--)
+	{
+	    uint8_t *keep_dst = 0;
+	    uint8x8_t sval, dval, temp;
 
-            uint8_t *keep_dst;
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    src = src_line;
+	    src_line += src_stride;
+	    w = width;
 
 #ifndef USE_GCC_INLINE_ASM
-            uint8x8_t sval,dval,temp;
+	    sval = vld1_u8 ((void *)src);
+	    dval = vld1_u8 ((void *)dst);
+	    keep_dst = dst;
 
-            sval = vld1_u8((void*)src);
-            dval = vld1_u8((void*)dst);
-            keep_dst = dst;
+	    temp = vqadd_u8 (dval, sval);
 
-            temp = vqadd_u8(dval,sval);
+	    src += (w & 7);
+	    dst += (w & 7);
+	    w -= (w & 7);
 
-            src += (w & 7);
-            dst += (w & 7);
-            w -= (w & 7);
+	    while (w)
+	    {
+		sval = vld1_u8 ((void *)src);
+		dval = vld1_u8 ((void *)dst);
 
-            while (w)
-            {
-                sval = vld1_u8((void*)src);
-                dval = vld1_u8((void*)dst);
+		vst1_u8 ((void *)keep_dst, temp);
+		keep_dst = dst;
 
-                vst1_u8((void*)keep_dst,temp);
-                keep_dst = dst;
+		temp = vqadd_u8 (dval, sval);
 
-                temp = vqadd_u8(dval,sval);
+		src += 8;
+		dst += 8;
+		w -= 8;
+	    }
 
-                src+=8;
-                dst+=8;
-                w-=8;
-            }
-            vst1_u8((void*)keep_dst,temp);
+	    vst1_u8 ((void *)keep_dst, temp);
 #else
-            asm volatile (
-// avoid using d8-d15 (q4-q7) aapcs callee-save registers
-                        "vld1.8  {d0}, [%[src]]\n\t"
-                        "vld1.8  {d4}, [%[dst]]\n\t"
-                        "mov     %[keep_dst], %[dst]\n\t"
-
-                        "and ip, %[w], #7\n\t"
-                        "add %[src], %[src], ip\n\t"
-                        "add %[dst], %[dst], ip\n\t"
-                        "subs %[w], %[w], ip\n\t"
-                        "b 9f\n\t"
-// LOOP
-                        "2:\n\t"
-                        "vld1.8  {d0}, [%[src]]!\n\t"
-                        "vld1.8  {d4}, [%[dst]]!\n\t"
-                        "vst1.8  {d20}, [%[keep_dst]]\n\t"
-                        "sub     %[keep_dst], %[dst], #8\n\t"
-                        "subs %[w], %[w], #8\n\t"
-                        "9:\n\t"
-                        "vqadd.u8 d20, d0, d4\n\t"
-
-                        "bne 2b\n\t"
-
-                        "1:\n\t"
-                        "vst1.8  {d20}, [%[keep_dst]]\n\t"
-
-                        : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
-                        :
-                        : "ip", "cc", "memory", "d0","d4",
-                          "d20"
-                        );
+	    asm volatile (
+/* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
+	        "vld1.8  {d0}, [%[src]]\n\t"
+	        "vld1.8  {d4}, [%[dst]]\n\t"
+	        "mov     %[keep_dst], %[dst]\n\t"
+
+	        "and ip, %[w], #7\n\t"
+	        "add %[src], %[src], ip\n\t"
+	        "add %[dst], %[dst], ip\n\t"
+	        "subs %[w], %[w], ip\n\t"
+	        "b 9f\n\t"
+/* LOOP */
+	        "2:\n\t"
+	        "vld1.8  {d0}, [%[src]]!\n\t"
+	        "vld1.8  {d4}, [%[dst]]!\n\t"
+	        "vst1.8  {d20}, [%[keep_dst]]\n\t"
+	        "sub     %[keep_dst], %[dst], #8\n\t"
+	        "subs %[w], %[w], #8\n\t"
+	        "9:\n\t"
+	        "vqadd.u8 d20, d0, d4\n\t"
+
+	        "bne 2b\n\t"
+
+	        "1:\n\t"
+	        "vst1.8  {d20}, [%[keep_dst]]\n\t"
+
+		: [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
+		:
+		: "ip", "cc", "memory", "d0", "d4",
+	        "d20"
+	        );
 #endif
-        }
+	}
     }
     else
     {
-        while (height--)
-        {
-            dst = dstLine;
-            dstLine += dstStride;
-            src = srcLine;
-            srcLine += srcStride;
-            w = width;
-            uint8x8_t sval, dval;
-            uint8_t *dst4, *dst2;
-
-            if (w&4)
-            {
-                sval = vreinterpret_u8_u32(vld1_lane_u32((void*)src,vreinterpret_u32_u8(sval),1));
-                dval = vreinterpret_u8_u32(vld1_lane_u32((void*)dst,vreinterpret_u32_u8(dval),1));
-                dst4=dst;
-                src+=4;
-                dst+=4;
-            }
-            if (w&2)
-            {
-                sval = vreinterpret_u8_u16(vld1_lane_u16((void*)src,vreinterpret_u16_u8(sval),1));
-                dval = vreinterpret_u8_u16(vld1_lane_u16((void*)dst,vreinterpret_u16_u8(dval),1));
-                dst2=dst;
-                src+=2;
-                dst+=2;
-            }
-            if (w&1)
-            {
-                sval = vld1_lane_u8(src,sval,1);
-                dval = vld1_lane_u8(dst,dval,1);
-            }
-
-            dval = vqadd_u8(dval,sval);
-
-            if (w&1)
-                vst1_lane_u8(dst,dval,1);
-            if (w&2)
-                vst1_lane_u16((void*)dst2,vreinterpret_u16_u8(dval),1);
-            if (w&4)
-                vst1_lane_u32((void*)dst4,vreinterpret_u32_u8(dval),1);
-        }
+	const uint8_t nil = 0;
+	const uint8x8_t vnil = vld1_dup_u8 (&nil);
+
+	while (height--)
+	{
+	    uint8x8_t sval = vnil, dval = vnil;
+	    uint8_t *dst4 = 0, *dst2 = 0;
+
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    src = src_line;
+	    src_line += src_stride;
+	    w = width;
+
+	    if (w & 4)
+	    {
+		sval = vreinterpret_u8_u32 (
+		    vld1_lane_u32 ((void *)src, vreinterpret_u32_u8 (sval), 1));
+		dval = vreinterpret_u8_u32 (
+		    vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
+
+		dst4 = dst;
+		src += 4;
+		dst += 4;
+	    }
+
+	    if (w & 2)
+	    {
+		sval = vreinterpret_u8_u16 (
+		    vld1_lane_u16 ((void *)src, vreinterpret_u16_u8 (sval), 1));
+		dval = vreinterpret_u8_u16 (
+		    vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
+
+		dst2 = dst;
+		src += 2;
+		dst += 2;
+	    }
+
+	    if (w & 1)
+	    {
+		sval = vld1_lane_u8 (src, sval, 1);
+		dval = vld1_lane_u8 (dst, dval, 1);
+	    }
+
+	    dval = vqadd_u8 (dval, sval);
+
+	    if (w & 1)
+		vst1_lane_u8 (dst, dval, 1);
+
+	    if (w & 2)
+		vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (dval), 1);
+
+	    if (w & 4)
+		vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (dval), 1);
+	}
     }
 }
 
-
-void
-fbCompositeSrc_8888x8888neon (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-			 pixman_image_t * pSrc,
-			 pixman_image_t * pMask,
-			 pixman_image_t * pDst,
-			 int32_t      xSrc,
-			 int32_t      ySrc,
-			 int32_t      xMask,
-			 int32_t      yMask,
-			 int32_t      xDst,
-			 int32_t      yDst,
-			 int32_t      width,
-			 int32_t      height)
+static void
+neon_composite_over_8888_8888 (pixman_implementation_t * impl,
+                               pixman_op_t               op,
+                               pixman_image_t *          src_image,
+                               pixman_image_t *          mask_image,
+                               pixman_image_t *          dst_image,
+                               int32_t                   src_x,
+                               int32_t                   src_y,
+                               int32_t                   mask_x,
+                               int32_t                   mask_y,
+                               int32_t                   dest_x,
+                               int32_t                   dest_y,
+                               int32_t                   width,
+                               int32_t                   height)
 {
-    uint32_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-    int	dstStride, srcStride;
-    uint32_t	w;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    uint32_t w;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-    if (width>=8)
+    if (width >= 8)
     {
-        // Use overlapping 8-pixel method  
-        while (height--)
-        {
-	    dst = dstLine;
-	    dstLine += dstStride;
-	    src = srcLine;
-	    srcLine += srcStride;
-	    w = width;
+	/* Use overlapping 8-pixel method */
+	while (height--)
+	{
+	    uint32_t *keep_dst = 0;
+	    uint8x8x4_t sval, dval, temp;
 
-            uint32_t *keep_dst;
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    src = src_line;
+	    src_line += src_stride;
+	    w = width;
 
 #ifndef USE_GCC_INLINE_ASM
-            uint8x8x4_t sval,dval,temp;
+	    sval = vld4_u8 ((void *)src);
+	    dval = vld4_u8 ((void *)dst);
+	    keep_dst = dst;
 
-            sval = vld4_u8((void*)src);
-            dval = vld4_u8((void*)dst);
-            keep_dst = dst;
+	    temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
+	    temp = neon8qadd (sval, temp);
 
-            temp = neon8mul(dval,vmvn_u8(sval.val[3]));
-            temp = neon8qadd(sval,temp);
+	    src += (w & 7);
+	    dst += (w & 7);
+	    w -= (w & 7);
 
-            src += (w & 7);
-            dst += (w & 7);
-            w -= (w & 7);
+	    while (w)
+	    {
+		sval = vld4_u8 ((void *)src);
+		dval = vld4_u8 ((void *)dst);
 
-            while (w)
-            {
-                sval = vld4_u8((void*)src);
-                dval = vld4_u8((void*)dst);
+		vst4_u8 ((void *)keep_dst, temp);
+		keep_dst = dst;
 
-                vst4_u8((void*)keep_dst,temp);
-                keep_dst = dst;
+		temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
+		temp = neon8qadd (sval, temp);
 
-                temp = neon8mul(dval,vmvn_u8(sval.val[3]));
-                temp = neon8qadd(sval,temp);
+		src += 8;
+		dst += 8;
+		w -= 8;
+	    }
 
-                src+=8;
-                dst+=8;
-                w-=8;
-            }
-            vst4_u8((void*)keep_dst,temp);
+	    vst4_u8 ((void *)keep_dst, temp);
 #else
-            asm volatile (
-// avoid using d8-d15 (q4-q7) aapcs callee-save registers
-                        "vld4.8  {d0-d3}, [%[src]]\n\t"
-                        "vld4.8  {d4-d7}, [%[dst]]\n\t"
-                        "mov     %[keep_dst], %[dst]\n\t"
-
-                        "and ip, %[w], #7\n\t"
-                        "add %[src], %[src], ip, LSL#2\n\t"
-                        "add %[dst], %[dst], ip, LSL#2\n\t"
-                        "subs %[w], %[w], ip\n\t"
-                        "b 9f\n\t"
-// LOOP
-                        "2:\n\t"
-                        "vld4.8  {d0-d3}, [%[src]]!\n\t"
-                        "vld4.8  {d4-d7}, [%[dst]]!\n\t"
-                        "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
-                        "sub     %[keep_dst], %[dst], #8*4\n\t"
-                        "subs %[w], %[w], #8\n\t"
-                        "9:\n\t"
-                        "vmvn.8  d31, d3\n\t"
-                        "vmull.u8 q10, d31, d4\n\t"
-                        "vmull.u8 q11, d31, d5\n\t"
-                        "vmull.u8 q12, d31, d6\n\t"
-                        "vmull.u8 q13, d31, d7\n\t"
-                        "vrshr.u16 q8, q10, #8\n\t"
-                        "vrshr.u16 q9, q11, #8\n\t"
-                        "vraddhn.u16 d20, q10, q8\n\t"
-                        "vraddhn.u16 d21, q11, q9\n\t"
-                        "vrshr.u16 q8, q12, #8\n\t"
-                        "vrshr.u16 q9, q13, #8\n\t"
-                        "vraddhn.u16 d22, q12, q8\n\t"
-                        "vraddhn.u16 d23, q13, q9\n\t"
-// result in d20-d23
-                        "vqadd.u8 d20, d0, d20\n\t"
-                        "vqadd.u8 d21, d1, d21\n\t"
-                        "vqadd.u8 d22, d2, d22\n\t"
-                        "vqadd.u8 d23, d3, d23\n\t"
-
-                        "bne 2b\n\t"
-
-                        "1:\n\t"
-                        "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
-
-                        : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
-                        : 
-                        : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
-                          "d16","d17","d18","d19","d20","d21","d22","d23"
-                        );
+	    asm volatile (
+/* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
+	        "vld4.8  {d0-d3}, [%[src]]\n\t"
+	        "vld4.8  {d4-d7}, [%[dst]]\n\t"
+	        "mov     %[keep_dst], %[dst]\n\t"
+
+	        "and ip, %[w], #7\n\t"
+	        "add %[src], %[src], ip, LSL#2\n\t"
+	        "add %[dst], %[dst], ip, LSL#2\n\t"
+	        "subs %[w], %[w], ip\n\t"
+	        "b 9f\n\t"
+/* LOOP */
+	        "2:\n\t"
+	        "vld4.8  {d0-d3}, [%[src]]!\n\t"
+	        "vld4.8  {d4-d7}, [%[dst]]!\n\t"
+	        "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
+	        "sub     %[keep_dst], %[dst], #8*4\n\t"
+	        "subs %[w], %[w], #8\n\t"
+	        "9:\n\t"
+	        "vmvn.8  d31, d3\n\t"
+	        "vmull.u8 q10, d31, d4\n\t"
+	        "vmull.u8 q11, d31, d5\n\t"
+	        "vmull.u8 q12, d31, d6\n\t"
+	        "vmull.u8 q13, d31, d7\n\t"
+	        "vrshr.u16 q8, q10, #8\n\t"
+	        "vrshr.u16 q9, q11, #8\n\t"
+	        "vraddhn.u16 d20, q10, q8\n\t"
+	        "vraddhn.u16 d21, q11, q9\n\t"
+	        "vrshr.u16 q8, q12, #8\n\t"
+	        "vrshr.u16 q9, q13, #8\n\t"
+	        "vraddhn.u16 d22, q12, q8\n\t"
+	        "vraddhn.u16 d23, q13, q9\n\t"
+/* result in d20-d23 */
+	        "vqadd.u8 d20, d0, d20\n\t"
+	        "vqadd.u8 d21, d1, d21\n\t"
+	        "vqadd.u8 d22, d2, d22\n\t"
+	        "vqadd.u8 d23, d3, d23\n\t"
+
+	        "bne 2b\n\t"
+
+	        "1:\n\t"
+	        "vst4.8  {d20-d23}, [%[keep_dst]]\n\t"
+
+		: [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
+		:
+		: "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+	        "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23"
+	        );
 #endif
-        }
+	}
     }
     else
     {
-        uint8x8_t    alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
-
-        // Handle width<8
-        while (height--)
-        {
-            dst = dstLine;
-            dstLine += dstStride;
-            src = srcLine;
-            srcLine += srcStride;
-            w = width;
-
-            while (w>=2)
-            {
-                uint8x8_t sval,dval;
-
-                /* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
-                sval = vreinterpret_u8_u32(vld1_u32((void*)src));
-                dval = vreinterpret_u8_u32(vld1_u32((void*)dst));
-                dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval),alpha_selector));
-                vst1_u8((void*)dst,vqadd_u8(sval,dval));
-
-                src+=2;
-                dst+=2;
-                w-=2;
-            }
-
-            if (w)
-            {
-                uint8x8_t sval,dval;
-
-                /* single 32-bit pixel in lane 0 */
-                sval = vreinterpret_u8_u32(vld1_dup_u32((void*)src));  // only interested in lane 0
-                dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));  // only interested in lane 0
-                dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval),alpha_selector));
-                vst1_lane_u32((void*)dst,vreinterpret_u32_u8(vqadd_u8(sval,dval)),0);
-            }
-        }
-    }
-}
+	uint8x8_t alpha_selector = vreinterpret_u8_u64 (
+	    vcreate_u64 (0x0707070703030303ULL));
 
+	/* Handle width < 8 */
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    src = src_line;
+	    src_line += src_stride;
+	    w = width;
 
+	    while (w >= 2)
+	    {
+		uint8x8_t sval, dval;
 
-void
-fbCompositeSrc_x888x0565neon (
-                          pixman_implementation_t * impl,
-                          pixman_op_t op,
-                          pixman_image_t * pSrc,
-                          pixman_image_t * pMask,
-                          pixman_image_t * pDst,
-                          int32_t      xSrc,
-                          int32_t      ySrc,
-                          int32_t      xMask,
-                          int32_t      yMask,
-                          int32_t      xDst,
-                          int32_t      yDst,
-                          int32_t     width,
-                          int32_t     height)
-{
-    uint16_t    *dstLine, *dst;
-    uint32_t    *srcLine, *src;
-    int dstStride, srcStride;
-    uint32_t    w;
+		/* two 32-bit pixels packed into D-reg; ad-hoc vectorization */
+		sval = vreinterpret_u8_u32 (vld1_u32 ((void *)src));
+		dval = vreinterpret_u8_u32 (vld1_u32 ((void *)dst));
+		dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
+		vst1_u8 ((void *)dst, vqadd_u8 (sval, dval));
 
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+		src += 2;
+		dst += 2;
+		w -= 2;
+	    }
 
-    if (width>=8)
-    {
-        while (height--)
-        {
-            dst = dstLine;
-            dstLine += dstStride;
-            src = srcLine;
-            srcLine += srcStride;
-            w = width;
-
-	    do {
-	        while (w>=8)
-	        {
-#ifndef USE_GCC_INLINE_ASM
-	            vst1q_u16(dst, pack0565(vld4_u8((void*)src)));
-#else
-                    asm volatile (
-                        "vld4.8       {d4-d7}, [%[src]]\n\t"
-                        "vshll.u8     q0, d6, #8\n\t"
-                        "vshll.u8     q1, d5, #8\n\t"
-                        "vsriq.u16    q0, q1, #5\t\n"
-                        "vshll.u8     q1, d4, #8\n\t"
-                        "vsriq.u16    q0, q1, #11\t\n"
-                        "vst1.16      {q0}, [%[dst]]\n\t"
-                        :
-                        : [dst] "r" (dst), [src] "r" (src)
-                        : "memory", "d0","d1","d2","d3","d4","d5","d6","d7"
-                        );
-#endif
-	            src+=8;
-	            dst+=8;
-	            w-=8;
-          	}
-                if (w != 0)
-                {
-                    src -= (8-w);
-                    dst -= (8-w);
-                    w = 8;  // do another vector
-                }
-            } while (w!=0);
-        }
-    }
-    else
-    {
-        // Handle width<8
-        while (height--)
-        {
-            dst = dstLine;
-            dstLine += dstStride;
-            src = srcLine;
-            srcLine += srcStride;
-            w = width;
-
-	    while (w>=2)
+	    if (w)
 	    {
-	        uint32x2_t sval, rgb, g, b;
-	        sval = vld1_u32(src);
-	        rgb = vshr_n_u32(sval,8-5); // r (5 bits) 
-	        g = vshr_n_u32(sval,8+8-6);  // g to bottom byte
-	        rgb = vsli_n_u32(rgb, g, 5);
-	        b = vshr_n_u32(sval,8+8+8-5);  // b to bottom byte
-                rgb = vsli_n_u32(rgb, b, 11);
-	        vst1_lane_u16(dst++,vreinterpret_u16_u32(rgb),0);
-	        vst1_lane_u16(dst++,vreinterpret_u16_u32(rgb),2);
-	        src+=2;
-	        w-=2;
+		uint8x8_t sval, dval;
+
+		/* single 32-bit pixel in lane 0 */
+		sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)src));  /* only interested in lane 0 */
+		dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));  /* only interested in lane 0 */
+		dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
+		vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
 	    }
-            if (w)
-            {
-                uint32x2_t sval, rgb, g, b;
-                sval = vld1_dup_u32(src);
-                rgb = vshr_n_u32(sval,8-5); // r (5 bits)
-                g = vshr_n_u32(sval,8+8-6);  // g to bottom byte
-                rgb = vsli_n_u32(rgb, g, 5);
-                b = vshr_n_u32(sval,8+8+8-5);  // b to bottom byte
-                rgb = vsli_n_u32(rgb, b, 11);
-                vst1_lane_u16(dst++,vreinterpret_u16_u32(rgb),0);
-            }
 	}
     }
 }
 
-
-void
-fbCompositeSrc_8888x8x8888neon (
-                               pixman_implementation_t * impl,
-                               pixman_op_t op,
-			       pixman_image_t * pSrc,
-			       pixman_image_t * pMask,
-			       pixman_image_t * pDst,
-			       int32_t	xSrc,
-			       int32_t	ySrc,
-			       int32_t      xMask,
-			       int32_t      yMask,
-			       int32_t      xDst,
-			       int32_t      yDst,
-			       int32_t      width,
-			       int32_t      height)
+static void
+neon_composite_over_8888_n_8888 (pixman_implementation_t * impl,
+                                 pixman_op_t               op,
+                                 pixman_image_t *          src_image,
+                                 pixman_image_t *          mask_image,
+                                 pixman_image_t *          dst_image,
+                                 int32_t                   src_x,
+                                 int32_t                   src_y,
+                                 int32_t                   mask_x,
+                                 int32_t                   mask_y,
+                                 int32_t                   dest_x,
+                                 int32_t                   dest_y,
+                                 int32_t                   width,
+                                 int32_t                   height)
 {
-    uint32_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-    uint32_t	mask;
-    int	dstStride, srcStride;
-    uint32_t	w;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    int dst_stride, src_stride;
+    uint32_t w;
     uint8x8_t mask_alpha;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-    fbComposeGetSolid (pMask, mask, pDst->bits.format);
-    mask_alpha = vdup_n_u8((mask) >> 24);
+    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
+    mask_alpha = vdup_n_u8 ((mask) >> 24);
 
-    if (width>=8)
+    if (width >= 8)
     {
-        // Use overlapping 8-pixel method
-        while (height--)
-        {
-            dst = dstLine;
-            dstLine += dstStride;
-            src = srcLine;
-            srcLine += srcStride;
-            w = width;
+	/* Use overlapping 8-pixel method */
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    src = src_line;
+	    src_line += src_stride;
+	    w = width;
 
-            uint32_t *keep_dst;
+	    uint32_t *keep_dst = 0;
 
 #ifndef USE_GCC_INLINE_ASM
-            uint8x8x4_t sval,dval,temp;
+	    uint8x8x4_t sval, dval, temp;
 
-            sval = vld4_u8((void*)src);
-            dval = vld4_u8((void*)dst);
-            keep_dst = dst;
+	    sval = vld4_u8 ((void *)src);
+	    dval = vld4_u8 ((void *)dst);
+	    keep_dst = dst;
 
-            sval = neon8mul(sval,mask_alpha);
-            temp = neon8mul(dval,vmvn_u8(sval.val[3]));
-            temp = neon8qadd(sval,temp);
+	    sval = neon8mul (sval, mask_alpha);
+	    temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
+	    temp = neon8qadd (sval, temp);
 
-            src += (w & 7);
-            dst += (w & 7);
-            w -= (w & 7);
+	    src += (w & 7);
+	    dst += (w & 7);
+	    w -= (w & 7);
 
-            while (w)
-            {
-                sval = vld4_u8((void*)src);
-                dval = vld4_u8((void*)dst);
+	    while (w)
+	    {
+		sval = vld4_u8 ((void *)src);
+		dval = vld4_u8 ((void *)dst);
 
-                vst4_u8((void*)keep_dst,temp);
-                keep_dst = dst;
+		vst4_u8 ((void *)keep_dst, temp);
+		keep_dst = dst;
 
-                sval = neon8mul(sval,mask_alpha);
-                temp = neon8mul(dval,vmvn_u8(sval.val[3]));
-                temp = neon8qadd(sval,temp);
+		sval = neon8mul (sval, mask_alpha);
+		temp = neon8mul (dval, vmvn_u8 (sval.val[3]));
+		temp = neon8qadd (sval, temp);
 
-                src+=8;
-                dst+=8;
-                w-=8;
-            }
-            vst4_u8((void*)keep_dst,temp);
+		src += 8;
+		dst += 8;
+		w -= 8;
+	    }
+	    vst4_u8 ((void *)keep_dst, temp);
 #else
-            asm volatile (
-// avoid using d8-d15 (q4-q7) aapcs callee-save registers
-                        "vdup.32      d30, %[mask]\n\t"
-                        "vdup.8       d30, d30[3]\n\t"
-
-                        "vld4.8       {d0-d3}, [%[src]]\n\t"
-                        "vld4.8       {d4-d7}, [%[dst]]\n\t"
-                        "mov  %[keep_dst], %[dst]\n\t"
-
-                        "and  ip, %[w], #7\n\t"
-                        "add  %[src], %[src], ip, LSL#2\n\t"
-                        "add  %[dst], %[dst], ip, LSL#2\n\t"
-                        "subs  %[w], %[w], ip\n\t"
-                        "b 9f\n\t"
-// LOOP
-                        "2:\n\t"
-                        "vld4.8       {d0-d3}, [%[src]]!\n\t"
-                        "vld4.8       {d4-d7}, [%[dst]]!\n\t"
-                        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
-                        "sub  %[keep_dst], %[dst], #8*4\n\t"
-                        "subs  %[w], %[w], #8\n\t"
-
-                        "9:\n\t"
-                        "vmull.u8     q10, d30, d0\n\t"
-                        "vmull.u8     q11, d30, d1\n\t"
-                        "vmull.u8     q12, d30, d2\n\t"
-                        "vmull.u8     q13, d30, d3\n\t"
-                        "vrshr.u16    q8, q10, #8\n\t"
-                        "vrshr.u16    q9, q11, #8\n\t"
-                        "vraddhn.u16  d0, q10, q8\n\t"
-                        "vraddhn.u16  d1, q11, q9\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vraddhn.u16  d3, q13, q9\n\t"
-                        "vraddhn.u16  d2, q12, q8\n\t"
-
-                        "vmvn.8       d31, d3\n\t"
-                        "vmull.u8     q10, d31, d4\n\t"
-                        "vmull.u8     q11, d31, d5\n\t"
-                        "vmull.u8     q12, d31, d6\n\t"
-                        "vmull.u8     q13, d31, d7\n\t"
-                        "vrshr.u16    q8, q10, #8\n\t"
-                        "vrshr.u16    q9, q11, #8\n\t"
-                        "vraddhn.u16  d20, q10, q8\n\t"
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vraddhn.u16  d21, q11, q9\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vraddhn.u16  d22, q12, q8\n\t"
-                        "vraddhn.u16  d23, q13, q9\n\t"
-// result in d20-d23
-                        "vqadd.u8     d20, d0, d20\n\t"
-                        "vqadd.u8     d21, d1, d21\n\t"
-                        "vqadd.u8     d22, d2, d22\n\t"
-                        "vqadd.u8     d23, d3, d23\n\t"
-
-                        "bne  2b\n\t"
-
-                        "1:\n\t"
-                        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
-
-                        : [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
-                        : [mask] "r" (mask)
-                        : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
-                          "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27",
-                          "d30","d31"
-                        );
+	    asm volatile (
+/* avoid using d8-d15 (q4-q7) aapcs callee-save registers */
+	        "vdup.32      d30, %[mask]\n\t"
+	        "vdup.8       d30, d30[3]\n\t"
+
+	        "vld4.8       {d0-d3}, [%[src]]\n\t"
+	        "vld4.8       {d4-d7}, [%[dst]]\n\t"
+	        "mov  %[keep_dst], %[dst]\n\t"
+
+	        "and  ip, %[w], #7\n\t"
+	        "add  %[src], %[src], ip, LSL#2\n\t"
+	        "add  %[dst], %[dst], ip, LSL#2\n\t"
+	        "subs  %[w], %[w], ip\n\t"
+	        "b 9f\n\t"
+/* LOOP */
+	        "2:\n\t"
+	        "vld4.8       {d0-d3}, [%[src]]!\n\t"
+	        "vld4.8       {d4-d7}, [%[dst]]!\n\t"
+	        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
+	        "sub  %[keep_dst], %[dst], #8*4\n\t"
+	        "subs  %[w], %[w], #8\n\t"
+
+	        "9:\n\t"
+	        "vmull.u8     q10, d30, d0\n\t"
+	        "vmull.u8     q11, d30, d1\n\t"
+	        "vmull.u8     q12, d30, d2\n\t"
+	        "vmull.u8     q13, d30, d3\n\t"
+	        "vrshr.u16    q8, q10, #8\n\t"
+	        "vrshr.u16    q9, q11, #8\n\t"
+	        "vraddhn.u16  d0, q10, q8\n\t"
+	        "vraddhn.u16  d1, q11, q9\n\t"
+	        "vrshr.u16    q9, q13, #8\n\t"
+	        "vrshr.u16    q8, q12, #8\n\t"
+	        "vraddhn.u16  d3, q13, q9\n\t"
+	        "vraddhn.u16  d2, q12, q8\n\t"
+
+	        "vmvn.8       d31, d3\n\t"
+	        "vmull.u8     q10, d31, d4\n\t"
+	        "vmull.u8     q11, d31, d5\n\t"
+	        "vmull.u8     q12, d31, d6\n\t"
+	        "vmull.u8     q13, d31, d7\n\t"
+	        "vrshr.u16    q8, q10, #8\n\t"
+	        "vrshr.u16    q9, q11, #8\n\t"
+	        "vraddhn.u16  d20, q10, q8\n\t"
+	        "vrshr.u16    q8, q12, #8\n\t"
+	        "vraddhn.u16  d21, q11, q9\n\t"
+	        "vrshr.u16    q9, q13, #8\n\t"
+	        "vraddhn.u16  d22, q12, q8\n\t"
+	        "vraddhn.u16  d23, q13, q9\n\t"
+
+/* result in d20-d23 */
+	        "vqadd.u8     d20, d0, d20\n\t"
+	        "vqadd.u8     d21, d1, d21\n\t"
+	        "vqadd.u8     d22, d2, d22\n\t"
+	        "vqadd.u8     d23, d3, d23\n\t"
+
+	        "bne  2b\n\t"
+
+	        "1:\n\t"
+	        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
+
+		: [w] "+r" (w), [src] "+r" (src), [dst] "+r" (dst), [keep_dst] "=r" (keep_dst)
+		: [mask] "r" (mask)
+		: "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+	        "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27",
+	        "d30", "d31"
+	        );
 #endif
-        }
+	}
     }
     else
     {
-        uint8x8_t    alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
+	uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
 
-        // Handle width<8
-        while (height--)
-        {
-            dst = dstLine;
-            dstLine += dstStride;
-            src = srcLine;
-            srcLine += srcStride;
-            w = width;
+	/* Handle width < 8 */
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    src = src_line;
+	    src_line += src_stride;
+	    w = width;
 
-            while (w>=2)
-            {
-                uint8x8_t sval,dval;
+	    while (w >= 2)
+	    {
+		uint8x8_t sval, dval;
 
-                sval = vreinterpret_u8_u32(vld1_u32((void*)src));
-                dval = vreinterpret_u8_u32(vld1_u32((void*)dst));
+		sval = vreinterpret_u8_u32 (vld1_u32 ((void *)src));
+		dval = vreinterpret_u8_u32 (vld1_u32 ((void *)dst));
 
-                /* sval * const alpha_mul */
-                sval = neon2mul(sval,mask_alpha);
+		/* sval * const alpha_mul */
+		sval = neon2mul (sval, mask_alpha);
 
-                /* dval * 255-(src alpha) */
-                dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval), alpha_selector));
+		/* dval * 255-(src alpha) */
+		dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 
-                vst1_u8((void*)dst,vqadd_u8(sval,dval));
+		vst1_u8 ((void *)dst, vqadd_u8 (sval, dval));
 
-                src+=2;
-                dst+=2;
-                w-=2;
-            }
+		src += 2;
+		dst += 2;
+		w -= 2;
+	    }
 
-            if (w)
-            {
-                uint8x8_t sval,dval;
+	    if (w)
+	    {
+		uint8x8_t sval, dval;
 
-                sval = vreinterpret_u8_u32(vld1_dup_u32((void*)src));
-                dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));
+		sval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)src));
+		dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));
 
-                /* sval * const alpha_mul */
-                sval = neon2mul(sval,mask_alpha);
+		/* sval * const alpha_mul */
+		sval = neon2mul (sval, mask_alpha);
 
-                /* dval * 255-(src alpha) */
-                dval = neon2mul(dval,vtbl1_u8(vmvn_u8(sval), alpha_selector));
+		/* dval * 255-(src alpha) */
+		dval = neon2mul (dval, vtbl1_u8 (vmvn_u8 (sval), alpha_selector));
 
-                vst1_lane_u32((void*)dst,vreinterpret_u32_u8(vqadd_u8(sval,dval)),0);
-            }
-        }
+		vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (vqadd_u8 (sval, dval)), 0);
+	    }
+	}
     }
 }
 
-
-
-void
-fbCompositeSolidMask_nx8x0565neon (
-                               pixman_implementation_t * impl,
-                               pixman_op_t op,
-                               pixman_image_t * pSrc,
-                               pixman_image_t * pMask,
-                               pixman_image_t * pDst,
-                               int32_t      xSrc,
-                               int32_t      ySrc,
-                               int32_t      xMask,
-                               int32_t      yMask,
-                               int32_t      xDst,
-                               int32_t      yDst,
-                               int32_t      width,
-                               int32_t      height)
+static void
+neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
+			      pixman_op_t               op,
+			      pixman_image_t *          src_image,
+			      pixman_image_t *          mask_image,
+			      pixman_image_t *          dst_image,
+			      int32_t                   src_x,
+			      int32_t                   src_y,
+			      int32_t                   mask_x,
+			      int32_t                   mask_y,
+			      int32_t                   dest_x,
+			      int32_t                   dest_y,
+			      int32_t                   width,
+			      int32_t                   height)
 {
     uint32_t     src, srca;
-    uint16_t    *dstLine, *dst;
-    uint8_t     *maskLine, *mask;
-    int          dstStride, maskStride;
+    uint16_t    *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int          dst_stride, mask_stride;
     uint32_t     w;
     uint8x8_t    sval2;
     uint8x8x4_t  sval8;
 
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
-        return;
+	return;
 
-    sval2=vreinterpret_u8_u32(vdup_n_u32(src));
-    sval8.val[0]=vdup_lane_u8(sval2,0);
-    sval8.val[1]=vdup_lane_u8(sval2,1);
-    sval8.val[2]=vdup_lane_u8(sval2,2);
-    sval8.val[3]=vdup_lane_u8(sval2,3);
+    sval2=vreinterpret_u8_u32 (vdup_n_u32 (src));
+    sval8.val[0]=vdup_lane_u8 (sval2,0);
+    sval8.val[1]=vdup_lane_u8 (sval2,1);
+    sval8.val[2]=vdup_lane_u8 (sval2,2);
+    sval8.val[3]=vdup_lane_u8 (sval2,3);
 
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     if (width>=8)
     {
-        // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
-        while (height--)
-        {
-            uint16_t *keep_dst;
+	/* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
+	while (height--)
+	{
+	    uint16_t *keep_dst=0;
 
-            dst = dstLine;
-            dstLine += dstStride;
-            mask = maskLine;
-            maskLine += maskStride;
-            w = width;
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
 
 #ifndef USE_GCC_INLINE_ASM
-            uint8x8_t alpha;
-            uint16x8_t dval, temp; 
-            uint8x8x4_t sval8temp;
+	    uint8x8_t alpha;
+	    uint16x8_t dval, temp;
+	    uint8x8x4_t sval8temp;
 
-            alpha = vld1_u8((void*)mask);
-            dval = vld1q_u16((void*)dst);
-            keep_dst = dst;
+	    alpha = vld1_u8 ((void *)mask);
+	    dval = vld1q_u16 ((void *)dst);
+	    keep_dst = dst;
 
-            sval8temp = neon8mul(sval8,alpha);
-            temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3]))));
+	    sval8temp = neon8mul (sval8, alpha);
+	    temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
 
-            mask += (w & 7);
-            dst += (w & 7);
-            w -= (w & 7);
+	    mask += (w & 7);
+	    dst += (w & 7);
+	    w -= (w & 7);
 
-            while (w)
-            {
-                dval = vld1q_u16((void*)dst);
-	        alpha = vld1_u8((void*)mask);
+	    while (w)
+	    {
+		dval = vld1q_u16 ((void *)dst);
+		alpha = vld1_u8 ((void *)mask);
 
-                vst1q_u16((void*)keep_dst,temp);
-                keep_dst = dst;
+		vst1q_u16 ((void *)keep_dst, temp);
+		keep_dst = dst;
 
-                sval8temp = neon8mul(sval8,alpha);
-                temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3]))));
+		sval8temp = neon8mul (sval8, alpha);
+		temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
 
-                mask+=8;
-                dst+=8;
-                w-=8;
-            }
-            vst1q_u16((void*)keep_dst,temp);
+		mask+=8;
+		dst+=8;
+		w-=8;
+	    }
+	    vst1q_u16 ((void *)keep_dst, temp);
 #else
-        asm volatile (
-                        "vdup.32      d0, %[src]\n\t"
-                        "vdup.8       d1, d0[1]\n\t"
-                        "vdup.8       d2, d0[2]\n\t"
-                        "vdup.8       d3, d0[3]\n\t"
-                        "vdup.8       d0, d0[0]\n\t"
-
-                        "vld1.8       {q12}, [%[dst]]\n\t"
-                        "vld1.8       {d31}, [%[mask]]\n\t"
-                        "mov  %[keep_dst], %[dst]\n\t"
-
-                        "and  ip, %[w], #7\n\t"
-                        "add  %[mask], %[mask], ip\n\t"
-                        "add  %[dst], %[dst], ip, LSL#1\n\t"
-                        "subs  %[w], %[w], ip\n\t"
-                        "b  9f\n\t"
-// LOOP
-                        "2:\n\t"
-
-                        "vld1.16      {q12}, [%[dst]]!\n\t"
-                        "vld1.8       {d31}, [%[mask]]!\n\t"
-                        "vst1.16      {q10}, [%[keep_dst]]\n\t"
-                        "sub  %[keep_dst], %[dst], #8*2\n\t"
-                        "subs  %[w], %[w], #8\n\t"
-                        "9:\n\t"
-// expand 0565 q12 to 8888 {d4-d7}
-                        "vmovn.u16    d4, q12\t\n"
-                        "vshr.u16     q11, q12, #5\t\n"
-                        "vshr.u16     q10, q12, #6+5\t\n"
-                        "vmovn.u16    d5, q11\t\n"
-                        "vmovn.u16    d6, q10\t\n"
-                        "vshl.u8      d4, d4, #3\t\n"
-                        "vshl.u8      d5, d5, #2\t\n"
-                        "vshl.u8      d6, d6, #3\t\n"
-                        "vsri.u8      d4, d4, #5\t\n"
-                        "vsri.u8      d5, d5, #6\t\n"
-                        "vsri.u8      d6, d6, #5\t\n"
-
-                        "vmull.u8     q10, d31, d0\n\t"
-                        "vmull.u8     q11, d31, d1\n\t"
-                        "vmull.u8     q12, d31, d2\n\t"
-                        "vmull.u8     q13, d31, d3\n\t"
-                        "vrshr.u16    q8, q10, #8\n\t"
-                        "vrshr.u16    q9, q11, #8\n\t"
-                        "vraddhn.u16  d20, q10, q8\n\t"
-                        "vraddhn.u16  d21, q11, q9\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vraddhn.u16  d23, q13, q9\n\t"
-                        "vraddhn.u16  d22, q12, q8\n\t"
-
-// duplicate in 4/2/1 & 8pix vsns
-                        "vmvn.8       d30, d23\n\t"
-                        "vmull.u8     q14, d30, d6\n\t"
-                        "vmull.u8     q13, d30, d5\n\t"
-                        "vmull.u8     q12, d30, d4\n\t"
-                        "vrshr.u16    q8, q14, #8\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vraddhn.u16  d6, q14, q8\n\t"
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vraddhn.u16  d5, q13, q9\n\t"
-                        "vqadd.u8     d6, d6, d22\n\t"  // moved up
-                        "vraddhn.u16  d4, q12, q8\n\t"
-// intentionally don't calculate alpha
-// result in d4-d6
-
-//                      "vqadd.u8     d6, d6, d22\n\t"  ** moved up
-                        "vqadd.u8     d5, d5, d21\n\t"
-                        "vqadd.u8     d4, d4, d20\n\t"
-
-// pack 8888 {d20-d23} to 0565 q10
-                        "vshll.u8     q10, d6, #8\n\t"
-                        "vshll.u8     q3, d5, #8\n\t"
-                        "vshll.u8     q2, d4, #8\n\t"
-                        "vsri.u16     q10, q3, #5\t\n"
-                        "vsri.u16     q10, q2, #11\t\n"
-
-                        "bne 2b\n\t"
-
-                        "1:\n\t"
-                        "vst1.16      {q10}, [%[keep_dst]]\n\t"
-
-                        : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
-                        : [src] "r" (src)
-                        : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
-                          "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
-                          "d30","d31"
-                        );
+	    asm volatile (
+		"vdup.32      d0, %[src]\n\t"
+		"vdup.8       d1, d0[1]\n\t"
+		"vdup.8       d2, d0[2]\n\t"
+		"vdup.8       d3, d0[3]\n\t"
+		"vdup.8       d0, d0[0]\n\t"
+
+		"vld1.8       {q12}, [%[dst]]\n\t"
+		"vld1.8       {d31}, [%[mask]]\n\t"
+		"mov  %[keep_dst], %[dst]\n\t"
+
+		"and  ip, %[w], #7\n\t"
+		"add  %[mask], %[mask], ip\n\t"
+		"add  %[dst], %[dst], ip, LSL#1\n\t"
+		"subs  %[w], %[w], ip\n\t"
+		"b  9f\n\t"
+/* LOOP */
+		"2:\n\t"
+
+		"vld1.16      {q12}, [%[dst]]!\n\t"
+		"vld1.8       {d31}, [%[mask]]!\n\t"
+		"vst1.16      {q10}, [%[keep_dst]]\n\t"
+		"sub  %[keep_dst], %[dst], #8*2\n\t"
+		"subs  %[w], %[w], #8\n\t"
+		"9:\n\t"
+/* expand 0565 q12 to 8888 {d4-d7} */
+		"vmovn.u16    d4, q12\t\n"
+		"vshr.u16     q11, q12, #5\t\n"
+		"vshr.u16     q10, q12, #6+5\t\n"
+		"vmovn.u16    d5, q11\t\n"
+		"vmovn.u16    d6, q10\t\n"
+		"vshl.u8      d4, d4, #3\t\n"
+		"vshl.u8      d5, d5, #2\t\n"
+		"vshl.u8      d6, d6, #3\t\n"
+		"vsri.u8      d4, d4, #5\t\n"
+		"vsri.u8      d5, d5, #6\t\n"
+		"vsri.u8      d6, d6, #5\t\n"
+
+		"vmull.u8     q10, d31, d0\n\t"
+		"vmull.u8     q11, d31, d1\n\t"
+		"vmull.u8     q12, d31, d2\n\t"
+		"vmull.u8     q13, d31, d3\n\t"
+		"vrshr.u16    q8, q10, #8\n\t"
+		"vrshr.u16    q9, q11, #8\n\t"
+		"vraddhn.u16  d20, q10, q8\n\t"
+		"vraddhn.u16  d21, q11, q9\n\t"
+		"vrshr.u16    q9, q13, #8\n\t"
+		"vrshr.u16    q8, q12, #8\n\t"
+		"vraddhn.u16  d23, q13, q9\n\t"
+		"vraddhn.u16  d22, q12, q8\n\t"
+
+/* duplicate in 4/2/1 & 8pix vsns */
+		"vmvn.8       d30, d23\n\t"
+		"vmull.u8     q14, d30, d6\n\t"
+		"vmull.u8     q13, d30, d5\n\t"
+		"vmull.u8     q12, d30, d4\n\t"
+		"vrshr.u16    q8, q14, #8\n\t"
+		"vrshr.u16    q9, q13, #8\n\t"
+		"vraddhn.u16  d6, q14, q8\n\t"
+		"vrshr.u16    q8, q12, #8\n\t"
+		"vraddhn.u16  d5, q13, q9\n\t"
+		"vqadd.u8     d6, d6, d22\n\t"  /* moved up */
+		"vraddhn.u16  d4, q12, q8\n\t"
+/* intentionally don't calculate alpha */
+/* result in d4-d6 */
+
+/*              "vqadd.u8     d6, d6, d22\n\t"  ** moved up */
+		"vqadd.u8     d5, d5, d21\n\t"
+		"vqadd.u8     d4, d4, d20\n\t"
+
+/* pack 8888 {d20-d23} to 0565 q10 */
+		"vshll.u8     q10, d6, #8\n\t"
+		"vshll.u8     q3, d5, #8\n\t"
+		"vshll.u8     q2, d4, #8\n\t"
+		"vsri.u16     q10, q3, #5\t\n"
+		"vsri.u16     q10, q2, #11\t\n"
+
+		"bne 2b\n\t"
+
+		"1:\n\t"
+		"vst1.16      {q10}, [%[keep_dst]]\n\t"
+
+		: [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
+		: [src] "r" (src)
+		: "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
+		  "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
+		  "d30","d31"
+		);
 #endif
-        }
+	}
     }
     else
     {
-        while (height--)
-        {
-            void *dst4, *dst2;
+	while (height--)
+	{
+	    void *dst4=0, *dst2=0;
 
-            dst = dstLine;
-            dstLine += dstStride;
-            mask = maskLine;
-            maskLine += maskStride;
-            w = width;
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
 
 
-#ifndef USE_GCC_INLINE_ASM
-            uint8x8_t alpha;
-            uint16x8_t dval, temp;
-            uint8x8x4_t sval8temp;
-
-            if (w&4)
-            {
-                alpha = vreinterpret_u8_u32(vld1_lane_u32((void*)mask,vreinterpret_u32_u8(alpha),1));
-                dval = vreinterpretq_u16_u64(vld1q_lane_u64((void*)dst,vreinterpretq_u64_u16(dval),1));
-                dst4=dst;
-                mask+=4;
-                dst+=4;
-            }
-            if (w&2)
-            {
-                alpha = vreinterpret_u8_u16(vld1_lane_u16((void*)mask,vreinterpret_u16_u8(alpha),1));
-                dval = vreinterpretq_u16_u32(vld1q_lane_u32((void*)dst,vreinterpretq_u32_u16(dval),1));
-                dst2=dst;
-                mask+=2;
-                dst+=2;
-            }
-            if (w&1)
-            {
-                alpha = vld1_lane_u8((void*)mask,alpha,1);
-                dval = vld1q_lane_u16((void*)dst,dval,1);
-            }
-
-            sval8temp = neon8mul(sval8,alpha);
-            temp = pack0565(neon8qadd(sval8temp,neon8mul(unpack0565(dval),vmvn_u8(sval8temp.val[3]))));
-
-            if (w&1)
-                vst1q_lane_u16((void*)dst,temp,1);
-            if (w&2)
-                vst1q_lane_u32((void*)dst2,vreinterpretq_u32_u16(temp),1);
-            if (w&4)
-                vst1q_lane_u64((void*)dst4,vreinterpretq_u64_u16(temp),1);
+#if 1 /* #ifndef USE_GCC_INLINE_ASM */
+	    uint8x8_t alpha;
+	    uint16x8_t dval, temp;
+	    uint8x8x4_t sval8temp;
+
+	    if (w&4)
+	    {
+		alpha = vreinterpret_u8_u32 (vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (alpha),1));
+		dval = vreinterpretq_u16_u64 (vld1q_lane_u64 ((void *)dst, vreinterpretq_u64_u16 (dval),1));
+		dst4=dst;
+		mask+=4;
+		dst+=4;
+	    }
+	    if (w&2)
+	    {
+		alpha = vreinterpret_u8_u16 (vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (alpha),1));
+		dval = vreinterpretq_u16_u32 (vld1q_lane_u32 ((void *)dst, vreinterpretq_u32_u16 (dval),1));
+		dst2=dst;
+		mask+=2;
+		dst+=2;
+	    }
+	    if (w&1)
+	    {
+		alpha = vld1_lane_u8 ((void *)mask, alpha,1);
+		dval = vld1q_lane_u16 ((void *)dst, dval,1);
+	    }
+
+	    sval8temp = neon8mul (sval8, alpha);
+	    temp = pack0565 (neon8qadd (sval8temp, neon8mul (unpack0565 (dval), vmvn_u8 (sval8temp.val[3]))));
+
+	    if (w&1)
+		vst1q_lane_u16 ((void *)dst, temp,1);
+	    if (w&2)
+		vst1q_lane_u32 ((void *)dst2, vreinterpretq_u32_u16 (temp),1);
+	    if (w&4)
+		vst1q_lane_u64 ((void *)dst4, vreinterpretq_u64_u16 (temp),1);
 #else
-            asm volatile (
-                        "vdup.32      d0, %[src]\n\t"
-                        "vdup.8       d1, d0[1]\n\t"
-                        "vdup.8       d2, d0[2]\n\t"
-                        "vdup.8       d3, d0[3]\n\t"
-                        "vdup.8       d0, d0[0]\n\t"
-
-                        "tst  %[w], #4\t\n"
-                        "beq  skip_load4\t\n"
-
-                        "vld1.64      {d25}, [%[dst]]\n\t"
-                        "vld1.32      {d31[1]}, [%[mask]]\n\t"
-                        "mov  %[dst4], %[dst]\t\n"
-                        "add  %[mask], %[mask], #4\t\n"
-                        "add  %[dst], %[dst], #4*2\t\n"
-
-                        "skip_load4:\t\n"
-                        "tst  %[w], #2\t\n"
-                        "beq  skip_load2\t\n"
-                        "vld1.32      {d24[1]}, [%[dst]]\n\t"
-                        "vld1.16      {d31[1]}, [%[mask]]\n\t"
-                        "mov  %[dst2], %[dst]\t\n"
-                        "add  %[mask], %[mask], #2\t\n"
-                        "add  %[dst], %[dst], #2*2\t\n"
-
-                        "skip_load2:\t\n"
-                        "tst  %[w], #1\t\n"
-                        "beq  skip_load1\t\n"
-                        "vld1.16      {d24[1]}, [%[dst]]\n\t"
-                        "vld1.8       {d31[1]}, [%[mask]]\n\t"
-
-                        "skip_load1:\t\n"
-// expand 0565 q12 to 8888 {d4-d7}
-                        "vmovn.u16    d4, q12\t\n"
-                        "vshr.u16     q11, q12, #5\t\n"
-                        "vshr.u16     q10, q12, #6+5\t\n"
-                        "vmovn.u16    d5, q11\t\n"
-                        "vmovn.u16    d6, q10\t\n"
-                        "vshl.u8      d4, d4, #3\t\n"
-                        "vshl.u8      d5, d5, #2\t\n"
-                        "vshl.u8      d6, d6, #3\t\n"
-                        "vsri.u8      d4, d4, #5\t\n"
-                        "vsri.u8      d5, d5, #6\t\n"
-                        "vsri.u8      d6, d6, #5\t\n"
-
-                        "vmull.u8     q10, d31, d0\n\t"
-                        "vmull.u8     q11, d31, d1\n\t"
-                        "vmull.u8     q12, d31, d2\n\t"
-                        "vmull.u8     q13, d31, d3\n\t"
-                        "vrshr.u16    q8, q10, #8\n\t"
-                        "vrshr.u16    q9, q11, #8\n\t"
-                        "vraddhn.u16  d20, q10, q8\n\t"
-                        "vraddhn.u16  d21, q11, q9\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vraddhn.u16  d23, q13, q9\n\t"
-                        "vraddhn.u16  d22, q12, q8\n\t"
-
-// duplicate in 4/2/1 & 8pix vsns
-                        "vmvn.8       d30, d23\n\t"
-                        "vmull.u8     q14, d30, d6\n\t"
-                        "vmull.u8     q13, d30, d5\n\t"
-                        "vmull.u8     q12, d30, d4\n\t"
-                        "vrshr.u16    q8, q14, #8\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vraddhn.u16  d6, q14, q8\n\t"
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vraddhn.u16  d5, q13, q9\n\t"
-                        "vqadd.u8     d6, d6, d22\n\t"  // moved up
-                        "vraddhn.u16  d4, q12, q8\n\t"
-// intentionally don't calculate alpha
-// result in d4-d6
-
-//                      "vqadd.u8     d6, d6, d22\n\t"  ** moved up
-                        "vqadd.u8     d5, d5, d21\n\t"
-                        "vqadd.u8     d4, d4, d20\n\t"
-
-// pack 8888 {d20-d23} to 0565 q10
-                        "vshll.u8     q10, d6, #8\n\t"
-                        "vshll.u8     q3, d5, #8\n\t"
-                        "vshll.u8     q2, d4, #8\n\t"
-                        "vsri.u16     q10, q3, #5\t\n"
-                        "vsri.u16     q10, q2, #11\t\n"
-
-                        "tst  %[w], #1\n\t"
-                        "beq skip_store1\t\n"
-                        "vst1.16      {d20[1]}, [%[dst]]\t\n"
-                        "skip_store1:\t\n"
-                        "tst  %[w], #2\n\t"
-                        "beq  skip_store2\t\n"
-                        "vst1.32      {d20[1]}, [%[dst2]]\t\n"
-                        "skip_store2:\t\n"
-                        "tst  %[w], #4\n\t"
-                        "beq skip_store4\t\n"
-                        "vst1.16      {d21}, [%[dst4]]\t\n"
-                        "skip_store4:\t\n"
-
-                        : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2)
-                        : [src] "r" (src)
-                        : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
-                          "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
-                          "d30","d31"
-                        );
+	    /* this code has some bug (does not pass blitters-test) */
+	    asm volatile (
+		"vdup.32      d0, %[src]\n\t"
+		"vdup.8       d1, d0[1]\n\t"
+		"vdup.8       d2, d0[2]\n\t"
+		"vdup.8       d3, d0[3]\n\t"
+		"vdup.8       d0, d0[0]\n\t"
+
+		"tst  %[w], #4\t\n"
+		"beq  skip_load4\t\n"
+
+		"vld1.64      {d25}, [%[dst]]\n\t"
+		"vld1.32      {d31[1]}, [%[mask]]\n\t"
+		"mov  %[dst4], %[dst]\t\n"
+		"add  %[mask], %[mask], #4\t\n"
+		"add  %[dst], %[dst], #4*2\t\n"
+
+		"skip_load4:\t\n"
+		"tst  %[w], #2\t\n"
+		"beq  skip_load2\t\n"
+		"vld1.32      {d24[1]}, [%[dst]]\n\t"
+		"vld1.16      {d31[1]}, [%[mask]]\n\t"
+		"mov  %[dst2], %[dst]\t\n"
+		"add  %[mask], %[mask], #2\t\n"
+		"add  %[dst], %[dst], #2*2\t\n"
+
+		"skip_load2:\t\n"
+		"tst  %[w], #1\t\n"
+		"beq  skip_load1\t\n"
+		"vld1.16      {d24[1]}, [%[dst]]\n\t"
+		"vld1.8       {d31[1]}, [%[mask]]\n\t"
+
+		"skip_load1:\t\n"
+/* expand 0565 q12 to 8888 {d4-d7} */
+		"vmovn.u16    d4, q12\t\n"
+		"vshr.u16     q11, q12, #5\t\n"
+		"vshr.u16     q10, q12, #6+5\t\n"
+		"vmovn.u16    d5, q11\t\n"
+		"vmovn.u16    d6, q10\t\n"
+		"vshl.u8      d4, d4, #3\t\n"
+		"vshl.u8      d5, d5, #2\t\n"
+		"vshl.u8      d6, d6, #3\t\n"
+		"vsri.u8      d4, d4, #5\t\n"
+		"vsri.u8      d5, d5, #6\t\n"
+		"vsri.u8      d6, d6, #5\t\n"
+
+		"vmull.u8     q10, d31, d0\n\t"
+		"vmull.u8     q11, d31, d1\n\t"
+		"vmull.u8     q12, d31, d2\n\t"
+		"vmull.u8     q13, d31, d3\n\t"
+		"vrshr.u16    q8, q10, #8\n\t"
+		"vrshr.u16    q9, q11, #8\n\t"
+		"vraddhn.u16  d20, q10, q8\n\t"
+		"vraddhn.u16  d21, q11, q9\n\t"
+		"vrshr.u16    q9, q13, #8\n\t"
+		"vrshr.u16    q8, q12, #8\n\t"
+		"vraddhn.u16  d23, q13, q9\n\t"
+		"vraddhn.u16  d22, q12, q8\n\t"
+
+/* duplicate in 4/2/1 & 8pix vsns */
+		"vmvn.8       d30, d23\n\t"
+		"vmull.u8     q14, d30, d6\n\t"
+		"vmull.u8     q13, d30, d5\n\t"
+		"vmull.u8     q12, d30, d4\n\t"
+		"vrshr.u16    q8, q14, #8\n\t"
+		"vrshr.u16    q9, q13, #8\n\t"
+		"vraddhn.u16  d6, q14, q8\n\t"
+		"vrshr.u16    q8, q12, #8\n\t"
+		"vraddhn.u16  d5, q13, q9\n\t"
+		"vqadd.u8     d6, d6, d22\n\t"  /* moved up */
+		"vraddhn.u16  d4, q12, q8\n\t"
+/* intentionally don't calculate alpha */
+/* result in d4-d6 */
+
+/*              "vqadd.u8     d6, d6, d22\n\t"  ** moved up */
+		"vqadd.u8     d5, d5, d21\n\t"
+		"vqadd.u8     d4, d4, d20\n\t"
+
+/* pack 8888 {d20-d23} to 0565 q10 */
+		"vshll.u8     q10, d6, #8\n\t"
+		"vshll.u8     q3, d5, #8\n\t"
+		"vshll.u8     q2, d4, #8\n\t"
+		"vsri.u16     q10, q3, #5\t\n"
+		"vsri.u16     q10, q2, #11\t\n"
+
+		"tst  %[w], #1\n\t"
+		"beq skip_store1\t\n"
+		"vst1.16      {d20[1]}, [%[dst]]\t\n"
+		"skip_store1:\t\n"
+		"tst  %[w], #2\n\t"
+		"beq  skip_store2\t\n"
+		"vst1.32      {d20[1]}, [%[dst2]]\t\n"
+		"skip_store2:\t\n"
+		"tst  %[w], #4\n\t"
+		"beq skip_store4\t\n"
+		"vst1.16      {d21}, [%[dst4]]\t\n"
+		"skip_store4:\t\n"
+
+		: [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [dst4] "+r" (dst4), [dst2] "+r" (dst2)
+		: [src] "r" (src)
+		: "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
+		  "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
+		  "d30","d31"
+		);
 #endif
-        }
+	}
     }
 }
 
-
-
-void
-fbCompositeSolidMask_nx8x8888neon (
-                            pixman_implementation_t * impl,
-                            pixman_op_t      op,
-			       pixman_image_t * pSrc,
-			       pixman_image_t * pMask,
-			       pixman_image_t * pDst,
-			       int32_t      xSrc,
-			       int32_t      ySrc,
-			       int32_t      xMask,
-			       int32_t      yMask,
-			       int32_t      xDst,
-			       int32_t      yDst,
-			       int32_t      width,
-			       int32_t      height)
+static void
+neon_composite_over_n_8_8888 (pixman_implementation_t * impl,
+                              pixman_op_t               op,
+                              pixman_image_t *          src_image,
+                              pixman_image_t *          mask_image,
+                              pixman_image_t *          dst_image,
+                              int32_t                   src_x,
+                              int32_t                   src_y,
+                              int32_t                   mask_x,
+                              int32_t                   mask_y,
+                              int32_t                   dest_x,
+                              int32_t                   dest_y,
+                              int32_t                   width,
+                              int32_t                   height)
 {
-    uint32_t	 src, srca;
-    uint32_t	*dstLine, *dst;
-    uint8_t	*maskLine, *mask;
-    int		 dstStride, maskStride;
-    uint32_t	 w;
-    uint8x8_t    sval2;
-    uint8x8x4_t  sval8;
-    uint8x8_t    mask_selector=vreinterpret_u8_u64(vcreate_u64(0x0101010100000000ULL));
-    uint8x8_t    alpha_selector=vreinterpret_u8_u64(vcreate_u64(0x0707070703030303ULL));
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
-
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint32_t w;
+    uint8x8_t sval2;
+    uint8x8x4_t sval8;
+    uint8x8_t mask_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0101010100000000ULL));
+    uint8x8_t alpha_selector = vreinterpret_u8_u64 (vcreate_u64 (0x0707070703030303ULL));
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    
+    /* bail out if fully transparent */
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    sval2=vreinterpret_u8_u32(vdup_n_u32(src));
-    sval8.val[0]=vdup_lane_u8(sval2,0);
-    sval8.val[1]=vdup_lane_u8(sval2,1);
-    sval8.val[2]=vdup_lane_u8(sval2,2);
-    sval8.val[3]=vdup_lane_u8(sval2,3);
+    sval2 = vreinterpret_u8_u32 (vdup_n_u32 (src));
+    sval8.val[0] = vdup_lane_u8 (sval2, 0);
+    sval8.val[1] = vdup_lane_u8 (sval2, 1);
+    sval8.val[2] = vdup_lane_u8 (sval2, 2);
+    sval8.val[3] = vdup_lane_u8 (sval2, 3);
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    if (width>=8)
+    if (width >= 8)
     {
-        // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
-        while (height--)
-        {
-            uint32_t *keep_dst;
+	/* Use overlapping 8-pixel method, modified to avoid
+	 * rewritten dest being reused
+	 */
+	while (height--)
+	{
+	    uint32_t *keep_dst = 0;
 
-            dst = dstLine;
-            dstLine += dstStride;
-            mask = maskLine;
-            maskLine += maskStride;
-            w = width;
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
 
 #ifndef USE_GCC_INLINE_ASM
-            uint8x8_t alpha;
-            uint8x8x4_t dval, temp;
-
-            alpha = vld1_u8((void*)mask);
-            dval = vld4_u8((void*)dst);
-            keep_dst = dst;
-
-            temp = neon8mul(sval8,alpha);
-            dval = neon8mul(dval,vmvn_u8(temp.val[3]));
-            temp = neon8qadd(temp,dval);
-
-            mask += (w & 7);
-            dst += (w & 7);
-            w -= (w & 7);
-
-            while (w)
-            {
-                alpha = vld1_u8((void*)mask);
-                dval = vld4_u8((void*)dst);
-
-                vst4_u8((void*)keep_dst,temp);
-                keep_dst = dst;
-
-                temp = neon8mul(sval8,alpha);
-                dval = neon8mul(dval,vmvn_u8(temp.val[3]));
-                temp = neon8qadd(temp,dval);
-
-                mask+=8;
-                dst+=8;
-                w-=8;
-            }
-            vst4_u8((void*)keep_dst,temp);
+	    uint8x8_t alpha;
+	    uint8x8x4_t dval, temp;
+
+	    alpha = vld1_u8 ((void *)mask);
+	    dval = vld4_u8 ((void *)dst);
+	    keep_dst = dst;
+
+	    temp = neon8mul (sval8, alpha);
+	    dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
+	    temp = neon8qadd (temp, dval);
+
+	    mask += (w & 7);
+	    dst += (w & 7);
+	    w -= (w & 7);
+
+	    while (w)
+	    {
+		alpha = vld1_u8 ((void *)mask);
+		dval = vld4_u8 ((void *)dst);
+
+		vst4_u8 ((void *)keep_dst, temp);
+		keep_dst = dst;
+
+		temp = neon8mul (sval8, alpha);
+		dval = neon8mul (dval, vmvn_u8 (temp.val[3]));
+		temp = neon8qadd (temp, dval);
+
+		mask += 8;
+		dst += 8;
+		w -= 8;
+	    }
+	    vst4_u8 ((void *)keep_dst, temp);
 #else
-        asm volatile (
-                        "vdup.32      d0, %[src]\n\t"
-                        "vdup.8       d1, d0[1]\n\t"
-                        "vdup.8       d2, d0[2]\n\t"
-                        "vdup.8       d3, d0[3]\n\t"
-                        "vdup.8       d0, d0[0]\n\t"
-
-                        "vld4.8       {d4-d7}, [%[dst]]\n\t"
-                        "vld1.8       {d31}, [%[mask]]\n\t"
-                        "mov  %[keep_dst], %[dst]\n\t"
-
-                        "and  ip, %[w], #7\n\t"
-                        "add  %[mask], %[mask], ip\n\t"
-                        "add  %[dst], %[dst], ip, LSL#2\n\t"
-                        "subs  %[w], %[w], ip\n\t"
-                        "b 9f\n\t"
-// LOOP
-                        "2:\n\t" 
-                        "vld4.8       {d4-d7}, [%[dst]]!\n\t"
-                        "vld1.8       {d31}, [%[mask]]!\n\t"
-                        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
-                        "sub  %[keep_dst], %[dst], #8*4\n\t"
-                        "subs  %[w], %[w], #8\n\t"
-                        "9:\n\t"
-
-                        "vmull.u8     q10, d31, d0\n\t"
-                        "vmull.u8     q11, d31, d1\n\t"
-                        "vmull.u8     q12, d31, d2\n\t"
-                        "vmull.u8     q13, d31, d3\n\t"
-                        "vrshr.u16    q8, q10, #8\n\t"
-                        "vrshr.u16    q9, q11, #8\n\t"
-                        "vraddhn.u16  d20, q10, q8\n\t"
-                        "vraddhn.u16  d21, q11, q9\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vraddhn.u16  d23, q13, q9\n\t"
-                        "vraddhn.u16  d22, q12, q8\n\t"
-
-                        "vmvn.8       d30, d23\n\t"
-                        "vmull.u8     q12, d30, d4\n\t"
-                        "vmull.u8     q13, d30, d5\n\t"
-                        "vmull.u8     q14, d30, d6\n\t"
-                        "vmull.u8     q15, d30, d7\n\t"
-
-                        "vrshr.u16    q8, q12, #8\n\t"
-                        "vrshr.u16    q9, q13, #8\n\t"
-                        "vraddhn.u16  d4, q12, q8\n\t"
-                        "vrshr.u16    q8, q14, #8\n\t"
-                        "vraddhn.u16  d5, q13, q9\n\t"
-                        "vrshr.u16    q9, q15, #8\n\t"
-                        "vraddhn.u16  d6, q14, q8\n\t"
-                        "vraddhn.u16  d7, q15, q9\n\t"
-// result in d4-d7
-
-                        "vqadd.u8     d20, d4, d20\n\t"
-                        "vqadd.u8     d21, d5, d21\n\t"
-                        "vqadd.u8     d22, d6, d22\n\t"
-                        "vqadd.u8     d23, d7, d23\n\t"
-
-                        "bne 2b\n\t"
-
-                        "1:\n\t"
-                        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
-
-                        : [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
-                        : [src] "r" (src) 
-                        : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
-                          "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
-                          "d30","d31"
-                        );
+	    asm volatile (
+	        "vdup.32      d0, %[src]\n\t"
+	        "vdup.8       d1, d0[1]\n\t"
+	        "vdup.8       d2, d0[2]\n\t"
+	        "vdup.8       d3, d0[3]\n\t"
+	        "vdup.8       d0, d0[0]\n\t"
+
+	        "vld4.8       {d4-d7}, [%[dst]]\n\t"
+	        "vld1.8       {d31}, [%[mask]]\n\t"
+	        "mov  %[keep_dst], %[dst]\n\t"
+
+	        "and  ip, %[w], #7\n\t"
+	        "add  %[mask], %[mask], ip\n\t"
+	        "add  %[dst], %[dst], ip, LSL#2\n\t"
+	        "subs  %[w], %[w], ip\n\t"
+	        "b 9f\n\t"
+/* LOOP */
+	        "2:\n\t"
+	        "vld4.8       {d4-d7}, [%[dst]]!\n\t"
+	        "vld1.8       {d31}, [%[mask]]!\n\t"
+	        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
+	        "sub  %[keep_dst], %[dst], #8*4\n\t"
+	        "subs  %[w], %[w], #8\n\t"
+	        "9:\n\t"
+
+	        "vmull.u8     q10, d31, d0\n\t"
+	        "vmull.u8     q11, d31, d1\n\t"
+	        "vmull.u8     q12, d31, d2\n\t"
+	        "vmull.u8     q13, d31, d3\n\t"
+	        "vrshr.u16    q8, q10, #8\n\t"
+	        "vrshr.u16    q9, q11, #8\n\t"
+	        "vraddhn.u16  d20, q10, q8\n\t"
+	        "vraddhn.u16  d21, q11, q9\n\t"
+	        "vrshr.u16    q9, q13, #8\n\t"
+	        "vrshr.u16    q8, q12, #8\n\t"
+	        "vraddhn.u16  d23, q13, q9\n\t"
+	        "vraddhn.u16  d22, q12, q8\n\t"
+
+	        "vmvn.8       d30, d23\n\t"
+	        "vmull.u8     q12, d30, d4\n\t"
+	        "vmull.u8     q13, d30, d5\n\t"
+	        "vmull.u8     q14, d30, d6\n\t"
+	        "vmull.u8     q15, d30, d7\n\t"
+
+	        "vrshr.u16    q8, q12, #8\n\t"
+	        "vrshr.u16    q9, q13, #8\n\t"
+	        "vraddhn.u16  d4, q12, q8\n\t"
+	        "vrshr.u16    q8, q14, #8\n\t"
+	        "vraddhn.u16  d5, q13, q9\n\t"
+	        "vrshr.u16    q9, q15, #8\n\t"
+	        "vraddhn.u16  d6, q14, q8\n\t"
+	        "vraddhn.u16  d7, q15, q9\n\t"
+/* result in d4-d7 */
+
+	        "vqadd.u8     d20, d4, d20\n\t"
+	        "vqadd.u8     d21, d5, d21\n\t"
+	        "vqadd.u8     d22, d6, d22\n\t"
+	        "vqadd.u8     d23, d7, d23\n\t"
+
+	        "bne 2b\n\t"
+
+	        "1:\n\t"
+	        "vst4.8       {d20-d23}, [%[keep_dst]]\n\t"
+
+		: [w] "+r" (w), [dst] "+r" (dst), [mask] "+r" (mask), [keep_dst] "=r" (keep_dst)
+		: [src] "r" (src)
+		: "ip", "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+	        "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
+	        "d30", "d31"
+	        );
 #endif
-        }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    uint8x8_t alpha;
+
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    while (w >= 2)
+	    {
+		uint8x8_t dval, temp, res;
+
+		alpha = vtbl1_u8 (
+		    vreinterpret_u8_u16 (vld1_dup_u16 ((void *)mask)), mask_selector);
+		dval = vld1_u8 ((void *)dst);
+
+		temp = neon2mul (sval2, alpha);
+		res = vqadd_u8 (
+		    temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
+
+		vst1_u8 ((void *)dst, res);
+
+		mask += 2;
+		dst += 2;
+		w -= 2;
+	    }
+
+	    if (w)
+	    {
+		uint8x8_t dval, temp, res;
+
+		alpha = vtbl1_u8 (vld1_dup_u8 ((void *)mask), mask_selector);
+		dval = vreinterpret_u8_u32 (vld1_dup_u32 ((void *)dst));
+
+		temp = neon2mul (sval2, alpha);
+		res = vqadd_u8 (
+		    temp, neon2mul (dval, vtbl1_u8 (vmvn_u8 (temp), alpha_selector)));
+
+		vst1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (res), 0);
+	    }
+	}
+    }
+}
+
+static void
+neon_composite_add_8888_8_8 (pixman_implementation_t * impl,
+                             pixman_op_t               op,
+                             pixman_image_t *          src_image,
+                             pixman_image_t *          mask_image,
+                             pixman_image_t *          dst_image,
+                             int32_t                   src_x,
+                             int32_t                   src_y,
+                             int32_t                   mask_x,
+                             int32_t                   mask_y,
+                             int32_t                   dest_x,
+                             int32_t                   dest_y,
+                             int32_t                   width,
+                             int32_t                   height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint32_t w;
+    uint32_t src;
+    uint8x8_t sa;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    sa = vdup_n_u8 ((src) >> 24);
+
+    if (width >= 8)
+    {
+	/* Use overlapping 8-pixel method, modified to avoid rewritten dest being reused */
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    uint8x8_t mval, dval, res;
+	    uint8_t     *keep_dst;
+
+	    mval = vld1_u8 ((void *)mask);
+	    dval = vld1_u8 ((void *)dst);
+	    keep_dst = dst;
+
+	    res = vqadd_u8 (neon2mul (mval, sa), dval);
+
+	    mask += (w & 7);
+	    dst += (w & 7);
+	    w -= w & 7;
+
+	    while (w)
+	    {
+		mval = vld1_u8 ((void *)mask);
+		dval = vld1_u8 ((void *)dst);
+		vst1_u8 ((void *)keep_dst, res);
+		keep_dst = dst;
+
+		res = vqadd_u8 (neon2mul (mval, sa), dval);
+
+		mask += 8;
+		dst += 8;
+		w -= 8;
+	    }
+	    vst1_u8 ((void *)keep_dst, res);
+	}
     }
     else
     {
-        while (height--)
-        {
-            uint8x8_t alpha;
+	/* Use 4/2/1 load/store method to handle 1-7 pixels */
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
 
-            dst = dstLine;
-            dstLine += dstStride;
-            mask = maskLine;
-            maskLine += maskStride;
-            w = width;
+	    uint8x8_t mval = sa, dval = sa, res;
+	    uint8_t *dst4 = 0, *dst2 = 0;
 
-            while (w>=2)
-            {
-                uint8x8_t dval, temp, res;
+	    if (w & 4)
+	    {
+		mval = vreinterpret_u8_u32 (
+		    vld1_lane_u32 ((void *)mask, vreinterpret_u32_u8 (mval), 1));
+		dval = vreinterpret_u8_u32 (
+		    vld1_lane_u32 ((void *)dst, vreinterpret_u32_u8 (dval), 1));
+
+		dst4 = dst;
+		mask += 4;
+		dst += 4;
+	    }
 
-                alpha = vtbl1_u8(vreinterpret_u8_u16(vld1_dup_u16((void*)mask)), mask_selector);
-                dval = vld1_u8((void*)dst);
+	    if (w & 2)
+	    {
+		mval = vreinterpret_u8_u16 (
+		    vld1_lane_u16 ((void *)mask, vreinterpret_u16_u8 (mval), 1));
+		dval = vreinterpret_u8_u16 (
+		    vld1_lane_u16 ((void *)dst, vreinterpret_u16_u8 (dval), 1));
+		dst2 = dst;
+		mask += 2;
+		dst += 2;
+	    }
 
-                temp = neon2mul(sval2,alpha);
-                res = vqadd_u8(temp,neon2mul(dval,vtbl1_u8(vmvn_u8(temp), alpha_selector)));
+	    if (w & 1)
+	    {
+		mval = vld1_lane_u8 (mask, mval, 1);
+		dval = vld1_lane_u8 (dst, dval, 1);
+	    }
 
-                vst1_u8((void*)dst,res);
+	    res = vqadd_u8 (neon2mul (mval, sa), dval);
 
-                mask+=2;
-                dst+=2;
-                w-=2;
-            }
-            if (w)
-            {
-                uint8x8_t dval, temp, res;
+	    if (w & 1)
+		vst1_lane_u8 (dst, res, 1);
+	    if (w & 2)
+		vst1_lane_u16 ((void *)dst2, vreinterpret_u16_u8 (res), 1);
+	    if (w & 4)
+		vst1_lane_u32 ((void *)dst4, vreinterpret_u32_u8 (res), 1);
+	}
+    }
+}
 
-                alpha = vtbl1_u8(vld1_dup_u8((void*)mask), mask_selector);
-                dval = vreinterpret_u8_u32(vld1_dup_u32((void*)dst));
+#ifdef USE_GCC_INLINE_ASM
 
-                temp = neon2mul(sval2,alpha);
-                res = vqadd_u8(temp,neon2mul(dval,vtbl1_u8(vmvn_u8(temp), alpha_selector)));
+static void
+neon_composite_src_16_16 (pixman_implementation_t * impl,
+                          pixman_op_t               op,
+                          pixman_image_t *          src_image,
+                          pixman_image_t *          mask_image,
+                          pixman_image_t *          dst_image,
+                          int32_t                   src_x,
+                          int32_t                   src_y,
+                          int32_t                   mask_x,
+                          int32_t                   mask_y,
+                          int32_t                   dest_x,
+                          int32_t                   dest_y,
+                          int32_t                   width,
+                          int32_t                   height)
+{
+    uint16_t    *dst_line, *src_line;
+    uint32_t dst_stride, src_stride;
 
-                vst1_lane_u32((void*)dst,vreinterpret_u32_u8(res),0);
-            }
-        }
+    if (!height || !width)
+	return;
+
+    /* We simply copy 16-bit-aligned pixels from one place to another. */
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    /* Preload the first input scanline */
+    {
+	uint16_t *src_ptr = src_line;
+	uint32_t count = width;
+
+	asm volatile (
+	    "0: @ loop							\n"
+	    "	subs    %[count], %[count], #32				\n"
+	    "	pld     [%[src]]					\n"
+	    "	add     %[src], %[src], #64				\n"
+	    "	bgt 0b							\n"
+
+	    /* Clobbered input registers marked as input/outputs */
+	    : [src] "+r" (src_ptr), [count] "+r" (count)
+	    :     /* no unclobbered inputs */
+	    : "cc"
+	    );
+    }
+
+    while (height--)
+    {
+	uint16_t *dst_ptr = dst_line;
+	uint16_t *src_ptr = src_line;
+	uint32_t count = width;
+	uint32_t tmp = 0;
+
+	/* Uses multi-register access and preloading to maximise bandwidth.
+	 * Each pixel is one halfword, so a quadword contains 8px.
+	 * Preload frequency assumed a 64-byte cacheline.
+	 */
+	asm volatile (
+	    "	cmp       %[count], #64				\n"
+	    "	blt 1f    @ skip oversized fragments		\n"
+	    "0: @ start with eight quadwords at a time		\n"
+	    /* preload from next scanline */
+	    "	pld       [%[src], %[src_stride], LSL #1]	\n"
+	    "	sub       %[count], %[count], #64		\n"
+	    "	vld1.16   {d16, d17, d18, d19}, [%[src]]!		\n"
+	    "	vld1.16   {d20, d21, d22, d23}, [%[src]]!		\n"
+	    /* preload from next scanline */
+	    "	pld       [%[src], %[src_stride], LSL #1]	\n"
+	    "	vld1.16   {d24, d25, d26, d27}, [%[src]]!		\n"
+	    "	vld1.16   {d28, d29, d30, d31}, [%[src]]!		\n"
+	    "	cmp       %[count], #64				\n"
+	    "	vst1.16   {d16, d17, d18, d19}, [%[dst]]!		\n"
+	    "	vst1.16   {d20, d21, d22, d23}, [%[dst]]!		\n"
+	    "	vst1.16   {d24, d25, d26, d27}, [%[dst]]!		\n"
+	    "	vst1.16   {d28, d29, d30, d31}, [%[dst]]!		\n"
+	    "	bge 0b						\n"
+	    "	cmp       %[count], #0				\n"
+	    "	beq 7f    @ aligned fastpath			\n"
+	    "1: @ four quadwords				\n"
+	    "	tst       %[count], #32				\n"
+	    "	beq 2f    @ skip oversized fragment		\n"
+	    /* preload from next scanline */
+	    "	pld       [%[src], %[src_stride], LSL #1]	\n"
+	    "	vld1.16   {d16, d17, d18, d19}, [%[src]]!		\n"
+	    "	vld1.16   {d20, d21, d22, d23}, [%[src]]!		\n"
+	    "	vst1.16   {d16, d17, d18, d19}, [%[dst]]!		\n"
+	    "	vst1.16   {d20, d21, d22, d23}, [%[dst]]!		\n"
+	    "2: @ two quadwords					\n"
+	    "	tst       %[count], #16				\n"
+	    "	beq 3f    @ skip oversized fragment		\n"
+	    /* preload from next scanline */
+	    "	pld       [%[src], %[src_stride], LSL #1]	\n"
+	    "	vld1.16   {d16, d17, d18, d19}, [%[src]]!		\n"
+	    "	vst1.16   {d16, d17, d18, d19}, [%[dst]]!		\n"
+	    "3: @ one quadword					\n"
+	    "	tst       %[count], #8				\n"
+	    "	beq 4f    @ skip oversized fragment		\n"
+	    "	vld1.16   {d16, d17}, [%[src]]!			\n"
+	    "	vst1.16   {d16, d17}, [%[dst]]!			\n"
+	    "4: @ one doubleword				\n"
+	    "	tst       %[count], #4				\n"
+	    "	beq 5f    @ skip oversized fragment		\n"
+	    "	vld1.16   {d16}, [%[src]]!			\n"
+	    "	vst1.16   {d16}, [%[dst]]!			\n"
+	    "5: @ one word					\n"
+	    "	tst       %[count], #2				\n"
+	    "	beq 6f    @ skip oversized fragment		\n"
+	    "	ldr       %[tmp], [%[src]], #4			\n"
+	    "	str       %[tmp], [%[dst]], #4			\n"
+	    "6: @ one halfword					\n"
+	    "	tst       %[count], #1				\n"
+	    "	beq 7f    @ skip oversized fragment		\n"
+	    "	ldrh      %[tmp], [%[src]]			\n"
+	    "	strh      %[tmp], [%[dst]]			\n"
+	    "7: @ end						\n"
+
+	    /* Clobbered input registers marked as input/outputs */
+	    : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr),
+	      [count] "+r" (count), [tmp] "+r" (tmp)
+
+	      /* Unclobbered input */
+	    : [src_stride] "r" (src_stride)
+
+	      /* Clobbered vector registers */
+	    : "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+	      "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory"
+	    );
+
+	src_line += src_stride;
+	dst_line += dst_stride;
     }
 }
 
+#endif /* USE_GCC_INLINE_ASM */
 
-void
-fbCompositeSrcAdd_8888x8x8neon (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-                            pixman_image_t * pSrc,
-                            pixman_image_t * pMask,
-                            pixman_image_t * pDst,
-                            int32_t      xSrc,
-                            int32_t      ySrc,
-                            int32_t      xMask,
-                            int32_t      yMask,
-                            int32_t      xDst,
-                            int32_t      yDst,
-                            int32_t      width,
-                            int32_t      height)
+static void
+neon_composite_src_24_16 (pixman_implementation_t * impl,
+                          pixman_op_t               op,
+                          pixman_image_t *          src_image,
+                          pixman_image_t *          mask_image,
+                          pixman_image_t *          dst_image,
+                          int32_t                   src_x,
+                          int32_t                   src_y,
+                          int32_t                   mask_x,
+                          int32_t                   mask_y,
+                          int32_t                   dest_x,
+                          int32_t                   dest_y,
+                          int32_t                   width,
+                          int32_t                   height)
 {
-    uint8_t     *dstLine, *dst;
-    uint8_t     *maskLine, *mask;
-    int dstStride, maskStride;
-    uint32_t    w;
-    uint32_t    src;
-    uint8x8_t   sa;
-
-    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
-    fbComposeGetSolid (pSrc, src, pDst->bits.format);
-    sa = vdup_n_u8((src) >> 24);
+    uint16_t    *dst_line;
+    uint32_t    *src_line;
+    uint32_t dst_stride, src_stride;
 
-    if (width>=8)
+    if (!width || !height)
+	return;
+
+    /* We simply copy pixels from one place to another,
+     * assuming that the source's alpha is opaque.
+     */
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    /* Preload the first input scanline */
     {
-        // Use overlapping 8-pixel method, modified to avoid rewritten dest being reused
-        while (height--)
-        {
-            dst = dstLine;
-            dstLine += dstStride;
-            mask = maskLine;
-            maskLine += maskStride;
-            w = width;
-
-            uint8x8_t mval, dval, res;
-            uint8_t     *keep_dst;
-
-            mval = vld1_u8((void *)mask);
-            dval = vld1_u8((void *)dst);
-            keep_dst = dst;
-
-            res = vqadd_u8(neon2mul(mval,sa),dval);
-
-            mask += (w & 7);
-            dst += (w & 7);
-            w -= w & 7;
-
-            while (w)
-            {
-                mval = vld1_u8((void *)mask);
-                dval = vld1_u8((void *)dst);
-                vst1_u8((void *)keep_dst, res);
-                keep_dst = dst;
-
-                res = vqadd_u8(neon2mul(mval,sa),dval);
-
-                mask += 8;
-                dst += 8;
-                w -= 8;
-            }
-            vst1_u8((void *)keep_dst, res);
-        }
+	uint8_t *src_ptr = (uint8_t*) src_line;
+	uint32_t count = (width + 15) / 16;
+
+#ifdef USE_GCC_INLINE_ASM
+	asm volatile (
+	    "0: @ loop						\n"
+	    "	subs    %[count], %[count], #1			\n"
+	    "	pld     [%[src]]				\n"
+	    "	add     %[src], %[src], #64			\n"
+	    "	bgt 0b						\n"
+
+	    /* Clobbered input registers marked as input/outputs */
+	    : [src] "+r" (src_ptr), [count] "+r" (count)
+	    :     /* no unclobbered inputs */
+	    : "cc"
+	    );
+#else
+	do
+	{
+	    __pld (src_ptr);
+	    src_ptr += 64;
+	}
+	while (--count);
+#endif
+    }
+
+    while (height--)
+    {
+	uint16_t *dst_ptr = dst_line;
+	uint32_t *src_ptr = src_line;
+	uint32_t count = width;
+	const uint32_t rb_mask = 0x1F;
+	const uint32_t g_mask = 0x3F;
+
+	/* If you're going to complain about a goto, take a long hard look
+	 * at the massive blocks of assembler this skips over.  ;-)
+	 */
+	if (count < 8)
+	    goto small_stuff;
+
+#ifdef USE_GCC_INLINE_ASM
+
+	/* This is not as aggressive as the RGB565-source case.
+	 * Generally the source is in cached RAM when the formats are
+	 * different, so we use preload.
+	 * 
+	 * We don't need to blend, so we are not reading from the
+	 * uncached framebuffer.
+	 */
+	asm volatile (
+	    "	cmp       %[count], #16				\n"
+	    "	blt 1f    @ skip oversized fragments		\n"
+	    "0: @ start with sixteen pixels at a time		\n"
+	    "	sub       %[count], %[count], #16		\n"
+	    "	pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline			\n"
+	    "	vld4.8    {d0, d1, d2, d3}, [%[src]]!		@ d3 is alpha and ignored, d2-0 are rgb.	\n"
+	    "	vld4.8    {d4, d5, d6, d7}, [%[src]]!		@ d7 is alpha and ignored, d6-4 are rgb.	\n"
+	    "	vshll.u8  q8, d2, #8				@ expand first red for repacking		\n"
+	    "	vshll.u8  q10, d1, #8				@ expand first green for repacking		\n"
+	    "	vshll.u8  q11, d0, #8				@ expand first blue for repacking		\n"
+	    "	vshll.u8  q9, d6, #8				@ expand second red for repacking		\n"
+	    "	vsri.u16  q8, q10, #5				@ insert first green after red			\n"
+	    "	vshll.u8  q10, d5, #8				@ expand second green for repacking		\n"
+	    "	vsri.u16  q8, q11, #11				@ insert first blue after green			\n"
+	    "	vshll.u8  q11, d4, #8				@ expand second blue for repacking		\n"
+	    "	vsri.u16  q9, q10, #5				@ insert second green after red			\n"
+	    "	vsri.u16  q9, q11, #11				@ insert second blue after green		\n"
+	    "	cmp       %[count], #16				\n"
+	    "	vst1.16   {d16, d17, d18, d19}, [%[dst]]!          @ store 16 pixels				\n"
+	    "	bge 0b						\n"
+	    "1: @ end of main loop				\n"
+	    "	cmp       %[count], #8				@ can we still do an 8-pixel block?		\n"
+	    "	blt 2f						\n"
+	    "	sub       %[count], %[count], #8		\n"
+	    "	pld      [%[src], %[src_stride], lsl #2]        @ preload from next scanline			\n"
+	    "	vld4.8    {d0, d1, d2, d3}, [%[src]]!		@ d3 is alpha and ignored, d2-0 are rgb.	\n"
+	    "	vshll.u8  q8, d2, #8				@ expand first red for repacking		\n"
+	    "	vshll.u8  q10, d1, #8				@ expand first green for repacking		\n"
+	    "	vshll.u8  q11, d0, #8				@ expand first blue for repacking		\n"
+	    "	vsri.u16  q8, q10, #5				@ insert first green after red			\n"
+	    "	vsri.u16  q8, q11, #11				@ insert first blue after green			\n"
+	    "	vst1.16   {d16, d17}, [%[dst]]!          @ store 8 pixels				\n"
+	    "2: @ end						\n"
+
+	    /* Clobbered input and working registers marked as input/outputs */
+	    : [dst] "+r" (dst_ptr), [src] "+r" (src_ptr), [count] "+r" (count)
+
+	      /* Unclobbered input */
+	    : [src_stride] "r" (src_stride)
+
+	      /* Clobbered vector registers */
+
+	      /* NB: these are the quad aliases of the
+	       * double registers used in the asm
+	       */
+	    : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17",
+	      "d18", "d19", "d20", "d21", "d22", "d23", "cc", "memory"
+	    );
+#else
+	/* A copy of the above code, in intrinsics-form. */
+	while (count >= 16)
+	{
+	    uint8x8x4_t pixel_set_a, pixel_set_b;
+	    uint16x8_t red_a, green_a, blue_a;
+	    uint16x8_t red_b, green_b, blue_b;
+	    uint16x8_t dest_pixels_a, dest_pixels_b;
+
+	    count -= 16;
+	    __pld (src_ptr + src_stride);
+	    pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
+	    pixel_set_b = vld4_u8 ((uint8_t*)(src_ptr + 8));
+	    src_ptr += 16;
+
+	    red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
+	    green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
+	    blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
+	    
+	    red_b   = vshll_n_u8 (pixel_set_b.val[2], 8);
+	    green_b = vshll_n_u8 (pixel_set_b.val[1], 8);
+	    blue_b  = vshll_n_u8 (pixel_set_b.val[0], 8);
+	    
+	    dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
+	    dest_pixels_b = vsriq_n_u16 (red_b, green_b, 5);
+	    
+	    dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
+	    dest_pixels_b = vsriq_n_u16 (dest_pixels_b, blue_b, 11);
+
+	    /* There doesn't seem to be an intrinsic for the
+	     * double-quadword variant
+	     */
+	    vst1q_u16 (dst_ptr, dest_pixels_a);
+	    vst1q_u16 (dst_ptr + 8, dest_pixels_b);
+	    dst_ptr += 16;
+	}
+
+	/* 8-pixel loop */
+	if (count >= 8)
+	{
+	    uint8x8x4_t pixel_set_a;
+	    uint16x8_t red_a, green_a, blue_a;
+	    uint16x8_t dest_pixels_a;
+
+	    __pld (src_ptr + src_stride);
+	    count -= 8;
+	    pixel_set_a = vld4_u8 ((uint8_t*)(src_ptr));
+	    src_ptr += 8;
+
+	    red_a   = vshll_n_u8 (pixel_set_a.val[2], 8);
+	    green_a = vshll_n_u8 (pixel_set_a.val[1], 8);
+	    blue_a  = vshll_n_u8 (pixel_set_a.val[0], 8);
+
+	    dest_pixels_a = vsriq_n_u16 (red_a, green_a, 5);
+	    dest_pixels_a = vsriq_n_u16 (dest_pixels_a, blue_a, 11);
+
+	    vst1q_u16 (dst_ptr, dest_pixels_a);
+	    dst_ptr += 8;
+	}
+
+#endif  /* USE_GCC_INLINE_ASM */
+
+    small_stuff:
+	if (count)
+	    __pld (src_ptr + src_stride);
+
+	while (count >= 2)
+	{
+	    uint32_t src_pixel_a = *src_ptr++;
+	    uint32_t src_pixel_b = *src_ptr++;
+
+	    /* ARM is really good at shift-then-ALU ops. */
+	    /* This should be a total of six shift-ANDs and five shift-ORs. */
+	    uint32_t dst_pixels_a;
+	    uint32_t dst_pixels_b;
+
+	    dst_pixels_a  = ((src_pixel_a >>  3) & rb_mask);
+	    dst_pixels_a |= ((src_pixel_a >> 10) &  g_mask) << 5;
+	    dst_pixels_a |= ((src_pixel_a >> 19) & rb_mask) << 11;
+
+	    dst_pixels_b  = ((src_pixel_b >>  3) & rb_mask);
+	    dst_pixels_b |= ((src_pixel_b >> 10) &  g_mask) << 5;
+	    dst_pixels_b |= ((src_pixel_b >> 19) & rb_mask) << 11;
+
+	    /* little-endian mode only */
+	    *((uint32_t*) dst_ptr) = dst_pixels_a | (dst_pixels_b << 16);
+	    dst_ptr += 2;
+	    count -= 2;
+	}
+
+	if (count)
+	{
+	    uint32_t src_pixel = *src_ptr++;
+
+	    /* ARM is really good at shift-then-ALU ops.
+	     * This block should end up as three shift-ANDs
+	     * and two shift-ORs.
+	     */
+	    uint32_t tmp_blue  = (src_pixel >>  3) & rb_mask;
+	    uint32_t tmp_green = (src_pixel >> 10) & g_mask;
+	    uint32_t tmp_red   = (src_pixel >> 19) & rb_mask;
+	    uint16_t dst_pixel = (tmp_red << 11) | (tmp_green << 5) | tmp_blue;
+
+	    *dst_ptr++ = dst_pixel;
+	    count--;
+	}
+
+	src_line += src_stride;
+	dst_line += dst_stride;
+    }
+}
+
+static pixman_bool_t
+pixman_fill_neon (uint32_t *bits,
+                  int       stride,
+                  int       bpp,
+                  int       x,
+                  int       y,
+                  int       width,
+                  int       height,
+                  uint32_t  _xor)
+{
+    uint32_t byte_stride, color;
+    char *dst;
+
+    /* stride is always multiple of 32bit units in pixman */
+    byte_stride = stride * sizeof(uint32_t);
+
+    switch (bpp)
+    {
+    case 8:
+	dst = ((char *) bits) + y * byte_stride + x;
+	_xor &= 0xff;
+	color = _xor << 24 | _xor << 16 | _xor << 8 | _xor;
+	break;
+
+    case 16:
+	dst = ((char *) bits) + y * byte_stride + x * 2;
+	_xor &= 0xffff;
+	color = _xor << 16 | _xor;
+	width *= 2;         /* width to bytes */
+	break;
+
+    case 32:
+	dst = ((char *) bits) + y * byte_stride + x * 4;
+	color = _xor;
+	width *= 4;         /* width to bytes */
+	break;
+
+    default:
+	return FALSE;
+    }
+
+#ifdef USE_GCC_INLINE_ASM
+    if (width < 16)
+    {
+	/* We have a special case for such small widths that don't allow
+	 * us to use wide 128-bit stores anyway. We don't waste time
+	 * trying to align writes, since there are only very few of them anyway
+	 */
+	asm volatile (
+	    "cmp		%[height], #0\n"/* Check if empty fill */
+	    "beq		3f\n"
+	    "vdup.32	d0, %[color]\n"/* Fill the color to neon req */
+
+	    /* Check if we have a such width that can easily be handled by single
+	     * operation for each scanline. This significantly reduces the number
+	     * of test/branch instructions for each scanline
+	     */
+	    "cmp		%[width], #8\n"
+	    "beq		4f\n"
+	    "cmp		%[width], #4\n"
+	    "beq		5f\n"
+	    "cmp		%[width], #2\n"
+	    "beq		6f\n"
+
+	    /* Loop starts here for each scanline */
+	    "1:\n"
+	    "mov		r4, %[dst]\n" /* Starting address of the current line */
+	    "tst		%[width], #8\n"
+	    "beq		2f\n"
+	    "vst1.8		{d0}, [r4]!\n"
+	    "2:\n"
+	    "tst		%[width], #4\n"
+	    "beq		2f\n"
+	    "str		%[color], [r4], #4\n"
+	    "2:\n"
+	    "tst		%[width], #2\n"
+	    "beq		2f\n"
+	    "strh		%[color], [r4], #2\n"
+	    "2:\n"
+	    "tst		%[width], #1\n"
+	    "beq		2f\n"
+	    "strb		%[color], [r4], #1\n"
+	    "2:\n"
+
+	    "subs		%[height], %[height], #1\n"
+	    "add		%[dst], %[dst], %[byte_stride]\n"
+	    "bne		1b\n"
+	    "b		3f\n"
+
+	    /* Special fillers for those widths that we can do with single operation */
+	    "4:\n"
+	    "subs		%[height], %[height], #1\n"
+	    "vst1.8		{d0}, [%[dst]]\n"
+	    "add		%[dst], %[dst], %[byte_stride]\n"
+	    "bne		4b\n"
+	    "b		3f\n"
+
+	    "5:\n"
+	    "subs		%[height], %[height], #1\n"
+	    "str		%[color], [%[dst]]\n"
+	    "add		%[dst], %[dst], %[byte_stride]\n"
+	    "bne		5b\n"
+	    "b		3f\n"
+
+	    "6:\n"
+	    "subs		%[height], %[height], #1\n"
+	    "strh		%[color], [%[dst]]\n"
+	    "add		%[dst], %[dst], %[byte_stride]\n"
+	    "bne		6b\n"
+
+	    "3:\n"
+	    : [height] "+r" (height), [dst] "+r" (dst)
+	    : [color] "r" (color), [width] "r" (width),
+	      [byte_stride] "r" (byte_stride)
+	    : "memory", "cc", "d0", "r4");
     }
     else
     {
-        // Use 4/2/1 load/store method to handle 1-7 pixels
-        while (height--)
-        {
-            dst = dstLine;
-            dstLine += dstStride;
-            mask = maskLine;
-            maskLine += maskStride;
-            w = width;
-
-            uint8x8_t mval, dval, res;
-            uint8_t *dst4, *dst2;
-
-            if (w&4)
-            {
-                mval = vreinterpret_u8_u32(vld1_lane_u32((void *)mask, vreinterpret_u32_u8(mval), 1));
-                dval = vreinterpret_u8_u32(vld1_lane_u32((void *)dst, vreinterpret_u32_u8(dval), 1));
-
-                dst4 = dst;
-                mask += 4;
-                dst += 4;
-            }
-            if (w&2)
-            {
-                mval = vreinterpret_u8_u16(vld1_lane_u16((void *)mask, vreinterpret_u16_u8(mval), 1));
-                dval = vreinterpret_u8_u16(vld1_lane_u16((void *)dst, vreinterpret_u16_u8(dval), 1));
-                dst2 = dst;
-                mask += 2;
-                dst += 2;
-            }
-            if (w&1)
-            {
-                mval = vld1_lane_u8(mask, mval, 1);
-                dval = vld1_lane_u8(dst, dval, 1);
-            }
-
-            res = vqadd_u8(neon2mul(mval,sa),dval);
-
-            if (w&1)
-                vst1_lane_u8(dst, res, 1);
-            if (w&2)
-                vst1_lane_u16((void *)dst2, vreinterpret_u16_u8(res), 1);
-            if (w&4)
-                vst1_lane_u32((void *)dst4, vreinterpret_u32_u8(res), 1);
-        }
+	asm volatile (
+	    "cmp		%[height], #0\n"/* Check if empty fill */
+	    "beq		5f\n"
+	    "vdup.32	q0, %[color]\n"/* Fill the color to neon req */
+
+	    /* Loop starts here for each scanline */
+	    "1:\n"
+	    "mov		r4, %[dst]\n"/* Starting address of the current line */
+	    "mov		r5, %[width]\n"/* We're going to write this many bytes */
+	    "ands		r6, r4, #15\n"/* Are we at the 128-bit aligned address? */
+	    "beq		2f\n"/* Jump to the best case */
+
+	    /* We're not 128-bit aligned: However, we know that we can get to the
+	       next aligned location, since the fill is at least 16 bytes wide */
+	    "rsb                r6, r6, #16\n" /* We would need to go forward this much */
+	    "sub		r5, r5, r6\n"/* Update bytes left */
+	    "tst		r6, #1\n"
+	    "beq		6f\n"
+	    "vst1.8		{d0[0]}, [r4]!\n"/* Store byte, now we are word aligned */
+	    "6:\n"
+	    "tst		r6, #2\n"
+	    "beq		6f\n"
+	    "vst1.16	{d0[0]}, [r4, :16]!\n"/* Store half word, now we are 16-bit aligned */
+	    "6:\n"
+	    "tst		r6, #4\n"
+	    "beq		6f\n"
+	    "vst1.32	{d0[0]}, [r4, :32]!\n"/* Store word, now we're 32-bit aligned */
+	    "6:\n"
+	    "tst		r6, #8\n"
+	    "beq		2f\n"
+	    "vst1.64	{d0}, [r4, :64]!\n"/* Store qword now we're 64-bit aligned */
+
+	    /* The good case: We're 128-bit aligned for this scanline */
+	    "2:\n"
+	    "and		r6, r5, #15\n"/* Number of tailing bytes */
+	    "cmp		r5, r6\n"/* Do we have at least one qword to write? */
+	    "beq		6f\n"/* No, we just write the tail */
+	    "lsr		r5, r5, #4\n"/* This many full qwords to write */
+
+	    /* The main block: Do 128-bit aligned writes */
+	    "3:\n"
+	    "subs		r5, r5, #1\n"
+	    "vst1.64	{d0, d1}, [r4, :128]!\n"
+	    "bne		3b\n"
+
+	    /* Handle the tailing bytes: Do 64, 32, 16 and 8-bit aligned writes as needed.
+	       We know that we're currently at 128-bit aligned address, so we can just
+	       pick the biggest operations that the remaining write width allows */
+	    "6:\n"
+	    "cmp		r6, #0\n"
+	    "beq		4f\n"
+	    "tst		r6, #8\n"
+	    "beq		6f\n"
+	    "vst1.64	{d0}, [r4, :64]!\n"
+	    "6:\n"
+	    "tst		r6, #4\n"
+	    "beq		6f\n"
+	    "vst1.32	{d0[0]}, [r4, :32]!\n"
+	    "6:\n"
+	    "tst		r6, #2\n"
+	    "beq		6f\n"
+	    "vst1.16	{d0[0]}, [r4, :16]!\n"
+	    "6:\n"
+	    "tst		r6, #1\n"
+	    "beq		4f\n"
+	    "vst1.8		{d0[0]}, [r4]!\n"
+	    "4:\n"
+
+	    /* Handle the next scanline */
+	    "subs		%[height], %[height], #1\n"
+	    "add		%[dst], %[dst], %[byte_stride]\n"
+	    "bne		1b\n"
+	    "5:\n"
+	    : [height] "+r" (height), [dst] "+r" (dst)
+	    : [color] "r" (color), [width] "r" (width),
+	      [byte_stride] "r" (byte_stride)
+	    : "memory", "cc", "d0", "d1", "r4", "r5", "r6");
     }
+    return TRUE;
+
+#else
+
+    /* TODO: intrinsic version for armcc */
+    return FALSE;
+
+#endif
 }
 
-static const FastPathInfo arm_neon_fast_path_array[] = 
+/* TODO: is there a more generic way of doing this being introduced? */
+#define NEON_SCANLINE_BUFFER_PIXELS (1024)
+
+static inline void
+neon_quadword_copy (void *   dst,
+		    void *   src,
+		    uint32_t count,         /* of quadwords */
+		    uint32_t trailer_count  /* of bytes */)
 {
-    { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       fbCompositeSrcAdd_8888x8x8neon,        0 },
-    { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       fbCompositeSrcAdd_8000x8000neon,       0 },
-    { PIXMAN_OP_SRC,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_x888x0565neon,          0 },
-    { PIXMAN_OP_SRC,  PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_x888x0565neon,          0 },
-    { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_x888x0565neon,          0 },
-    { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_x888x0565neon,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888neon,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888neon,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888neon,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888neon,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888neon,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888neon,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   fbCompositeSolidMask_nx8x0565neon,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   fbCompositeSolidMask_nx8x0565neon,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888neon,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888neon,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888neon,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888neon,     0 },
-    { PIXMAN_OP_NONE },
-};
+    uint8_t *t_dst = dst, *t_src = src;
+
+    /* Uses aligned multi-register loads to maximise read bandwidth
+     * on uncached memory such as framebuffers
+     * The accesses do not have the aligned qualifiers, so that the copy
+     * may convert between aligned-uncached and unaligned-cached memory.
+     * It is assumed that the CPU can infer alignedness from the address.
+     */
+
+#ifdef USE_GCC_INLINE_ASM
+
+    asm volatile (
+        "	cmp       %[count], #8				\n"
+        "	blt 1f    @ skip oversized fragments		\n"
+        "0: @ start with eight quadwords at a time		\n"
+        "	sub       %[count], %[count], #8		\n"
+        "	vld1.8    {d16, d17, d18, d19}, [%[src]]!		\n"
+        "	vld1.8    {d20, d21, d22, d23}, [%[src]]!		\n"
+        "	vld1.8    {d24, d25, d26, d27}, [%[src]]!		\n"
+        "	vld1.8    {d28, d29, d30, d31}, [%[src]]!		\n"
+        "	cmp       %[count], #8				\n"
+        "	vst1.8    {d16, d17, d18, d19}, [%[dst]]!		\n"
+        "	vst1.8    {d20, d21, d22, d23}, [%[dst]]!		\n"
+        "	vst1.8    {d24, d25, d26, d27}, [%[dst]]!		\n"
+        "	vst1.8    {d28, d29, d30, d31}, [%[dst]]!		\n"
+        "	bge 0b						\n"
+        "1: @ four quadwords					\n"
+        "	tst       %[count], #4				\n"
+        "	beq 2f    @ skip oversized fragment		\n"
+        "	vld1.8    {d16, d17, d18, d19}, [%[src]]!		\n"
+        "	vld1.8    {d20, d21, d22, d23}, [%[src]]!		\n"
+        "	vst1.8    {d16, d17, d18, d19}, [%[dst]]!		\n"
+        "	vst1.8    {d20, d21, d22, d23}, [%[dst]]!		\n"
+        "2: @ two quadwords					\n"
+        "	tst       %[count], #2				\n"
+        "	beq 3f    @ skip oversized fragment		\n"
+        "	vld1.8    {d16, d17, d18, d19}, [%[src]]!		\n"
+        "	vst1.8    {d16, d17, d18, d19}, [%[dst]]!		\n"
+        "3: @ one quadword					\n"
+        "	tst       %[count], #1				\n"
+        "	beq 4f    @ skip oversized fragment		\n"
+        "	vld1.8    {d16, d17}, [%[src]]!			\n"
+        "	vst1.8    {d16, d17}, [%[dst]]!			\n"
+        "4: @ end						\n"
+
+        /* Clobbered input registers marked as input/outputs */
+	: [dst] "+r" (t_dst), [src] "+r" (t_src), [count] "+r" (count)
+
+	  /* No unclobbered inputs */
+	:
+
+        /* Clobbered vector registers */
+	: "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
+	  "d26", "d27", "d28", "d29", "d30", "d31", "cc", "memory");
+
+#else
+
+    while (count >= 8)
+    {
+	uint8x16x4_t t1 = vld4q_u8 (t_src);
+	uint8x16x4_t t2 = vld4q_u8 (t_src + sizeof(uint8x16x4_t));
+	
+	t_src += sizeof(uint8x16x4_t) * 2;
+	vst4q_u8 (t_dst, t1);
+	vst4q_u8 (t_dst + sizeof(uint8x16x4_t), t2);
+	t_dst += sizeof(uint8x16x4_t) * 2;
+	count -= 8;
+    }
+
+    if (count & 4)
+    {
+	uint8x16x4_t t1 = vld4q_u8 (t_src);
+	
+	t_src += sizeof(uint8x16x4_t);
+	vst4q_u8 (t_dst, t1);
+	t_dst += sizeof(uint8x16x4_t);
+    }
+
+    if (count & 2)
+    {
+	uint8x8x4_t t1 = vld4_u8 (t_src);
+	
+	t_src += sizeof(uint8x8x4_t);
+	vst4_u8 (t_dst, t1);
+	t_dst += sizeof(uint8x8x4_t);
+    }
+
+    if (count & 1)
+    {
+	uint8x16_t t1 = vld1q_u8 (t_src);
+	
+	t_src += sizeof(uint8x16_t);
+	vst1q_u8 (t_dst, t1);
+	t_dst += sizeof(uint8x16_t);
+    }
+
+#endif  /* !USE_GCC_INLINE_ASM */
+
+    if (trailer_count)
+    {
+	if (trailer_count & 8)
+	{
+	    uint8x8_t t1 = vld1_u8 (t_src);
+	    
+	    t_src += sizeof(uint8x8_t);
+	    vst1_u8 (t_dst, t1);
+	    t_dst += sizeof(uint8x8_t);
+	}
+
+	if (trailer_count & 4)
+	{
+	    *((uint32_t*) t_dst) = *((uint32_t*) t_src);
+	    
+	    t_dst += 4;
+	    t_src += 4;
+	}
+
+	if (trailer_count & 2)
+	{
+	    *((uint16_t*) t_dst) = *((uint16_t*) t_src);
+	    
+	    t_dst += 2;
+	    t_src += 2;
+	}
+
+	if (trailer_count & 1)
+	{
+	    *t_dst++ = *t_src++;
+	}
+    }
+}
+
+static inline void
+solid_over_565_8_pix_neon (uint32_t  glyph_colour,
+                           uint16_t *dest,
+                           uint8_t * in_mask,
+                           uint32_t  dest_stride,    /* bytes, not elements */
+                           uint32_t  mask_stride,
+                           uint32_t  count           /* 8-pixel groups */)
+{
+    /* Inner loop of glyph blitter (solid colour, alpha mask) */
+
+#ifdef USE_GCC_INLINE_ASM
+
+    asm volatile (
+        "	vld4.8 {d20[], d21[], d22[], d23[]}, [%[glyph_colour]]  @ splat solid colour components	\n"
+        "0:	@ loop																				\n"
+        "	vld1.16   {d0, d1}, [%[dest]]         @ load first pixels from framebuffer			\n"
+        "	vld1.8    {d17}, [%[in_mask]]         @ load alpha mask of glyph						\n"
+        "	vmull.u8  q9, d17, d23               @ apply glyph colour alpha to mask				\n"
+        "	vshrn.u16 d17, q9, #8                @ reformat it to match original mask			\n"
+        "	vmvn      d18, d17                   @ we need the inverse mask for the background	\n"
+        "	vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits				\n"
+        "	vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels			\n"
+        "	vshrn.u16 d4, q0, #3                 @ unpack green									\n"
+        "	vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)			\n"
+        "	vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)		\n"
+        "	vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)			\n"
+        "	vmull.u8  q1, d2, d18                @ apply inverse mask to background red...		\n"
+        "	vmull.u8  q2, d4, d18                @ ...green...									\n"
+        "	vmull.u8  q3, d6, d18                @ ...blue										\n"
+        "	subs      %[count], %[count], #1     @ decrement/test loop counter					\n"
+        "	vmlal.u8  q1, d17, d22               @ add masked foreground red...					\n"
+        "	vmlal.u8  q2, d17, d21               @ ...green...									\n"
+        "	vmlal.u8  q3, d17, d20               @ ...blue										\n"
+        "	add %[in_mask], %[in_mask], %[mask_stride] @ advance mask pointer, while we wait		\n"
+        "	vsri.16   q1, q2, #5                 @ pack green behind red						\n"
+        "	vsri.16   q1, q3, #11                @ pack blue into pixels						\n"
+        "	vst1.16   {d2, d3}, [%[dest]]         @ store composited pixels						\n"
+        "	add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer					\n"
+        "	bne 0b                               @ next please									\n"
+
+	/* Clobbered registers marked as input/outputs */
+	: [dest] "+r" (dest), [in_mask] "+r" (in_mask), [count] "+r" (count)
+	  
+	  /* Inputs */
+	: [dest_stride] "r" (dest_stride), [mask_stride] "r" (mask_stride), [glyph_colour] "r" (&glyph_colour)
+
+	  /* Clobbers, including the inputs we modify, and potentially lots of memory */
+	: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d19",
+	  "d20", "d21", "d22", "d23", "d24", "d25", "cc", "memory"
+        );
 
-const FastPathInfo *const arm_neon_fast_paths = arm_neon_fast_path_array;
+#else
+
+    uint8x8x4_t solid_colour = vld4_dup_u8 ((uint8_t*) &glyph_colour);
+
+    while (count--)
+    {
+	uint16x8_t pixels = vld1q_u16 (dest);
+	uint8x8_t mask = vshrn_n_u16 (vmull_u8 (solid_colour.val[3], vld1_u8 (in_mask)), 8);
+	uint8x8_t mask_image = vmvn_u8 (mask);
+
+	uint8x8_t t_red   = vshrn_n_u16 (pixels, 8);
+	uint8x8_t t_green = vshrn_n_u16 (pixels, 3);
+	uint8x8_t t_blue  = vshrn_n_u16 (vsli_n_u8 (pixels, pixels, 5), 2);
+
+	uint16x8_t s_red   = vmull_u8 (vsri_n_u8 (t_red, t_red, 5), mask_image);
+	uint16x8_t s_green = vmull_u8 (vsri_n_u8 (t_green, t_green, 6), mask_image);
+	uint16x8_t s_blue  = vmull_u8 (t_blue, mask_image);
+
+	s_red   = vmlal (s_red, mask, solid_colour.val[2]);
+	s_green = vmlal (s_green, mask, solid_colour.val[1]);
+	s_blue  = vmlal (s_blue, mask, solid_colour.val[0]);
+
+	pixels = vsri_n_u16 (s_red, s_green, 5);
+	pixels = vsri_n_u16 (pixels, s_blue, 11);
+	vst1q_u16 (dest, pixels);
+
+	dest += dest_stride;
+	mask += mask_stride;
+    }
 
+#endif
+}
+
+#if 0 /* this is broken currently */
 static void
-arm_neon_composite (pixman_implementation_t *imp,
-		pixman_op_t     op,
-		pixman_image_t *src,
-		pixman_image_t *mask,
-		pixman_image_t *dest,
-		int32_t         src_x,
-		int32_t         src_y,
-		int32_t         mask_x,
-		int32_t         mask_y,
-		int32_t         dest_x,
-		int32_t         dest_y,
-		int32_t        width,
-		int32_t        height)
+neon_composite_over_n_8_0565 (pixman_implementation_t * impl,
+                              pixman_op_t               op,
+                              pixman_image_t *          src_image,
+                              pixman_image_t *          mask_image,
+                              pixman_image_t *          dst_image,
+                              int32_t                   src_x,
+                              int32_t                   src_y,
+                              int32_t                   mask_x,
+                              int32_t                   mask_y,
+                              int32_t                   dest_x,
+                              int32_t                   dest_y,
+                              int32_t                   width,
+                              int32_t                   height)
+{
+    uint32_t  src, srca;
+    uint16_t *dst_line, *aligned_line;
+    uint8_t  *mask_line;
+    uint32_t  dst_stride, mask_stride;
+    uint32_t  kernel_count, copy_count, copy_tail;
+    uint8_t   kernel_offset, copy_offset;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+    /* bail out if fully transparent or degenerate */
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    if (width == 0 || height == 0)
+	return;
+
+    if (width > NEON_SCANLINE_BUFFER_PIXELS)
+    {
+	/* split the blit, so we can use a fixed-size scanline buffer
+	 * TODO: there must be a more elegant way of doing this.
+	 */
+	int x;
+	for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
+	{
+	    neon_composite_over_n_8_0565 (
+		impl, op,
+		src_image, mask_image, dst_image,
+		src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
+		(x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
+	}
+
+	return;
+    }
+    
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    /* keep within minimum number of aligned quadwords on width
+     * while also keeping the minimum number of columns to process
+     */
+    {
+	unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
+	unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
+	unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
+
+	/* the fast copy should be quadword aligned */
+	copy_offset = dst_line - ((uint16_t*) aligned_left);
+	aligned_line = dst_line - copy_offset;
+	copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
+	copy_tail = 0;
+
+	if (aligned_right - aligned_left > ceiling_length)
+	{
+	    /* unaligned routine is tightest */
+	    kernel_count = (uint32_t) (ceiling_length >> 4);
+	    kernel_offset = copy_offset;
+	}
+	else
+	{
+	    /* aligned routine is equally tight, so it is safer to align */
+	    kernel_count = copy_count;
+	    kernel_offset = 0;
+	}
+
+	/* We should avoid reading beyond scanline ends for safety */
+	if (aligned_line < (dst_line - dest_x) ||
+	    (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
+	{
+	    /* switch to precise read */
+	    copy_offset = kernel_offset = 0;
+	    aligned_line = dst_line;
+	    kernel_count = (uint32_t) (ceiling_length >> 4);
+	    copy_count = (width * sizeof(*dst_line)) >> 4;
+	    copy_tail = (width * sizeof(*dst_line)) & 0xF;
+	}
+    }
+
+    {
+	uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8];         /* deliberately not initialised */
+	uint8_t glyph_line[NEON_SCANLINE_BUFFER_PIXELS + 8];
+	int y = height;
+
+	/* row-major order */
+	/* left edge, middle block, right edge */
+	for ( ; y--; mask_line += mask_stride, aligned_line += dst_stride, dst_line += dst_stride)
+	{
+	    /* We don't want to overrun the edges of the glyph,
+	     * so realign the edge data into known buffers
+	     */
+	    neon_quadword_copy (glyph_line + copy_offset, mask_line, width >> 4, width & 0xF);
+
+	    /* Uncached framebuffer access is really, really slow
+	     * if we do it piecemeal. It should be much faster if we
+	     * grab it all at once. One scanline should easily fit in
+	     * L1 cache, so this should not waste RAM bandwidth.
+	     */
+	    neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
+
+	    /* Apply the actual filter */
+	    solid_over_565_8_pix_neon (
+		src, scan_line + kernel_offset,
+		glyph_line + kernel_offset, 8 * sizeof(*dst_line),
+		8, kernel_count);
+
+	    /* Copy the modified scanline back */
+	    neon_quadword_copy (dst_line, scan_line + copy_offset,
+				width >> 3, (width & 7) * 2);
+	}
+    }
+}
+#endif
+
+#ifdef USE_GCC_INLINE_ASM
+
+static inline void
+plain_over_565_8_pix_neon (uint32_t  colour,
+			   uint16_t *dest,
+			   uint32_t  dest_stride,     /* bytes, not elements */
+			   uint32_t  count            /* 8-pixel groups */)
 {
-	if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
-			       op, src, mask, dest,
-			       src_x, src_y,
-			       mask_x, mask_y,
-			       dest_x, dest_y,
-			       width, height))
+    /* Inner loop for plain translucent rects
+     * (solid colour without alpha mask)
+     */
+    asm volatile (
+        "	vld4.8   {d20[], d21[], d22[], d23[]}, [%[colour]]  @ solid colour load/splat \n"
+        "	vmull.u8  q12, d23, d22              @ premultiply alpha red   \n"
+        "	vmull.u8  q13, d23, d21              @ premultiply alpha green \n"
+        "	vmull.u8  q14, d23, d20              @ premultiply alpha blue  \n"
+        "	vmvn      d18, d23                   @ inverse alpha for background \n"
+        "0:	@ loop\n"
+        "	vld1.16   {d0, d1}, [%[dest]]         @ load first pixels from framebuffer	\n"
+        "	vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels	\n"
+        "	vshrn.u16 d4, q0, #3                 @ unpack green				\n"
+        "	vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits		\n"
+        "	vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)	\n"
+        "	vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)	\n"
+        "	vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)	\n"
+        "	vmov      q0, q12                    @ retrieve foreground red   \n"
+        "	vmlal.u8  q0, d2, d18                @ blend red - my kingdom for a four-operand MLA \n"
+        "	vmov      q1, q13                    @ retrieve foreground green \n"
+        "	vmlal.u8  q1, d4, d18                @ blend green               \n"
+        "	vmov      q2, q14                    @ retrieve foreground blue  \n"
+        "	vmlal.u8  q2, d6, d18                @ blend blue                \n"
+        "	subs      %[count], %[count], #1     @ decrement/test loop counter		\n"
+        "	vsri.16   q0, q1, #5                 @ pack green behind red			\n"
+        "	vsri.16   q0, q2, #11                @ pack blue into pixels			\n"
+        "	vst1.16   {d0, d1}, [%[dest]]         @ store composited pixels			\n"
+        "	add %[dest], %[dest], %[dest_stride]  @ advance framebuffer pointer		\n"
+        "	bne 0b                               @ next please				\n"
+
+        /* Clobbered registers marked as input/outputs */
+	: [dest] "+r" (dest), [count] "+r" (count)
+
+	  /* Inputs */
+	: [dest_stride] "r" (dest_stride), [colour] "r" (&colour)
+
+	  /* Clobbers, including the inputs we modify, and
+	   * potentially lots of memory
+	   */
+	: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d18", "d19",
+	  "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29",
+	  "cc", "memory"
+        );
+}
+
+static void
+neon_composite_over_n_0565 (pixman_implementation_t * impl,
+                            pixman_op_t               op,
+                            pixman_image_t *          src_image,
+                            pixman_image_t *          mask_image,
+                            pixman_image_t *          dst_image,
+                            int32_t                   src_x,
+                            int32_t                   src_y,
+                            int32_t                   mask_x,
+                            int32_t                   mask_y,
+                            int32_t                   dest_x,
+                            int32_t                   dest_y,
+                            int32_t                   width,
+                            int32_t                   height)
+{
+    uint32_t src, srca;
+    uint16_t    *dst_line, *aligned_line;
+    uint32_t dst_stride;
+    uint32_t kernel_count, copy_count, copy_tail;
+    uint8_t kernel_offset, copy_offset;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+    /* bail out if fully transparent */
+    srca = src >> 24;
+    if (src == 0)
+	return;
+    
+    if (width == 0 || height == 0)
+	return;
+
+    if (width > NEON_SCANLINE_BUFFER_PIXELS)
+    {
+	/* split the blit, so we can use a fixed-size scanline buffer *
+	 * TODO: there must be a more elegant way of doing this.
+	 */
+	int x;
+	
+	for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
+	{
+	    neon_composite_over_n_0565 (
+		impl, op,
+		src_image, mask_image, dst_image,
+		src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
+		(x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
+	}
+	return;
+    }
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    /* keep within minimum number of aligned quadwords on width
+     * while also keeping the minimum number of columns to process
+     */
+    {
+	unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
+	unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
+	unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
+
+	/* the fast copy should be quadword aligned */
+	copy_offset = dst_line - ((uint16_t*) aligned_left);
+	aligned_line = dst_line - copy_offset;
+	copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
+	copy_tail = 0;
+
+	if (aligned_right - aligned_left > ceiling_length)
 	{
-		return;
+	    /* unaligned routine is tightest */
+	    kernel_count = (uint32_t) (ceiling_length >> 4);
+	    kernel_offset = copy_offset;
+	}
+	else
+	{
+	    /* aligned routine is equally tight, so it is safer to align */
+	    kernel_count = copy_count;
+	    kernel_offset = 0;
 	}
 
-	_pixman_implementation_composite (imp->delegate, op,
-				      src, mask, dest,
-				      src_x, src_y,
-				      mask_x, mask_y,
-				      dest_x, dest_y,
-				      width, height);
+	/* We should avoid reading beyond scanline ends for safety */
+	if (aligned_line < (dst_line - dest_x) ||
+	    (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
+	{
+	    /* switch to precise read */
+	    copy_offset = kernel_offset = 0;
+	    aligned_line = dst_line;
+	    kernel_count = (uint32_t) (ceiling_length >> 4);
+	    copy_count = (width * sizeof(*dst_line)) >> 4;
+	    copy_tail = (width * sizeof(*dst_line)) & 0xF;
+	}
+    }
+
+    {
+	uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8];  /* deliberately not initialised */
+
+	/* row-major order */
+	/* left edge, middle block, right edge */
+	for ( ; height--; aligned_line += dst_stride, dst_line += dst_stride)
+	{
+	    /* Uncached framebuffer access is really, really slow if we do it piecemeal.
+	     * It should be much faster if we grab it all at once.
+	     * One scanline should easily fit in L1 cache, so this should
+	     * not waste RAM bandwidth.
+	     */
+	    neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
+
+	    /* Apply the actual filter */
+	    plain_over_565_8_pix_neon (
+		src, scan_line + kernel_offset, 8 * sizeof(*dst_line), kernel_count);
+
+	    /* Copy the modified scanline back */
+	    neon_quadword_copy (
+		dst_line, scan_line + copy_offset, width >> 3, (width & 7) * 2);
+	}
+    }
 }
 
-pixman_bool_t
-pixman_blt_neon (
-	void *src_bits,
-	void *dst_bits,
-	int src_stride,
-	int dst_stride,
-	int src_bpp,
-	int dst_bpp,
-	int src_x, int src_y,
-	int dst_x, int dst_y,
-	int width, int height)
+static inline void
+ARGB8_over_565_8_pix_neon (uint32_t *src,
+                           uint16_t *dest,
+                           uint32_t  src_stride,     /* bytes, not elements */
+                           uint32_t  count           /* 8-pixel groups */)
 {
+    asm volatile (
+        "0:	@ loop\n"
+        "	pld   [%[src], %[src_stride]]         @ preload from next scanline	\n"
+        "	vld1.16   {d0, d1}, [%[dest]]         @ load pixels from framebuffer	\n"
+        "	vld4.8   {d20, d21, d22, d23},[%[src]]! @ load source image pixels		\n"
+        "	vsli.u16  q3, q0, #5                 @ duplicate framebuffer blue bits		\n"
+        "	vshrn.u16 d2, q0, #8                 @ unpack red from framebuffer pixels	\n"
+        "	vshrn.u16 d4, q0, #3                 @ unpack green				\n"
+        "	vmvn      d18, d23                   @ we need the inverse alpha for the background	\n"
+        "	vsri.u8   d2, d2, #5                 @ duplicate red bits (extend 5 to 8)	\n"
+        "	vshrn.u16 d6, q3, #2                 @ unpack extended blue (truncate 10 to 8)	\n"
+        "	vsri.u8   d4, d4, #6                 @ duplicate green bits (extend 6 to 8)	\n"
+        "	vmull.u8  q1, d2, d18                @ apply inverse alpha to background red...	\n"
+        "	vmull.u8  q2, d4, d18                @ ...green...				\n"
+        "	vmull.u8  q3, d6, d18                @ ...blue					\n"
+        "	subs      %[count], %[count], #1     @ decrement/test loop counter		\n"
+        "	vmlal.u8  q1, d23, d22               @ add blended foreground red...		\n"
+        "	vmlal.u8  q2, d23, d21               @ ...green...				\n"
+        "	vmlal.u8  q3, d23, d20               @ ...blue					\n"
+        "	vsri.16   q1, q2, #5                 @ pack green behind red			\n"
+        "	vsri.16   q1, q3, #11                @ pack blue into pixels			\n"
+        "	vst1.16   {d2, d3}, [%[dest]]!        @ store composited pixels			\n"
+        "	bne 0b                               @ next please				\n"
+
+        /* Clobbered registers marked as input/outputs */
+	: [dest] "+r" (dest), [src] "+r" (src), [count] "+r" (count)
+
+	  /* Inputs */
+	: [src_stride] "r" (src_stride)
+
+	  /* Clobbers, including the inputs we modify, and potentially lots of memory */
+	: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d17", "d18", "d20",
+	  "d21", "d22", "d23", "cc", "memory"
+        );
+}
 
-#if 0  // Relies on code which isn't upstreamed yet
+static void
+neon_composite_over_8888_0565 (pixman_implementation_t * impl,
+                               pixman_op_t               op,
+                               pixman_image_t *          src_image,
+                               pixman_image_t *          mask_image,
+                               pixman_image_t *          dst_image,
+                               int32_t                   src_x,
+                               int32_t                   src_y,
+                               int32_t                   mask_x,
+                               int32_t                   mask_y,
+                               int32_t                   dest_x,
+                               int32_t                   dest_y,
+                               int32_t                   width,
+                               int32_t                   height)
+{
+    uint32_t    *src_line;
+    uint16_t    *dst_line, *aligned_line;
+    uint32_t dst_stride, src_stride;
+    uint32_t kernel_count, copy_count, copy_tail;
+    uint8_t kernel_offset, copy_offset;
+
+    /* we assume mask is opaque 
+     * so the only alpha to deal with is embedded in src
+     */
+    if (width > NEON_SCANLINE_BUFFER_PIXELS)
+    {
+	/* split the blit, so we can use a fixed-size scanline buffer */
+	int x;
+	for (x = 0; x < width; x += NEON_SCANLINE_BUFFER_PIXELS)
+	{
+	    neon_composite_over_8888_0565 (
+		impl, op,
+		src_image, mask_image, dst_image,
+		src_x + x, src_y, mask_x + x, mask_y, dest_x + x, dest_y,
+		(x + NEON_SCANLINE_BUFFER_PIXELS > width) ? width - x : NEON_SCANLINE_BUFFER_PIXELS, height);
+	}
+	return;
+    }
 
-	// accelerate only straight copies
-	if(src_bpp != dst_bpp || (src_bpp & 7) || !width || !height)
-		return FALSE;
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
+    /* keep within minimum number of aligned quadwords on width
+     * while also keeping the minimum number of columns to process
+     */
+    {
+	unsigned long aligned_left = (unsigned long)(dst_line) & ~0xF;
+	unsigned long aligned_right = (((unsigned long)(dst_line + width)) + 0xF) & ~0xF;
+	unsigned long ceiling_length = (((unsigned long) width) * sizeof(*dst_line) + 0xF) & ~0xF;
+
+	/* the fast copy should be quadword aligned */
+	copy_offset = dst_line - ((uint16_t*) aligned_left);
+	aligned_line = dst_line - copy_offset;
+	copy_count = (uint32_t) ((aligned_right - aligned_left) >> 4);
+	copy_tail = 0;
+
+	if (aligned_right - aligned_left > ceiling_length)
 	{
-		uint32_t bytes_per_pixel = src_bpp >> 3;
-		uint32_t byte_width = width * bytes_per_pixel;
-		int32_t src_stride_bytes = src_stride * 4; // parameter is in words for some reason
-		int32_t dst_stride_bytes = dst_stride * 4;
-		uint8_t *src_bytes = ((uint8_t*) src_bits) + src_y * src_stride_bytes + src_x * bytes_per_pixel;
-		uint8_t *dst_bytes = ((uint8_t*) dst_bits) + dst_y * dst_stride_bytes + dst_x * bytes_per_pixel;
-		uint32_t quadword_count = byte_width / 16;
-		uint32_t offset         = byte_width % 16;
-
-		while(height--) {
-			QuadwordCopy_neon(dst_bytes, src_bytes, quadword_count, offset);
-			src_bytes += src_stride_bytes;
-			dst_bytes += dst_stride_bytes;
-		}
+	    /* unaligned routine is tightest */
+	    kernel_count = (uint32_t) (ceiling_length >> 4);
+	    kernel_offset = copy_offset;
+	}
+	else
+	{
+	    /* aligned routine is equally tight, so it is safer to align */
+	    kernel_count = copy_count;
+	    kernel_offset = 0;
 	}
 
-	return TRUE;
+	/* We should avoid reading beyond scanline ends for safety */
+	if (aligned_line < (dst_line - dest_x) ||
+	    (aligned_line + (copy_count * 16 / sizeof(*dst_line))) > ((dst_line - dest_x) + dst_image->bits.width))
+	{
+	    /* switch to precise read */
+	    copy_offset = kernel_offset = 0;
+	    aligned_line = dst_line;
+	    kernel_count = (uint32_t) (ceiling_length >> 4);
+	    copy_count = (width * sizeof(*dst_line)) >> 4;
+	    copy_tail = (width * sizeof(*dst_line)) & 0xF;
+	}
+    }
 
-#else /* USE_GCC_INLINE_ASM */
+    /* Preload the first input scanline */
+    {
+	uint8_t *src_ptr = (uint8_t*) src_line;
+	uint32_t count = (width + 15) / 16;
+
+#ifdef USE_GCC_INLINE_ASM
+	asm volatile (
+	    "0: @ loop						\n"
+	    "	subs    %[count], %[count], #1			\n"
+	    "	pld     [%[src]]				\n"
+	    "	add     %[src], %[src], #64			\n"
+	    "	bgt 0b						\n"
+
+	    /* Clobbered input registers marked as input/outputs */
+	    : [src] "+r" (src_ptr), [count] "+r" (count)
+	    :     /* no unclobbered inputs */
+	    : "cc"
+	    );
+#else
+	do
+	{
+	    __pld (src_ptr);
+	    src_ptr += 64;
+	}
+	while (--count);
+#endif
+    }
 
-	// TODO: intrinsic version for armcc
-	return FALSE;
+    {
+	uint16_t scan_line[NEON_SCANLINE_BUFFER_PIXELS + 8]; /* deliberately not initialised */
 
+	/* row-major order */
+	/* left edge, middle block, right edge */
+	for ( ; height--; src_line += src_stride, aligned_line += dst_stride)
+	{
+	    /* Uncached framebuffer access is really, really slow if we do
+	     * it piecemeal. It should be much faster if we grab it all at
+	     * once. One scanline should easily fit in L1 cache, so this
+	     * should not waste RAM bandwidth.
+	     */
+	    neon_quadword_copy (scan_line, aligned_line, copy_count, copy_tail);
+
+	    /* Apply the actual filter */
+	    ARGB8_over_565_8_pix_neon (
+		src_line, scan_line + kernel_offset,
+		src_stride * sizeof(*src_line), kernel_count);
+
+	    /* Copy the modified scanline back */
+	    neon_quadword_copy (dst_line,
+				scan_line + copy_offset,
+				width >> 3, (width & 7) * 2);
+	}
+    }
+}
+
+#endif  /* USE_GCC_INLINE_ASM */
+
+static const pixman_fast_path_t arm_neon_fast_path_array[] =
+{
+    { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       neon_composite_add_8888_8_8,     0 },
+    { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       neon_composite_add_8000_8000,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   neon_composite_over_n_8_0565,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   neon_composite_over_n_8_0565,    0 },
+    { PIXMAN_OP_SRC,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
+    { PIXMAN_OP_SRC,  PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_24_16,        0 },
+    { PIXMAN_OP_SRC,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
+    { PIXMAN_OP_SRC,  PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_24_16,        0 },
+#ifdef USE_GCC_INLINE_ASM
+    { PIXMAN_OP_SRC,  PIXMAN_r5g6b5,   PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_src_16_16,        0 },
+    { PIXMAN_OP_SRC,  PIXMAN_b5g6r5,   PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_src_16_16,        0 },
+#if 0 /* this code has some bugs */
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_n_0565,      0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_n_0565,      0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   neon_composite_over_8888_0565,   0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   neon_composite_over_8888_0565,   0 },
+#endif
 #endif
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, neon_composite_over_8888_8888,   0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, neon_composite_over_8888_8888,   0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, neon_composite_over_8888_8888,   0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, neon_composite_over_8888_8888,   0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_8888_n_8888, NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, neon_composite_over_n_8_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, neon_composite_over_n_8_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, neon_composite_over_n_8_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, neon_composite_over_n_8_8888,    0 },
+    { PIXMAN_OP_NONE },
+};
+
+const pixman_fast_path_t *const arm_neon_fast_paths = arm_neon_fast_path_array;
+
+static void
+arm_neon_composite (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    pixman_image_t *         src,
+                    pixman_image_t *         mask,
+                    pixman_image_t *         dest,
+                    int32_t                  src_x,
+                    int32_t                  src_y,
+                    int32_t                  mask_x,
+                    int32_t                  mask_y,
+                    int32_t                  dest_x,
+                    int32_t                  dest_y,
+                    int32_t                  width,
+                    int32_t                  height)
+{
+    if (_pixman_run_fast_path (arm_neon_fast_paths, imp,
+                               op, src, mask, dest,
+                               src_x, src_y,
+                               mask_x, mask_y,
+                               dest_x, dest_y,
+                               width, height))
+    {
+	return;
+    }
+
+    _pixman_implementation_composite (imp->delegate, op,
+                                      src, mask, dest,
+                                      src_x, src_y,
+                                      mask_x, mask_y,
+                                      dest_x, dest_y,
+                                      width, height);
+}
+
+static pixman_bool_t
+pixman_blt_neon (void *src_bits,
+                 void *dst_bits,
+                 int   src_stride,
+                 int   dst_stride,
+                 int   src_bpp,
+                 int   dst_bpp,
+                 int   src_x,
+                 int   src_y,
+                 int   dst_x,
+                 int   dst_y,
+                 int   width,
+                 int   height)
+{
+    if (!width || !height)
+	return TRUE;
+
+    /* accelerate only straight copies involving complete bytes */
+    if (src_bpp != dst_bpp || (src_bpp & 7))
+	return FALSE;
+
+    {
+	uint32_t bytes_per_pixel = src_bpp >> 3;
+	uint32_t byte_width = width * bytes_per_pixel;
+	/* parameter is in words for some reason */
+	int32_t src_stride_bytes = src_stride * 4;
+	int32_t dst_stride_bytes = dst_stride * 4;
+	uint8_t *src_bytes = ((uint8_t*) src_bits) +
+	    src_y * src_stride_bytes + src_x * bytes_per_pixel;
+	uint8_t *dst_bytes = ((uint8_t*) dst_bits) +
+	    dst_y * dst_stride_bytes + dst_x * bytes_per_pixel;
+	uint32_t quadword_count = byte_width / 16;
+	uint32_t offset         = byte_width % 16;
+
+	while (height--)
+	{
+	    neon_quadword_copy (dst_bytes, src_bytes, quadword_count, offset);
+	    src_bytes += src_stride_bytes;
+	    dst_bytes += dst_stride_bytes;
+	}
+    }
+
+    return TRUE;
 }
 
 static pixman_bool_t
 arm_neon_blt (pixman_implementation_t *imp,
-	  uint32_t *src_bits,
-	  uint32_t *dst_bits,
-	  int src_stride,
-	  int dst_stride,
-	  int src_bpp,
-	  int dst_bpp,
-	  int src_x, int src_y,
-	  int dst_x, int dst_y,
-	  int width, int height)
+              uint32_t *               src_bits,
+              uint32_t *               dst_bits,
+              int                      src_stride,
+              int                      dst_stride,
+              int                      src_bpp,
+              int                      dst_bpp,
+              int                      src_x,
+              int                      src_y,
+              int                      dst_x,
+              int                      dst_y,
+              int                      width,
+              int                      height)
 {
-	if (pixman_blt_neon (
-			src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-			src_x, src_y, dst_x, dst_y, width, height))
-		return TRUE;
-
-	return _pixman_implementation_blt (
-			imp->delegate,
-			src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-			src_x, src_y, dst_x, dst_y, width, height);
+    if (pixman_blt_neon (
+            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+            src_x, src_y, dst_x, dst_y, width, height))
+    {
+	return TRUE;
+    }
+
+    return _pixman_implementation_blt (
+               imp->delegate,
+               src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+               src_x, src_y, dst_x, dst_y, width, height);
 }
 
 static pixman_bool_t
 arm_neon_fill (pixman_implementation_t *imp,
-	   uint32_t *bits,
-	   int stride,
-	   int bpp,
-	   int x,
-	   int y,
-	   int width,
-	   int height,
-	   uint32_t xor)
+               uint32_t *               bits,
+               int                      stride,
+               int                      bpp,
+               int                      x,
+               int                      y,
+               int                      width,
+               int                      height,
+               uint32_t xor)
 {
-	if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
-		return TRUE;
+    if (pixman_fill_neon (bits, stride, bpp, x, y, width, height, xor))
+	return TRUE;
 
-	return _pixman_implementation_fill (
-			imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+    return _pixman_implementation_fill (
+	imp->delegate, bits, stride, bpp, x, y, width, height, xor);
 }
 
 pixman_implementation_t *
-_pixman_implementation_create_arm_neon (pixman_implementation_t *toplevel)
+_pixman_implementation_create_arm_neon (void)
 {
-	pixman_implementation_t *simd = _pixman_implementation_create_arm_simd (NULL);
-	pixman_implementation_t *imp  = _pixman_implementation_create (toplevel, simd);
+    pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
+    pixman_implementation_t *imp = _pixman_implementation_create (general);
 
-	imp->composite = arm_neon_composite;
-//	imp->blt = arm_neon_blt;
-//	imp->fill = arm_neon_fill;  --  Relies on code which is not upstreamed yet
+    imp->composite = arm_neon_composite;
+#if 0 /* this code has some bugs */
+    imp->blt = arm_neon_blt;
+#endif
+    imp->fill = arm_neon_fill;
 
-	return imp;
+    return imp;
 }
+
diff --git a/lib/pixman/pixman/pixman-arm-neon.h b/lib/pixman/pixman/pixman-arm-neon.h
deleted file mode 100644
index aed7a4dfd..000000000
--- a/lib/pixman/pixman/pixman-arm-neon.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright © 2009 ARM Ltd
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of ARM Ltd not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  ARM Ltd makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author:  Ian Rickards (ian.rickards@arm.com) 
- *
- */
-
-#include "pixman-private.h"
-
-#ifdef USE_ARM_NEON
-
-pixman_bool_t pixman_have_arm_neon(void);
-
-#else
-#define pixman_have_arm_neon() FALSE
-#endif
-
-#ifdef USE_ARM_NEON
-
-extern const FastPathInfo *const arm_neon_fast_paths;
-
-void
-fbCompositeSrcAdd_8000x8000neon (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-                        pixman_image_t * pSrc,
-                        pixman_image_t * pMask,
-                        pixman_image_t * pDst,
-                        int32_t      xSrc,
-                        int32_t      ySrc,
-                        int32_t      xMask,
-                        int32_t      yMask,
-                        int32_t      xDst,
-                        int32_t      yDst,
-                        int32_t      width,
-                        int32_t      height);
-
-void
-fbCompositeSrc_8888x8888neon (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-			pixman_image_t * pSrc,
-			pixman_image_t * pMask,
-			pixman_image_t * pDst,
-			int32_t      xSrc,
-			int32_t      ySrc,
-			int32_t      xMask,
-			int32_t      yMask,
-			int32_t      xDst,
-			int32_t      yDst,
-			int32_t      width,
-			int32_t      height);
-
-void
-fbCompositeSrc_8888x8x8888neon (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-			pixman_image_t * pSrc,
-			pixman_image_t * pMask,
-			pixman_image_t * pDst,
-			int32_t      xSrc,
-			int32_t      ySrc,
-			int32_t      xMask,
-			int32_t      yMask,
-			int32_t      xDst,
-			int32_t      yDst,
-			int32_t      width,
-			int32_t      height);
-
-void
-fbCompositeSolidMask_nx8x0565neon (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-                        pixman_image_t * pSrc,
-                        pixman_image_t * pMask,
-                        pixman_image_t * pDst,
-                        int32_t      xSrc,
-                        int32_t      ySrc,
-                        int32_t      xMask,
-                        int32_t      yMask,
-                        int32_t      xDst,
-                        int32_t      yDst,
-                        int32_t      width,
-                        int32_t      height);
-
-void
-fbCompositeSolidMask_nx8x8888neon (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-			pixman_image_t * pSrc,
-			pixman_image_t * pMask,
-			pixman_image_t * pDst,
-			int32_t      xSrc,
-			int32_t      ySrc,
-			int32_t      xMask,
-			int32_t      yMask,
-			int32_t      xDst,
-			int32_t      yDst,
-		 	int32_t      width,
-			int32_t      height);
-
-void
-fbCompositeSrcAdd_8888x8x8neon (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-                        pixman_image_t * pSrc,
-                        pixman_image_t * pMask,
-                        pixman_image_t * pDst,
-                        int32_t      xSrc,
-                        int32_t      ySrc,
-                        int32_t      xMask,
-                        int32_t      yMask,
-                        int32_t      xDst,
-                        int32_t      yDst,
-                        int32_t      width,
-                        int32_t      height);
-
-void
-fbCompositeSrc_16x16neon (
-	pixman_implementation_t * impl,
-	pixman_op_t op,
-                        pixman_image_t * pSrc,
-                        pixman_image_t * pMask,
-                        pixman_image_t * pDst,
-	int32_t      xSrc,
-	int32_t      ySrc,
-	int32_t      xMask,
-	int32_t      yMask,
-	int32_t      xDst,
-	int32_t      yDst,
-	int32_t      width,
-	int32_t      height);
-
-void
-fbCompositeSrc_24x16neon (
-	pixman_implementation_t * impl,
-	pixman_op_t op,
-	pixman_image_t * pSrc,
-	pixman_image_t * pMask,
-	pixman_image_t * pDst,
-	int32_t      xSrc,
-	int32_t      ySrc,
-	int32_t      xMask,
-	int32_t      yMask,
-	int32_t      xDst,
-	int32_t      yDst,
-	int32_t      width,
-	int32_t      height);
-
-pixman_bool_t
-pixman_fill_neon (uint32_t *bits,
-		  int stride,
-		  int bpp,
-		  int x,
-		  int y,
-		  int width,
-		  int height,
-		  uint32_t _xor);
-
-#endif /* USE_ARM_NEON */
diff --git a/lib/pixman/pixman/pixman-arm-simd.c b/lib/pixman/pixman/pixman-arm-simd.c
index 42503fc85..fb7bf3da8 100644
--- a/lib/pixman/pixman/pixman-arm-simd.c
+++ b/lib/pixman/pixman/pixman-arm-simd.c
@@ -27,48 +27,48 @@
 #include <config.h>
 #endif
 
-#include "pixman-arm-simd.h"
-
-void
-fbCompositeSrcAdd_8000x8000arm (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-				pixman_image_t * pSrc,
-				pixman_image_t * pMask,
-				pixman_image_t * pDst,
-				int32_t      xSrc,
-				int32_t      ySrc,
-				int32_t      xMask,
-				int32_t      yMask,
-				int32_t      xDst,
-				int32_t      yDst,
-				int32_t      width,
-				int32_t      height)
+#include "pixman-private.h"
+
+static void
+arm_composite_add_8000_8000 (pixman_implementation_t * impl,
+    pixman_op_t               op,
+    pixman_image_t *          src_image,
+    pixman_image_t *          mask_image,
+    pixman_image_t *          dst_image,
+    int32_t                   src_x,
+    int32_t                   src_y,
+    int32_t                   mask_x,
+    int32_t                   mask_y,
+    int32_t                   dest_x,
+    int32_t                   dest_y,
+    int32_t                   width,
+    int32_t                   height)
 {
-    uint8_t	*dstLine, *dst;
-    uint8_t	*srcLine, *src;
-    int	dstStride, srcStride;
-    uint16_t	w;
-    uint8_t	s, d;
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w;
+    uint8_t s, d;
 
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
-    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
-        /* ensure both src and dst are properly aligned before doing 32 bit reads
-         * we'll stay in this loop if src and dst have differing alignments */
+	/* ensure both src and dst are properly aligned before doing 32 bit reads
+	 * we'll stay in this loop if src and dst have differing alignments
+	 */
 	while (w && (((unsigned long)dst & 3) || ((unsigned long)src & 3)))
 	{
 	    s = *src;
 	    d = *dst;
-	    asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
+	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
 	    *dst = d;
 
 	    dst++;
@@ -78,7 +78,9 @@ fbCompositeSrcAdd_8000x8000arm (
 
 	while (w >= 4)
 	{
-	    asm("uqadd8 %0, %1, %2" : "=r"(*(uint32_t*)dst) : "r"(*(uint32_t*)src), "r"(*(uint32_t*)dst));
+	    asm ("uqadd8 %0, %1, %2"
+		 : "=r" (*(uint32_t*)dst)
+		 : "r" (*(uint32_t*)src), "r" (*(uint32_t*)dst));
 	    dst += 4;
 	    src += 4;
 	    w -= 4;
@@ -88,7 +90,7 @@ fbCompositeSrcAdd_8000x8000arm (
 	{
 	    s = *src;
 	    d = *dst;
-	    asm("uqadd8 %0, %1, %2" : "+r"(d) : "r"(s));
+	    asm ("uqadd8 %0, %1, %2" : "+r" (d) : "r" (s));
 	    *dst = d;
 
 	    dst++;
@@ -99,232 +101,233 @@ fbCompositeSrcAdd_8000x8000arm (
 
 }
 
-void
-fbCompositeSrc_8888x8888arm (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-			 pixman_image_t * pSrc,
-			 pixman_image_t * pMask,
-			 pixman_image_t * pDst,
-			 int32_t      xSrc,
-			 int32_t      ySrc,
-			 int32_t      xMask,
-			 int32_t      yMask,
-			 int32_t      xDst,
-			 int32_t      yDst,
-			 int32_t      width,
-			 int32_t      height)
+static void
+arm_composite_over_8888_8888 (pixman_implementation_t * impl,
+    pixman_op_t               op,
+    pixman_image_t *          src_image,
+    pixman_image_t *          mask_image,
+    pixman_image_t *          dst_image,
+    int32_t                   src_x,
+    int32_t                   src_y,
+    int32_t                   mask_x,
+    int32_t                   mask_y,
+    int32_t                   dest_x,
+    int32_t                   dest_y,
+    int32_t                   width,
+    int32_t                   height)
 {
-    uint32_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-    int	dstStride, srcStride;
-    uint16_t	w;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w;
     uint32_t component_half = 0x800080;
     uint32_t upper_component_mask = 0xff00ff00;
     uint32_t alpha_mask = 0xff;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
-//#define inner_branch
+/* #define inner_branch */
 	asm volatile (
-			"cmp %[w], #0\n\t"
-			"beq 2f\n\t"
-			"1:\n\t"
-			/* load src */
-			"ldr r5, [%[src]], #4\n\t"
+	    "cmp %[w], #0\n\t"
+	    "beq 2f\n\t"
+	    "1:\n\t"
+	    /* load src */
+	    "ldr r5, [%[src]], #4\n\t"
 #ifdef inner_branch
-			/* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
-			 * The 0x0 case also allows us to avoid doing an unecessary data
-			 * write which is more valuable so we only check for that */
-			"cmp r5, #0\n\t"
-			"beq 3f\n\t"
+	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+	     * The 0x0 case also allows us to avoid doing an unecessary data
+	     * write which is more valuable so we only check for that
+	     */
+	    "cmp r5, #0\n\t"
+	    "beq 3f\n\t"
 
-			/* = 255 - alpha */
-			"sub r8, %[alpha_mask], r5, lsr #24\n\t"
+	    /* = 255 - alpha */
+	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 
-			"ldr r4, [%[dest]] \n\t"
+	    "ldr r4, [%[dest]] \n\t"
 
 #else
-			"ldr r4, [%[dest]] \n\t"
+	    "ldr r4, [%[dest]] \n\t"
 
-			/* = 255 - alpha */
-			"sub r8, %[alpha_mask], r5, lsr #24\n\t"
+	    /* = 255 - alpha */
+	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 #endif
-			"uxtb16 r6, r4\n\t"
-			"uxtb16 r7, r4, ror #8\n\t"
+	    "uxtb16 r6, r4\n\t"
+	    "uxtb16 r7, r4, ror #8\n\t"
 
-			/* multiply by 257 and divide by 65536 */
-			"mla r6, r6, r8, %[component_half]\n\t"
-			"mla r7, r7, r8, %[component_half]\n\t"
+	    /* multiply by 257 and divide by 65536 */
+	    "mla r6, r6, r8, %[component_half]\n\t"
+	    "mla r7, r7, r8, %[component_half]\n\t"
 
-			"uxtab16 r6, r6, r6, ror #8\n\t"
-			"uxtab16 r7, r7, r7, ror #8\n\t"
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
 
-			/* recombine the 0xff00ff00 bytes of r6 and r7 */
-			"and r7, r7, %[upper_component_mask]\n\t"
-			"uxtab16 r6, r7, r6, ror #8\n\t"
+	    /* recombine the 0xff00ff00 bytes of r6 and r7 */
+	    "and r7, r7, %[upper_component_mask]\n\t"
+	    "uxtab16 r6, r7, r6, ror #8\n\t"
 
-			"uqadd8 r5, r6, r5\n\t"
+	    "uqadd8 r5, r6, r5\n\t"
 
 #ifdef inner_branch
-			"3:\n\t"
+	    "3:\n\t"
 
 #endif
-			"str r5, [%[dest]], #4\n\t"
-			/* increment counter and jmp to top */
-			"subs	%[w], %[w], #1\n\t"
-			"bne	1b\n\t"
-			"2:\n\t"
-			: [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
-			: [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
-			  [alpha_mask] "r" (alpha_mask)
-			: "r4", "r5", "r6", "r7", "r8", "cc", "memory"
-			);
+	    "str r5, [%[dest]], #4\n\t"
+	    /* increment counter and jmp to top */
+	    "subs	%[w], %[w], #1\n\t"
+	    "bne	1b\n\t"
+	    "2:\n\t"
+	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+	    : [component_half] "r" (component_half), [upper_component_mask] "r" (upper_component_mask),
+	    [alpha_mask] "r" (alpha_mask)
+	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory"
+	    );
     }
 }
 
-void
-fbCompositeSrc_8888x8x8888arm (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-			       pixman_image_t * pSrc,
-			       pixman_image_t * pMask,
-			       pixman_image_t * pDst,
-			       int32_t	xSrc,
-			       int32_t	ySrc,
-			       int32_t      xMask,
-			       int32_t      yMask,
-			       int32_t      xDst,
-			       int32_t      yDst,
-			       int32_t      width,
-			       int32_t      height)
+static void
+arm_composite_over_8888_n_8888 (
+    pixman_implementation_t * impl,
+    pixman_op_t               op,
+    pixman_image_t *          src_image,
+    pixman_image_t *          mask_image,
+    pixman_image_t *          dst_image,
+    int32_t                   src_x,
+    int32_t                   src_y,
+    int32_t                   mask_x,
+    int32_t                   mask_y,
+    int32_t                   dest_x,
+    int32_t                   dest_y,
+    int32_t                   width,
+    int32_t                   height)
 {
-    uint32_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-    uint32_t	mask;
-    int	dstStride, srcStride;
-    uint16_t	w;
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    uint32_t mask;
+    int dst_stride, src_stride;
+    uint16_t w;
     uint32_t component_half = 0x800080;
     uint32_t alpha_mask = 0xff;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-    fbComposeGetSolid (pMask, mask, pDst->bits.format);
+    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
     mask = (mask) >> 24;
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
-//#define inner_branch
+/* #define inner_branch */
 	asm volatile (
-			"cmp %[w], #0\n\t"
-			"beq 2f\n\t"
-			"1:\n\t"
-			/* load src */
-			"ldr r5, [%[src]], #4\n\t"
+	    "cmp %[w], #0\n\t"
+	    "beq 2f\n\t"
+	    "1:\n\t"
+	    /* load src */
+	    "ldr r5, [%[src]], #4\n\t"
 #ifdef inner_branch
-			/* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
-			 * The 0x0 case also allows us to avoid doing an unecessary data
-			 * write which is more valuable so we only check for that */
-			"cmp r5, #0\n\t"
-			"beq 3f\n\t"
+	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+	     * The 0x0 case also allows us to avoid doing an unecessary data
+	     * write which is more valuable so we only check for that
+	     */
+	    "cmp r5, #0\n\t"
+	    "beq 3f\n\t"
 
 #endif
-			"ldr r4, [%[dest]] \n\t"
+	    "ldr r4, [%[dest]] \n\t"
 
-			"uxtb16 r6, r5\n\t"
-			"uxtb16 r7, r5, ror #8\n\t"
+	    "uxtb16 r6, r5\n\t"
+	    "uxtb16 r7, r5, ror #8\n\t"
 
-			/* multiply by alpha (r8) then by 257 and divide by 65536 */
-			"mla r6, r6, %[mask_alpha], %[component_half]\n\t"
-			"mla r7, r7, %[mask_alpha], %[component_half]\n\t"
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, r6, %[mask_alpha], %[component_half]\n\t"
+	    "mla r7, r7, %[mask_alpha], %[component_half]\n\t"
 
-			"uxtab16 r6, r6, r6, ror #8\n\t"
-			"uxtab16 r7, r7, r7, ror #8\n\t"
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
 
-			"uxtb16 r6, r6, ror #8\n\t"
-			"uxtb16 r7, r7, ror #8\n\t"
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
 
-			/* recombine */
-			"orr r5, r6, r7, lsl #8\n\t"
+	    /* recombine */
+	    "orr r5, r6, r7, lsl #8\n\t"
 
-			"uxtb16 r6, r4\n\t"
-			"uxtb16 r7, r4, ror #8\n\t"
+	    "uxtb16 r6, r4\n\t"
+	    "uxtb16 r7, r4, ror #8\n\t"
 
-			/* 255 - alpha */
-			"sub r8, %[alpha_mask], r5, lsr #24\n\t"
+	    /* 255 - alpha */
+	    "sub r8, %[alpha_mask], r5, lsr #24\n\t"
 
-			/* multiply by alpha (r8) then by 257 and divide by 65536 */
-			"mla r6, r6, r8, %[component_half]\n\t"
-			"mla r7, r7, r8, %[component_half]\n\t"
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, r6, r8, %[component_half]\n\t"
+	    "mla r7, r7, r8, %[component_half]\n\t"
 
-			"uxtab16 r6, r6, r6, ror #8\n\t"
-			"uxtab16 r7, r7, r7, ror #8\n\t"
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
 
-			"uxtb16 r6, r6, ror #8\n\t"
-			"uxtb16 r7, r7, ror #8\n\t"
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
 
-			/* recombine */
-			"orr r6, r6, r7, lsl #8\n\t"
+	    /* recombine */
+	    "orr r6, r6, r7, lsl #8\n\t"
 
-			"uqadd8 r5, r6, r5\n\t"
+	    "uqadd8 r5, r6, r5\n\t"
 
 #ifdef inner_branch
-			"3:\n\t"
+	    "3:\n\t"
 
 #endif
-			"str r5, [%[dest]], #4\n\t"
-			/* increment counter and jmp to top */
-			"subs	%[w], %[w], #1\n\t"
-			"bne	1b\n\t"
-			"2:\n\t"
-			: [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
-			: [component_half] "r" (component_half), [mask_alpha] "r" (mask),
-			  [alpha_mask] "r" (alpha_mask)
-			: "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
-			);
+	    "str r5, [%[dest]], #4\n\t"
+	    /* increment counter and jmp to top */
+	    "subs	%[w], %[w], #1\n\t"
+	    "bne	1b\n\t"
+	    "2:\n\t"
+	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src)
+	    : [component_half] "r" (component_half), [mask_alpha] "r" (mask),
+	    [alpha_mask] "r" (alpha_mask)
+	    : "r4", "r5", "r6", "r7", "r8", "r9", "cc", "memory"
+	    );
     }
 }
 
-void
-fbCompositeSolidMask_nx8x8888arm (
-                            pixman_implementation_t * impl,
-                            pixman_op_t      op,
-			       pixman_image_t * pSrc,
-			       pixman_image_t * pMask,
-			       pixman_image_t * pDst,
-			       int32_t      xSrc,
-			       int32_t      ySrc,
-			       int32_t      xMask,
-			       int32_t      yMask,
-			       int32_t      xDst,
-			       int32_t      yDst,
-			       int32_t      width,
-			       int32_t      height)
+static void
+arm_composite_over_n_8_8888 (pixman_implementation_t * impl,
+			     pixman_op_t               op,
+			     pixman_image_t *          src_image,
+			     pixman_image_t *          mask_image,
+			     pixman_image_t *          dst_image,
+			     int32_t                   src_x,
+			     int32_t                   src_y,
+			     int32_t                   mask_x,
+			     int32_t                   mask_y,
+			     int32_t                   dest_x,
+			     int32_t                   dest_y,
+			     int32_t                   width,
+			     int32_t                   height)
 {
-    uint32_t	 src, srca;
-    uint32_t	*dstLine, *dst;
-    uint8_t	*maskLine, *mask;
-    int		 dstStride, maskStride;
-    uint16_t	 w;
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t  *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
 
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
+    /* bail out if fully transparent */
     srca = src >> 24;
     if (src == 0)
 	return;
@@ -335,148 +338,149 @@ fbCompositeSolidMask_nx8x8888arm (
     uint32_t src_hi = (src >> 8) & component_mask;
     uint32_t src_lo = src & component_mask;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	mask = maskLine;
-	maskLine += maskStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
 	w = width;
 
-//#define inner_branch
+/* #define inner_branch */
 	asm volatile (
-			"cmp %[w], #0\n\t"
-			"beq 2f\n\t"
-			"1:\n\t"
-			/* load mask */
-			"ldrb r5, [%[mask]], #1\n\t"
+	    "cmp %[w], #0\n\t"
+	    "beq 2f\n\t"
+	    "1:\n\t"
+	    /* load mask */
+	    "ldrb r5, [%[mask]], #1\n\t"
 #ifdef inner_branch
-			/* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
-			 * The 0x0 case also allows us to avoid doing an unecessary data
-			 * write which is more valuable so we only check for that */
-			"cmp r5, #0\n\t"
-			"beq 3f\n\t"
+	    /* We can avoid doing the multiplication in two cases: 0x0 or 0xff.
+	     * The 0x0 case also allows us to avoid doing an unecessary data
+	     * write which is more valuable so we only check for that
+	     */
+	    "cmp r5, #0\n\t"
+	    "beq 3f\n\t"
 
 #endif
-			"ldr r4, [%[dest]] \n\t"
+	    "ldr r4, [%[dest]] \n\t"
 
-			/* multiply by alpha (r8) then by 257 and divide by 65536 */
-			"mla r6, %[src_lo], r5, %[component_half]\n\t"
-			"mla r7, %[src_hi], r5, %[component_half]\n\t"
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, %[src_lo], r5, %[component_half]\n\t"
+	    "mla r7, %[src_hi], r5, %[component_half]\n\t"
 
-			"uxtab16 r6, r6, r6, ror #8\n\t"
-			"uxtab16 r7, r7, r7, ror #8\n\t"
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
 
-			"uxtb16 r6, r6, ror #8\n\t"
-			"uxtb16 r7, r7, ror #8\n\t"
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
 
-			/* recombine */
-			"orr r5, r6, r7, lsl #8\n\t"
+	    /* recombine */
+	    "orr r5, r6, r7, lsl #8\n\t"
 
-			"uxtb16 r6, r4\n\t"
-			"uxtb16 r7, r4, ror #8\n\t"
+	    "uxtb16 r6, r4\n\t"
+	    "uxtb16 r7, r4, ror #8\n\t"
 
-			/* we could simplify this to use 'sub' if we were
-			 * willing to give up a register for alpha_mask */
-			"mvn r8, r5\n\t"
-			"mov r8, r8, lsr #24\n\t"
+	    /* we could simplify this to use 'sub' if we were
+	    * willing to give up a register for alpha_mask */
+	    "mvn r8, r5\n\t"
+	    "mov r8, r8, lsr #24\n\t"
 
-			/* multiply by alpha (r8) then by 257 and divide by 65536 */
-			"mla r6, r6, r8, %[component_half]\n\t"
-			"mla r7, r7, r8, %[component_half]\n\t"
+	    /* multiply by alpha (r8) then by 257 and divide by 65536 */
+	    "mla r6, r6, r8, %[component_half]\n\t"
+	    "mla r7, r7, r8, %[component_half]\n\t"
 
-			"uxtab16 r6, r6, r6, ror #8\n\t"
-			"uxtab16 r7, r7, r7, ror #8\n\t"
+	    "uxtab16 r6, r6, r6, ror #8\n\t"
+	    "uxtab16 r7, r7, r7, ror #8\n\t"
 
-			"uxtb16 r6, r6, ror #8\n\t"
-			"uxtb16 r7, r7, ror #8\n\t"
+	    "uxtb16 r6, r6, ror #8\n\t"
+	    "uxtb16 r7, r7, ror #8\n\t"
 
-			/* recombine */
-			"orr r6, r6, r7, lsl #8\n\t"
+	    /* recombine */
+	    "orr r6, r6, r7, lsl #8\n\t"
 
-			"uqadd8 r5, r6, r5\n\t"
+	    "uqadd8 r5, r6, r5\n\t"
 
 #ifdef inner_branch
-			"3:\n\t"
+	    "3:\n\t"
 
 #endif
-			"str r5, [%[dest]], #4\n\t"
-			/* increment counter and jmp to top */
-			"subs	%[w], %[w], #1\n\t"
-			"bne	1b\n\t"
-			"2:\n\t"
-			: [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
-			: [component_half] "r" (component_half),
-			  [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
-			: "r4", "r5", "r6", "r7", "r8", "cc", "memory"
-			);
+	    "str r5, [%[dest]], #4\n\t"
+	    /* increment counter and jmp to top */
+	    "subs	%[w], %[w], #1\n\t"
+	    "bne	1b\n\t"
+	    "2:\n\t"
+	    : [w] "+r" (w), [dest] "+r" (dst), [src] "+r" (src), [mask] "+r" (mask)
+	    : [component_half] "r" (component_half),
+	      [src_hi] "r" (src_hi), [src_lo] "r" (src_lo)
+	    : "r4", "r5", "r6", "r7", "r8", "cc", "memory");
     }
 }
 
-static const FastPathInfo arm_simd_fast_path_array[] =
+static const pixman_fast_path_t arm_simd_fast_path_array[] =
 {
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888arm,      0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,	PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888arm,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,	PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888arm,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,	PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888arm,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888arm,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888arm,	   NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, arm_composite_over_8888_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, arm_composite_over_8888_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, arm_composite_over_8888_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, arm_composite_over_8888_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, arm_composite_over_8888_n_8888,  NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, arm_composite_over_8888_n_8888,  NEED_SOLID_MASK },
 
-    { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fbCompositeSrcAdd_8000x8000arm,   0 },
+    { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       arm_composite_add_8000_8000,     0 },
 
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888arm,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888arm,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888arm,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888arm,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, arm_composite_over_n_8_8888,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, arm_composite_over_n_8_8888,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, arm_composite_over_n_8_8888,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, arm_composite_over_n_8_8888,     0 },
 
     { PIXMAN_OP_NONE },
 };
 
-const FastPathInfo *const arm_simd_fast_paths = arm_simd_fast_path_array;
+const pixman_fast_path_t *const arm_simd_fast_paths = arm_simd_fast_path_array;
 
 static void
 arm_simd_composite (pixman_implementation_t *imp,
-		pixman_op_t     op,
-		pixman_image_t *src,
-		pixman_image_t *mask,
-		pixman_image_t *dest,
-		int32_t         src_x,
-		int32_t         src_y,
-		int32_t         mask_x,
-		int32_t         mask_y,
-		int32_t         dest_x,
-		int32_t         dest_y,
-		int32_t        width,
-		int32_t        height)
+                    pixman_op_t              op,
+                    pixman_image_t *         src,
+                    pixman_image_t *         mask,
+                    pixman_image_t *         dest,
+                    int32_t                  src_x,
+                    int32_t                  src_y,
+                    int32_t                  mask_x,
+                    int32_t                  mask_y,
+                    int32_t                  dest_x,
+                    int32_t                  dest_y,
+                    int32_t                  width,
+                    int32_t                  height)
 {
     if (_pixman_run_fast_path (arm_simd_fast_paths, imp,
-			       op, src, mask, dest,
-			       src_x, src_y,
-			       mask_x, mask_y,
-			       dest_x, dest_y,
-			       width, height))
+                               op, src, mask, dest,
+                               src_x, src_y,
+                               mask_x, mask_y,
+                               dest_x, dest_y,
+                               width, height))
     {
 	return;
     }
 
     _pixman_implementation_composite (imp->delegate, op,
-				      src, mask, dest,
-				      src_x, src_y,
-				      mask_x, mask_y,
-				      dest_x, dest_y,
-				      width, height);
+                                      src, mask, dest,
+                                      src_x, src_y,
+                                      mask_x, mask_y,
+                                      dest_x, dest_y,
+                                      width, height);
 }
 
 pixman_implementation_t *
-_pixman_implementation_create_arm_simd (pixman_implementation_t *toplevel)
+_pixman_implementation_create_arm_simd (void)
 {
-    pixman_implementation_t *general = _pixman_implementation_create_fast_path (NULL);
-    pixman_implementation_t *imp = _pixman_implementation_create (toplevel, general);
+    pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
+    pixman_implementation_t *imp = _pixman_implementation_create (general);
 
     imp->composite = arm_simd_composite;
 
     return imp;
 }
+
diff --git a/lib/pixman/pixman/pixman-arm-simd.h b/lib/pixman/pixman/pixman-arm-simd.h
deleted file mode 100644
index 8c1f88342..000000000
--- a/lib/pixman/pixman/pixman-arm-simd.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright © 2008 Mozilla Corporation
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Mozilla Corporation not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Mozilla Corporation makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- *
- * Author:  Jeff Muizelaar (jeff@infidigm.net)
- *
- */
-
-#include "pixman-private.h"
-
-#ifdef USE_ARM_SIMD
-
-pixman_bool_t pixman_have_arm_simd(void);
-
-#else
-#define pixman_have_arm_simd() FALSE
-#endif
-
-#ifdef USE_ARM_SIMD
-
-extern const FastPathInfo *const arm_simd_fast_paths;
-
-void
-fbCompositeSrcAdd_8000x8000arm (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-				pixman_image_t * pSrc,
-				pixman_image_t * pMask,
-				pixman_image_t * pDst,
-				int32_t      xSrc,
-				int32_t      ySrc,
-				int32_t      xMask,
-				int32_t      yMask,
-				int32_t      xDst,
-				int32_t      yDst,
-				int32_t      width,
-				int32_t      height);
-
-void
-fbCompositeSrc_8888x8888arm (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-			 pixman_image_t * pSrc,
-			 pixman_image_t * pMask,
-			 pixman_image_t * pDst,
-				int32_t      xSrc,
-				int32_t      ySrc,
-				int32_t      xMask,
-				int32_t      yMask,
-				int32_t      xDst,
-				int32_t      yDst,
-				int32_t      width,
-				int32_t      height);
-
-void
-fbCompositeSrc_8888x8x8888arm (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-			 pixman_image_t * pSrc,
-			 pixman_image_t * pMask,
-			 pixman_image_t * pDst,
-				int32_t      xSrc,
-				int32_t      ySrc,
-				int32_t      xMask,
-				int32_t      yMask,
-				int32_t      xDst,
-				int32_t      yDst,
-				int32_t      width,
-				int32_t      height);
-
-void
-fbCompositeSolidMask_nx8x8888arm (
-                            pixman_implementation_t * impl,
-                            pixman_op_t op,
-			 pixman_image_t * pSrc,
-			 pixman_image_t * pMask,
-			 pixman_image_t * pDst,
-				int32_t      xSrc,
-				int32_t      ySrc,
-				int32_t      xMask,
-				int32_t      yMask,
-				int32_t      xDst,
-				int32_t      yDst,
-				int32_t      width,
-				int32_t      height);
-
-
-#endif /* USE_ARM */
diff --git a/lib/pixman/pixman/pixman-bits-image.c b/lib/pixman/pixman/pixman-bits-image.c
index 888e487e9..7a1910935 100644
--- a/lib/pixman/pixman/pixman-bits-image.c
+++ b/lib/pixman/pixman/pixman-bits-image.c
@@ -1,346 +1,773 @@
 /*
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *             2008 Aaron Plattner, NVIDIA Corporation
  * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2007, 2009 Red Hat, Inc.
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
  * the above copyright notice appear in all copies and that both that
  * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  SuSE makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
  *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
  */
 
+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif
 #include <stdlib.h>
+#include <string.h>
 #include "pixman-private.h"
+#include "pixman-combine32.h"
 
-
-#define READ_ACCESS(f) ((image->common.read_func)? f##_accessors : f)
-#define WRITE_ACCESS(f) ((image->common.write_func)? f##_accessors : f)
+/* Store functions */
 
 static void
-fbFetchSolid(bits_image_t * image,
-	     int x, int y, int width,
-	     uint32_t *buffer,
-	     uint32_t *mask, uint32_t maskBits)
+bits_image_store_scanline_32 (bits_image_t *  image,
+                              int             x,
+                              int             y,
+                              int             width,
+                              const uint32_t *buffer)
 {
-    uint32_t color;
-    uint32_t *end;
-    fetchPixelProc32 fetch =
-	READ_ACCESS(pixman_fetchPixelProcForPicture32)(image);
-    
-    color = fetch(image, 0, 0);
-    
-    end = buffer + width;
-    while (buffer < end)
-	*(buffer++) = color;
+    image->store_scanline_raw_32 (image, x, y, width, buffer);
+
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	bits_image_store_scanline_32 (image->common.alpha_map, x, y, width, buffer);
+    }
 }
 
 static void
-fbFetchSolid64(bits_image_t * image,
-	       int x, int y, int width,
-	       uint64_t *buffer, void *unused, uint32_t unused2)
+bits_image_store_scanline_64 (bits_image_t *  image,
+                              int             x,
+                              int             y,
+                              int             width,
+                              const uint32_t *buffer)
 {
-    uint64_t color;
-    uint64_t *end;
-    fetchPixelProc64 fetch =
-	READ_ACCESS(pixman_fetchPixelProcForPicture64)(image);
-    
-    color = fetch(image, 0, 0);
-    
-    end = buffer + width;
-    while (buffer < end)
-	*(buffer++) = color;
+    image->store_scanline_raw_64 (image, x, y, width, buffer);
+
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	bits_image_store_scanline_64 (image->common.alpha_map, x, y, width, buffer);
+    }
 }
 
-static void
-fbFetch(bits_image_t * image,
-	int x, int y, int width,
-	uint32_t *buffer, uint32_t *mask, uint32_t maskBits)
+void
+_pixman_image_store_scanline_32 (bits_image_t *  image,
+                                 int             x,
+                                 int             y,
+                                 int             width,
+                                 const uint32_t *buffer)
+{
+    image->store_scanline_32 (image, x, y, width, buffer);
+}
+
+void
+_pixman_image_store_scanline_64 (bits_image_t *  image,
+                                 int             x,
+                                 int             y,
+                                 int             width,
+                                 const uint32_t *buffer)
 {
-    fetchProc32 fetch = READ_ACCESS(pixman_fetchProcForPicture32)(image);
-    
-    fetch(image, x, y, width, buffer);
+    image->store_scanline_64 (image, x, y, width, buffer);
+}
+
+/* Fetch functions */
+
+static uint32_t
+bits_image_fetch_pixel_alpha (bits_image_t *image, int x, int y)
+{
+    uint32_t pixel;
+    uint32_t pixel_a;
+
+    pixel = image->fetch_pixel_raw_32 (image, x, y);
+
+    assert (image->common.alpha_map);
+
+    x -= image->common.alpha_origin_x;
+    y -= image->common.alpha_origin_y;
+
+    if (x < 0 || x >= image->common.alpha_map->width ||
+	y < 0 || y >= image->common.alpha_map->height)
+    {
+	pixel_a = 0;
+    }
+    else
+    {
+	pixel_a = image->common.alpha_map->fetch_pixel_raw_32 (
+	    image->common.alpha_map, x, y);
+	pixel_a = ALPHA_8 (pixel_a);
+    }
+
+    pixel &= 0x00ffffff;
+    pixel |= (pixel_a << 24);
+
+    return pixel;
+}
+
+static force_inline uint32_t
+get_pixel (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
+{
+    if (check_bounds &&
+	(x < 0 || x >= image->width || y < 0 || y >= image->height))
+    {
+	return 0;
+    }
+
+    return image->fetch_pixel_32 (image, x, y);
+}
+
+static force_inline void
+repeat (pixman_repeat_t repeat, int size, int *coord)
+{
+    switch (repeat)
+    {
+    case PIXMAN_REPEAT_NORMAL:
+	*coord = MOD (*coord, size);
+	break;
+
+    case PIXMAN_REPEAT_PAD:
+	*coord = CLIP (*coord, 0, size - 1);
+	break;
+
+    case PIXMAN_REPEAT_REFLECT:
+	*coord = MOD (*coord, size * 2);
+
+	if (*coord >= size)
+	    *coord = size * 2 - *coord - 1;
+	break;
+
+    case PIXMAN_REPEAT_NONE:
+	break;
+    }
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_nearest (bits_image_t   *image,
+				pixman_fixed_t  x,
+				pixman_fixed_t  y)
+{
+    int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+    int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+    if (image->common.repeat != PIXMAN_REPEAT_NONE)
+    {
+	repeat (image->common.repeat, image->width, &x0);
+	repeat (image->common.repeat, image->height, &y0);
+
+	return get_pixel (image, x0, y0, FALSE);
+    }
+    else
+    {
+	return get_pixel (image, x0, y0, TRUE);
+    }
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_bilinear (bits_image_t   *image,
+				 pixman_fixed_t  x,
+				 pixman_fixed_t  y)
+{
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+    int x1, y1, x2, y2;
+    uint32_t tl, tr, bl, br, r;
+    int32_t distx, disty, idistx, idisty;
+    uint32_t ft, fb;
+
+    x1 = x - pixman_fixed_1 / 2;
+    y1 = y - pixman_fixed_1 / 2;
+
+    distx = (x1 >> 8) & 0xff;
+    disty = (y1 >> 8) & 0xff;
+
+    x1 = pixman_fixed_to_int (x1);
+    y1 = pixman_fixed_to_int (y1);
+    x2 = x1 + 1;
+    y2 = y1 + 1;
+
+    if (repeat_mode != PIXMAN_REPEAT_NONE)
+    {
+	repeat (repeat_mode, width, &x1);
+	repeat (repeat_mode, height, &y1);
+	repeat (repeat_mode, width, &x2);
+	repeat (repeat_mode, height, &y2);
+	
+	tl = get_pixel (image, x1, y1, FALSE);
+	bl = get_pixel (image, x1, y2, FALSE);
+	tr = get_pixel (image, x2, y1, FALSE);
+	br = get_pixel (image, x2, y2, FALSE);
+    }
+    else
+    {
+	tl = get_pixel (image, x1, y1, TRUE);
+	tr = get_pixel (image, x2, y1, TRUE);
+	bl = get_pixel (image, x1, y2, TRUE);
+	br = get_pixel (image, x2, y2, TRUE);
+    }
+
+    idistx = 256 - distx;
+    idisty = 256 - disty;
+
+#define GET8(v, i)   ((uint16_t) (uint8_t) ((v) >> i))
+    ft = GET8 (tl, 0) * idistx + GET8 (tr, 0) * distx;
+    fb = GET8 (bl, 0) * idistx + GET8 (br, 0) * distx;
+    r = (((ft * idisty + fb * disty) >> 16) & 0xff);
+    ft = GET8 (tl, 8) * idistx + GET8 (tr, 8) * distx;
+    fb = GET8 (bl, 8) * idistx + GET8 (br, 8) * distx;
+    r |= (((ft * idisty + fb * disty) >> 8) & 0xff00);
+    ft = GET8 (tl, 16) * idistx + GET8 (tr, 16) * distx;
+    fb = GET8 (bl, 16) * idistx + GET8 (br, 16) * distx;
+    r |= (((ft * idisty + fb * disty)) & 0xff0000);
+    ft = GET8 (tl, 24) * idistx + GET8 (tr, 24) * distx;
+    fb = GET8 (bl, 24) * idistx + GET8 (br, 24) * distx;
+    r |= (((ft * idisty + fb * disty) << 8) & 0xff000000);
+
+    return r;
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_convolution (bits_image_t   *image,
+				    pixman_fixed_t  x,
+				    pixman_fixed_t  y)
+{
+    pixman_fixed_t *params = image->common.filter_params;
+    int x_off = (params[0] - pixman_fixed_1) >> 1;
+    int y_off = (params[1] - pixman_fixed_1) >> 1;
+    int32_t cwidth = pixman_fixed_to_int (params[0]);
+    int32_t cheight = pixman_fixed_to_int (params[1]);
+    int32_t srtot, sgtot, sbtot, satot;
+    int32_t i, j, x1, x2, y1, y2;
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+
+    params += 2;
+
+    x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
+    y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
+    x2 = x1 + cwidth;
+    y2 = y1 + cheight;
+
+    srtot = sgtot = sbtot = satot = 0;
+
+    for (i = y1; i < y2; ++i)
+    {
+	for (j = x1; j < x2; ++j)
+	{
+	    int rx = j;
+	    int ry = i;
+
+	    pixman_fixed_t f = *params;
+
+	    if (f)
+	    {
+		uint32_t pixel;
+
+		if (repeat_mode != PIXMAN_REPEAT_NONE)
+		{
+		    repeat (repeat_mode, width, &rx);
+		    repeat (repeat_mode, height, &ry);
+
+		    pixel = get_pixel (image, rx, ry, FALSE);
+		}
+		else
+		{
+		    pixel = get_pixel (image, rx, ry, TRUE);
+		}
+
+		srtot += RED_8 (pixel) * f;
+		sgtot += GREEN_8 (pixel) * f;
+		sbtot += BLUE_8 (pixel) * f;
+		satot += ALPHA_8 (pixel) * f;
+	    }
+
+	    params++;
+	}
+    }
+
+    satot >>= 16;
+    srtot >>= 16;
+    sgtot >>= 16;
+    sbtot >>= 16;
+
+    satot = CLIP (satot, 0, 0xff);
+    srtot = CLIP (srtot, 0, 0xff);
+    sgtot = CLIP (sgtot, 0, 0xff);
+    sbtot = CLIP (sbtot, 0, 0xff);
+
+    return ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_filtered (bits_image_t *image,
+				 pixman_fixed_t x,
+				 pixman_fixed_t y)
+{
+    switch (image->common.filter)
+    {
+    case PIXMAN_FILTER_NEAREST:
+    case PIXMAN_FILTER_FAST:
+	return bits_image_fetch_pixel_nearest (image, x, y);
+	break;
+
+    case PIXMAN_FILTER_BILINEAR:
+    case PIXMAN_FILTER_GOOD:
+    case PIXMAN_FILTER_BEST:
+	return bits_image_fetch_pixel_bilinear (image, x, y);
+	break;
+
+    case PIXMAN_FILTER_CONVOLUTION:
+	return bits_image_fetch_pixel_convolution (image, x, y);
+	break;
+    }
+
+    return 0;
 }
 
 static void
-fbFetch64(bits_image_t * image,
-	  int x, int y, int width,
-	  uint64_t *buffer, void *unused, uint32_t unused2)
+bits_image_fetch_transformed (pixman_image_t * image,
+                              int              offset,
+                              int              line,
+                              int              width,
+                              uint32_t *       buffer,
+                              const uint32_t * mask,
+                              uint32_t         mask_bits)
 {
-    fetchProc64 fetch = READ_ACCESS(pixman_fetchProcForPicture64)(image);
-    
-    fetch(image, x, y, width, buffer);
+    pixman_fixed_t x, y, w;
+    pixman_fixed_t ux, uy, uw;
+    pixman_vector_t v;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    /* when using convolution filters or PIXMAN_REPEAT_PAD one
+     * might get here without a transform */
+    if (image->common.transform)
+    {
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return;
+
+	ux = image->common.transform->matrix[0][0];
+	uy = image->common.transform->matrix[1][0];
+	uw = image->common.transform->matrix[2][0];
+    }
+    else
+    {
+	ux = pixman_fixed_1;
+	uy = 0;
+	uw = 0;
+    }
+
+    x = v.vector[0];
+    y = v.vector[1];
+    w = v.vector[2];
+
+    if (w == pixman_fixed_1 && uw == 0) /* Affine */
+    {
+	for (i = 0; i < width; ++i)
+	{
+	    if (!mask || (mask[i] & mask_bits))
+	    {
+		buffer[i] =
+		    bits_image_fetch_pixel_filtered (&image->bits, x, y);
+	    }
+
+	    x += ux;
+	    y += uy;
+	}
+    }
+    else
+    {
+	for (i = 0; i < width; ++i)
+	{
+	    pixman_fixed_t x0, y0;
+
+	    if (!mask || (mask[i] & mask_bits))
+	    {
+		x0 = ((pixman_fixed_48_16_t)x << 16) / w;
+		y0 = ((pixman_fixed_48_16_t)y << 16) / w;
+
+		buffer[i] =
+		    bits_image_fetch_pixel_filtered (&image->bits, x0, y0);
+	    }
+
+	    x += ux;
+	    y += uy;
+	    w += uw;
+	}
+    }
 }
 
 static void
-fbStore(bits_image_t * image, int x, int y, int width, uint32_t *buffer)
+bits_image_fetch_solid_32 (pixman_image_t * image,
+                           int              x,
+                           int              y,
+                           int              width,
+                           uint32_t *       buffer,
+                           const uint32_t * mask,
+                           uint32_t         mask_bits)
 {
-    uint32_t *bits;
-    int32_t stride;
-    storeProc32 store = WRITE_ACCESS(pixman_storeProcForPicture32)(image);
-    const pixman_indexed_t * indexed = image->indexed;
-
-    bits = image->bits;
-    stride = image->rowstride;
-    bits += y*stride;
-    store((pixman_image_t *)image, bits, buffer, x, width, indexed);
+    uint32_t color;
+    uint32_t *end;
+
+    color = image->bits.fetch_pixel_raw_32 (&image->bits, 0, 0);
+
+    end = buffer + width;
+    while (buffer < end)
+	*(buffer++) = color;
 }
 
 static void
-fbStore64 (bits_image_t * image, int x, int y, int width, uint64_t *buffer)
+bits_image_fetch_solid_64 (pixman_image_t * image,
+                           int              x,
+                           int              y,
+                           int              width,
+                           uint32_t *       b,
+                           const uint32_t * unused,
+                           uint32_t         unused2)
 {
-    uint32_t *bits;
-    int32_t stride;
-    storeProc64 store = WRITE_ACCESS(pixman_storeProcForPicture64)(image);
-    const pixman_indexed_t * indexed = image->indexed;
-
-    bits = image->bits;
-    stride = image->rowstride;
-    bits += y*stride;
-    store((pixman_image_t *)image, bits, buffer, x, width, indexed);
+    uint64_t color;
+    uint64_t *buffer = (uint64_t *)b;
+    uint64_t *end;
+
+    color = image->bits.fetch_pixel_raw_64 (&image->bits, 0, 0);
+
+    end = buffer + width;
+    while (buffer < end)
+	*(buffer++) = color;
 }
 
 static void
-fbStoreExternalAlpha (bits_image_t * image, int x, int y, int width,
-		      uint32_t *buffer)
+bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
+                                            pixman_bool_t wide,
+                                            int           x,
+                                            int           y,
+                                            int           width,
+                                            uint32_t *    buffer)
 {
-    uint32_t *bits, *alpha_bits;
-    int32_t stride, astride;
-    int ax, ay;
-    storeProc32 store;
-    storeProc32 astore;
-    const pixman_indexed_t * indexed = image->indexed;
-    const pixman_indexed_t * aindexed;
-
-    if (!image->common.alpha_map) {
-        // XXX[AGP]: This should never happen!
-        // fbStore(image, x, y, width, buffer);
-        abort();
+    uint32_t w;
+
+    if (y < 0 || y >= image->height)
+    {
+	memset (buffer, 0, width * (wide? 8 : 4));
 	return;
     }
 
-    store = WRITE_ACCESS(pixman_storeProcForPicture32)(image);
-    astore = WRITE_ACCESS(pixman_storeProcForPicture32)(image->common.alpha_map);
-    aindexed = image->common.alpha_map->indexed;
+    if (x < 0)
+    {
+	w = MIN (width, -x);
 
-    ax = x;
-    ay = y;
+	memset (buffer, 0, w * (wide ? 8 : 4));
 
-    bits = image->bits;
-    stride = image->rowstride;
+	width -= w;
+	buffer += w * (wide? 2 : 1);
+	x += w;
+    }
 
-    alpha_bits = image->common.alpha_map->bits;
-    astride = image->common.alpha_map->rowstride;
+    if (x < image->width)
+    {
+	w = MIN (width, image->width - x);
 
-    bits       += y*stride;
-    alpha_bits += (ay - image->common.alpha_origin.y)*astride;
+	if (wide)
+	    image->fetch_scanline_raw_64 ((pixman_image_t *)image, x, y, w, buffer, NULL, 0);
+	else
+	    image->fetch_scanline_raw_32 ((pixman_image_t *)image, x, y, w, buffer, NULL, 0);
 
+	width -= w;
+	buffer += w * (wide? 2 : 1);
+	x += w;
+    }
 
-    store((pixman_image_t *)image, bits, buffer, x, width, indexed);
-    astore((pixman_image_t *)image->common.alpha_map,
-	   alpha_bits, buffer, ax - image->common.alpha_origin.x, width, aindexed);
+    memset (buffer, 0, width * (wide ? 8 : 4));
 }
 
 static void
-fbStoreExternalAlpha64 (bits_image_t * image, int x, int y, int width,
-			uint64_t *buffer)
+bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
+                                              pixman_bool_t wide,
+                                              int           x,
+                                              int           y,
+                                              int           width,
+                                              uint32_t *    buffer)
 {
-    uint32_t *bits, *alpha_bits;
-    int32_t stride, astride;
-    int ax, ay;
-    storeProc64 store;
-    storeProc64 astore;
-    const pixman_indexed_t * indexed = image->indexed;
-    const pixman_indexed_t * aindexed;
+    uint32_t w;
 
-    store = ACCESS(pixman_storeProcForPicture64)(image);
-    astore = ACCESS(pixman_storeProcForPicture64)(image->common.alpha_map);
-    aindexed = image->common.alpha_map->indexed;
+    while (y < 0)
+	y += image->height;
 
-    ax = x;
-    ay = y;
+    while (y >= image->height)
+	y -= image->height;
 
-    bits = image->bits;
-    stride = image->rowstride;
+    while (width)
+    {
+	while (x < 0)
+	    x += image->width;
+	while (x >= image->width)
+	    x -= image->width;
+
+	w = MIN (width, image->width - x);
+
+	if (wide)
+	    image->fetch_scanline_raw_64 ((pixman_image_t *)image, x, y, w, buffer, NULL, 0);
+	else
+	    image->fetch_scanline_raw_32 ((pixman_image_t *)image, x, y, w, buffer, NULL, 0);
+
+	buffer += w * (wide? 2 : 1);
+	x += w;
+	width -= w;
+    }
+}
+
+static void
+bits_image_fetch_untransformed_32 (pixman_image_t * image,
+                                   int              x,
+                                   int              y,
+                                   int              width,
+                                   uint32_t *       buffer,
+                                   const uint32_t * mask,
+                                   uint32_t         mask_bits)
+{
+    if (image->common.repeat == PIXMAN_REPEAT_NONE)
+    {
+	bits_image_fetch_untransformed_repeat_none (
+	    &image->bits, FALSE, x, y, width, buffer);
+    }
+    else
+    {
+	bits_image_fetch_untransformed_repeat_normal (
+	    &image->bits, FALSE, x, y, width, buffer);
+    }
+}
+
+static void
+bits_image_fetch_untransformed_64 (pixman_image_t * image,
+                                   int              x,
+                                   int              y,
+                                   int              width,
+                                   uint32_t *       buffer,
+                                   const uint32_t * unused,
+                                   uint32_t         unused2)
+{
+    if (image->common.repeat == PIXMAN_REPEAT_NONE)
+    {
+	bits_image_fetch_untransformed_repeat_none (
+	    &image->bits, TRUE, x, y, width, buffer);
+    }
+    else
+    {
+	bits_image_fetch_untransformed_repeat_normal (
+	    &image->bits, TRUE, x, y, width, buffer);
+    }
+}
 
-    alpha_bits = image->common.alpha_map->bits;
-    astride = image->common.alpha_map->rowstride;
+static pixman_bool_t out_of_bounds_workaround = TRUE;
 
-    bits       += y*stride;
-    alpha_bits += (ay - image->common.alpha_origin.y)*astride;
+/* Old X servers rely on out-of-bounds accesses when they are asked
+ * to composite with a window as the source. They create a pixman image
+ * pointing to some bogus position in memory, but then they set a clip
+ * region to the position where the actual bits are.
+ *
+ * Due to a bug in old versions of pixman, where it would not clip
+ * against the image bounds when a clip region was set, this would
+ * actually work. So by default we allow certain out-of-bound access
+ * to happen unless explicitly disabled.
+ *
+ * Fixed X servers should call this function to disable the workaround.
+ */
+PIXMAN_EXPORT void
+pixman_disable_out_of_bounds_workaround (void)
+{
+    out_of_bounds_workaround = FALSE;
+}
 
+static pixman_bool_t
+source_image_needs_out_of_bounds_workaround (bits_image_t *image)
+{
+    if (image->common.clip_sources                      &&
+        image->common.repeat == PIXMAN_REPEAT_NONE      &&
+	image->common.have_clip_region			&&
+        out_of_bounds_workaround)
+    {
+	if (!image->common.client_clip)
+	{
+	    /* There is no client clip, so if the clip region extends beyond the
+	     * drawable geometry, it must be because the X server generated the
+	     * bogus clip region.
+	     */
+	    const pixman_box32_t *extents = pixman_region32_extents (&image->common.clip_region);
+
+	    if (extents->x1 >= 0 && extents->x2 <= image->width &&
+		extents->y1 >= 0 && extents->y2 <= image->height)
+	    {
+		return FALSE;
+	    }
+	}
+
+	return TRUE;
+    }
 
-    store((pixman_image_t *)image, bits, buffer, x, width, indexed);
-    astore((pixman_image_t *)image->common.alpha_map,
-	   alpha_bits, buffer, ax - image->common.alpha_origin.x, width, aindexed);
+    return FALSE;
 }
 
 static void
 bits_image_property_changed (pixman_image_t *image)
 {
     bits_image_t *bits = (bits_image_t *)image;
-    
+
+    _pixman_bits_image_setup_raw_accessors (bits);
+
+    image->bits.fetch_pixel_32 = image->bits.fetch_pixel_raw_32;
+
     if (bits->common.alpha_map)
     {
 	image->common.get_scanline_64 =
-	    (scanFetchProc)_pixman_image_get_scanline_64_generic;
+	    _pixman_image_get_scanline_generic_64;
 	image->common.get_scanline_32 =
-	    (scanFetchProc)READ_ACCESS(fbFetchExternalAlpha);
+	    bits_image_fetch_transformed;
+
+	image->bits.fetch_pixel_32 = bits_image_fetch_pixel_alpha;
     }
     else if ((bits->common.repeat != PIXMAN_REPEAT_NONE) &&
-	    bits->width == 1 &&
-	    bits->height == 1)
+             bits->width == 1 &&
+             bits->height == 1)
     {
-	image->common.get_scanline_64 = (scanFetchProc)fbFetchSolid64;
-	image->common.get_scanline_32 = (scanFetchProc)fbFetchSolid;
+	image->common.get_scanline_64 = bits_image_fetch_solid_64;
+	image->common.get_scanline_32 = bits_image_fetch_solid_32;
     }
     else if (!bits->common.transform &&
-	     bits->common.filter != PIXMAN_FILTER_CONVOLUTION &&
-	     bits->common.repeat != PIXMAN_REPEAT_PAD &&
-	     bits->common.repeat != PIXMAN_REPEAT_REFLECT)
+             bits->common.filter != PIXMAN_FILTER_CONVOLUTION &&
+             (bits->common.repeat == PIXMAN_REPEAT_NONE ||
+              bits->common.repeat == PIXMAN_REPEAT_NORMAL))
     {
-	image->common.get_scanline_64 = (scanFetchProc)fbFetch64;
-	image->common.get_scanline_32 = (scanFetchProc)fbFetch;
+	image->common.get_scanline_64 = bits_image_fetch_untransformed_64;
+	image->common.get_scanline_32 = bits_image_fetch_untransformed_32;
     }
     else
     {
 	image->common.get_scanline_64 =
-	    (scanFetchProc)_pixman_image_get_scanline_64_generic;
+	    _pixman_image_get_scanline_generic_64;
 	image->common.get_scanline_32 =
-	    (scanFetchProc)READ_ACCESS(fbFetchTransformed);
+	    bits_image_fetch_transformed;
     }
-    
-    if (bits->common.alpha_map)
-    {
-	bits->store_scanline_64 = (scanStoreProc)fbStoreExternalAlpha64;
-	bits->store_scanline_32 = fbStoreExternalAlpha;
-    }
-    else
-    {
-	bits->store_scanline_64 = (scanStoreProc)fbStore64;
-	bits->store_scanline_32 = fbStore;
-    }
-}
 
-void
-_pixman_image_store_scanline_32 (bits_image_t *image, int x, int y, int width,
-				 uint32_t *buffer)
-{
-    image->store_scanline_32 (image, x, y, width, buffer);
-}
+    bits->store_scanline_64 = bits_image_store_scanline_64;
+    bits->store_scanline_32 = bits_image_store_scanline_32;
 
-void
-_pixman_image_store_scanline_64 (bits_image_t *image, int x, int y, int width,
-				 uint32_t *buffer)
-{
-    image->store_scanline_64 (image, x, y, width, buffer);
+    bits->common.need_workaround =
+        source_image_needs_out_of_bounds_workaround (bits);
 }
 
 static uint32_t *
 create_bits (pixman_format_code_t format,
-	     int		  width,
-	     int		  height,
-	     int		 *rowstride_bytes)
+             int                  width,
+             int                  height,
+             int *                rowstride_bytes)
 {
     int stride;
     int buf_size;
     int bpp;
-    
+
     /* what follows is a long-winded way, avoiding any possibility of integer
      * overflows, of saying:
-     * stride = ((width * bpp + FB_MASK) >> FB_SHIFT) * sizeof (uint32_t);
+     * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t);
      */
-    
+
     bpp = PIXMAN_FORMAT_BPP (format);
     if (pixman_multiply_overflows_int (width, bpp))
 	return NULL;
-    
+
     stride = width * bpp;
-    if (pixman_addition_overflows_int (stride, FB_MASK))
-	return NULL;
-    
-    stride += FB_MASK;
-    stride >>= FB_SHIFT;
-    
-#if FB_SHIFT < 2
-    if (pixman_multiply_overflows_int (stride, sizeof (uint32_t)))
+    if (pixman_addition_overflows_int (stride, 0x1f))
 	return NULL;
-#endif
+
+    stride += 0x1f;
+    stride >>= 5;
+
     stride *= sizeof (uint32_t);
-    
+
     if (pixman_multiply_overflows_int (height, stride))
 	return NULL;
-    
+
     buf_size = height * stride;
-    
+
     if (rowstride_bytes)
 	*rowstride_bytes = stride;
-    
+
     return calloc (buf_size, 1);
 }
 
 PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_bits (pixman_format_code_t  format,
-			  int                   width,
-			  int                   height,
-			  uint32_t	       *bits,
-			  int			rowstride_bytes)
+pixman_image_create_bits (pixman_format_code_t format,
+                          int                  width,
+                          int                  height,
+                          uint32_t *           bits,
+                          int                  rowstride_bytes)
 {
     pixman_image_t *image;
     uint32_t *free_me = NULL;
-    
+
     /* must be a whole number of uint32_t's
      */
     return_val_if_fail (bits == NULL ||
-			(rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
-    
+                        (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
+
     if (!bits && width && height)
     {
 	free_me = bits = create_bits (format, width, height, &rowstride_bytes);
 	if (!bits)
 	    return NULL;
     }
-    
-    image = _pixman_image_allocate();
-    
-    if (!image) {
+
+    image = _pixman_image_allocate ();
+
+    if (!image)
+    {
 	if (free_me)
 	    free (free_me);
+
 	return NULL;
     }
-    
+
     image->type = BITS;
     image->bits.format = format;
     image->bits.width = width;
     image->bits.height = height;
     image->bits.bits = bits;
     image->bits.free_me = free_me;
-    
-    image->bits.rowstride = rowstride_bytes / (int) sizeof (uint32_t); /* we store it in number
-									* of uint32_t's
-									*/
+    image->bits.read_func = NULL;
+    image->bits.write_func = NULL;
+
+    /* The rowstride is stored in number of uint32_t */
+    image->bits.rowstride = rowstride_bytes / (int) sizeof (uint32_t);
+
     image->bits.indexed = NULL;
-    
-    pixman_region32_fini (&image->common.full_region);
-    pixman_region32_init_rect (&image->common.full_region, 0, 0,
-			       image->bits.width, image->bits.height);
-    
+
     image->common.property_changed = bits_image_property_changed;
-    
-    bits_image_property_changed (image);
-    
+
     _pixman_image_reset_clip_region (image);
-    
+
     return image;
 }
diff --git a/lib/pixman/pixman/pixman-combine.c.template b/lib/pixman/pixman/pixman-combine.c.template
new file mode 100644
index 000000000..c129980a8
--- /dev/null
+++ b/lib/pixman/pixman/pixman-combine.c.template
@@ -0,0 +1,2436 @@
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+
+#include "pixman-private.h"
+
+#include "pixman-combine.h"
+
+/*** per channel helper functions ***/
+
+static void
+combine_mask_ca (comp4_t *src, comp4_t *mask)
+{
+    comp4_t a = *mask;
+
+    comp4_t x;
+    comp2_t xa;
+
+    if (!a)
+    {
+	*(src) = 0;
+	return;
+    }
+
+    x = *(src);
+    if (a == ~0)
+    {
+	x = x >> A_SHIFT;
+	x |= x << G_SHIFT;
+	x |= x << R_SHIFT;
+	*(mask) = x;
+	return;
+    }
+
+    xa = x >> A_SHIFT;
+    UNcx4_MUL_UNcx4 (x, a);
+    *(src) = x;
+    
+    UNcx4_MUL_UNc (a, xa);
+    *(mask) = a;
+}
+
+static void
+combine_mask_value_ca (comp4_t *src, const comp4_t *mask)
+{
+    comp4_t a = *mask;
+    comp4_t x;
+
+    if (!a)
+    {
+	*(src) = 0;
+	return;
+    }
+
+    if (a == ~0)
+	return;
+
+    x = *(src);
+    UNcx4_MUL_UNcx4 (x, a);
+    *(src) = x;
+}
+
+static void
+combine_mask_alpha_ca (const comp4_t *src, comp4_t *mask)
+{
+    comp4_t a = *(mask);
+    comp4_t x;
+
+    if (!a)
+	return;
+
+    x = *(src) >> A_SHIFT;
+    if (x == MASK)
+	return;
+
+    if (a == ~0)
+    {
+	x |= x << G_SHIFT;
+	x |= x << R_SHIFT;
+	*(mask) = x;
+	return;
+    }
+
+    UNcx4_MUL_UNc (a, x);
+    *(mask) = a;
+}
+
+/*
+ * There are two ways of handling alpha -- either as a single unified value or
+ * a separate value for each component, hence each macro must have two
+ * versions.  The unified alpha version has a 'U' at the end of the name,
+ * the component version has a 'C'.  Similarly, functions which deal with
+ * this difference will have two versions using the same convention.
+ */
+
+/*
+ * All of the composing functions
+ */
+
+static force_inline comp4_t
+combine_mask (const comp4_t *src, const comp4_t *mask, int i)
+{
+    comp4_t s, m;
+
+    if (mask)
+    {
+	m = *(mask + i) >> A_SHIFT;
+
+	if (!m)
+	    return 0;
+    }
+
+    s = *(src + i);
+
+    if (mask)
+	UNcx4_MUL_UNc (s, m);
+
+    return s;
+}
+
+static void
+combine_clear (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    memset (dest, 0, width * sizeof(comp4_t));
+}
+
+static void
+combine_src_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    if (!mask)
+	memcpy (dest, src, width * sizeof (comp4_t));
+    else
+    {
+	for (i = 0; i < width; ++i)
+	{
+	    comp4_t s = combine_mask (src, mask, i);
+
+	    *(dest + i) = s;
+	}
+    }
+}
+
+/* if the Src is opaque, call combine_src_u */
+static void
+combine_over_u (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t ia = ALPHA_c (~s);
+
+	UNcx4_MUL_UNc_ADD_UNcx4 (d, ia, s);
+	*(dest + i) = d;
+    }
+}
+
+/* if the Dst is opaque, this is a noop */
+static void
+combine_over_reverse_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t ia = ALPHA_c (~*(dest + i));
+	UNcx4_MUL_UNc_ADD_UNcx4 (s, ia, d);
+	*(dest + i) = s;
+    }
+}
+
+/* if the Dst is opaque, call combine_src_u */
+static void
+combine_in_u (pixman_implementation_t *imp,
+              pixman_op_t              op,
+              comp4_t *                dest,
+              const comp4_t *          src,
+              const comp4_t *          mask,
+              int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t a = ALPHA_c (*(dest + i));
+	UNcx4_MUL_UNc (s, a);
+	*(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, this is a noop */
+static void
+combine_in_reverse_u (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      comp4_t *                dest,
+                      const comp4_t *          src,
+                      const comp4_t *          mask,
+                      int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t a = ALPHA_c (s);
+	UNcx4_MUL_UNc (d, a);
+	*(dest + i) = d;
+    }
+}
+
+/* if the Dst is opaque, call combine_clear */
+static void
+combine_out_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t a = ALPHA_c (~*(dest + i));
+	UNcx4_MUL_UNc (s, a);
+	*(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, call combine_clear */
+static void
+combine_out_reverse_u (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t a = ALPHA_c (~s);
+	UNcx4_MUL_UNc (d, a);
+	*(dest + i) = d;
+    }
+}
+
+/* if the Src is opaque, call combine_in_u */
+/* if the Dst is opaque, call combine_over_u */
+/* if both the Src and Dst are opaque, call combine_src_u */
+static void
+combine_atop_u (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t dest_a = ALPHA_c (d);
+	comp4_t src_ia = ALPHA_c (~s);
+
+	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_a, d, src_ia);
+	*(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, call combine_over_reverse_u */
+/* if the Dst is opaque, call combine_in_reverse_u */
+/* if both the Src and Dst are opaque, call combine_dst_u */
+static void
+combine_atop_reverse_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t src_a = ALPHA_c (s);
+	comp4_t dest_ia = ALPHA_c (~d);
+
+	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_ia, d, src_a);
+	*(dest + i) = s;
+    }
+}
+
+/* if the Src is opaque, call combine_over_u */
+/* if the Dst is opaque, call combine_over_reverse_u */
+/* if both the Src and Dst are opaque, call combine_clear */
+static void
+combine_xor_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t src_ia = ALPHA_c (~s);
+	comp4_t dest_ia = ALPHA_c (~d);
+
+	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (s, dest_ia, d, src_ia);
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_add_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	UNcx4_ADD_UNcx4 (d, s);
+	*(dest + i) = d;
+    }
+}
+
+/* if the Src is opaque, call combine_add_u */
+/* if the Dst is opaque, call combine_add_u */
+/* if both the Src and Dst are opaque, call combine_add_u */
+static void
+combine_saturate_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    comp4_t *                dest,
+                    const comp4_t *          src,
+                    const comp4_t *          mask,
+                    int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp2_t sa, da;
+
+	sa = s >> A_SHIFT;
+	da = ~d >> A_SHIFT;
+	if (sa > da)
+	{
+	    sa = DIV_UNc (da, sa);
+	    UNcx4_MUL_UNc (s, sa);
+	}
+	;
+	UNcx4_ADD_UNcx4 (d, s);
+	*(dest + i) = d;
+    }
+}
+
+/*
+ * PDF blend modes:
+ * The following blend modes have been taken from the PDF ISO 32000
+ * specification, which at this point in time is available from
+ * http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
+ * The relevant chapters are 11.3.5 and 11.3.6.
+ * The formula for computing the final pixel color given in 11.3.6 is:
+ * αr × Cr = (1 – αs) × αb × Cb + (1 – αb) × αs × Cs + αb × αs × B(Cb, Cs)
+ * with B() being the blend function.
+ * Note that OVER is a special case of this operation, using B(Cb, Cs) = Cs
+ *
+ * These blend modes should match the SVG filter draft specification, as
+ * it has been designed to mirror ISO 32000. Note that at the current point
+ * no released draft exists that shows this, as the formulas have not been
+ * updated yet after the release of ISO 32000.
+ *
+ * The default implementation here uses the PDF_SEPARABLE_BLEND_MODE and
+ * PDF_NON_SEPARABLE_BLEND_MODE macros, which take the blend function as an
+ * argument. Note that this implementation operates on premultiplied colors,
+ * while the PDF specification does not. Therefore the code uses the formula
+ * ar.Cra = (1 – as) . Dca + (1 – ad) . Sca + B(Dca, ad, Sca, as)
+ */
+
+/*
+ * Multiply
+ * B(Dca, ad, Sca, as) = Dca.Sca
+ */
+
+static void
+combine_multiply_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    comp4_t *                dest,
+                    const comp4_t *          src,
+                    const comp4_t *          mask,
+                    int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t ss = s;
+	comp4_t src_ia = ALPHA_c (~s);
+	comp4_t dest_ia = ALPHA_c (~d);
+
+	UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (ss, dest_ia, d, src_ia);
+	UNcx4_MUL_UNcx4 (d, s);
+	UNcx4_ADD_UNcx4 (d, ss);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_multiply_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     comp4_t *                dest,
+                     const comp4_t *          src,
+                     const comp4_t *          mask,
+                     int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t m = *(mask + i);
+	comp4_t s = *(src + i);
+	comp4_t d = *(dest + i);
+	comp4_t r = d;
+	comp4_t dest_ia = ALPHA_c (~d);
+
+	combine_mask_value_ca (&s, &m);
+
+	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (r, ~m, s, dest_ia);
+	UNcx4_MUL_UNcx4 (d, s);
+	UNcx4_ADD_UNcx4 (r, d);
+
+	*(dest + i) = r;
+    }
+}
+
+#define PDF_SEPARABLE_BLEND_MODE(name)					\
+    static void								\
+    combine_ ## name ## _u (pixman_implementation_t *imp,		\
+			    pixman_op_t              op,		\
+                            comp4_t *                dest,		\
+			    const comp4_t *          src,		\
+			    const comp4_t *          mask,		\
+			    int                      width)		\
+    {									\
+	int i;								\
+	for (i = 0; i < width; ++i) {					\
+	    comp4_t s = combine_mask (src, mask, i);			\
+	    comp4_t d = *(dest + i);					\
+	    comp1_t sa = ALPHA_c (s);					\
+	    comp1_t isa = ~sa;						\
+	    comp1_t da = ALPHA_c (d);					\
+	    comp1_t ida = ~da;						\
+	    comp4_t result;						\
+									\
+	    result = d;							\
+	    UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (result, isa, s, ida);	\
+	    								\
+	    *(dest + i) = result +					\
+		(DIV_ONE_UNc (sa * da) << A_SHIFT) +			\
+		(blend_ ## name (RED_c (d), da, RED_c (s), sa) << R_SHIFT) + \
+		(blend_ ## name (GREEN_c (d), da, GREEN_c (s), sa) << G_SHIFT) + \
+		(blend_ ## name (BLUE_c (d), da, BLUE_c (s), sa));	\
+	}								\
+    }									\
+    									\
+    static void								\
+    combine_ ## name ## _ca (pixman_implementation_t *imp,		\
+			     pixman_op_t              op,		\
+                             comp4_t *                dest,		\
+			     const comp4_t *          src,		\
+			     const comp4_t *          mask,		\
+			     int                     width)		\
+    {									\
+	int i;								\
+	for (i = 0; i < width; ++i) {					\
+	    comp4_t m = *(mask + i);					\
+	    comp4_t s = *(src + i);					\
+	    comp4_t d = *(dest + i);					\
+	    comp1_t da = ALPHA_c (d);					\
+	    comp1_t ida = ~da;						\
+	    comp4_t result;						\
+            								\
+	    combine_mask_value_ca (&s, &m);				\
+            								\
+	    result = d;							\
+	    UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (result, ~m, s, ida);     \
+            								\
+	    result +=							\
+	        (DIV_ONE_UNc (ALPHA_c (m) * da) << A_SHIFT) +		\
+	        (blend_ ## name (RED_c (d), da, RED_c (s), RED_c (m)) << R_SHIFT) + \
+	        (blend_ ## name (GREEN_c (d), da, GREEN_c (s), GREEN_c (m)) << G_SHIFT) + \
+	        (blend_ ## name (BLUE_c (d), da, BLUE_c (s), BLUE_c (m))); \
+	    								\
+	    *(dest + i) = result;					\
+	}								\
+    }
+
+/*
+ * Screen
+ * B(Dca, ad, Sca, as) = Dca.sa + Sca.da - Dca.Sca
+ */
+static inline comp4_t
+blend_screen (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    return DIV_ONE_UNc (sca * da + dca * sa - sca * dca);
+}
+
+PDF_SEPARABLE_BLEND_MODE (screen)
+
+/*
+ * Overlay
+ * B(Dca, Da, Sca, Sa) =
+ *   if 2.Dca < Da
+ *     2.Sca.Dca
+ *   otherwise
+ *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
+ */
+static inline comp4_t
+blend_overlay (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t rca;
+
+    if (2 * dca < da)
+	rca = 2 * sca * dca;
+    else
+	rca = sa * da - 2 * (da - dca) * (sa - sca);
+    return DIV_ONE_UNc (rca);
+}
+
+PDF_SEPARABLE_BLEND_MODE (overlay)
+
+/*
+ * Darken
+ * B(Dca, Da, Sca, Sa) = min (Sca.Da, Dca.Sa)
+ */
+static inline comp4_t
+blend_darken (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t s, d;
+
+    s = sca * da;
+    d = dca * sa;
+    return DIV_ONE_UNc (s > d ? d : s);
+}
+
+PDF_SEPARABLE_BLEND_MODE (darken)
+
+/*
+ * Lighten
+ * B(Dca, Da, Sca, Sa) = max (Sca.Da, Dca.Sa)
+ */
+static inline comp4_t
+blend_lighten (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t s, d;
+
+    s = sca * da;
+    d = dca * sa;
+    return DIV_ONE_UNc (s > d ? s : d);
+}
+
+PDF_SEPARABLE_BLEND_MODE (lighten)
+
+/*
+ * Color dodge
+ * B(Dca, Da, Sca, Sa) =
+ *   if Dca == 0
+ *     0
+ *   if Sca == Sa
+ *     Sa.Da
+ *   otherwise
+ *     Sa.Da. min (1, Dca / Da / (1 - Sca/Sa))
+ */
+static inline comp4_t
+blend_color_dodge (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    if (sca >= sa)
+    {
+	return dca == 0 ? 0 : DIV_ONE_UNc (sa * da);
+    }
+    else
+    {
+	comp4_t rca = dca * sa / (sa - sca);
+	return DIV_ONE_UNc (sa * MIN (rca, da));
+    }
+}
+
+PDF_SEPARABLE_BLEND_MODE (color_dodge)
+
+/*
+ * Color burn
+ * B(Dca, Da, Sca, Sa) =
+ *   if Dca == Da
+ *     Sa.Da
+ *   if Sca == 0
+ *     0
+ *   otherwise
+ *     Sa.Da.(1 - min (1, (1 - Dca/Da).Sa / Sca))
+ */
+static inline comp4_t
+blend_color_burn (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    if (sca == 0)
+    {
+	return dca < da ? 0 : DIV_ONE_UNc (sa * da);
+    }
+    else
+    {
+	comp4_t rca = (da - dca) * sa / sca;
+	return DIV_ONE_UNc (sa * (MAX (rca, da) - rca));
+    }
+}
+
+PDF_SEPARABLE_BLEND_MODE (color_burn)
+
+/*
+ * Hard light
+ * B(Dca, Da, Sca, Sa) =
+ *   if 2.Sca < Sa
+ *     2.Sca.Dca
+ *   otherwise
+ *     Sa.Da - 2.(Da - Dca).(Sa - Sca)
+ */
+static inline comp4_t
+blend_hard_light (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    if (2 * sca < sa)
+	return DIV_ONE_UNc (2 * sca * dca);
+    else
+	return DIV_ONE_UNc (sa * da - 2 * (da - dca) * (sa - sca));
+}
+
+PDF_SEPARABLE_BLEND_MODE (hard_light)
+
+/*
+ * Soft light
+ * B(Dca, Da, Sca, Sa) =
+ *   if (2.Sca <= Sa)
+ *     Dca.(Sa - (1 - Dca/Da).(2.Sca - Sa))
+ *   otherwise if Dca.4 <= Da
+ *     Dca.(Sa + (2.Sca - Sa).((16.Dca/Da - 12).Dca/Da + 3)
+ *   otherwise
+ *     (Dca.Sa + (SQRT (Dca/Da).Da - Dca).(2.Sca - Sa))
+ */
+static inline comp4_t
+blend_soft_light (comp4_t dca_org,
+		  comp4_t da_org,
+		  comp4_t sca_org,
+		  comp4_t sa_org)
+{
+    double dca = dca_org * (1.0 / MASK);
+    double da = da_org * (1.0 / MASK);
+    double sca = sca_org * (1.0 / MASK);
+    double sa = sa_org * (1.0 / MASK);
+    double rca;
+
+    if (2 * sca < sa)
+    {
+	if (da == 0)
+	    rca = dca * sa;
+	else
+	    rca = dca * sa - dca * (da - dca) * (sa - 2 * sca) / da;
+    }
+    else if (da == 0)
+    {
+	rca = 0;
+    }
+    else if (4 * dca <= da)
+    {
+	rca = dca * sa +
+	    (2 * sca - sa) * dca * ((16 * dca / da - 12) * dca / da + 3);
+    }
+    else
+    {
+	rca = dca * sa + (sqrt (dca * da) - dca) * (2 * sca - sa);
+    }
+    return rca * MASK + 0.5;
+}
+
+PDF_SEPARABLE_BLEND_MODE (soft_light)
+
+/*
+ * Difference
+ * B(Dca, Da, Sca, Sa) = abs (Dca.Sa - Sca.Da)
+ */
+static inline comp4_t
+blend_difference (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    comp4_t dcasa = dca * sa;
+    comp4_t scada = sca * da;
+
+    if (scada < dcasa)
+	return DIV_ONE_UNc (dcasa - scada);
+    else
+	return DIV_ONE_UNc (scada - dcasa);
+}
+
+PDF_SEPARABLE_BLEND_MODE (difference)
+
+/*
+ * Exclusion
+ * B(Dca, Da, Sca, Sa) = (Sca.Da + Dca.Sa - 2.Sca.Dca)
+ */
+
+/* This can be made faster by writing it directly and not using
+ * PDF_SEPARABLE_BLEND_MODE, but that's a performance optimization */
+
+static inline comp4_t
+blend_exclusion (comp4_t dca, comp4_t da, comp4_t sca, comp4_t sa)
+{
+    return DIV_ONE_UNc (sca * da + dca * sa - 2 * dca * sca);
+}
+
+PDF_SEPARABLE_BLEND_MODE (exclusion)
+
+#undef PDF_SEPARABLE_BLEND_MODE
+
+/*
+ * PDF nonseperable blend modes are implemented using the following functions
+ * to operate in Hsl space, with Cmax, Cmid, Cmin referring to the max, mid
+ * and min value of the red, green and blue components.
+ *
+ * LUM (C) = 0.3 × Cred + 0.59 × Cgreen + 0.11 × Cblue
+ *
+ * clip_color (C):
+ *   l = LUM (C)
+ *   min = Cmin
+ *   max = Cmax
+ *   if n < 0.0
+ *     C = l + ( ( ( C – l ) × l ) ⁄ ( l – min ) )
+ *   if x > 1.0
+ *     C = l + ( ( ( C – l ) × ( 1 – l ) ) ⁄ ( max – l ) )
+ *   return C
+ *
+ * set_lum (C, l):
+ *   d = l – LUM (C)
+ *   C += d
+ *   return clip_color (C)
+ *
+ * SAT (C) = CH_MAX (C) - CH_MIN (C)
+ *
+ * set_sat (C, s):
+ *  if Cmax > Cmin
+ *    Cmid = ( ( ( Cmid – Cmin ) × s ) ⁄ ( Cmax – Cmin ) )
+ *    Cmax = s
+ *  else
+ *    Cmid = Cmax = 0.0
+ *  Cmin = 0.0
+ *  return C
+ */
+
+/* For premultiplied colors, we need to know what happens when C is
+ * multiplied by a real number. LUM and SAT are linear:
+ *
+ *    LUM (r × C) = r × LUM (C)		SAT (r * C) = r * SAT (C)
+ *
+ * If we extend clip_color with an extra argument a and change
+ *
+ *        if x >= 1.0
+ *
+ * into
+ *
+ *        if x >= a
+ *
+ * then clip_color is also linear:
+ *
+ *    r * clip_color (C, a) = clip_color (r_c, ra);
+ *
+ * for positive r.
+ *
+ * Similarly, we can extend set_lum with an extra argument that is just passed
+ * on to clip_color:
+ *
+ *   r * set_lum ( C, l, a)
+ *
+ *   = r × clip_color ( C + l - LUM (C), a)
+ *
+ *   = clip_color ( r * C + r × l - r * LUM (C), r * a)
+ *
+ *   = set_lum ( r * C, r * l, r * a)
+ *
+ * Finally, set_sat:
+ *
+ *    r * set_sat (C, s) = set_sat (x * C, r * s)
+ *
+ * The above holds for all non-zero x, because they x'es in the fraction for
+ * C_mid cancel out. Specifically, it holds for x = r:
+ *
+ *    r * set_sat (C, s) = set_sat (r_c, rs)
+ *
+ */
+
+/* So, for the non-separable PDF blend modes, we have (using s, d for
+ * non-premultiplied colors, and S, D for premultiplied:
+ *
+ *   Color:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (S/a_s, LUM (D/a_d), 1)
+ *   = set_lum (S * a_d, a_s * LUM (D), a_s * a_d)
+ *
+ *
+ *   Luminosity:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (D/a_d, LUM(S/a_s), 1)
+ *   = set_lum (a_s * D, a_d * LUM(S), a_s * a_d)
+ *
+ *
+ *   Saturation:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (set_sat (D/a_d, SAT (S/a_s)), LUM (D/a_d), 1)
+ *   = set_lum (a_s * a_d * set_sat (D/a_d, SAT (S/a_s)),
+ *                                        a_s * LUM (D), a_s * a_d)
+ *   = set_lum (set_sat (a_s * D, a_d * SAT (S), a_s * LUM (D), a_s * a_d))
+ *
+ *   Hue:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (set_sat (S/a_s, SAT (D/a_d)), LUM (D/a_d), 1)
+ *   = a_s * a_d * set_lum (set_sat (a_d * S, a_s * SAT (D)),
+ *                                        a_s * LUM (D), a_s * a_d)
+ *
+ */
+
+#define CH_MIN(c) (c[0] < c[1] ? (c[0] < c[2] ? c[0] : c[2]) : (c[1] < c[2] ? c[1] : c[2]))
+#define CH_MAX(c) (c[0] > c[1] ? (c[0] > c[2] ? c[0] : c[2]) : (c[1] > c[2] ? c[1] : c[2]))
+#define LUM(c) ((c[0] * 30 + c[1] * 59 + c[2] * 11) / 100)
+#define SAT(c) (CH_MAX (c) - CH_MIN (c))
+
+#define PDF_NON_SEPARABLE_BLEND_MODE(name)				\
+    static void								\
+    combine_ ## name ## _u (pixman_implementation_t *imp,		\
+			    pixman_op_t op,				\
+                            comp4_t *dest,				\
+			    const comp4_t *src,				\
+			    const comp4_t *mask,			\
+			    int width)					\
+    {									\
+	int i;								\
+	for (i = 0; i < width; ++i)					\
+	{								\
+	    comp4_t s = combine_mask (src, mask, i);			\
+	    comp4_t d = *(dest + i);					\
+	    comp1_t sa = ALPHA_c (s);					\
+	    comp1_t isa = ~sa;						\
+	    comp1_t da = ALPHA_c (d);					\
+	    comp1_t ida = ~da;						\
+	    comp4_t result;						\
+	    comp4_t sc[3], dc[3], c[3];					\
+            								\
+	    result = d;							\
+	    UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc (result, isa, s, ida);	\
+	    dc[0] = RED_c (d);						\
+	    sc[0] = RED_c (s);						\
+	    dc[1] = GREEN_c (d);					\
+	    sc[1] = GREEN_c (s);					\
+	    dc[2] = BLUE_c (d);						\
+	    sc[2] = BLUE_c (s);						\
+	    blend_ ## name (c, dc, da, sc, sa);				\
+            								\
+	    *(dest + i) = result +					\
+		(DIV_ONE_UNc (sa * da) << A_SHIFT) +			\
+		(DIV_ONE_UNc (c[0]) << R_SHIFT) +			\
+		(DIV_ONE_UNc (c[1]) << G_SHIFT) +			\
+		(DIV_ONE_UNc (c[2]));					\
+	}								\
+    }
+
+static void
+set_lum (comp4_t dest[3], comp4_t src[3], comp4_t sa, comp4_t lum)
+{
+    double a, l, min, max;
+    double tmp[3];
+
+    a = sa * (1.0 / MASK);
+
+    l = lum * (1.0 / MASK);
+    tmp[0] = src[0] * (1.0 / MASK);
+    tmp[1] = src[1] * (1.0 / MASK);
+    tmp[2] = src[2] * (1.0 / MASK);
+
+    l = l - LUM (tmp);
+    tmp[0] += l;
+    tmp[1] += l;
+    tmp[2] += l;
+
+    /* clip_color */
+    l = LUM (tmp);
+    min = CH_MIN (tmp);
+    max = CH_MAX (tmp);
+
+    if (min < 0)
+    {
+	tmp[0] = l + (tmp[0] - l) * l / (l - min);
+	tmp[1] = l + (tmp[1] - l) * l / (l - min);
+	tmp[2] = l + (tmp[2] - l) * l / (l - min);
+    }
+    if (max > a)
+    {
+	tmp[0] = l + (tmp[0] - l) * (a - l) / (max - l);
+	tmp[1] = l + (tmp[1] - l) * (a - l) / (max - l);
+	tmp[2] = l + (tmp[2] - l) * (a - l) / (max - l);
+    }
+
+    dest[0] = tmp[0] * MASK + 0.5;
+    dest[1] = tmp[1] * MASK + 0.5;
+    dest[2] = tmp[2] * MASK + 0.5;
+}
+
+static void
+set_sat (comp4_t dest[3], comp4_t src[3], comp4_t sat)
+{
+    int id[3];
+    comp4_t min, max;
+
+    if (src[0] > src[1])
+    {
+	if (src[0] > src[2])
+	{
+	    id[0] = 0;
+	    if (src[1] > src[2])
+	    {
+		id[1] = 1;
+		id[2] = 2;
+	    }
+	    else
+	    {
+		id[1] = 2;
+		id[2] = 1;
+	    }
+	}
+	else
+	{
+	    id[0] = 2;
+	    id[1] = 0;
+	    id[2] = 1;
+	}
+    }
+    else
+    {
+	if (src[0] > src[2])
+	{
+	    id[0] = 1;
+	    id[1] = 0;
+	    id[2] = 2;
+	}
+	else
+	{
+	    id[2] = 0;
+	    if (src[1] > src[2])
+	    {
+		id[0] = 1;
+		id[1] = 2;
+	    }
+	    else
+	    {
+		id[0] = 2;
+		id[1] = 1;
+	    }
+	}
+    }
+
+    max = dest[id[0]];
+    min = dest[id[2]];
+    if (max > min)
+    {
+	dest[id[1]] = (dest[id[1]] - min) * sat / (max - min);
+	dest[id[0]] = sat;
+	dest[id[2]] = 0;
+    }
+    else
+    {
+	dest[0] = dest[1] = dest[2] = 0;
+    }
+}
+
+/*
+ * Hue:
+ * B(Cb, Cs) = set_lum (set_sat (Cs, SAT (Cb)), LUM (Cb))
+ */
+static inline void
+blend_hsl_hue (comp4_t c[3],
+               comp4_t dc[3],
+               comp4_t da,
+               comp4_t sc[3],
+               comp4_t sa)
+{
+    c[0] = sc[0] * da;
+    c[1] = sc[1] * da;
+    c[2] = sc[2] * da;
+    set_sat (c, c, SAT (dc) * sa);
+    set_lum (c, c, sa * da, LUM (dc) * sa);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_hue)
+
+/*
+ * Saturation:
+ * B(Cb, Cs) = set_lum (set_sat (Cb, SAT (Cs)), LUM (Cb))
+ */
+static inline void
+blend_hsl_saturation (comp4_t c[3],
+                      comp4_t dc[3],
+                      comp4_t da,
+                      comp4_t sc[3],
+                      comp4_t sa)
+{
+    c[0] = dc[0] * sa;
+    c[1] = dc[1] * sa;
+    c[2] = dc[2] * sa;
+    set_sat (c, c, SAT (sc) * da);
+    set_lum (c, c, sa * da, LUM (dc) * sa);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_saturation)
+
+/*
+ * Color:
+ * B(Cb, Cs) = set_lum (Cs, LUM (Cb))
+ */
+static inline void
+blend_hsl_color (comp4_t c[3],
+                 comp4_t dc[3],
+                 comp4_t da,
+                 comp4_t sc[3],
+                 comp4_t sa)
+{
+    c[0] = sc[0] * da;
+    c[1] = sc[1] * da;
+    c[2] = sc[2] * da;
+    set_lum (c, c, sa * da, LUM (dc) * sa);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_color)
+
+/*
+ * Luminosity:
+ * B(Cb, Cs) = set_lum (Cb, LUM (Cs))
+ */
+static inline void
+blend_hsl_luminosity (comp4_t c[3],
+                      comp4_t dc[3],
+                      comp4_t da,
+                      comp4_t sc[3],
+                      comp4_t sa)
+{
+    c[0] = dc[0] * sa;
+    c[1] = dc[1] * sa;
+    c[2] = dc[2] * sa;
+    set_lum (c, c, sa * da, LUM (sc) * da);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_luminosity)
+
+#undef SAT
+#undef LUM
+#undef CH_MAX
+#undef CH_MIN
+#undef PDF_NON_SEPARABLE_BLEND_MODE
+
+/* Overlay
+ *
+ * All of the disjoint composing functions
+ *
+ * The four entries in the first column indicate what source contributions
+ * come from each of the four areas of the picture -- areas covered by neither
+ * A nor B, areas covered only by A, areas covered only by B and finally
+ * areas covered by both A and B.
+ * 
+ * Disjoint			Conjoint
+ * Fa		Fb		Fa		Fb
+ * (0,0,0,0)	0		0		0		0
+ * (0,A,0,A)	1		0		1		0
+ * (0,0,B,B)	0		1		0		1
+ * (0,A,B,A)	1		min((1-a)/b,1)	1		max(1-a/b,0)
+ * (0,A,B,B)	min((1-b)/a,1)	1		max(1-b/a,0)	1
+ * (0,0,0,A)	max(1-(1-b)/a,0) 0		min(1,b/a)	0
+ * (0,0,0,B)	0		max(1-(1-a)/b,0) 0		min(a/b,1)
+ * (0,A,0,0)	min(1,(1-b)/a)	0		max(1-b/a,0)	0
+ * (0,0,B,0)	0		min(1,(1-a)/b)	0		max(1-a/b,0)
+ * (0,0,B,A)	max(1-(1-b)/a,0) min(1,(1-a)/b)	 min(1,b/a)	max(1-a/b,0)
+ * (0,A,0,B)	min(1,(1-b)/a)	max(1-(1-a)/b,0) max(1-b/a,0)	min(1,a/b)
+ * (0,A,B,0)	min(1,(1-b)/a)	min(1,(1-a)/b)	max(1-b/a,0)	max(1-a/b,0)
+ */
+
+#define COMBINE_A_OUT 1
+#define COMBINE_A_IN  2
+#define COMBINE_B_OUT 4
+#define COMBINE_B_IN  8
+
+#define COMBINE_CLEAR   0
+#define COMBINE_A       (COMBINE_A_OUT | COMBINE_A_IN)
+#define COMBINE_B       (COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_ATOP  (COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_ATOP  (COMBINE_A_OUT | COMBINE_B_IN)
+#define COMBINE_XOR     (COMBINE_A_OUT | COMBINE_B_OUT)
+
+/* portion covered by a but not b */
+static comp1_t
+combine_disjoint_out_part (comp1_t a, comp1_t b)
+{
+    /* min (1, (1-b) / a) */
+
+    b = ~b;                 /* 1 - b */
+    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
+	return MASK;        /* 1 */
+    return DIV_UNc (b, a);     /* (1-b) / a */
+}
+
+/* portion covered by both a and b */
+static comp1_t
+combine_disjoint_in_part (comp1_t a, comp1_t b)
+{
+    /* max (1-(1-b)/a,0) */
+    /*  = - min ((1-b)/a - 1, 0) */
+    /*  = 1 - min (1, (1-b)/a) */
+
+    b = ~b;                 /* 1 - b */
+    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
+	return 0;           /* 1 - 1 */
+    return ~DIV_UNc(b, a);    /* 1 - (1-b) / a */
+}
+
+/* portion covered by a but not b */
+static comp1_t
+combine_conjoint_out_part (comp1_t a, comp1_t b)
+{
+    /* max (1-b/a,0) */
+    /* = 1-min(b/a,1) */
+
+    /* min (1, (1-b) / a) */
+
+    if (b >= a)             /* b >= a -> b/a >= 1 */
+	return 0x00;        /* 0 */
+    return ~DIV_UNc(b, a);    /* 1 - b/a */
+}
+
+/* portion covered by both a and b */
+static comp1_t
+combine_conjoint_in_part (comp1_t a, comp1_t b)
+{
+    /* min (1,b/a) */
+
+    if (b >= a)             /* b >= a -> b/a >= 1 */
+	return MASK;        /* 1 */
+    return DIV_UNc (b, a);     /* b/a */
+}
+
+#define GET_COMP(v, i)   ((comp2_t) (comp1_t) ((v) >> i))
+
+#define ADD(x, y, i, t)							\
+    ((t) = GET_COMP (x, i) + GET_COMP (y, i),				\
+     (comp4_t) ((comp1_t) ((t) | (0 - ((t) >> G_SHIFT)))) << (i))
+
+#define GENERIC(x, y, i, ax, ay, t, u, v)				\
+    ((t) = (MUL_UNc (GET_COMP (y, i), ay, (u)) +			\
+            MUL_UNc (GET_COMP (x, i), ax, (v))),			\
+     (comp4_t) ((comp1_t) ((t) |					\
+                           (0 - ((t) >> G_SHIFT)))) << (i))
+
+static void
+combine_disjoint_general_u (comp4_t *      dest,
+                            const comp4_t *src,
+                            const comp4_t *mask,
+                            int            width,
+                            comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t m, n, o, p;
+	comp2_t Fa, Fb, t, u, v;
+	comp1_t sa = s >> A_SHIFT;
+	comp1_t da = d >> A_SHIFT;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    Fa = combine_disjoint_out_part (sa, da);
+	    break;
+
+	case COMBINE_A_IN:
+	    Fa = combine_disjoint_in_part (sa, da);
+	    break;
+
+	case COMBINE_A:
+	    Fa = MASK;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    Fb = combine_disjoint_out_part (da, sa);
+	    break;
+
+	case COMBINE_B_IN:
+	    Fb = combine_disjoint_in_part (da, sa);
+	    break;
+
+	case COMBINE_B:
+	    Fb = MASK;
+	    break;
+	}
+	m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
+	n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
+	o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
+	p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
+	s = m | n | o | p;
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_disjoint_over_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp2_t a = s >> A_SHIFT;
+
+	if (a != 0x00)
+	{
+	    if (a != MASK)
+	    {
+		comp4_t d = *(dest + i);
+		a = combine_disjoint_out_part (d >> A_SHIFT, a);
+		UNcx4_MUL_UNc_ADD_UNcx4 (d, a, s);
+		s = d;
+	    }
+
+	    *(dest + i) = s;
+	}
+    }
+}
+
+static void
+combine_disjoint_in_u (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_disjoint_in_reverse_u (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               comp4_t *                dest,
+                               const comp4_t *          src,
+                               const comp4_t *          mask,
+                               int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_disjoint_out_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_disjoint_out_reverse_u (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_disjoint_atop_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_disjoint_atop_reverse_u (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_disjoint_xor_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
+combine_conjoint_general_u (comp4_t *      dest,
+                            const comp4_t *src,
+                            const comp4_t *mask,
+                            int            width,
+                            comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = combine_mask (src, mask, i);
+	comp4_t d = *(dest + i);
+	comp4_t m, n, o, p;
+	comp2_t Fa, Fb, t, u, v;
+	comp1_t sa = s >> A_SHIFT;
+	comp1_t da = d >> A_SHIFT;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    Fa = combine_conjoint_out_part (sa, da);
+	    break;
+
+	case COMBINE_A_IN:
+	    Fa = combine_conjoint_in_part (sa, da);
+	    break;
+
+	case COMBINE_A:
+	    Fa = MASK;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    Fb = combine_conjoint_out_part (da, sa);
+	    break;
+
+	case COMBINE_B_IN:
+	    Fb = combine_conjoint_in_part (da, sa);
+	    break;
+
+	case COMBINE_B:
+	    Fb = MASK;
+	    break;
+	}
+
+	m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
+	n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
+	o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
+	p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
+
+	s = m | n | o | p;
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_conjoint_over_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+combine_conjoint_over_reverse_u (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+combine_conjoint_in_u (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_conjoint_in_reverse_u (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               comp4_t *                dest,
+                               const comp4_t *          src,
+                               const comp4_t *          mask,
+                               int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_conjoint_out_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_conjoint_out_reverse_u (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_conjoint_atop_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_conjoint_atop_reverse_u (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_conjoint_xor_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
+
+/************************************************************************/
+/*********************** Per Channel functions **************************/
+/************************************************************************/
+
+static void
+combine_clear_ca (pixman_implementation_t *imp,
+                  pixman_op_t              op,
+                  comp4_t *                dest,
+                  const comp4_t *          src,
+                  const comp4_t *          mask,
+                  int                      width)
+{
+    memset (dest, 0, width * sizeof(comp4_t));
+}
+
+static void
+combine_src_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+
+	combine_mask_value_ca (&s, &m);
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_over_ca (pixman_implementation_t *imp,
+                 pixman_op_t              op,
+                 comp4_t *                dest,
+                 const comp4_t *          src,
+                 const comp4_t *          mask,
+                 int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t a;
+
+	combine_mask_ca (&s, &m);
+
+	a = ~m;
+	if (a)
+	{
+	    comp4_t d = *(dest + i);
+	    UNcx4_MUL_UNcx4_ADD_UNcx4 (d, a, s);
+	    s = d;
+	}
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_over_reverse_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp4_t a = ~d >> A_SHIFT;
+
+	if (a)
+	{
+	    comp4_t s = *(src + i);
+	    comp4_t m = *(mask + i);
+
+	    UNcx4_MUL_UNcx4 (s, m);
+	    UNcx4_MUL_UNc_ADD_UNcx4 (s, a, d);
+
+	    *(dest + i) = s;
+	}
+    }
+}
+
+static void
+combine_in_ca (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               comp4_t *                dest,
+               const comp4_t *          src,
+               const comp4_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp2_t a = d >> A_SHIFT;
+	comp4_t s = 0;
+
+	if (a)
+	{
+	    comp4_t m = *(mask + i);
+
+	    s = *(src + i);
+	    combine_mask_value_ca (&s, &m);
+
+	    if (a != MASK)
+		UNcx4_MUL_UNc (s, a);
+	}
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_in_reverse_ca (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       comp4_t *                dest,
+                       const comp4_t *          src,
+                       const comp4_t *          mask,
+                       int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t a;
+
+	combine_mask_alpha_ca (&s, &m);
+
+	a = m;
+	if (a != ~0)
+	{
+	    comp4_t d = 0;
+
+	    if (a)
+	    {
+		d = *(dest + i);
+		UNcx4_MUL_UNcx4 (d, a);
+	    }
+
+	    *(dest + i) = d;
+	}
+    }
+}
+
+static void
+combine_out_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp2_t a = ~d >> A_SHIFT;
+	comp4_t s = 0;
+
+	if (a)
+	{
+	    comp4_t m = *(mask + i);
+
+	    s = *(src + i);
+	    combine_mask_value_ca (&s, &m);
+
+	    if (a != MASK)
+		UNcx4_MUL_UNc (s, a);
+	}
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_out_reverse_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t a;
+
+	combine_mask_alpha_ca (&s, &m);
+
+	a = ~m;
+	if (a != ~0)
+	{
+	    comp4_t d = 0;
+
+	    if (a)
+	    {
+		d = *(dest + i);
+		UNcx4_MUL_UNcx4 (d, a);
+	    }
+
+	    *(dest + i) = d;
+	}
+    }
+}
+
+static void
+combine_atop_ca (pixman_implementation_t *imp,
+                 pixman_op_t              op,
+                 comp4_t *                dest,
+                 const comp4_t *          src,
+                 const comp4_t *          mask,
+                 int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t ad;
+	comp2_t as = d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	ad = ~m;
+
+	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_atop_reverse_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t ad;
+	comp2_t as = ~d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	ad = m;
+
+	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_xor_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t d = *(dest + i);
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t ad;
+	comp2_t as = ~d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	ad = ~m;
+
+	UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc (d, ad, s, as);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_add_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                comp4_t *                dest,
+                const comp4_t *          src,
+                const comp4_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s = *(src + i);
+	comp4_t m = *(mask + i);
+	comp4_t d = *(dest + i);
+
+	combine_mask_value_ca (&s, &m);
+
+	UNcx4_ADD_UNcx4 (d, s);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_saturate_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     comp4_t *                dest,
+                     const comp4_t *          src,
+                     const comp4_t *          mask,
+                     int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s, d;
+	comp2_t sa, sr, sg, sb, da;
+	comp2_t t, u, v;
+	comp4_t m, n, o, p;
+
+	d = *(dest + i);
+	s = *(src + i);
+	m = *(mask + i);
+
+	combine_mask_ca (&s, &m);
+
+	sa = (m >> A_SHIFT);
+	sr = (m >> R_SHIFT) & MASK;
+	sg = (m >> G_SHIFT) & MASK;
+	sb =  m             & MASK;
+	da = ~d >> A_SHIFT;
+
+	if (sb <= da)
+	    m = ADD (s, d, 0, t);
+	else
+	    m = GENERIC (s, d, 0, (da << G_SHIFT) / sb, MASK, t, u, v);
+
+	if (sg <= da)
+	    n = ADD (s, d, G_SHIFT, t);
+	else
+	    n = GENERIC (s, d, G_SHIFT, (da << G_SHIFT) / sg, MASK, t, u, v);
+
+	if (sr <= da)
+	    o = ADD (s, d, R_SHIFT, t);
+	else
+	    o = GENERIC (s, d, R_SHIFT, (da << G_SHIFT) / sr, MASK, t, u, v);
+
+	if (sa <= da)
+	    p = ADD (s, d, A_SHIFT, t);
+	else
+	    p = GENERIC (s, d, A_SHIFT, (da << G_SHIFT) / sa, MASK, t, u, v);
+
+	*(dest + i) = m | n | o | p;
+    }
+}
+
+static void
+combine_disjoint_general_ca (comp4_t *      dest,
+                             const comp4_t *src,
+                             const comp4_t *mask,
+                             int            width,
+                             comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s, d;
+	comp4_t m, n, o, p;
+	comp4_t Fa, Fb;
+	comp2_t t, u, v;
+	comp4_t sa;
+	comp1_t da;
+
+	s = *(src + i);
+	m = *(mask + i);
+	d = *(dest + i);
+	da = d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	sa = m;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    m = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> 0), da);
+	    n = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (comp4_t)combine_disjoint_out_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A_IN:
+	    m = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> 0), da);
+	    n = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (comp4_t)combine_disjoint_in_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A:
+	    Fa = ~0;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    m = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> 0));
+	    n = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (comp4_t)combine_disjoint_out_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B_IN:
+	    m = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> 0));
+	    n = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (comp4_t)combine_disjoint_in_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B:
+	    Fb = ~0;
+	    break;
+	}
+	m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
+	n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
+	o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
+	p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
+
+	s = m | n | o | p;
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_disjoint_over_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+combine_disjoint_in_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_disjoint_in_reverse_ca (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_disjoint_out_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_disjoint_out_reverse_ca (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_disjoint_atop_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_disjoint_atop_reverse_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  comp4_t *                dest,
+                                  const comp4_t *          src,
+                                  const comp4_t *          mask,
+                                  int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_disjoint_xor_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
+combine_conjoint_general_ca (comp4_t *      dest,
+                             const comp4_t *src,
+                             const comp4_t *mask,
+                             int            width,
+                             comp1_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	comp4_t s, d;
+	comp4_t m, n, o, p;
+	comp4_t Fa, Fb;
+	comp2_t t, u, v;
+	comp4_t sa;
+	comp1_t da;
+
+	s = *(src + i);
+	m = *(mask + i);
+	d = *(dest + i);
+	da = d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	sa = m;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    m = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> 0), da);
+	    n = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (comp4_t)combine_conjoint_out_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A_IN:
+	    m = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> 0), da);
+	    n = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (comp4_t)combine_conjoint_in_part ((comp1_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A:
+	    Fa = ~0;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    m = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> 0));
+	    n = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (comp4_t)combine_conjoint_out_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B_IN:
+	    m = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> 0));
+	    n = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (comp4_t)combine_conjoint_in_part (da, (comp1_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B:
+	    Fb = ~0;
+	    break;
+	}
+	m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
+	n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
+	o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
+	p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
+
+	s = m | n | o | p;
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_conjoint_over_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+combine_conjoint_over_reverse_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  comp4_t *                dest,
+                                  const comp4_t *          src,
+                                  const comp4_t *          mask,
+                                  int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+combine_conjoint_in_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        comp4_t *                dest,
+                        const comp4_t *          src,
+                        const comp4_t *          mask,
+                        int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_conjoint_in_reverse_ca (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                comp4_t *                dest,
+                                const comp4_t *          src,
+                                const comp4_t *          mask,
+                                int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_conjoint_out_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_conjoint_out_reverse_ca (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 comp4_t *                dest,
+                                 const comp4_t *          src,
+                                 const comp4_t *          mask,
+                                 int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_conjoint_atop_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          comp4_t *                dest,
+                          const comp4_t *          src,
+                          const comp4_t *          mask,
+                          int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_conjoint_atop_reverse_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  comp4_t *                dest,
+                                  const comp4_t *          src,
+                                  const comp4_t *          mask,
+                                  int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_conjoint_xor_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         comp4_t *                dest,
+                         const comp4_t *          src,
+                         const comp4_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+void
+_pixman_setup_combiner_functions_width (pixman_implementation_t *imp)
+{
+    /* Unified alpha */
+    imp->combine_width[PIXMAN_OP_CLEAR] = combine_clear;
+    imp->combine_width[PIXMAN_OP_SRC] = combine_src_u;
+    /* dest */
+    imp->combine_width[PIXMAN_OP_OVER] = combine_over_u;
+    imp->combine_width[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_u;
+    imp->combine_width[PIXMAN_OP_IN] = combine_in_u;
+    imp->combine_width[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_u;
+    imp->combine_width[PIXMAN_OP_OUT] = combine_out_u;
+    imp->combine_width[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_u;
+    imp->combine_width[PIXMAN_OP_ATOP] = combine_atop_u;
+    imp->combine_width[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_u;
+    imp->combine_width[PIXMAN_OP_XOR] = combine_xor_u;
+    imp->combine_width[PIXMAN_OP_ADD] = combine_add_u;
+    imp->combine_width[PIXMAN_OP_SATURATE] = combine_saturate_u;
+
+    /* Disjoint, unified */
+    imp->combine_width[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear;
+    imp->combine_width[PIXMAN_OP_DISJOINT_SRC] = combine_src_u;
+    /* dest */
+    imp->combine_width[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_u;
+    imp->combine_width[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_u;
+
+    /* Conjoint, unified */
+    imp->combine_width[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear;
+    imp->combine_width[PIXMAN_OP_CONJOINT_SRC] = combine_src_u;
+    /* dest */
+    imp->combine_width[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_u;
+    imp->combine_width[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_u;
+
+    imp->combine_width[PIXMAN_OP_MULTIPLY] = combine_multiply_u;
+    imp->combine_width[PIXMAN_OP_SCREEN] = combine_screen_u;
+    imp->combine_width[PIXMAN_OP_OVERLAY] = combine_overlay_u;
+    imp->combine_width[PIXMAN_OP_DARKEN] = combine_darken_u;
+    imp->combine_width[PIXMAN_OP_LIGHTEN] = combine_lighten_u;
+    imp->combine_width[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_u;
+    imp->combine_width[PIXMAN_OP_COLOR_BURN] = combine_color_burn_u;
+    imp->combine_width[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_u;
+    imp->combine_width[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_u;
+    imp->combine_width[PIXMAN_OP_DIFFERENCE] = combine_difference_u;
+    imp->combine_width[PIXMAN_OP_EXCLUSION] = combine_exclusion_u;
+    imp->combine_width[PIXMAN_OP_HSL_HUE] = combine_hsl_hue_u;
+    imp->combine_width[PIXMAN_OP_HSL_SATURATION] = combine_hsl_saturation_u;
+    imp->combine_width[PIXMAN_OP_HSL_COLOR] = combine_hsl_color_u;
+    imp->combine_width[PIXMAN_OP_HSL_LUMINOSITY] = combine_hsl_luminosity_u;
+
+    /* Component alpha combiners */
+    imp->combine_width_ca[PIXMAN_OP_CLEAR] = combine_clear_ca;
+    imp->combine_width_ca[PIXMAN_OP_SRC] = combine_src_ca;
+    /* dest */
+    imp->combine_width_ca[PIXMAN_OP_OVER] = combine_over_ca;
+    imp->combine_width_ca[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_IN] = combine_in_ca;
+    imp->combine_width_ca[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_OUT] = combine_out_ca;
+    imp->combine_width_ca[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_ATOP] = combine_atop_ca;
+    imp->combine_width_ca[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_XOR] = combine_xor_ca;
+    imp->combine_width_ca[PIXMAN_OP_ADD] = combine_add_ca;
+    imp->combine_width_ca[PIXMAN_OP_SATURATE] = combine_saturate_ca;
+
+    /* Disjoint CA */
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_SRC] = combine_src_ca;
+    /* dest */
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_ca;
+
+    /* Conjoint CA */
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_SRC] = combine_src_ca;
+    /* dest */
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_ca;
+    imp->combine_width_ca[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_ca;
+
+    imp->combine_width_ca[PIXMAN_OP_MULTIPLY] = combine_multiply_ca;
+    imp->combine_width_ca[PIXMAN_OP_SCREEN] = combine_screen_ca;
+    imp->combine_width_ca[PIXMAN_OP_OVERLAY] = combine_overlay_ca;
+    imp->combine_width_ca[PIXMAN_OP_DARKEN] = combine_darken_ca;
+    imp->combine_width_ca[PIXMAN_OP_LIGHTEN] = combine_lighten_ca;
+    imp->combine_width_ca[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_ca;
+    imp->combine_width_ca[PIXMAN_OP_COLOR_BURN] = combine_color_burn_ca;
+    imp->combine_width_ca[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_ca;
+    imp->combine_width_ca[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_ca;
+    imp->combine_width_ca[PIXMAN_OP_DIFFERENCE] = combine_difference_ca;
+    imp->combine_width_ca[PIXMAN_OP_EXCLUSION] = combine_exclusion_ca;
+
+    /* It is not clear that these make sense, so leave them out for now */
+    imp->combine_width_ca[PIXMAN_OP_HSL_HUE] = NULL;
+    imp->combine_width_ca[PIXMAN_OP_HSL_SATURATION] = NULL;
+    imp->combine_width_ca[PIXMAN_OP_HSL_COLOR] = NULL;
+    imp->combine_width_ca[PIXMAN_OP_HSL_LUMINOSITY] = NULL;
+}
+
diff --git a/lib/pixman/pixman/pixman-combine.h.template b/lib/pixman/pixman/pixman-combine.h.template
new file mode 100644
index 000000000..2f6392f96
--- /dev/null
+++ b/lib/pixman/pixman/pixman-combine.h.template
@@ -0,0 +1,226 @@
+
+#define COMPONENT_SIZE
+#define MASK
+#define ONE_HALF
+
+#define A_SHIFT
+#define R_SHIFT
+#define G_SHIFT
+#define A_MASK
+#define R_MASK
+#define G_MASK
+
+#define RB_MASK
+#define AG_MASK
+#define RB_ONE_HALF
+#define RB_MASK_PLUS_ONE
+
+#define ALPHA_c(x) ((x) >> A_SHIFT)
+#define RED_c(x) (((x) >> R_SHIFT) & MASK)
+#define GREEN_c(x) (((x) >> G_SHIFT) & MASK)
+#define BLUE_c(x) ((x) & MASK)
+
+/*
+ * Helper macros.
+ */
+
+#define MUL_UNc(a, b, t)						\
+    ((t) = (a) * (b) + ONE_HALF, ((((t) >> G_SHIFT ) + (t) ) >> G_SHIFT ))
+
+#define DIV_UNc(a, b)							\
+    (((comp2_t) (a) * MASK) / (b))
+
+#define ADD_UNc(x, y, t)				     \
+    ((t) = x + y,					     \
+     (comp4_t) (comp1_t) ((t) | (0 - ((t) >> G_SHIFT))))
+
+#define DIV_ONE_UNc(x)							\
+    (((x) + ONE_HALF + (((x) + ONE_HALF) >> G_SHIFT)) >> G_SHIFT)
+
+/*
+ * The methods below use some tricks to be able to do two color
+ * components at the same time.
+ */
+
+/*
+ * x_rb = (x_rb * a) / 255
+ */
+#define UNc_rb_MUL_UNc(x, a, t)						\
+    do									\
+    {									\
+	t  = ((x) & RB_MASK) * (a);					\
+	t += RB_ONE_HALF;						\
+	x = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
+	x &= RB_MASK;							\
+    } while (0)
+
+/*
+ * x_rb = min (x_rb + y_rb, 255)
+ */
+#define UNc_rb_ADD_UNc_rb(x, y, t)					\
+    do									\
+    {									\
+	t = ((x) + (y));						\
+	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);		\
+	x = (t & RB_MASK);						\
+    } while (0)
+
+/*
+ * x_rb = (x_rb * a_rb) / 255
+ */
+#define UNc_rb_MUL_UNc_rb(x, a, t)					\
+    do									\
+    {									\
+	t  = (x & MASK) * (a & MASK);					\
+	t |= (x & R_MASK) * ((a >> R_SHIFT) & MASK);			\
+	t += RB_ONE_HALF;						\
+	t = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
+	x = t & RB_MASK;						\
+    } while (0)
+
+/*
+ * x_c = (x_c * a) / 255
+ */
+#define UNcx4_MUL_UNc(x, a)						\
+    do									\
+    {									\
+	comp4_t r1, r2, t;						\
+									\
+	r1 = (x);							\
+	UNc_rb_MUL_UNc (r1, a, t);					\
+									\
+	r2 = (x) >> G_SHIFT;						\
+	UNc_rb_MUL_UNc (r2, a, t);					\
+									\
+	x = r1 | (r2 << G_SHIFT);					\
+    } while (0)
+
+/*
+ * x_c = (x_c * a) / 255 + y_c
+ */
+#define UNcx4_MUL_UNc_ADD_UNcx4(x, a, y)				\
+    do									\
+    {									\
+	comp4_t r1, r2, r3, t;						\
+									\
+	r1 = (x);							\
+	r2 = (y) & RB_MASK;						\
+	UNc_rb_MUL_UNc (r1, a, t);					\
+	UNc_rb_ADD_UNc_rb (r1, r2, t);					\
+									\
+	r2 = (x) >> G_SHIFT;						\
+	r3 = ((y) >> G_SHIFT) & RB_MASK;				\
+	UNc_rb_MUL_UNc (r2, a, t);					\
+	UNc_rb_ADD_UNc_rb (r2, r3, t);					\
+									\
+	x = r1 | (r2 << G_SHIFT);					\
+    } while (0)
+
+/*
+ * x_c = (x_c * a + y_c * b) / 255
+ */
+#define UNcx4_MUL_UNc_ADD_UNcx4_MUL_UNc(x, a, y, b)			\
+    do									\
+    {									\
+	comp4_t r1, r2, r3, t;						\
+									\
+	r1 = x;								\
+	r2 = y;								\
+	UNc_rb_MUL_UNc (r1, a, t);					\
+	UNc_rb_MUL_UNc (r2, b, t);					\
+	UNc_rb_ADD_UNc_rb (r1, r2, t);					\
+									\
+	r2 = (x >> G_SHIFT);						\
+	r3 = (y >> G_SHIFT);						\
+	UNc_rb_MUL_UNc (r2, a, t);					\
+	UNc_rb_MUL_UNc (r3, b, t);					\
+	UNc_rb_ADD_UNc_rb (r2, r3, t);					\
+									\
+	x = r1 | (r2 << G_SHIFT);					\
+    } while (0)
+
+/*
+ * x_c = (x_c * a_c) / 255
+ */
+#define UNcx4_MUL_UNcx4(x, a)						\
+    do									\
+    {									\
+	comp4_t r1, r2, r3, t;						\
+									\
+	r1 = x;								\
+	r2 = a;								\
+	UNc_rb_MUL_UNc_rb (r1, r2, t);					\
+									\
+	r2 = x >> G_SHIFT;						\
+	r3 = a >> G_SHIFT;						\
+	UNc_rb_MUL_UNc_rb (r2, r3, t);					\
+									\
+	x = r1 | (r2 << G_SHIFT);					\
+    } while (0)
+
+/*
+ * x_c = (x_c * a_c) / 255 + y_c
+ */
+#define UNcx4_MUL_UNcx4_ADD_UNcx4(x, a, y)				\
+    do									\
+    {									\
+	comp4_t r1, r2, r3, t;						\
+									\
+	r1 = x;								\
+	r2 = a;								\
+	UNc_rb_MUL_UNc_rb (r1, r2, t);					\
+	r2 = y & RB_MASK;						\
+	UNc_rb_ADD_UNc_rb (r1, r2, t);					\
+									\
+	r2 = (x >> G_SHIFT);						\
+	r3 = (a >> G_SHIFT);						\
+	UNc_rb_MUL_UNc_rb (r2, r3, t);					\
+	r3 = (y >> G_SHIFT) & RB_MASK;					\
+	UNc_rb_ADD_UNc_rb (r2, r3, t);					\
+									\
+	x = r1 | (r2 << G_SHIFT);					\
+    } while (0)
+
+/*
+ * x_c = (x_c * a_c + y_c * b) / 255
+ */
+#define UNcx4_MUL_UNcx4_ADD_UNcx4_MUL_UNc(x, a, y, b)			\
+    do									\
+    {									\
+	comp4_t r1, r2, r3, t;						\
+									\
+	r1 = x;								\
+	r2 = a;								\
+	UNc_rb_MUL_UNc_rb (r1, r2, t);					\
+	r2 = y;								\
+	UNc_rb_MUL_UNc (r2, b, t);					\
+	UNc_rb_ADD_UNc_rb (r1, r2, t);					\
+									\
+	r2 = x >> G_SHIFT;						\
+	r3 = a >> G_SHIFT;						\
+	UNc_rb_MUL_UNc_rb (r2, r3, t);					\
+	r3 = y >> G_SHIFT;						\
+	UNc_rb_MUL_UNc (r3, b, t);					\
+	UNc_rb_ADD_UNc_rb (r2, r3, t);					\
+									\
+	x = r1 | (r2 << G_SHIFT);					\
+    } while (0)
+
+/*
+   x_c = min(x_c + y_c, 255)
+ */
+#define UNcx4_ADD_UNcx4(x, y)						\
+    do									\
+    {									\
+	comp4_t r1, r2, r3, t;						\
+									\
+	r1 = x & RB_MASK;						\
+	r2 = y & RB_MASK;						\
+	UNc_rb_ADD_UNc_rb (r1, r2, t);					\
+									\
+	r2 = (x >> G_SHIFT) & RB_MASK;					\
+	r3 = (y >> G_SHIFT) & RB_MASK;					\
+	UNc_rb_ADD_UNc_rb (r2, r3, t);					\
+									\
+	x = r1 | (r2 << G_SHIFT);					\
+    } while (0)
diff --git a/lib/pixman/pixman/pixman-compiler.h b/lib/pixman/pixman/pixman-compiler.h
new file mode 100644
index 000000000..9647dbb48
--- /dev/null
+++ b/lib/pixman/pixman/pixman-compiler.h
@@ -0,0 +1,71 @@
+/* Pixman uses some non-standard compiler features. This file ensures
+ * they exist
+ *
+ * The features are:
+ *
+ *    FUNC	     must be defined to expand to the current function
+ *    PIXMAN_EXPORT  should be defined to whatever is required to
+ *                   export functions from a shared library
+ *    limits	     limits for various types must be defined
+ *    inline         must be defined
+ *    force_inline   must be defined
+ */
+#if defined (__GNUC__)
+#  define FUNC     ((const char*) (__PRETTY_FUNCTION__))
+#elif defined (__sun) || (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
+#  define FUNC     ((const char*) (__func__))
+#else
+#  define FUNC     ((const char*) ("???"))
+#endif
+
+#ifndef INT16_MIN
+# define INT16_MIN              (-32767-1)
+#endif
+
+#ifndef INT16_MAX
+# define INT16_MAX              (32767)
+#endif
+
+#ifndef INT32_MIN
+# define INT32_MIN              (-2147483647-1)
+#endif
+
+#ifndef INT32_MAX
+# define INT32_MAX              (2147483647)
+#endif
+
+#ifndef UINT32_MIN
+# define UINT32_MIN             (0)
+#endif
+
+#ifndef UINT32_MAX
+# define UINT32_MAX             (4294967295U)
+#endif
+
+#ifndef M_PI
+# define M_PI			3.14159265358979323846
+#endif
+
+#ifdef _MSC_VER
+/* 'inline' is available only in C++ in MSVC */
+#   define inline __inline
+#   define force_inline __forceinline
+#elif defined __GNUC__ || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
+#   define inline __inline__
+#   define force_inline __inline__ __attribute__ ((__always_inline__))
+#else
+#   ifndef force_inline
+#      define force_inline inline
+#   endif
+#endif
+
+/* GCC visibility */
+#if defined(__GNUC__) && __GNUC__ >= 4
+#   define PIXMAN_EXPORT __attribute__ ((visibility("default")))
+/* Sun Studio 8 visibility */
+#elif defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
+#   define PIXMAN_EXPORT __global
+#else
+#   define PIXMAN_EXPORT
+#endif
+
diff --git a/lib/pixman/pixman/pixman-compute-region.c b/lib/pixman/pixman/pixman-compute-region.c
deleted file mode 100644
index 31eaee8e3..000000000
--- a/lib/pixman/pixman/pixman-compute-region.c
+++ /dev/null
@@ -1,242 +0,0 @@
-/*
- *
- * Copyright © 1999 Keith Packard
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Keith Packard makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
- * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
- * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
- * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
- * PERFORMANCE OF THIS SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "pixman-private.h"
-
-#define BOUND(v)	(int16_t) ((v) < INT16_MIN ? INT16_MIN : (v) > INT16_MAX ? INT16_MAX : (v))
-
-static inline pixman_bool_t
-miClipPictureReg (pixman_region32_t *	pRegion,
-		  pixman_region32_t *	pClip,
-		  int		dx,
-		  int		dy)
-{
-    if (pixman_region32_n_rects(pRegion) == 1 &&
-	pixman_region32_n_rects(pClip) == 1)
-    {
-	pixman_box32_t *  pRbox = pixman_region32_rectangles(pRegion, NULL);
-	pixman_box32_t *  pCbox = pixman_region32_rectangles(pClip, NULL);
-	int	v;
-	
-	if (pRbox->x1 < (v = pCbox->x1 + dx))
-	    pRbox->x1 = BOUND(v);
-	if (pRbox->x2 > (v = pCbox->x2 + dx))
-	    pRbox->x2 = BOUND(v);
-	if (pRbox->y1 < (v = pCbox->y1 + dy))
-	    pRbox->y1 = BOUND(v);
-	if (pRbox->y2 > (v = pCbox->y2 + dy))
-	    pRbox->y2 = BOUND(v);
-	if (pRbox->x1 >= pRbox->x2 ||
-	    pRbox->y1 >= pRbox->y2)
-	{
-	    pixman_region32_init (pRegion);
-	}
-    }
-    else if (!pixman_region32_not_empty (pClip))
-	return FALSE;
-    else
-    {
-	if (dx || dy)
-	    pixman_region32_translate (pRegion, -dx, -dy);
-	if (!pixman_region32_intersect (pRegion, pRegion, pClip))
-	    return FALSE;
-	if (dx || dy)
-	    pixman_region32_translate(pRegion, dx, dy);
-    }
-    return pixman_region32_not_empty(pRegion);
-}
-
-
-static inline pixman_bool_t
-miClipPictureSrc (pixman_region32_t *	pRegion,
-		  pixman_image_t *	pPicture,
-		  int		dx,
-		  int		dy)
-{
-    /* XXX what to do with clipping from transformed pictures? */
-    if (pPicture->common.transform || pPicture->type != BITS)
-	return TRUE;
-
-    if (pPicture->common.repeat)
-    {
-	/* If the clip region was set by a client, then it should be intersected
-	 * with the composite region since it's interpreted as happening
-	 * after the repeat algorithm.
-	 *
-	 * If the clip region was not set by a client, then it was imposed by
-	 * boundaries of the pixmap, or by sibling or child windows, which means
-	 * it should in theory be repeated along. FIXME: we ignore that case.
-	 * It is only relevant for windows that are (a) clipped by siblings/children
-	 * and (b) used as source. However this case is not useful anyway due
-	 * to lack of GraphicsExpose events.
-	 */
-	if (pPicture->common.has_client_clip)
-	{
-	    pixman_region32_translate (pRegion, dx, dy);
-	    
-	    if (!pixman_region32_intersect (pRegion, pRegion, 
-					    pPicture->common.src_clip))
-		return FALSE;
-	    
-	    pixman_region32_translate ( pRegion, -dx, -dy);
-	}
-	    
-	return TRUE;
-    }
-    else
-    {
-	return miClipPictureReg (pRegion,
-				 pPicture->common.src_clip,
-				 dx,
-				 dy);
-    }
-}
-
-/*
- * returns FALSE if the final region is empty.  Indistinguishable from
- * an allocation failure, but rendering ignores those anyways.
- */
-
-pixman_bool_t
-pixman_compute_composite_region32 (pixman_region32_t *	pRegion,
-				   pixman_image_t *	pSrc,
-				   pixman_image_t *	pMask,
-				   pixman_image_t *	pDst,
-				   int16_t		xSrc,
-				   int16_t		ySrc,
-				   int16_t		xMask,
-				   int16_t		yMask,
-				   int16_t		xDst,
-				   int16_t		yDst,
-				   uint16_t		width,
-				   uint16_t		height)
-{
-    int		v;
-    
-    pRegion->extents.x1 = xDst;
-    v = xDst + width;
-    pRegion->extents.x2 = BOUND(v);
-    pRegion->extents.y1 = yDst;
-    v = yDst + height;
-    pRegion->extents.y2 = BOUND(v);
-    pRegion->data = 0;
-    /* Check for empty operation */
-    if (pRegion->extents.x1 >= pRegion->extents.x2 ||
-	pRegion->extents.y1 >= pRegion->extents.y2)
-    {
-	pixman_region32_init (pRegion);
-	return FALSE;
-    }
-    /* clip against dst */
-    if (!miClipPictureReg (pRegion, &pDst->common.clip_region, 0, 0))
-    {
-	pixman_region32_fini (pRegion);
-	return FALSE;
-    }
-    if (pDst->common.alpha_map)
-    {
-	if (!miClipPictureReg (pRegion, &pDst->common.alpha_map->common.clip_region,
-			       -pDst->common.alpha_origin.x,
-			       -pDst->common.alpha_origin.y))
-	{
-	    pixman_region32_fini (pRegion);
-	    return FALSE;
-	}
-    }
-    /* clip against src */
-    if (!miClipPictureSrc (pRegion, pSrc, xDst - xSrc, yDst - ySrc))
-    {
-	pixman_region32_fini (pRegion);
-	return FALSE;
-    }
-    if (pSrc->common.alpha_map)
-    {
-	if (!miClipPictureSrc (pRegion, (pixman_image_t *)pSrc->common.alpha_map,
-			       xDst - (xSrc - pSrc->common.alpha_origin.x),
-			       yDst - (ySrc - pSrc->common.alpha_origin.y)))
-	{
-	    pixman_region32_fini (pRegion);
-	    return FALSE;
-	}
-    }
-    /* clip against mask */
-    if (pMask)
-    {
-	if (!miClipPictureSrc (pRegion, pMask, xDst - xMask, yDst - yMask))
-	{
-	    pixman_region32_fini (pRegion);
-	    return FALSE;
-	}	
-	if (pMask->common.alpha_map)
-	{
-	    if (!miClipPictureSrc (pRegion, (pixman_image_t *)pMask->common.alpha_map,
-				   xDst - (xMask - pMask->common.alpha_origin.x),
-				   yDst - (yMask - pMask->common.alpha_origin.y)))
-	    {
-		pixman_region32_fini (pRegion);
-		return FALSE;
-	    }
-	}
-    }
-    
-    return TRUE;
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_compute_composite_region (pixman_region16_t *	pRegion,
-				 pixman_image_t *	pSrc,
-				 pixman_image_t *	pMask,
-				 pixman_image_t *	pDst,
-				 int16_t		xSrc,
-				 int16_t		ySrc,
-				 int16_t		xMask,
-				 int16_t		yMask,
-				 int16_t		xDst,
-				 int16_t		yDst,
-				 uint16_t	width,
-				 uint16_t	height)
-{
-    pixman_region32_t r32;
-    pixman_bool_t retval;
-
-    pixman_region32_init (&r32);
-    
-    retval = pixman_compute_composite_region32 (&r32, pSrc, pMask, pDst,
-						xSrc, ySrc, xMask, yMask, xDst, yDst,
-						width, height);
-
-    if (retval)
-    {
-	if (!pixman_region16_copy_from_region32 (pRegion, &r32))
-	    retval = FALSE;
-    }
-    
-    pixman_region32_fini (&r32);
-    return retval;
-}
diff --git a/lib/pixman/pixman/pixman-conical-gradient.c b/lib/pixman/pixman/pixman-conical-gradient.c
index 023256aae..d720db3d4 100644
--- a/lib/pixman/pixman/pixman-conical-gradient.c
+++ b/lib/pixman/pixman/pixman-conical-gradient.c
@@ -29,14 +29,19 @@
 #include "pixman-private.h"
 
 static void
-conical_gradient_get_scanline_32 (pixman_image_t *image, int x, int y, int width,
-				  uint32_t *buffer, uint32_t *mask, uint32_t maskBits)
+conical_gradient_get_scanline_32 (pixman_image_t *image,
+                                  int             x,
+                                  int             y,
+                                  int             width,
+                                  uint32_t *      buffer,
+                                  const uint32_t *mask,
+                                  uint32_t        mask_bits)
 {
     source_image_t *source = (source_image_t *)image;
     gradient_t *gradient = (gradient_t *)source;
     conical_gradient_t *conical = (conical_gradient_t *)image;
     uint32_t       *end = buffer + width;
-    GradientWalker  walker;
+    pixman_gradient_walker_t walker;
     pixman_bool_t affine = TRUE;
     double cx = 1.;
     double cy = 0.;
@@ -44,73 +49,92 @@ conical_gradient_get_scanline_32 (pixman_image_t *image, int x, int y, int width
     double rx = x + 0.5;
     double ry = y + 0.5;
     double rz = 1.;
-    double a = conical->angle/(180.*65536);
+    double a = conical->angle / (180. * 65536);
 
     _pixman_gradient_walker_init (&walker, gradient, source->common.repeat);
-    
-    if (source->common.transform) {
+
+    if (source->common.transform)
+    {
 	pixman_vector_t v;
+
 	/* reference point is the center of the pixel */
-	v.vector[0] = pixman_int_to_fixed(x) + pixman_fixed_1/2;
-	v.vector[1] = pixman_int_to_fixed(y) + pixman_fixed_1/2;
+	v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+	v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
 	v.vector[2] = pixman_fixed_1;
+
 	if (!pixman_transform_point_3d (source->common.transform, &v))
 	    return;
+
+	cx = source->common.transform->matrix[0][0] / 65536.;
+	cy = source->common.transform->matrix[1][0] / 65536.;
+	cz = source->common.transform->matrix[2][0] / 65536.;
 	
-	cx = source->common.transform->matrix[0][0]/65536.;
-	cy = source->common.transform->matrix[1][0]/65536.;
-	cz = source->common.transform->matrix[2][0]/65536.;
-	rx = v.vector[0]/65536.;
-	ry = v.vector[1]/65536.;
-	rz = v.vector[2]/65536.;
-	affine = source->common.transform->matrix[2][0] == 0 && v.vector[2] == pixman_fixed_1;
-    }
-    
-    if (affine) {
-	rx -= conical->center.x/65536.;
-	ry -= conical->center.y/65536.;
+	rx = v.vector[0] / 65536.;
+	ry = v.vector[1] / 65536.;
+	rz = v.vector[2] / 65536.;
 	
-	while (buffer < end) {
+	affine =
+	    source->common.transform->matrix[2][0] == 0 &&
+	    v.vector[2] == pixman_fixed_1;
+    }
+
+    if (affine)
+    {
+	rx -= conical->center.x / 65536.;
+	ry -= conical->center.y / 65536.;
+
+	while (buffer < end)
+	{
 	    double angle;
-	    
-	    if (!mask || *mask++ & maskBits)
+
+	    if (!mask || *mask++ & mask_bits)
 	    {
-		pixman_fixed_48_16_t   t;
-		
-		angle = atan2(ry, rx) + a;
-		t     = (pixman_fixed_48_16_t) (angle * (65536. / (2*M_PI)));
-		
-		*(buffer) = _pixman_gradient_walker_pixel (&walker, t);
+		pixman_fixed_48_16_t t;
+
+		angle = atan2 (ry, rx) + a;
+		t     = (pixman_fixed_48_16_t) (angle * (65536. / (2 * M_PI)));
+
+		*buffer = _pixman_gradient_walker_pixel (&walker, t);
 	    }
-	    
+
 	    ++buffer;
+	    
 	    rx += cx;
 	    ry += cy;
 	}
-    } else {
-	while (buffer < end) {
+    }
+    else
+    {
+	while (buffer < end)
+	{
 	    double x, y;
 	    double angle;
-	    
-	    if (!mask || *mask++ & maskBits)
+
+	    if (!mask || *mask++ & mask_bits)
 	    {
-		pixman_fixed_48_16_t  t;
-		
-		if (rz != 0) {
-		    x = rx/rz;
-		    y = ry/rz;
-		} else {
+		pixman_fixed_48_16_t t;
+
+		if (rz != 0)
+		{
+		    x = rx / rz;
+		    y = ry / rz;
+		}
+		else
+		{
 		    x = y = 0.;
 		}
-		x -= conical->center.x/65536.;
-		y -= conical->center.y/65536.;
-		angle = atan2(y, x) + a;
-		t     = (pixman_fixed_48_16_t) (angle * (65536. / (2*M_PI)));
+
+		x -= conical->center.x / 65536.;
+		y -= conical->center.y / 65536.;
 		
-		*(buffer) = _pixman_gradient_walker_pixel (&walker, t);
+		angle = atan2 (y, x) + a;
+		t     = (pixman_fixed_48_16_t) (angle * (65536. / (2 * M_PI)));
+
+		*buffer = _pixman_gradient_walker_pixel (&walker, t);
 	    }
-	    
+
 	    ++buffer;
+	    
 	    rx += cx;
 	    ry += cy;
 	    rz += cz;
@@ -121,37 +145,36 @@ conical_gradient_get_scanline_32 (pixman_image_t *image, int x, int y, int width
 static void
 conical_gradient_property_changed (pixman_image_t *image)
 {
-    image->common.get_scanline_32 = (scanFetchProc)conical_gradient_get_scanline_32;
-    image->common.get_scanline_64 = (scanFetchProc)_pixman_image_get_scanline_64_generic;
+    image->common.get_scanline_32 = conical_gradient_get_scanline_32;
+    image->common.get_scanline_64 = _pixman_image_get_scanline_generic_64;
 }
 
 PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_conical_gradient (pixman_point_fixed_t         *center,
-				      pixman_fixed_t                angle,
-				      const pixman_gradient_stop_t *stops,
-				      int                           n_stops)
+pixman_image_create_conical_gradient (pixman_point_fixed_t *        center,
+                                      pixman_fixed_t                angle,
+                                      const pixman_gradient_stop_t *stops,
+                                      int                           n_stops)
 {
-    pixman_image_t *image = _pixman_image_allocate();
+    pixman_image_t *image = _pixman_image_allocate ();
     conical_gradient_t *conical;
-    
+
     if (!image)
 	return NULL;
-    
+
     conical = &image->conical;
-    
+
     if (!_pixman_init_gradient (&conical->common, stops, n_stops))
     {
 	free (image);
 	return NULL;
     }
-    
+
     image->type = CONICAL;
     conical->center = *center;
     conical->angle = angle;
-    
+
     image->common.property_changed = conical_gradient_property_changed;
-    
-    conical_gradient_property_changed (image);
-    
+
     return image;
 }
+
diff --git a/lib/pixman/pixman/pixman-cpu.c b/lib/pixman/pixman/pixman-cpu.c
index 057c13418..5d5469bb8 100644
--- a/lib/pixman/pixman/pixman-cpu.c
+++ b/lib/pixman/pixman/pixman-cpu.c
@@ -47,12 +47,16 @@ static volatile pixman_bool_t have_vmx = TRUE;
 static pixman_bool_t
 pixman_have_vmx (void)
 {
-    if(!initialized) {
-        size_t length = sizeof(have_vmx);
-        int error =
-            sysctlbyname("hw.optional.altivec", &have_vmx, &length, NULL, 0);
-        if(error) have_vmx = FALSE;
-        initialized = TRUE;
+    if (!initialized)
+    {
+	size_t length = sizeof(have_vmx);
+	int error =
+	    sysctlbyname ("hw.optional.altivec", &have_vmx, &length, NULL, 0);
+
+	if (error)
+	    have_vmx = FALSE;
+
+	initialized = TRUE;
     }
     return have_vmx;
 }
@@ -69,39 +73,47 @@ pixman_have_vmx (void)
 static pixman_bool_t
 pixman_have_vmx (void)
 {
-    if (!initialized) {
+    if (!initialized)
+    {
 	char fname[64];
 	unsigned long buf[64];
 	ssize_t count = 0;
 	pid_t pid;
 	int fd, i;
 
-	pid = getpid();
-	snprintf(fname, sizeof(fname)-1, "/proc/%d/auxv", pid);
+	pid = getpid ();
+	snprintf (fname, sizeof(fname) - 1, "/proc/%d/auxv", pid);
 
-	fd = open(fname, O_RDONLY);
-	if (fd >= 0) {
-	    for (i = 0; i <= (count / sizeof(unsigned long)); i += 2) {
+	fd = open (fname, O_RDONLY);
+	if (fd >= 0)
+	{
+	    for (i = 0; i <= (count / sizeof(unsigned long)); i += 2)
+	    {
 		/* Read more if buf is empty... */
-		if (i == (count / sizeof(unsigned long))) {
-		    count = read(fd, buf, sizeof(buf));
+		if (i == (count / sizeof(unsigned long)))
+		{
+		    count = read (fd, buf, sizeof(buf));
 		    if (count <= 0)
 			break;
 		    i = 0;
 		}
 
-		if (buf[i] == AT_HWCAP) {
-		    have_vmx = !!(buf[i+1] & PPC_FEATURE_HAS_ALTIVEC);
+		if (buf[i] == AT_HWCAP)
+		{
+		    have_vmx = !!(buf[i + 1] & PPC_FEATURE_HAS_ALTIVEC);
 		    initialized = TRUE;
 		    break;
-		} else if (buf[i] == AT_NULL) {
+		}
+		else if (buf[i] == AT_NULL)
+		{
 		    break;
 		}
 	    }
-	    close(fd);
+	    close (fd);
 	}
     }
-    if (!initialized) {
+    if (!initialized)
+    {
 	/* Something went wrong. Assume 'no' rather than playing
 	   fragile tricks with catching SIGILL. */
 	have_vmx = FALSE;
@@ -110,35 +122,45 @@ pixman_have_vmx (void)
 
     return have_vmx;
 }
+
 #else /* !__APPLE__ && !__linux__ */
 #include <signal.h>
 #include <setjmp.h>
 
 static jmp_buf jump_env;
 
-static void vmx_test(int sig, siginfo_t *si, void *unused) {
+static void
+vmx_test (int        sig,
+	  siginfo_t *si,
+	  void *     unused)
+{
     longjmp (jump_env, 1);
 }
 
 static pixman_bool_t
-pixman_have_vmx (void) {
+pixman_have_vmx (void)
+{
     struct sigaction sa, osa;
     int jmp_result;
-    if (!initialized) {
-        sa.sa_flags = SA_SIGINFO;
-        sigemptyset(&sa.sa_mask);
-        sa.sa_sigaction = vmx_test;
-        sigaction(SIGILL, &sa, &osa);
+
+    if (!initialized)
+    {
+	sa.sa_flags = SA_SIGINFO;
+	sigemptyset (&sa.sa_mask);
+	sa.sa_sigaction = vmx_test;
+	sigaction (SIGILL, &sa, &osa);
 	jmp_result = setjmp (jump_env);
-	if (jmp_result == 0) {
+	if (jmp_result == 0)
+	{
 	    asm volatile ( "vor 0, 0, 0" );
 	}
-        sigaction(SIGILL, &osa, NULL);
+	sigaction (SIGILL, &osa, NULL);
 	have_vmx = (jmp_result == 0);
-        initialized = TRUE;
+	initialized = TRUE;
     }
     return have_vmx;
 }
+
 #endif /* __APPLE__ */
 #endif /* USE_VMX */
 
@@ -147,7 +169,7 @@ pixman_have_vmx (void) {
 #if defined(_MSC_VER)
 
 #if defined(USE_ARM_SIMD)
-extern int pixman_msvc_try_arm_simd_op();
+extern int pixman_msvc_try_arm_simd_op ();
 
 pixman_bool_t
 pixman_have_arm_simd (void)
@@ -155,22 +177,24 @@ pixman_have_arm_simd (void)
     static pixman_bool_t initialized = FALSE;
     static pixman_bool_t have_arm_simd = FALSE;
 
-    if (!initialized) {
-        __try {
-            pixman_msvc_try_arm_simd_op();
-            have_arm_simd = TRUE;
-        } __except(GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
-            have_arm_simd = FALSE;
-        }
+    if (!initialized)
+    {
+	__try {
+	    pixman_msvc_try_arm_simd_op ();
+	    have_arm_simd = TRUE;
+	} __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION) {
+	    have_arm_simd = FALSE;
+	}
 	initialized = TRUE;
     }
 
     return have_arm_simd;
 }
+
 #endif /* USE_ARM_SIMD */
 
 #if defined(USE_ARM_NEON)
-extern int pixman_msvc_try_arm_neon_op();
+extern int pixman_msvc_try_arm_neon_op ();
 
 pixman_bool_t
 pixman_have_arm_neon (void)
@@ -178,18 +202,23 @@ pixman_have_arm_neon (void)
     static pixman_bool_t initialized = FALSE;
     static pixman_bool_t have_arm_neon = FALSE;
 
-    if (!initialized) {
-        __try {
-            pixman_msvc_try_arm_neon_op();
-            have_arm_neon = TRUE;
-        } __except(GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
-            have_arm_neon = FALSE;
-        }
+    if (!initialized)
+    {
+	__try
+	{
+	    pixman_msvc_try_arm_neon_op ();
+	    have_arm_neon = TRUE;
+	}
+	__except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION)
+	{
+	    have_arm_neon = FALSE;
+	}
 	initialized = TRUE;
     }
 
     return have_arm_neon;
 }
+
 #endif /* USE_ARM_NEON */
 
 #else /* linux ELF */
@@ -211,40 +240,51 @@ static pixman_bool_t arm_has_iwmmxt = FALSE;
 static pixman_bool_t arm_tests_initialized = FALSE;
 
 static void
-pixman_arm_read_auxv() {
+pixman_arm_read_auxv ()
+{
     int fd;
     Elf32_auxv_t aux;
 
-    fd = open("/proc/self/auxv", O_RDONLY);
-    if (fd >= 0) {
-        while (read(fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t)) {
-            if (aux.a_type == AT_HWCAP) {
+    fd = open ("/proc/self/auxv", O_RDONLY);
+    if (fd >= 0)
+    {
+	while (read (fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t))
+	{
+	    if (aux.a_type == AT_HWCAP)
+	    {
 		uint32_t hwcap = aux.a_un.a_val;
-		if (getenv("ARM_FORCE_HWCAP"))
-		    hwcap = strtoul(getenv("ARM_FORCE_HWCAP"), NULL, 0);
-		// hardcode these values to avoid depending on specific versions
-		// of the hwcap header, e.g. HWCAP_NEON
+		if (getenv ("ARM_FORCE_HWCAP"))
+		    hwcap = strtoul (getenv ("ARM_FORCE_HWCAP"), NULL, 0);
+		/* hardcode these values to avoid depending on specific
+		 * versions of the hwcap header, e.g. HWCAP_NEON
+		 */
 		arm_has_vfp = (hwcap & 64) != 0;
 		arm_has_iwmmxt = (hwcap & 512) != 0;
-		// this flag is only present on kernel 2.6.29
+		/* this flag is only present on kernel 2.6.29 */
 		arm_has_neon = (hwcap & 4096) != 0;
-            } else if (aux.a_type == AT_PLATFORM) {
+	    }
+	    else if (aux.a_type == AT_PLATFORM)
+	    {
 		const char *plat = (const char*) aux.a_un.a_val;
-		if (getenv("ARM_FORCE_PLATFORM"))
-		    plat = getenv("ARM_FORCE_PLATFORM");
-		if (strncmp(plat, "v7l", 3) == 0) {
+		if (getenv ("ARM_FORCE_PLATFORM"))
+		    plat = getenv ("ARM_FORCE_PLATFORM");
+		if (strncmp (plat, "v7l", 3) == 0)
+		{
 		    arm_has_v7 = TRUE;
 		    arm_has_v6 = TRUE;
-		} else if (strncmp(plat, "v6l", 3) == 0) {
+		}
+		else if (strncmp (plat, "v6l", 3) == 0)
+		{
 		    arm_has_v6 = TRUE;
 		}
-            }
-        }
-        close (fd);
+	    }
+	}
+	close (fd);
 
-	// if we don't have 2.6.29, we have to do this hack; set
-	// the env var to trust HWCAP.
-	if (!getenv("ARM_TRUST_HWCAP") && arm_has_v7)
+	/* if we don't have 2.6.29, we have to do this hack; set
+	 * the env var to trust HWCAP.
+	 */
+	if (!getenv ("ARM_TRUST_HWCAP") && arm_has_v7)
 	    arm_has_neon = TRUE;
     }
 
@@ -256,10 +296,11 @@ pixman_bool_t
 pixman_have_arm_simd (void)
 {
     if (!arm_tests_initialized)
-	pixman_arm_read_auxv();
+	pixman_arm_read_auxv ();
 
     return arm_has_v6;
 }
+
 #endif /* USE_ARM_SIMD */
 
 #if defined(USE_ARM_NEON)
@@ -267,10 +308,11 @@ pixman_bool_t
 pixman_have_arm_neon (void)
 {
     if (!arm_tests_initialized)
-	pixman_arm_read_auxv();
+	pixman_arm_read_auxv ();
 
     return arm_has_neon;
 }
+
 #endif /* USE_ARM_NEON */
 
 #endif /* linux */
@@ -283,37 +325,42 @@ pixman_have_arm_neon (void)
  * that would lead to SIGILL instructions on old CPUs that don't have
  * it.
  */
-#if !defined(__amd64__) && !defined(__x86_64__)
+#if !defined(__amd64__) && !defined(__x86_64__) && !defined(_M_AMD64)
 
 #ifdef HAVE_GETISAX
 #include <sys/auxv.h>
 #endif
 
-enum CPUFeatures {
-    NoFeatures = 0,
+typedef enum
+{
+    NO_FEATURES = 0,
     MMX = 0x1,
-    MMX_Extensions = 0x2,
+    MMX_EXTENSIONS = 0x2,
     SSE = 0x6,
     SSE2 = 0x8,
     CMOV = 0x10
-};
+} cpu_features_t;
 
-static unsigned int detectCPUFeatures(void) {
+
+static unsigned int
+detect_cpu_features (void)
+{
     unsigned int features = 0;
     unsigned int result = 0;
 
 #ifdef HAVE_GETISAX
-    if (getisax(&result, 1)) {
-        if (result & AV_386_CMOV)
-            features |= CMOV;
-        if (result & AV_386_MMX)
-            features |= MMX;
-        if (result & AV_386_AMD_MMX)
-            features |= MMX_Extensions;
-        if (result & AV_386_SSE)
-            features |= SSE;
-        if (result & AV_386_SSE2)
-            features |= SSE2;
+    if (getisax (&result, 1))
+    {
+	if (result & AV_386_CMOV)
+	    features |= CMOV;
+	if (result & AV_386_MMX)
+	    features |= MMX;
+	if (result & AV_386_AMD_MMX)
+	    features |= MMX_EXTENSIONS;
+	if (result & AV_386_SSE)
+	    features |= SSE;
+	if (result & AV_386_SSE2)
+	    features |= SSE2;
     }
 #else
     char vendor[13];
@@ -333,128 +380,130 @@ static unsigned int detectCPUFeatures(void) {
      * original values when we access the output operands.
      */
     __asm__ (
-	"pushf\n"
-	"pop %%eax\n"
-	"mov %%eax, %%ecx\n"
-	"xor $0x00200000, %%eax\n"
-	"push %%eax\n"
-	"popf\n"
-	"pushf\n"
-	"pop %%eax\n"
-	"mov $0x0, %%edx\n"
-	"xor %%ecx, %%eax\n"
-	"jz 1f\n"
-	
-	"mov $0x00000000, %%eax\n"
-	"push %%ebx\n"
-	"cpuid\n"
-	"mov %%ebx, %%eax\n"
-	"pop %%ebx\n"
-	"mov %%eax, %1\n"
-	"mov %%edx, %2\n"
-	"mov %%ecx, %3\n"
-	"mov $0x00000001, %%eax\n"
-	"push %%ebx\n"
-	"cpuid\n"
-	"pop %%ebx\n"
-	"1:\n"
-	"mov %%edx, %0\n"
+        "pushf\n"
+        "pop %%eax\n"
+        "mov %%eax, %%ecx\n"
+        "xor $0x00200000, %%eax\n"
+        "push %%eax\n"
+        "popf\n"
+        "pushf\n"
+        "pop %%eax\n"
+        "mov $0x0, %%edx\n"
+        "xor %%ecx, %%eax\n"
+        "jz 1f\n"
+
+        "mov $0x00000000, %%eax\n"
+        "push %%ebx\n"
+        "cpuid\n"
+        "mov %%ebx, %%eax\n"
+        "pop %%ebx\n"
+        "mov %%eax, %1\n"
+        "mov %%edx, %2\n"
+        "mov %%ecx, %3\n"
+        "mov $0x00000001, %%eax\n"
+        "push %%ebx\n"
+        "cpuid\n"
+        "pop %%ebx\n"
+        "1:\n"
+        "mov %%edx, %0\n"
 	: "=r" (result),
-	  "=m" (vendor[0]),
-	  "=m" (vendor[4]),
-	  "=m" (vendor[8])
+        "=m" (vendor[0]),
+        "=m" (vendor[4]),
+        "=m" (vendor[8])
 	:
 	: "%eax", "%ecx", "%edx"
         );
-    
+
 #elif defined (_MSC_VER)
 
     _asm {
-      pushfd
-      pop eax
-      mov ecx, eax
-      xor eax, 00200000h
-      push eax
-      popfd
-      pushfd
-      pop eax
-      mov edx, 0
-      xor eax, ecx
-      jz nocpuid
-
-      mov eax, 0
-      push ebx
-      cpuid
-      mov eax, ebx
-      pop ebx
-      mov vendor0, eax
-      mov vendor1, edx
-      mov vendor2, ecx
-      mov eax, 1
-      push ebx
-      cpuid
-      pop ebx
+	pushfd
+	pop eax
+	mov ecx, eax
+	xor eax, 00200000h
+	push eax
+	popfd
+	pushfd
+	pop eax
+	mov edx, 0
+	xor eax, ecx
+	jz nocpuid
+
+	mov eax, 0
+	push ebx
+	cpuid
+	mov eax, ebx
+	pop ebx
+	mov vendor0, eax
+	mov vendor1, edx
+	mov vendor2, ecx
+	mov eax, 1
+	push ebx
+	cpuid
+	pop ebx
     nocpuid:
-      mov result, edx
+	mov result, edx
     }
-    memmove (vendor+0, &vendor0, 4);
-    memmove (vendor+4, &vendor1, 4);
-    memmove (vendor+8, &vendor2, 4);
+    memmove (vendor + 0, &vendor0, 4);
+    memmove (vendor + 4, &vendor1, 4);
+    memmove (vendor + 8, &vendor2, 4);
 
 #else
 #   error unsupported compiler
 #endif
 
     features = 0;
-    if (result) {
-        /* result now contains the standard feature bits */
-        if (result & (1 << 15))
-            features |= CMOV;
-        if (result & (1 << 23))
-            features |= MMX;
-        if (result & (1 << 25))
-            features |= SSE;
-        if (result & (1 << 26))
-            features |= SSE2;
-        if ((features & MMX) && !(features & SSE) &&
-            (strcmp(vendor, "AuthenticAMD") == 0 ||
-             strcmp(vendor, "Geode by NSC") == 0)) {
-            /* check for AMD MMX extensions */
+    if (result)
+    {
+	/* result now contains the standard feature bits */
+	if (result & (1 << 15))
+	    features |= CMOV;
+	if (result & (1 << 23))
+	    features |= MMX;
+	if (result & (1 << 25))
+	    features |= SSE;
+	if (result & (1 << 26))
+	    features |= SSE2;
+	if ((features & MMX) && !(features & SSE) &&
+	    (strcmp (vendor, "AuthenticAMD") == 0 ||
+	     strcmp (vendor, "Geode by NSC") == 0))
+	{
+	    /* check for AMD MMX extensions */
 #ifdef __GNUC__
-            __asm__(
-		"	push %%ebx\n"
-		"	mov $0x80000000, %%eax\n"
-		"	cpuid\n"
-		"	xor %%edx, %%edx\n"
-		"	cmp $0x1, %%eax\n"
-		"	jge 2f\n"
-		"	mov $0x80000001, %%eax\n"
-		"	cpuid\n"
-		"2:\n"
-		"	pop %%ebx\n"
-		"	mov %%edx, %0\n"
+	    __asm__ (
+	        "	push %%ebx\n"
+	        "	mov $0x80000000, %%eax\n"
+	        "	cpuid\n"
+	        "	xor %%edx, %%edx\n"
+	        "	cmp $0x1, %%eax\n"
+	        "	jge 2f\n"
+	        "	mov $0x80000001, %%eax\n"
+	        "	cpuid\n"
+	        "2:\n"
+	        "	pop %%ebx\n"
+	        "	mov %%edx, %0\n"
 		: "=r" (result)
 		:
 		: "%eax", "%ecx", "%edx"
-                );
+	        );
 #elif defined _MSC_VER
-            _asm {
-              push ebx
-              mov eax, 80000000h
-              cpuid
-              xor edx, edx
-              cmp eax, 1
-              jge notamd
-              mov eax, 80000001h
-              cpuid
-            notamd:
-              pop ebx
-              mov result, edx
-            }
+	    _asm {
+		push ebx
+		mov eax, 80000000h
+		cpuid
+		xor edx, edx
+		cmp eax, 1
+		jge notamd
+		mov eax, 80000001h
+		cpuid
+	    notamd:
+		pop ebx
+		mov result, edx
+	    }
 #endif
-            if (result & (1<<22))
-                features |= MMX_Extensions;
-        }
+	    if (result & (1 << 22))
+		features |= MMX_EXTENSIONS;
+	}
     }
 #endif /* HAVE_GETISAX */
 
@@ -469,9 +518,9 @@ pixman_have_mmx (void)
 
     if (!initialized)
     {
-        unsigned int features = detectCPUFeatures();
-	mmx_present = (features & (MMX|MMX_Extensions)) == (MMX|MMX_Extensions);
-        initialized = TRUE;
+	unsigned int features = detect_cpu_features ();
+	mmx_present = (features & (MMX | MMX_EXTENSIONS)) == (MMX | MMX_EXTENSIONS);
+	initialized = TRUE;
     }
 
     return mmx_present;
@@ -486,13 +535,14 @@ pixman_have_sse2 (void)
 
     if (!initialized)
     {
-        unsigned int features = detectCPUFeatures();
-        sse2_present = (features & (MMX|MMX_Extensions|SSE|SSE2)) == (MMX|MMX_Extensions|SSE|SSE2);
-        initialized = TRUE;
+	unsigned int features = detect_cpu_features ();
+	sse2_present = (features & (MMX | MMX_EXTENSIONS | SSE | SSE2)) == (MMX | MMX_EXTENSIONS | SSE | SSE2);
+	initialized = TRUE;
     }
 
     return sse2_present;
 }
+
 #endif
 
 #else /* __amd64__ */
@@ -510,25 +560,26 @@ _pixman_choose_implementation (void)
 {
 #ifdef USE_SSE2
     if (pixman_have_sse2 ())
-	return _pixman_implementation_create_sse2 (NULL);
+	return _pixman_implementation_create_sse2 ();
 #endif
 #ifdef USE_MMX
-    if (pixman_have_mmx())
-	return _pixman_implementation_create_mmx (NULL);
+    if (pixman_have_mmx ())
+	return _pixman_implementation_create_mmx ();
 #endif
 
 #ifdef USE_ARM_NEON
-    if (pixman_have_arm_neon())
-	return _pixman_implementation_create_arm_neon (NULL);
+    if (pixman_have_arm_neon ())
+	return _pixman_implementation_create_arm_neon ();
 #endif
 #ifdef USE_ARM_SIMD
-    if (pixman_have_arm_simd())
-	return _pixman_implementation_create_arm_simd (NULL);
+    if (pixman_have_arm_simd ())
+	return _pixman_implementation_create_arm_simd ();
 #endif
 #ifdef USE_VMX
-    if (pixman_have_vmx())
-	return _pixman_implementation_create_vmx (NULL);
+    if (pixman_have_vmx ())
+	return _pixman_implementation_create_vmx ();
 #endif
-    
-    return _pixman_implementation_create_fast_path (NULL);
+
+    return _pixman_implementation_create_fast_path ();
 }
+
diff --git a/lib/pixman/pixman/pixman-edge-imp.h b/lib/pixman/pixman/pixman-edge-imp.h
index 016bfaba7..a30f82108 100644
--- a/lib/pixman/pixman/pixman-edge-imp.h
+++ b/lib/pixman/pixman/pixman-edge-imp.h
@@ -20,11 +20,11 @@
  * PERFORMANCE OF THIS SOFTWARE.
  */
 
-#ifndef rasterizeSpan
+#ifndef rasterize_span
 #endif
 
 static void
-rasterizeEdges (pixman_image_t  *image,
+RASTERIZE_EDGES (pixman_image_t  *image,
 		pixman_edge_t	*l,
 		pixman_edge_t	*r,
 		pixman_fixed_t		t,
@@ -50,7 +50,7 @@ rasterizeEdges (pixman_image_t  *image,
 #if N_BITS == 1
 	/* For the non-antialiased case, round the coordinates up, in effect
 	 * sampling the center of the pixel. (The AA case does a similar 
-	 * adjustment in RenderSamplesX) */
+	 * adjustment in RENDER_SAMPLES_X) */
 	lx += X_FRAC_FIRST(1);
 	rx += X_FRAC_FIRST(1);
 #endif
@@ -78,53 +78,85 @@ rasterizeEdges (pixman_image_t  *image,
 
 #if N_BITS == 1
 	    {
+
+#ifdef WORDS_BIGENDIAN
+#   define SCREEN_SHIFT_LEFT(x,n)	((x) << (n))
+#   define SCREEN_SHIFT_RIGHT(x,n)	((x) >> (n))
+#else
+#   define SCREEN_SHIFT_LEFT(x,n)	((x) >> (n))
+#   define SCREEN_SHIFT_RIGHT(x,n)	((x) << (n))
+#endif
+
+#define LEFT_MASK(x)							\
+		(((x) & 0x1f) ?						\
+		 SCREEN_SHIFT_RIGHT (0xffffffff, (x) & 0x1f) : 0)
+#define RIGHT_MASK(x)							\
+		(((32 - (x)) & 0x1f) ?					\
+		 SCREEN_SHIFT_LEFT (0xffffffff, (32 - (x)) & 0x1f) : 0)
+		
+#define MASK_BITS(x,w,l,n,r) {						\
+		    n = (w);						\
+		    r = RIGHT_MASK ((x) + n);				\
+		    l = LEFT_MASK (x);					\
+		    if (l) {						\
+			n -= 32 - ((x) & 0x1f);				\
+			if (n < 0) {					\
+			    n = 0;					\
+			    l &= r;					\
+			    r = 0;					\
+			}						\
+		    }							\
+		    n >>= 5;						\
+		}
+		
 		uint32_t  *a = line;
 		uint32_t  startmask;
 		uint32_t  endmask;
 		int	    nmiddle;
 		int	    width = rxi - lxi;
 		int	    x = lxi;
-
-		a += x >> FB_SHIFT;
-		x &= FB_MASK;
-
-		FbMaskBits (x, width, startmask, nmiddle, endmask);
-		    if (startmask) {
-			WRITE(image, a, READ(image, a) | startmask);
-			a++;
-		    }
-		    while (nmiddle--)
-			WRITE(image, a++, FB_ALLONES);
-		    if (endmask)
-			WRITE(image, a, READ(image, a) | endmask);
+		
+		a += x >> 5;
+		x &= 0x1f;
+		
+		MASK_BITS (x, width, startmask, nmiddle, endmask);
+
+		if (startmask) {
+		    WRITE(image, a, READ(image, a) | startmask);
+		    a++;
+		}
+		while (nmiddle--)
+		    WRITE(image, a++, 0xffffffff);
+		if (endmask)
+		    WRITE(image, a, READ(image, a) | endmask);
 	    }
 #else
 	    {
-		DefineAlpha(line,lxi);
+		DEFINE_ALPHA(line,lxi);
 		int	    lxs;
 		int     rxs;
 
 		/* Sample coverage for edge pixels */
-		lxs = RenderSamplesX (lx, N_BITS);
-		rxs = RenderSamplesX (rx, N_BITS);
+		lxs = RENDER_SAMPLES_X (lx, N_BITS);
+		rxs = RENDER_SAMPLES_X (rx, N_BITS);
 
 		/* Add coverage across row */
 		if (lxi == rxi)
 		{
-		    AddAlpha (rxs - lxs);
+		    ADD_ALPHA (rxs - lxs);
 		}
 		else
 		{
 		    int	xi;
 
-		    AddAlpha (N_X_FRAC(N_BITS) - lxs);
-		    StepAlpha;
+		    ADD_ALPHA (N_X_FRAC(N_BITS) - lxs);
+		    STEP_ALPHA;
 		    for (xi = lxi + 1; xi < rxi; xi++)
 		    {
-			AddAlpha (N_X_FRAC(N_BITS));
-			StepAlpha;
+			ADD_ALPHA (N_X_FRAC(N_BITS));
+			STEP_ALPHA;
 		    }
-		    AddAlpha (rxs);
+		    ADD_ALPHA (rxs);
 		}
 	    }
 #endif
@@ -136,19 +168,19 @@ rasterizeEdges (pixman_image_t  *image,
 #if N_BITS > 1
 	if (pixman_fixed_frac (y) != Y_FRAC_LAST(N_BITS))
 	{
-	    RenderEdgeStepSmall (l);
-	    RenderEdgeStepSmall (r);
+	    RENDER_EDGE_STEP_SMALL (l);
+	    RENDER_EDGE_STEP_SMALL (r);
 	    y += STEP_Y_SMALL(N_BITS);
 	}
 	else
 #endif
 	{
-	    RenderEdgeStepBig (l);
-	    RenderEdgeStepBig (r);
+	    RENDER_EDGE_STEP_BIG (l);
+	    RENDER_EDGE_STEP_BIG (r);
 	    y += STEP_Y_BIG(N_BITS);
 	    line += stride;
 	}
     }
 }
 
-#undef rasterizeSpan
+#undef rasterize_span
diff --git a/lib/pixman/pixman/pixman-edge.c b/lib/pixman/pixman/pixman-edge.c
index b9246af5f..81a2e960a 100644
--- a/lib/pixman/pixman/pixman-edge.c
+++ b/lib/pixman/pixman/pixman-edge.c
@@ -27,6 +27,35 @@
 #include <string.h>
 
 #include "pixman-private.h"
+#include "pixman-accessor.h"
+
+/*
+ * Step across a small sample grid gap
+ */
+#define RENDER_EDGE_STEP_SMALL(edge)					\
+    {									\
+	edge->x += edge->stepx_small;					\
+	edge->e += edge->dx_small;					\
+	if (edge->e > 0)						\
+	{								\
+	    edge->e -= edge->dy;					\
+	    edge->x += edge->signdx;					\
+	}								\
+    }
+
+/*
+ * Step across a large sample grid gap
+ */
+#define RENDER_EDGE_STEP_BIG(edge)					\
+    {									\
+	edge->x += edge->stepx_big;					\
+	edge->e += edge->dx_big;					\
+	if (edge->e > 0)						\
+	{								\
+	    edge->e -= edge->dy;					\
+	    edge->x += edge->signdx;					\
+	}								\
+    }
 
 #ifdef PIXMAN_FB_ACCESSORS
 #define PIXMAN_RASTERIZE_EDGES pixman_rasterize_edges_accessors
@@ -38,36 +67,38 @@
  * 4 bit alpha
  */
 
-#define N_BITS	4
-#define rasterizeEdges	fbRasterizeEdges4
+#define N_BITS  4
+#define RASTERIZE_EDGES rasterize_edges_4
 
-#if BITMAP_BIT_ORDER == LSBFirst
-#define Shift4(o)	((o) << 2)
+#ifndef WORDS_BIG_ENDIAN
+#define SHIFT_4(o)      ((o) << 2)
 #else
-#define Shift4(o)	((1-(o)) << 2)
+#define SHIFT_4(o)      ((1 - (o)) << 2)
 #endif
 
-#define Get4(x,o)	(((x) >> Shift4(o)) & 0xf)
-#define Put4(x,o,v)	(((x) & ~(0xf << Shift4(o))) | (((v) & 0xf) << Shift4(o)))
+#define GET_4(x, o)      (((x) >> SHIFT_4 (o)) & 0xf)
+#define PUT_4(x, o, v)							\
+    (((x) & ~(0xf << SHIFT_4 (o))) | (((v) & 0xf) << SHIFT_4 (o)))
 
-#define DefineAlpha(line,x)			     \
-    uint8_t   *__ap = (uint8_t *) line + ((x) >> 1); \
-    int	    __ao = (x) & 1
+#define DEFINE_ALPHA(line, x)						\
+    uint8_t   *__ap = (uint8_t *) line + ((x) >> 1);			\
+    int __ao = (x) & 1
 
-#define StepAlpha	((__ap += __ao), (__ao ^= 1))
+#define STEP_ALPHA      ((__ap += __ao), (__ao ^= 1))
 
-#define AddAlpha(a) {							\
-	uint8_t   __o = READ(image, __ap);				\
-	uint8_t   __a = (a) + Get4(__o, __ao);				\
-	WRITE(image, __ap, Put4 (__o, __ao, __a | (0 - ((__a) >> 4))));	\
+#define ADD_ALPHA(a)							\
+    {									\
+        uint8_t __o = READ (image, __ap);				\
+        uint8_t __a = (a) + GET_4 (__o, __ao);				\
+        WRITE (image, __ap, PUT_4 (__o, __ao, __a | (0 - ((__a) >> 4)))); \
     }
 
 #include "pixman-edge-imp.h"
 
-#undef AddAlpha
-#undef StepAlpha
-#undef DefineAlpha
-#undef rasterizeEdges
+#undef ADD_ALPHA
+#undef STEP_ALPHA
+#undef DEFINE_ALPHA
+#undef RASTERIZE_EDGES
 #undef N_BITS
 
 
@@ -76,35 +107,38 @@
  */
 
 #define N_BITS 1
-#define rasterizeEdges	fbRasterizeEdges1
+#define RASTERIZE_EDGES rasterize_edges_1
 
 #include "pixman-edge-imp.h"
 
-#undef rasterizeEdges
+#undef RASTERIZE_EDGES
 #undef N_BITS
 
 /*
  * 8 bit alpha
  */
 
-static inline uint8_t
+static force_inline uint8_t
 clip255 (int x)
 {
-    if (x > 255) return 255;
+    if (x > 255)
+	return 255;
+
     return x;
 }
 
-#define add_saturate_8(buf,val,length)				\
-    do {							\
-	int i__ = (length);					\
-	uint8_t *buf__ = (buf);					\
-	int val__ = (val);					\
-								\
-	while (i__--)						\
-	{							\
-	    WRITE(image, (buf__), clip255 (READ(image, (buf__)) + (val__)));	\
-	    (buf__)++;						\
-	}							\
+#define ADD_SATURATE_8(buf, val, length)				\
+    do									\
+    {									\
+        int i__ = (length);						\
+        uint8_t *buf__ = (buf);						\
+        int val__ = (val);						\
+									\
+        while (i__--)							\
+        {								\
+            WRITE (image, (buf__), clip255 (READ (image, (buf__)) + (val__))); \
+            (buf__)++;							\
+	}								\
     } while (0)
 
 /*
@@ -119,13 +153,13 @@ clip255 (int x)
  *                   fill_start       fill_end
  */
 static void
-fbRasterizeEdges8 (pixman_image_t       *image,
-		   pixman_edge_t	*l,
-		   pixman_edge_t	*r,
-		   pixman_fixed_t	t,
-		   pixman_fixed_t	b)
+rasterize_edges_8 (pixman_image_t *image,
+                   pixman_edge_t * l,
+                   pixman_edge_t * r,
+                   pixman_fixed_t  t,
+                   pixman_fixed_t  b)
 {
-    pixman_fixed_t  y = t;
+    pixman_fixed_t y = t;
     uint32_t  *line;
     int fill_start = -1, fill_end = -1;
     int fill_size = 0;
@@ -138,153 +172,165 @@ fbRasterizeEdges8 (pixman_image_t       *image,
     for (;;)
     {
         uint8_t *ap = (uint8_t *) line;
-	pixman_fixed_t	lx, rx;
-	int	lxi, rxi;
+        pixman_fixed_t lx, rx;
+        int lxi, rxi;
 
-	/* clip X */
-	lx = l->x;
-	if (lx < 0)
+        /* clip X */
+        lx = l->x;
+        if (lx < 0)
 	    lx = 0;
-	rx = r->x;
-	if (pixman_fixed_to_int (rx) >= width)
+
+        rx = r->x;
+
+        if (pixman_fixed_to_int (rx) >= width)
+	{
 	    /* Use the last pixel of the scanline, covered 100%.
 	     * We can't use the first pixel following the scanline,
 	     * because accessing it could result in a buffer overrun.
 	     */
 	    rx = pixman_int_to_fixed (width) - 1;
+	}
 
-	/* Skip empty (or backwards) sections */
-	if (rx > lx)
-	{
+        /* Skip empty (or backwards) sections */
+        if (rx > lx)
+        {
             int lxs, rxs;
 
-	    /* Find pixel bounds for span. */
-	    lxi = pixman_fixed_to_int (lx);
-	    rxi = pixman_fixed_to_int (rx);
+            /* Find pixel bounds for span. */
+            lxi = pixman_fixed_to_int (lx);
+            rxi = pixman_fixed_to_int (rx);
 
             /* Sample coverage for edge pixels */
-            lxs = RenderSamplesX (lx, 8);
-            rxs = RenderSamplesX (rx, 8);
+            lxs = RENDER_SAMPLES_X (lx, 8);
+            rxs = RENDER_SAMPLES_X (rx, 8);
 
             /* Add coverage across row */
-	    if (lxi == rxi)
-	    {
-		WRITE(image, ap +lxi, clip255 (READ(image, ap + lxi) + rxs - lxs));
+            if (lxi == rxi)
+            {
+                WRITE (image, ap + lxi,
+		       clip255 (READ (image, ap + lxi) + rxs - lxs));
 	    }
-	    else
-	    {
-		WRITE(image, ap + lxi, clip255 (READ(image, ap + lxi) + N_X_FRAC(8) - lxs));
+            else
+            {
+                WRITE (image, ap + lxi,
+		       clip255 (READ (image, ap + lxi) + N_X_FRAC (8) - lxs));
 
-		/* Move forward so that lxi/rxi is the pixel span */
-		lxi++;
+                /* Move forward so that lxi/rxi is the pixel span */
+                lxi++;
 
-		/* Don't bother trying to optimize the fill unless
+                /* Don't bother trying to optimize the fill unless
 		 * the span is longer than 4 pixels. */
-		if (rxi - lxi > 4)
-		{
-		    if (fill_start < 0)
-		    {
-			fill_start = lxi;
-			fill_end = rxi;
-			fill_size++;
+                if (rxi - lxi > 4)
+                {
+                    if (fill_start < 0)
+                    {
+                        fill_start = lxi;
+                        fill_end = rxi;
+                        fill_size++;
 		    }
-		    else
-		    {
-			if (lxi >= fill_end || rxi < fill_start)
-			{
-			    /* We're beyond what we saved, just fill it */
-			    add_saturate_8 (ap + fill_start,
-					    fill_size * N_X_FRAC(8),
-					    fill_end - fill_start);
-			    fill_start = lxi;
-			    fill_end = rxi;
-			    fill_size = 1;
+                    else
+                    {
+                        if (lxi >= fill_end || rxi < fill_start)
+                        {
+                            /* We're beyond what we saved, just fill it */
+                            ADD_SATURATE_8 (ap + fill_start,
+                                            fill_size * N_X_FRAC (8),
+                                            fill_end - fill_start);
+                            fill_start = lxi;
+                            fill_end = rxi;
+                            fill_size = 1;
 			}
-			else
-			{
-			    /* Update fill_start */
-			    if (lxi > fill_start)
-			    {
-				add_saturate_8 (ap + fill_start,
-						fill_size * N_X_FRAC(8),
-						lxi - fill_start);
-				fill_start = lxi;
+                        else
+                        {
+                            /* Update fill_start */
+                            if (lxi > fill_start)
+                            {
+                                ADD_SATURATE_8 (ap + fill_start,
+                                                fill_size * N_X_FRAC (8),
+                                                lxi - fill_start);
+                                fill_start = lxi;
 			    }
-			    else if (lxi < fill_start)
-			    {
-				add_saturate_8 (ap + lxi, N_X_FRAC(8),
-						fill_start - lxi);
+                            else if (lxi < fill_start)
+                            {
+                                ADD_SATURATE_8 (ap + lxi, N_X_FRAC (8),
+                                                fill_start - lxi);
 			    }
 
-			    /* Update fill_end */
-			    if (rxi < fill_end)
-			    {
-				add_saturate_8 (ap + rxi,
-						fill_size * N_X_FRAC(8),
-						fill_end - rxi);
-				fill_end = rxi;
+                            /* Update fill_end */
+                            if (rxi < fill_end)
+                            {
+                                ADD_SATURATE_8 (ap + rxi,
+                                                fill_size * N_X_FRAC (8),
+                                                fill_end - rxi);
+                                fill_end = rxi;
 			    }
-			    else if (fill_end < rxi)
-			    {
-				add_saturate_8 (ap + fill_end,
-						N_X_FRAC(8),
-						rxi - fill_end);
+                            else if (fill_end < rxi)
+                            {
+                                ADD_SATURATE_8 (ap + fill_end,
+                                                N_X_FRAC (8),
+                                                rxi - fill_end);
 			    }
-			    fill_size++;
+                            fill_size++;
 			}
 		    }
 		}
-		else
-		{
-		    add_saturate_8 (ap + lxi, N_X_FRAC(8), rxi - lxi);
+                else
+                {
+                    ADD_SATURATE_8 (ap + lxi, N_X_FRAC (8), rxi - lxi);
 		}
 
-		WRITE(image, ap + rxi, clip255 (READ(image, ap + rxi) + rxs));
+                WRITE (image, ap + rxi, clip255 (READ (image, ap + rxi) + rxs));
 	    }
 	}
 
-	if (y == b) {
+        if (y == b)
+        {
             /* We're done, make sure we clean up any remaining fill. */
-            if (fill_start != fill_end) {
-		if (fill_size == N_Y_FRAC(8))
-		{
-		    MEMSET_WRAPPED (image, ap + fill_start, 0xff, fill_end - fill_start);
+            if (fill_start != fill_end)
+            {
+                if (fill_size == N_Y_FRAC (8))
+                {
+                    MEMSET_WRAPPED (image, ap + fill_start,
+				    0xff, fill_end - fill_start);
 		}
-		else
-		{
-		    add_saturate_8 (ap + fill_start, fill_size * N_X_FRAC(8),
-				    fill_end - fill_start);
+                else
+                {
+                    ADD_SATURATE_8 (ap + fill_start, fill_size * N_X_FRAC (8),
+                                    fill_end - fill_start);
 		}
-            }
-	    break;
-        }
+	    }
+            break;
+	}
 
-	if (pixman_fixed_frac (y) != Y_FRAC_LAST(8))
-	{
-	    RenderEdgeStepSmall (l);
-	    RenderEdgeStepSmall (r);
-	    y += STEP_Y_SMALL(8);
+        if (pixman_fixed_frac (y) != Y_FRAC_LAST (8))
+        {
+            RENDER_EDGE_STEP_SMALL (l);
+            RENDER_EDGE_STEP_SMALL (r);
+            y += STEP_Y_SMALL (8);
 	}
-	else
-	{
-	    RenderEdgeStepBig (l);
-	    RenderEdgeStepBig (r);
-	    y += STEP_Y_BIG(8);
+        else
+        {
+            RENDER_EDGE_STEP_BIG (l);
+            RENDER_EDGE_STEP_BIG (r);
+            y += STEP_Y_BIG (8);
             if (fill_start != fill_end)
             {
-		if (fill_size == N_Y_FRAC(8))
-		{
-		    MEMSET_WRAPPED (image, ap + fill_start, 0xff, fill_end - fill_start);
+                if (fill_size == N_Y_FRAC (8))
+                {
+                    MEMSET_WRAPPED (image, ap + fill_start,
+				    0xff, fill_end - fill_start);
 		}
-		else
-		{
-		    add_saturate_8 (ap + fill_start, fill_size * N_X_FRAC(8),
-				    fill_end - fill_start);
+                else
+                {
+                    ADD_SATURATE_8 (ap + fill_start, fill_size * N_X_FRAC (8),
+                                    fill_end - fill_start);
 		}
+		
                 fill_start = fill_end = -1;
                 fill_size = 0;
-            }
-	    line += stride;
+	    }
+	    
+            line += stride;
 	}
     }
 }
@@ -294,21 +340,23 @@ static
 #endif
 void
 PIXMAN_RASTERIZE_EDGES (pixman_image_t *image,
-			pixman_edge_t	*l,
-			pixman_edge_t	*r,
-			pixman_fixed_t	t,
-			pixman_fixed_t	b)
+                        pixman_edge_t * l,
+                        pixman_edge_t * r,
+                        pixman_fixed_t  t,
+                        pixman_fixed_t  b)
 {
     switch (PIXMAN_FORMAT_BPP (image->bits.format))
     {
     case 1:
-	fbRasterizeEdges1 (image, l, r, t, b);
+	rasterize_edges_1 (image, l, r, t, b);
 	break;
+
     case 4:
-	fbRasterizeEdges4 (image, l, r, t, b);
+	rasterize_edges_4 (image, l, r, t, b);
 	break;
+
     case 8:
-	fbRasterizeEdges8 (image, l, r, t, b);
+	rasterize_edges_8 (image, l, r, t, b);
 	break;
     }
 }
@@ -317,12 +365,14 @@ PIXMAN_RASTERIZE_EDGES (pixman_image_t *image,
 
 PIXMAN_EXPORT void
 pixman_rasterize_edges (pixman_image_t *image,
-			pixman_edge_t	*l,
-			pixman_edge_t	*r,
-			pixman_fixed_t	t,
-			pixman_fixed_t	b)
+                        pixman_edge_t * l,
+                        pixman_edge_t * r,
+                        pixman_fixed_t  t,
+                        pixman_fixed_t  b)
 {
-    if (image->common.read_func	|| image->common.write_func)
+    return_if_fail (image->type == BITS);
+    
+    if (image->bits.read_func || image->bits.write_func)
 	pixman_rasterize_edges_accessors (image, l, r, t, b);
     else
 	pixman_rasterize_edges_no_accessors (image, l, r, t, b);
diff --git a/lib/pixman/pixman/pixman-fast-path.c b/lib/pixman/pixman/pixman-fast-path.c
index 5f78bc335..5ab8d8c99 100644
--- a/lib/pixman/pixman/pixman-fast-path.c
+++ b/lib/pixman/pixman/pixman-fast-path.c
@@ -23,110 +23,139 @@
  * Author:  Keith Packard, SuSE, Inc.
  */
 
+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif
 #include <string.h>
 #include "pixman-private.h"
 #include "pixman-combine32.h"
-#define FbFullMask(n)   ((n) == 32 ? (uint32_t)-1 : ((((uint32_t) 1) << n) - 1))
-
-#undef READ
-#undef WRITE
-#define READ(img,x) (*(x))
-#define WRITE(img,ptr,v) ((*(ptr)) = (v))
 
 static force_inline uint32_t
-fbOver (uint32_t src, uint32_t dest)
+fetch_24 (uint8_t *a)
 {
-    // dest = (dest * (255 - alpha)) / 255 + src
-    uint32_t a = ~src >> 24; // 255 - alpha == 255 + (~alpha + 1) == ~alpha
-    FbByteMulAdd(dest, a, src);
+    if (((unsigned long)a) & 1)
+    {
+#ifdef WORDS_BIGENDIAN
+	return (*a << 16) | (*(uint16_t *)(a + 1));
+#else
+	return *a | (*(uint16_t *)(a + 1) << 8);
+#endif
+    }
+    else
+    {
+#ifdef WORDS_BIGENDIAN
+	return (*(uint16_t *)a << 8) | *(a + 2);
+#else
+	return *(uint16_t *)a | (*(a + 2) << 16);
+#endif
+    }
+}
 
-    return dest;
+static force_inline void
+store_24 (uint8_t *a,
+          uint32_t v)
+{
+    if (((unsigned long)a) & 1)
+    {
+#ifdef WORDS_BIGENDIAN
+	*a = (uint8_t) (v >> 16);
+	*(uint16_t *)(a + 1) = (uint16_t) (v);
+#else
+	*a = (uint8_t) (v);
+	*(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
+#endif
+    }
+    else
+    {
+#ifdef WORDS_BIGENDIAN
+	*(uint16_t *)a = (uint16_t)(v >> 8);
+	*(a + 2) = (uint8_t)v;
+#else
+	*(uint16_t *)a = (uint16_t)v;
+	*(a + 2) = (uint8_t)(v >> 16);
+#endif
+    }
 }
 
-static uint32_t
-fbOver24 (uint32_t x, uint32_t y)
+static force_inline uint32_t
+over (uint32_t src,
+      uint32_t dest)
 {
-    uint16_t  a = ~x >> 24;
-    uint16_t  t;
-    uint32_t  m,n,o;
-
-    m = FbOverU(x,y,0,a,t);
-    n = FbOverU(x,y,8,a,t);
-    o = FbOverU(x,y,16,a,t);
-    return m|n|o;
+    uint32_t a = ~src >> 24;
+
+    UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
+
+    return dest;
 }
 
 static uint32_t
-fbIn (uint32_t x, uint8_t y)
+in (uint32_t x,
+    uint8_t  y)
 {
-    uint16_t  a = y;
-    uint16_t  t;
-    uint32_t  m,n,o,p;
-
-    m = FbInU(x,0,a,t);
-    n = FbInU(x,8,a,t);
-    o = FbInU(x,16,a,t);
-    p = FbInU(x,24,a,t);
-    return m|n|o|p;
+    uint16_t a = y;
+
+    UN8x4_MUL_UN8 (x, a);
+
+    return x;
 }
 
 /*
  * Naming convention:
  *
- *  opSRCxMASKxDST
+ *  op_src_mask_dest
  */
-
 static void
-fbCompositeOver_x888x8x8888 (pixman_implementation_t *imp,
-			     pixman_op_t      op,
-			     pixman_image_t * pSrc,
-			     pixman_image_t * pMask,
-			     pixman_image_t * pDst,
-			     int32_t      xSrc,
-			     int32_t      ySrc,
-			     int32_t      xMask,
-			     int32_t      yMask,
-			     int32_t      xDst,
-			     int32_t      yDst,
-			     int32_t     width,
-			     int32_t     height)
+fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
 {
-    uint32_t	*src, *srcLine;
-    uint32_t    *dst, *dstLine;
-    uint8_t	*mask, *maskLine;
-    int		 srcStride, maskStride, dstStride;
+    uint32_t    *src, *src_line;
+    uint32_t    *dst, *dst_line;
+    uint8_t     *mask, *mask_line;
+    int src_stride, mask_stride, dst_stride;
     uint8_t m;
     uint32_t s, d;
     uint16_t w;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     while (height--)
     {
-	src = srcLine;
-	srcLine += srcStride;
-	dst = dstLine;
-	dstLine += dstStride;
-	mask = maskLine;
-	maskLine += maskStride;
+	src = src_line;
+	src_line += src_stride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
 
 	w = width;
 	while (w--)
 	{
-	    m = READ(pMask, mask++);
+	    m = *mask++;
 	    if (m)
 	    {
-		s = READ(pSrc, src) | 0xff000000;
+		s = *src | 0xff000000;
 
 		if (m == 0xff)
-		    WRITE(pDst, dst, s);
+		{
+		    *dst = s;
+		}
 		else
 		{
-		    d = fbIn (s, m);
-		    WRITE(pDst, dst, fbOver (d, READ(pDst, dst)));
+		    d = in (s, m);
+		    *dst = over (d, *dst);
 		}
 	    }
 	    src++;
@@ -136,55 +165,53 @@ fbCompositeOver_x888x8x8888 (pixman_implementation_t *imp,
 }
 
 static void
-fbCompositeSolidMaskIn_nx8x8 (pixman_implementation_t *imp,
-			      pixman_op_t      op,
-			      pixman_image_t    *iSrc,
-			      pixman_image_t    *iMask,
-			      pixman_image_t    *iDst,
-			      int32_t      xSrc,
-			      int32_t      ySrc,
-			      int32_t      xMask,
-			      int32_t      yMask,
-			      int32_t      xDst,
-			      int32_t      yDst,
-			      int32_t     width,
-			      int32_t     height)
+fast_composite_in_n_8_8 (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         pixman_image_t *         src_image,
+                         pixman_image_t *         mask_image,
+                         pixman_image_t *         dest_image,
+                         int32_t                  src_x,
+                         int32_t                  src_y,
+                         int32_t                  mask_x,
+                         int32_t                  mask_y,
+                         int32_t                  dest_x,
+                         int32_t                  dest_y,
+                         int32_t                  width,
+                         int32_t                  height)
 {
-    uint32_t	src, srca;
-    uint8_t	*dstLine, *dst, dstMask;
-    uint8_t	*maskLine, *mask, m;
-    int	dstStride, maskStride;
-    uint16_t	w;
-    uint16_t    t;
+    uint32_t src, srca;
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    uint16_t w;
+    uint16_t t;
 
-    fbComposeGetSolid(iSrc, src, iDst->bits.format);
+    src = _pixman_image_get_solid (src_image, dest_image->bits.format);
 
-    dstMask = FbFullMask (PIXMAN_FORMAT_DEPTH (iDst->bits.format));
     srca = src >> 24;
 
-    fbComposeGetStart (iDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
-    fbComposeGetStart (iMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    if (srca == 0xff) {
+    if (srca == 0xff)
+    {
 	while (height--)
 	{
-	    dst = dstLine;
-	    dstLine += dstStride;
-	    mask = maskLine;
-	    maskLine += maskStride;
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
 	    w = width;
 
 	    while (w--)
 	    {
 		m = *mask++;
+
 		if (m == 0)
-		{
 		    *dst = 0;
-		}
 		else if (m != 0xff)
-		{
-		    *dst = FbIntMult(m, *dst, t);
-		}
+		    *dst = MUL_UN8 (m, *dst, t);
+
 		dst++;
 	    }
 	}
@@ -193,133 +220,127 @@ fbCompositeSolidMaskIn_nx8x8 (pixman_implementation_t *imp,
     {
 	while (height--)
 	{
-	    dst = dstLine;
-	    dstLine += dstStride;
-	    mask = maskLine;
-	    maskLine += maskStride;
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
 	    w = width;
 
 	    while (w--)
 	    {
 		m = *mask++;
-		m = FbIntMult(m, srca, t);
+		m = MUL_UN8 (m, srca, t);
+
 		if (m == 0)
-		{
 		    *dst = 0;
-		}
 		else if (m != 0xff)
-		{
-		    *dst = FbIntMult(m, *dst, t);
-		}
+		    *dst = MUL_UN8 (m, *dst, t);
+
 		dst++;
 	    }
 	}
     }
 }
 
-
 static void
-fbCompositeSrcIn_8x8 (pixman_implementation_t *imp,
-		      pixman_op_t      op,
-		      pixman_image_t  *iSrc,
-		      pixman_image_t  *iMask,
-		      pixman_image_t  *iDst,
-		      int32_t          xSrc,
-		      int32_t          ySrc,
-		      int32_t          xMask,
-		      int32_t          yMask,
-		      int32_t          xDst,
-		      int32_t          yDst,
-		      int32_t         width,
-		      int32_t         height)
+fast_composite_in_8_8 (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       pixman_image_t *         src_image,
+                       pixman_image_t *         mask_image,
+                       pixman_image_t *         dest_image,
+                       int32_t                  src_x,
+                       int32_t                  src_y,
+                       int32_t                  mask_x,
+                       int32_t                  mask_y,
+                       int32_t                  dest_x,
+                       int32_t                  dest_y,
+                       int32_t                  width,
+                       int32_t                  height)
 {
-    uint8_t	*dstLine, *dst;
-    uint8_t	*srcLine, *src;
-    int	dstStride, srcStride;
-    uint16_t	w;
-    uint8_t	s;
-    uint16_t	t;
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w;
+    uint8_t s;
+    uint16_t t;
 
-    fbComposeGetStart (iSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
-    fbComposeGetStart (iDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
 	while (w--)
 	{
 	    s = *src++;
+
 	    if (s == 0)
-	    {
 		*dst = 0;
-	    }
 	    else if (s != 0xff)
-	    {
-		*dst = FbIntMult(s, *dst, t);
-	    }
+		*dst = MUL_UN8 (s, *dst, t);
+
 	    dst++;
 	}
     }
 }
 
 static void
-fbCompositeSolidMask_nx8x8888 (pixman_implementation_t *imp,
-			       pixman_op_t      op,
-			       pixman_image_t * pSrc,
-			       pixman_image_t * pMask,
-			       pixman_image_t * pDst,
-			       int32_t      xSrc,
-			       int32_t      ySrc,
-			       int32_t      xMask,
-			       int32_t      yMask,
-			       int32_t      xDst,
-			       int32_t      yDst,
-			       int32_t     width,
-			       int32_t     height)
+fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
 {
-    uint32_t	 src, srca;
-    uint32_t	*dstLine, *dst, d, dstMask;
-    uint8_t	*maskLine, *mask, m;
-    int		 dstStride, maskStride;
-    uint16_t	 w;
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst, d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    uint16_t w;
 
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
-    dstMask = FbFullMask (PIXMAN_FORMAT_DEPTH (pDst->bits.format));
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	mask = maskLine;
-	maskLine += maskStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
 	w = width;
 
 	while (w--)
 	{
-	    m = READ(pMask, mask++);
+	    m = *mask++;
 	    if (m == 0xff)
 	    {
 		if (srca == 0xff)
-		    WRITE(pDst, dst, src & dstMask);
+		    *dst = src;
 		else
-		    WRITE(pDst, dst, fbOver (src, READ(pDst, dst)) & dstMask);
+		    *dst = over (src, *dst);
 	    }
 	    else if (m)
 	    {
-		d = fbIn (src, m);
-		WRITE(pDst, dst, fbOver (d, READ(pDst, dst)) & dstMask);
+		d = in (src, m);
+		*dst = over (d, *dst);
 	    }
 	    dst++;
 	}
@@ -327,136 +348,187 @@ fbCompositeSolidMask_nx8x8888 (pixman_implementation_t *imp,
 }
 
 static void
-fbCompositeSolidMask_nx8888x8888C (pixman_implementation_t *imp,
-				   pixman_op_t op,
-				   pixman_image_t * pSrc,
-				   pixman_image_t * pMask,
-				   pixman_image_t * pDst,
-				   int32_t      xSrc,
-				   int32_t      ySrc,
-				   int32_t      xMask,
-				   int32_t      yMask,
-				   int32_t      xDst,
-				   int32_t      yDst,
-				   int32_t     width,
-				   int32_t     height)
+fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+				   pixman_op_t              op,
+				   pixman_image_t *         src_image,
+				   pixman_image_t *         mask_image,
+				   pixman_image_t *         dst_image,
+				   int32_t                  src_x,
+				   int32_t                  src_y,
+				   int32_t                  mask_x,
+				   int32_t                  mask_y,
+				   int32_t                  dest_x,
+				   int32_t                  dest_y,
+				   int32_t                  width,
+				   int32_t                  height)
 {
-    uint32_t	src, srca;
-    uint32_t	*dstLine, *dst, d, dstMask;
-    uint32_t	*maskLine, *mask, ma;
-    int	dstStride, maskStride;
-    uint16_t	w;
-    uint32_t	m, n, o, p;
+    uint32_t src, srca, s;
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    uint16_t w;
 
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
-    dstMask = FbFullMask (PIXMAN_FORMAT_DEPTH (pDst->bits.format));
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	mask = maskLine;
-	maskLine += maskStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
 	w = width;
 
 	while (w--)
 	{
-	    ma = READ(pMask, mask++);
+	    ma = *mask++;
+
+	    if (ma)
+	    {
+		d = *dst;
+		s = src;
+
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
+
+		*dst = s;
+	    }
+
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_op_t              op,
+                                    pixman_image_t *         src_image,
+                                    pixman_image_t *         mask_image,
+                                    pixman_image_t *         dst_image,
+                                    int32_t                  src_x,
+                                    int32_t                  src_y,
+                                    int32_t                  mask_x,
+                                    int32_t                  mask_y,
+                                    int32_t                  dest_x,
+                                    int32_t                  dest_y,
+                                    int32_t                  width,
+                                    int32_t                  height)
+{
+    uint32_t src, srca, s;
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    uint16_t w;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    ma = *mask++;
 	    if (ma == 0xffffffff)
 	    {
 		if (srca == 0xff)
-		    WRITE(pDst, dst, src & dstMask);
+		    *dst = src;
 		else
-		    WRITE(pDst, dst, fbOver (src, READ(pDst, dst)) & dstMask);
+		    *dst = over (src, *dst);
 	    }
 	    else if (ma)
 	    {
-		d = READ(pDst, dst);
-#define FbInOverC(src,srca,msk,dst,i,result) { \
-    uint16_t  __a = FbGet8(msk,i); \
-    uint32_t  __t, __ta; \
-    uint32_t  __i; \
-    __t = FbIntMult (FbGet8(src,i), __a,__i); \
-    __ta = (uint8_t) ~FbIntMult (srca, __a,__i); \
-    __t = __t + FbIntMult(FbGet8(dst,i),__ta,__i); \
-    __t = (uint32_t) (uint8_t) (__t | (-(__t >> 8))); \
-    result = __t << (i); \
-}
-		FbInOverC (src, srca, ma, d, 0, m);
-		FbInOverC (src, srca, ma, d, 8, n);
-		FbInOverC (src, srca, ma, d, 16, o);
-		FbInOverC (src, srca, ma, d, 24, p);
-		WRITE(pDst, dst, m|n|o|p);
+		d = *dst;
+		s = src;
+
+		UN8x4_MUL_UN8x4 (s, ma);
+		UN8x4_MUL_UN8 (ma, srca);
+		ma = ~ma;
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+		*dst = d;
 	    }
+
 	    dst++;
 	}
     }
 }
 
 static void
-fbCompositeSolidMask_nx8x0888 (pixman_implementation_t *imp,
-			       pixman_op_t op,
-			       pixman_image_t * pSrc,
-			       pixman_image_t * pMask,
-			       pixman_image_t * pDst,
-			       int32_t      xSrc,
-			       int32_t      ySrc,
-			       int32_t      xMask,
-			       int32_t      yMask,
-			       int32_t      xDst,
-			       int32_t      yDst,
-			       int32_t     width,
-			       int32_t     height)
+fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
 {
-    uint32_t	src, srca;
-    uint8_t	*dstLine, *dst;
-    uint32_t	d;
-    uint8_t	*maskLine, *mask, m;
-    int	dstStride, maskStride;
-    uint16_t	w;
+    uint32_t src, srca;
+    uint8_t     *dst_line, *dst;
+    uint32_t d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    uint16_t w;
 
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 3);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	mask = maskLine;
-	maskLine += maskStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
 	w = width;
 
 	while (w--)
 	{
-	    m = READ(pMask, mask++);
+	    m = *mask++;
 	    if (m == 0xff)
 	    {
 		if (srca == 0xff)
+		{
 		    d = src;
+		}
 		else
 		{
-		    d = Fetch24(pDst, dst);
-		    d = fbOver24 (src, d);
+		    d = fetch_24 (dst);
+		    d = over (src, d);
 		}
-		Store24(pDst, dst,d);
+		store_24 (dst, d);
 	    }
 	    else if (m)
 	    {
-		d = fbOver24 (fbIn(src,m), Fetch24(pDst, dst));
-		Store24(pDst, dst, d);
+		d = over (in (src, m), fetch_24 (dst));
+		store_24 (dst, d);
 	    }
 	    dst += 3;
 	}
@@ -464,63 +536,65 @@ fbCompositeSolidMask_nx8x0888 (pixman_implementation_t *imp,
 }
 
 static void
-fbCompositeSolidMask_nx8x0565 (pixman_implementation_t *imp,
-			       pixman_op_t op,
-				  pixman_image_t * pSrc,
-				  pixman_image_t * pMask,
-				  pixman_image_t * pDst,
-				  int32_t      xSrc,
-				  int32_t      ySrc,
-				  int32_t      xMask,
-				  int32_t      yMask,
-				  int32_t      xDst,
-				  int32_t      yDst,
-				  int32_t     width,
-				  int32_t     height)
+fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
 {
-    uint32_t	src, srca;
-    uint16_t	*dstLine, *dst;
-    uint32_t	d;
-    uint8_t	*maskLine, *mask, m;
-    int	dstStride, maskStride;
-    uint16_t	w;
+    uint32_t src, srca;
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    uint16_t w;
 
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	mask = maskLine;
-	maskLine += maskStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
 	w = width;
 
 	while (w--)
 	{
-	    m = READ(pMask, mask++);
+	    m = *mask++;
 	    if (m == 0xff)
 	    {
 		if (srca == 0xff)
+		{
 		    d = src;
+		}
 		else
 		{
-		    d = READ(pDst, dst);
-		    d = fbOver24 (src, cvt0565to0888(d));
+		    d = *dst;
+		    d = over (src, CONVERT_0565_TO_0888 (d));
 		}
-		WRITE(pDst, dst, cvt8888to0565(d));
+		*dst = CONVERT_8888_TO_0565 (d);
 	    }
 	    else if (m)
 	    {
-		d = READ(pDst, dst);
-		d = fbOver24 (fbIn(src,m), cvt0565to0888(d));
-		WRITE(pDst, dst, cvt8888to0565(d));
+		d = *dst;
+		d = over (in (src, m), CONVERT_0565_TO_0888 (d));
+		*dst = CONVERT_8888_TO_0565 (d);
 	    }
 	    dst++;
 	}
@@ -528,73 +602,76 @@ fbCompositeSolidMask_nx8x0565 (pixman_implementation_t *imp,
 }
 
 static void
-fbCompositeSolidMask_nx8888x0565C (pixman_implementation_t *imp,
-				   pixman_op_t op,
-				   pixman_image_t * pSrc,
-				   pixman_image_t * pMask,
-				   pixman_image_t * pDst,
-				   int32_t      xSrc,
-				   int32_t      ySrc,
-				   int32_t      xMask,
-				   int32_t      yMask,
-				   int32_t      xDst,
-				   int32_t      yDst,
-				   int32_t     width,
-				   int32_t     height)
+fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                    pixman_op_t              op,
+                                    pixman_image_t *         src_image,
+                                    pixman_image_t *         mask_image,
+                                    pixman_image_t *         dst_image,
+                                    int32_t                  src_x,
+                                    int32_t                  src_y,
+                                    int32_t                  mask_x,
+                                    int32_t                  mask_y,
+                                    int32_t                  dest_x,
+                                    int32_t                  dest_y,
+                                    int32_t                  width,
+                                    int32_t                  height)
 {
-    uint32_t	src, srca;
-    uint16_t	src16;
-    uint16_t	*dstLine, *dst;
-    uint32_t	d;
-    uint32_t	*maskLine, *mask, ma;
-    int	dstStride, maskStride;
-    uint16_t	w;
-    uint32_t	m, n, o;
+    uint32_t  src, srca, s;
+    uint16_t  src16;
+    uint16_t *dst_line, *dst;
+    uint32_t  d;
+    uint32_t *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    uint16_t w;
 
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    src16 = cvt8888to0565(src);
+    src16 = CONVERT_8888_TO_0565 (src);
 
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	mask = maskLine;
-	maskLine += maskStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
 	w = width;
 
 	while (w--)
 	{
-	    ma = READ(pMask, mask++);
+	    ma = *mask++;
 	    if (ma == 0xffffffff)
 	    {
 		if (srca == 0xff)
 		{
-		    WRITE(pDst, dst, src16);
+		    *dst = src16;
 		}
 		else
 		{
-		    d = READ(pDst, dst);
-		    d = fbOver24 (src, cvt0565to0888(d));
-		    WRITE(pDst, dst, cvt8888to0565(d));
+		    d = *dst;
+		    d = over (src, CONVERT_0565_TO_0888 (d));
+		    *dst = CONVERT_8888_TO_0565 (d);
 		}
 	    }
 	    else if (ma)
 	    {
-		d = READ(pDst, dst);
-		d = cvt0565to0888(d);
-		FbInOverC (src, srca, ma, d, 0, m);
-		FbInOverC (src, srca, ma, d, 8, n);
-		FbInOverC (src, srca, ma, d, 16, o);
-		d = m|n|o;
-		WRITE(pDst, dst, cvt8888to0565(d));
+		d = *dst;
+		d = CONVERT_0565_TO_0888 (d);
+
+		s = src;
+
+		UN8x4_MUL_UN8x4 (s, ma);
+		UN8x4_MUL_UN8 (ma, srca);
+		ma = ~ma;
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+		*dst = CONVERT_8888_TO_0565 (d);
 	    }
 	    dst++;
 	}
@@ -602,96 +679,95 @@ fbCompositeSolidMask_nx8888x0565C (pixman_implementation_t *imp,
 }
 
 static void
-fbCompositeSrc_8888x8888 (pixman_implementation_t *imp,
-			  pixman_op_t op,
-			 pixman_image_t * pSrc,
-			 pixman_image_t * pMask,
-			 pixman_image_t * pDst,
-			 int32_t      xSrc,
-			 int32_t      ySrc,
-			 int32_t      xMask,
-			 int32_t      yMask,
-			 int32_t      xDst,
-			 int32_t      yDst,
-			 int32_t     width,
-			 int32_t     height)
+fast_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               pixman_image_t *         src_image,
+                               pixman_image_t *         mask_image,
+                               pixman_image_t *         dst_image,
+                               int32_t                  src_x,
+                               int32_t                  src_y,
+                               int32_t                  mask_x,
+                               int32_t                  mask_y,
+                               int32_t                  dest_x,
+                               int32_t                  dest_y,
+                               int32_t                  width,
+                               int32_t                  height)
 {
-    uint32_t	*dstLine, *dst, dstMask;
-    uint32_t	*srcLine, *src, s;
-    int	dstStride, srcStride;
-    uint8_t	a;
-    uint16_t	w;
-
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    uint8_t a;
+    uint16_t w;
 
-    dstMask = FbFullMask (PIXMAN_FORMAT_DEPTH (pDst->bits.format));
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
 	while (w--)
 	{
-	    s = READ(pSrc, src++);
+	    s = *src++;
 	    a = s >> 24;
 	    if (a == 0xff)
-		WRITE(pDst, dst, s & dstMask);
+		*dst = s;
 	    else if (s)
-		WRITE(pDst, dst, fbOver (s, READ(pDst, dst)) & dstMask);
+		*dst = over (s, *dst);
 	    dst++;
 	}
     }
 }
 
 static void
-fbCompositeSrc_8888x0888 (pixman_implementation_t *imp,
-			  pixman_op_t op,
-			 pixman_image_t * pSrc,
-			 pixman_image_t * pMask,
-			 pixman_image_t * pDst,
-			 int32_t      xSrc,
-			 int32_t      ySrc,
-			 int32_t      xMask,
-			 int32_t      yMask,
-			 int32_t      xDst,
-			 int32_t      yDst,
-			 int32_t     width,
-			 int32_t     height)
+fast_composite_over_8888_0888 (pixman_implementation_t *imp,
+			       pixman_op_t              op,
+			       pixman_image_t *         src_image,
+			       pixman_image_t *         mask_image,
+			       pixman_image_t *         dst_image,
+			       int32_t                  src_x,
+			       int32_t                  src_y,
+			       int32_t                  mask_x,
+			       int32_t                  mask_y,
+			       int32_t                  dest_x,
+			       int32_t                  dest_y,
+			       int32_t                  width,
+			       int32_t                  height)
 {
-    uint8_t	*dstLine, *dst;
-    uint32_t	d;
-    uint32_t	*srcLine, *src, s;
-    uint8_t	a;
-    int	dstStride, srcStride;
-    uint16_t	w;
+    uint8_t     *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint8_t a;
+    int dst_stride, src_stride;
+    uint16_t w;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 3);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
 	while (w--)
 	{
-	    s = READ(pSrc, src++);
+	    s = *src++;
 	    a = s >> 24;
 	    if (a)
 	    {
 		if (a == 0xff)
 		    d = s;
 		else
-		    d = fbOver24 (s, Fetch24(pDst, dst));
-		Store24(pDst, dst, d);
+		    d = over (s, fetch_24 (dst));
+
+		store_24 (dst, d);
 	    }
 	    dst += 3;
 	}
@@ -699,52 +775,54 @@ fbCompositeSrc_8888x0888 (pixman_implementation_t *imp,
 }
 
 static void
-fbCompositeSrc_8888x0565 (pixman_implementation_t *imp,
-			  pixman_op_t op,
-			 pixman_image_t * pSrc,
-			 pixman_image_t * pMask,
-			 pixman_image_t * pDst,
-			 int32_t      xSrc,
-			 int32_t      ySrc,
-			 int32_t      xMask,
-			 int32_t      yMask,
-			 int32_t      xDst,
-			 int32_t      yDst,
-			 int32_t     width,
-			 int32_t     height)
+fast_composite_over_8888_0565 (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               pixman_image_t *         src_image,
+                               pixman_image_t *         mask_image,
+                               pixman_image_t *         dst_image,
+                               int32_t                  src_x,
+                               int32_t                  src_y,
+                               int32_t                  mask_x,
+                               int32_t                  mask_y,
+                               int32_t                  dest_x,
+                               int32_t                  dest_y,
+                               int32_t                  width,
+                               int32_t                  height)
 {
-    uint16_t	*dstLine, *dst;
-    uint32_t	d;
-    uint32_t	*srcLine, *src, s;
-    uint8_t	a;
-    int	dstStride, srcStride;
-    uint16_t	w;
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint8_t a;
+    int dst_stride, src_stride;
+    uint16_t w;
 
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
 	while (w--)
 	{
-	    s = READ(pSrc, src++);
+	    s = *src++;
 	    a = s >> 24;
 	    if (s)
 	    {
 		if (a == 0xff)
+		{
 		    d = s;
+		}
 		else
 		{
-		    d = READ(pDst, dst);
-		    d = fbOver24 (s, cvt0565to0888(d));
+		    d = *dst;
+		    d = over (s, CONVERT_0565_TO_0888 (d));
 		}
-		WRITE(pDst, dst, cvt8888to0565(d));
+		*dst = CONVERT_8888_TO_0565 (d);
 	    }
 	    dst++;
 	}
@@ -752,90 +830,90 @@ fbCompositeSrc_8888x0565 (pixman_implementation_t *imp,
 }
 
 static void
-fbCompositeSrc_x888x0565 (pixman_implementation_t *imp,
-			  pixman_op_t op,
-                          pixman_image_t * pSrc,
-                          pixman_image_t * pMask,
-                          pixman_image_t * pDst,
-                          int32_t      xSrc,
-                          int32_t      ySrc,
-                          int32_t      xMask,
-                          int32_t      yMask,
-                          int32_t      xDst,
-                          int32_t      yDst,
-                          int32_t     width,
-                          int32_t     height)
+fast_composite_src_x888_0565 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
 {
-    uint16_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src, s;
-    int	dstStride, srcStride;
-    uint16_t	w;
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    uint16_t w;
 
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
 	while (w--)
 	{
-	    s = READ(pSrc, src++);
-	    WRITE(pDst, dst, cvt8888to0565(s));
+	    s = *src++;
+	    *dst = CONVERT_8888_TO_0565 (s);
 	    dst++;
 	}
     }
 }
 
 static void
-fbCompositeSrcAdd_8000x8000 (pixman_implementation_t *imp,
-			     pixman_op_t	op,
-			     pixman_image_t * pSrc,
-			     pixman_image_t * pMask,
-			     pixman_image_t * pDst,
-			     int32_t      xSrc,
-			     int32_t      ySrc,
-			     int32_t      xMask,
-			     int32_t      yMask,
-			     int32_t      xDst,
-			     int32_t      yDst,
-			     int32_t     width,
-			     int32_t     height)
+fast_composite_add_8000_8000 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
 {
-    uint8_t	*dstLine, *dst;
-    uint8_t	*srcLine, *src;
-    int	dstStride, srcStride;
-    uint16_t	w;
-    uint8_t	s, d;
-    uint16_t	t;
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w;
+    uint8_t s, d;
+    uint16_t t;
 
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
-    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
 	while (w--)
 	{
-	    s = READ(pSrc, src++);
+	    s = *src++;
 	    if (s)
 	    {
 		if (s != 0xff)
 		{
-		    d = READ(pDst, dst);
+		    d = *dst;
 		    t = d + s;
 		    s = t | (0 - (t >> 8));
 		}
-		WRITE(pDst, dst, s);
+		*dst = s;
 	    }
 	    dst++;
 	}
@@ -843,57 +921,49 @@ fbCompositeSrcAdd_8000x8000 (pixman_implementation_t *imp,
 }
 
 static void
-fbCompositeSrcAdd_8888x8888 (pixman_implementation_t *imp,
-			     pixman_op_t	op,
-			     pixman_image_t * pSrc,
-			     pixman_image_t * pMask,
-			     pixman_image_t * pDst,
-			     int32_t      xSrc,
-			     int32_t      ySrc,
-			     int32_t      xMask,
-			     int32_t      yMask,
-			     int32_t      xDst,
-			     int32_t      yDst,
-			     int32_t     width,
-			     int32_t     height)
+fast_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
 {
-    uint32_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-    int	dstStride, srcStride;
-    uint16_t	w;
-    uint32_t	s, d;
-    uint16_t	t;
-    uint32_t	m,n,o,p;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w;
+    uint32_t s, d;
 
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
 	while (w--)
 	{
-	    s = READ(pSrc, src++);
+	    s = *src++;
 	    if (s)
 	    {
 		if (s != 0xffffffff)
 		{
-		    d = READ(pDst, dst);
+		    d = *dst;
 		    if (d)
-		    {
-			m = FbAdd(s,d,0,t);
-			n = FbAdd(s,d,8,t);
-			o = FbAdd(s,d,16,t);
-			p = FbAdd(s,d,24,t);
-			s = m|n|o|p;
-		    }
+			UN8x4_ADD_UN8x4 (s, d);
 		}
-		WRITE(pDst, dst, s);
+		*dst = s;
 	    }
 	    dst++;
 	}
@@ -901,54 +971,54 @@ fbCompositeSrcAdd_8888x8888 (pixman_implementation_t *imp,
 }
 
 static void
-fbCompositeSrcAdd_8888x8x8 (pixman_implementation_t *imp,
-			    pixman_op_t op,
-			    pixman_image_t * pSrc,
-			    pixman_image_t * pMask,
-			    pixman_image_t * pDst,
-			    int32_t      xSrc,
-			    int32_t      ySrc,
-			    int32_t      xMask,
-			    int32_t      yMask,
-			    int32_t      xDst,
-			    int32_t      yDst,
-			    int32_t     width,
-			    int32_t     height)
+fast_composite_add_8888_8_8 (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             pixman_image_t *         src_image,
+                             pixman_image_t *         mask_image,
+                             pixman_image_t *         dst_image,
+                             int32_t                  src_x,
+                             int32_t                  src_y,
+                             int32_t                  mask_x,
+                             int32_t                  mask_y,
+                             int32_t                  dest_x,
+                             int32_t                  dest_y,
+                             int32_t                  width,
+                             int32_t                  height)
 {
-    uint8_t	*dstLine, *dst;
-    uint8_t	*maskLine, *mask;
-    int	dstStride, maskStride;
-    uint16_t	w;
-    uint32_t	src;
-    uint8_t	sa;
-
-    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
-    fbComposeGetSolid (pSrc, src, pDst->bits.format);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
+    uint32_t src;
+    uint8_t sa;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
     sa = (src >> 24);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	mask = maskLine;
-	maskLine += maskStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
 	w = width;
 
 	while (w--)
 	{
-	    uint16_t	tmp;
-	    uint16_t	a;
-	    uint32_t	m, d;
-	    uint32_t	r;
+	    uint16_t tmp;
+	    uint16_t a;
+	    uint32_t m, d;
+	    uint32_t r;
 
-	    a = READ(pMask, mask++);
-	    d = READ(pDst, dst);
+	    a = *mask++;
+	    d = *dst;
 
-	    m = FbInU (sa, 0, a, tmp);
-	    r = FbAdd (m, d, 0, tmp);
+	    m = MUL_UN8 (sa, a, tmp);
+	    r = ADD_UN8 (m, d, tmp);
 
-	    WRITE(pDst, dst++, r);
+	    *dst++ = r;
 	}
     }
 }
@@ -958,229 +1028,249 @@ fbCompositeSrcAdd_8888x8x8 (pixman_implementation_t *imp,
  */
 
 static void
-fbCompositeSolidFill (pixman_implementation_t *imp,
-		      pixman_op_t op,
-		      pixman_image_t * pSrc,
-		      pixman_image_t * pMask,
-		      pixman_image_t * pDst,
-		      int32_t      xSrc,
-		      int32_t      ySrc,
-		      int32_t      xMask,
-		      int32_t      yMask,
-		      int32_t      xDst,
-		      int32_t      yDst,
-		      int32_t     width,
-		      int32_t     height)
+fast_composite_solid_fill (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           pixman_image_t *         src_image,
+                           pixman_image_t *         mask_image,
+                           pixman_image_t *         dst_image,
+                           int32_t                  src_x,
+                           int32_t                  src_y,
+                           int32_t                  mask_x,
+                           int32_t                  mask_y,
+                           int32_t                  dest_x,
+                           int32_t                  dest_y,
+                           int32_t                  width,
+                           int32_t                  height)
 {
-    uint32_t	src;
+    uint32_t src;
 
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
-    if (pDst->bits.format == PIXMAN_a8)
+    if (dst_image->bits.format == PIXMAN_a8)
+    {
 	src = src >> 24;
-    else if (pDst->bits.format == PIXMAN_r5g6b5 ||
-	     pDst->bits.format == PIXMAN_b5g6r5)
-	src = cvt8888to0565 (src);
-
-    pixman_fill (pDst->bits.bits, pDst->bits.rowstride,
-		 PIXMAN_FORMAT_BPP (pDst->bits.format),
-		 xDst, yDst,
-		 width, height,
-		 src);
+    }
+    else if (dst_image->bits.format == PIXMAN_r5g6b5 ||
+             dst_image->bits.format == PIXMAN_b5g6r5)
+    {
+	src = CONVERT_8888_TO_0565 (src);
+    }
+
+    pixman_fill (dst_image->bits.bits, dst_image->bits.rowstride,
+                 PIXMAN_FORMAT_BPP (dst_image->bits.format),
+                 dest_x, dest_y,
+                 width, height,
+                 src);
 }
 
 static void
-fbCompositeSrc_8888xx888 (pixman_implementation_t *imp,
-			  pixman_op_t op,
-			  pixman_image_t * pSrc,
-			  pixman_image_t * pMask,
-			  pixman_image_t * pDst,
-			  int32_t      xSrc,
-			  int32_t      ySrc,
-			  int32_t      xMask,
-			  int32_t      yMask,
-			  int32_t      xDst,
-			  int32_t      yDst,
-			  int32_t     width,
-			  int32_t     height)
+fast_composite_src_8888_x888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
 {
-    uint32_t	*dst;
+    uint32_t    *dst;
     uint32_t    *src;
-    int		 dstStride, srcStride;
-    uint32_t	 n_bytes = width * sizeof (uint32_t);
+    int dst_stride, src_stride;
+    uint32_t n_bytes = width * sizeof (uint32_t);
 
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, src, 1);
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dst, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst, 1);
 
     while (height--)
     {
 	memcpy (dst, src, n_bytes);
 
-	dst += dstStride;
-	src += srcStride;
+	dst += dst_stride;
+	src += src_stride;
     }
 }
 
-static const FastPathInfo c_fast_paths[] =
+static const pixman_fast_path_t c_fast_paths[] =
 {
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   fbCompositeSolidMask_nx8x0565, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   fbCompositeSolidMask_nx8x0565, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r8g8b8,   fbCompositeSolidMask_nx8x0888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b8g8r8,   fbCompositeSolidMask_nx8x0888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888, 0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8888x8888C, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8888x8888C, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   fbCompositeSolidMask_nx8888x0565C, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8888x8888C, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8888x8888C, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   fbCompositeSolidMask_nx8888x0565C, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,	PIXMAN_x8r8g8b8, fbCompositeOver_x888x8x8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,	PIXMAN_a8r8g8b8, fbCompositeOver_x888x8x8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,	PIXMAN_x8b8g8r8, fbCompositeOver_x888x8x8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,	PIXMAN_a8b8g8r8, fbCompositeOver_x888x8x8888,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,	PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,	PIXMAN_r5g6b5,	 fbCompositeSrc_8888x0565,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,	PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,	PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_8888x0565,	   0 },
-    { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8,  PIXMAN_null,	PIXMAN_a8r8g8b8, fbCompositeSrcAdd_8888x8888,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8,  PIXMAN_null,	PIXMAN_a8b8g8r8, fbCompositeSrcAdd_8888x8888,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fbCompositeSrcAdd_8000x8000,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       fbCompositeSrcAdd_8888x8x8,    0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSolidFill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSolidFill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_a8b8g8r8, fbCompositeSolidFill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeSolidFill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_a8,       fbCompositeSolidFill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSolidFill, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSrc_8888xx888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSrc_8888xx888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeSrc_8888xx888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeSrc_8888xx888, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_x888x0565, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_x888x0565, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_x888x0565, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_x888x0565, 0 },
-    { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fbCompositeSrcIn_8x8,   0 },
-    { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,	PIXMAN_a8,	 fbCompositeSolidMaskIn_nx8x8, 0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   fast_composite_over_n_8_0565, 0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   fast_composite_over_n_8_0565, 0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r8g8b8,   fast_composite_over_n_8_0888, 0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b8g8r8,   fast_composite_over_n_8_0888, 0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fast_composite_over_n_8_8888, 0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fast_composite_over_n_8_8888, 0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fast_composite_over_n_8_8888, 0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fast_composite_over_n_8_8888, 0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fast_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, fast_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   fast_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, fast_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, fast_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   fast_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fast_composite_over_x888_8_8888,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fast_composite_over_x888_8_8888,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, fast_composite_over_x888_8_8888,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, fast_composite_over_x888_8_8888,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fast_composite_over_8888_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fast_composite_over_8888_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fast_composite_over_8888_0565,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, fast_composite_over_8888_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, fast_composite_over_8888_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fast_composite_over_8888_0565,    0 },
+    { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, fast_composite_add_8888_8888,   0 },
+    { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, fast_composite_add_8888_8888,   0 },
+    { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fast_composite_add_8000_8000,   0 },
+    { PIXMAN_OP_ADD, PIXMAN_solid,     PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fast_composite_add_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_ADD, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       fast_composite_add_8888_8_8,    0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_a8r8g8b8, fast_composite_solid_fill, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_x8r8g8b8, fast_composite_solid_fill, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_a8b8g8r8, fast_composite_solid_fill, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_x8b8g8r8, fast_composite_solid_fill, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_a8,       fast_composite_solid_fill, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_null,     PIXMAN_r5g6b5,   fast_composite_solid_fill, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, fast_composite_src_8888_x888, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, fast_composite_src_8888_x888, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, fast_composite_src_8888_x888, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, fast_composite_src_8888_x888, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_r5g6b5,   fast_composite_src_x888_0565, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_r5g6b5,   fast_composite_src_x888_0565, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_b5g6r5,   fast_composite_src_x888_0565, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_b5g6r5,   fast_composite_src_x888_0565, 0 },
+    { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fast_composite_in_8_8,   0 },
+    { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       fast_composite_in_n_8_8, 0 },
     { PIXMAN_OP_NONE },
 };
 
 static void
-fbCompositeSrcScaleNearest (pixman_implementation_t *imp,
-			    pixman_op_t     op,
-			    pixman_image_t *pSrc,
-			    pixman_image_t *pMask,
-			    pixman_image_t *pDst,
-			    int32_t         xSrc,
-			    int32_t         ySrc,
-			    int32_t         xMask,
-			    int32_t         yMask,
-			    int32_t         xDst,
-			    int32_t         yDst,
-			    int32_t        width,
-			    int32_t        height)
+fast_composite_src_scale_nearest (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  pixman_image_t *         src_image,
+                                  pixman_image_t *         mask_image,
+                                  pixman_image_t *         dst_image,
+                                  int32_t                  src_x,
+                                  int32_t                  src_y,
+                                  int32_t                  mask_x,
+                                  int32_t                  mask_y,
+                                  int32_t                  dest_x,
+                                  int32_t                  dest_y,
+                                  int32_t                  width,
+                                  int32_t                  height)
 {
     uint32_t       *dst;
     uint32_t       *src;
-    int             dstStride, srcStride;
-    int             i, j;
+    int dst_stride, src_stride;
+    int i, j;
     pixman_vector_t v;
-    
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dst, 1);
-    /* pass in 0 instead of xSrc and ySrc because xSrc and ySrc need to be
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst, 1);
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
      * transformed from destination space to source space */
-    fbComposeGetStart (pSrc, 0, 0, uint32_t, srcStride, src, 1);
-    
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src, 1);
+
     /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed(xSrc) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed(ySrc) + pixman_fixed_1 / 2;
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
     v.vector[2] = pixman_fixed_1;
-    
-    if (!pixman_transform_point_3d (pSrc->common.transform, &v))
-        return;
-    
+
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))
+	return;
+
     /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
     v.vector[0] -= pixman_fixed_e;
     v.vector[1] -= pixman_fixed_e;
-    
-    for (j = 0; j < height; j++) {
-        pixman_fixed_t vx = v.vector[0];
-        pixman_fixed_t vy = v.vector[1];
-        for (i = 0; i < width; ++i) {
-            pixman_bool_t inside_bounds;
-            uint32_t result;
-            int x, y;
-            x = vx >> 16;
-            y = vy >> 16;
-	    
-            /* apply the repeat function */
-            switch (pSrc->common.repeat) {
+
+    for (j = 0; j < height; j++)
+    {
+	pixman_fixed_t vx = v.vector[0];
+	pixman_fixed_t vy = v.vector[1];
+
+	for (i = 0; i < width; ++i)
+	{
+	    pixman_bool_t inside_bounds;
+	    uint32_t result;
+	    int x, y;
+	    x = vx >> 16;
+	    y = vy >> 16;
+
+	    /* apply the repeat function */
+	    switch (src_image->common.repeat)
+	    {
 	    case PIXMAN_REPEAT_NORMAL:
-		x = MOD (x, pSrc->bits.width);
-		y = MOD (y, pSrc->bits.height);
+		x = MOD (x, src_image->bits.width);
+		y = MOD (y, src_image->bits.height);
 		inside_bounds = TRUE;
 		break;
-		
+
 	    case PIXMAN_REPEAT_PAD:
-		x = CLIP (x, 0, pSrc->bits.width-1);
-		y = CLIP (y, 0, pSrc->bits.height-1);
+		x = CLIP (x, 0, src_image->bits.width - 1);
+		y = CLIP (y, 0, src_image->bits.height - 1);
 		inside_bounds = TRUE;
 		break;
-		
+
 	    case PIXMAN_REPEAT_REFLECT:
-		x = MOD (x, pSrc->bits.width * 2);
-		if (x >= pSrc->bits.width)
-		    x = pSrc->bits.width * 2 - x - 1;
-		y = MOD (y, pSrc->bits.height * 2);
-		if (y >= pSrc->bits.height)
-		    y = pSrc->bits.height * 2 - y - 1;
+		x = MOD (x, src_image->bits.width * 2);
+		if (x >= src_image->bits.width)
+		    x = src_image->bits.width * 2 - x - 1;
+		y = MOD (y, src_image->bits.height * 2);
+		if (y >= src_image->bits.height)
+		    y = src_image->bits.height * 2 - y - 1;
 		inside_bounds = TRUE;
 		break;
-		
+
 	    case PIXMAN_REPEAT_NONE:
 	    default:
-		inside_bounds = (x >= 0 && x < pSrc->bits.width && y >= 0 && y < pSrc->bits.height);
+		inside_bounds =
+		    (x >= 0				&&
+		     x < src_image->bits.width		&&
+		     y >= 0				&&
+		     y < src_image->bits.height);
 		break;
-            }
-	    
-            if (inside_bounds) {
-                //XXX: we should move this multiplication out of the loop
-                result = READ(pSrc, src + y * srcStride + x);
-            } else {
-                result = 0;
-            }
-            WRITE(pDst, dst + i, result);
-	    
-            /* adjust the x location by a unit vector in the x direction:
-             * this is equivalent to transforming x+1 of the destination point to source space */
-            vx += pSrc->common.transform->matrix[0][0];
-        }
-        /* adjust the y location by a unit vector in the y direction
-         * this is equivalent to transforming y+1 of the destination point to source space */
-        v.vector[1] += pSrc->common.transform->matrix[1][1];
-        dst += dstStride;
+	    }
+
+	    if (inside_bounds)
+	    {
+		/* XXX: we should move this multiplication out of the loop */
+		result = *(src + y * src_stride + x);
+	    }
+	    else
+	    {
+		result = 0;
+	    }
+	    *(dst + i) = result;
+
+	    /* adjust the x location by a unit vector in the x direction:
+	     * this is equivalent to transforming x+1 of the destination
+	     * point to source space
+	     */
+	    vx += src_image->common.transform->matrix[0][0];
+	}
+	/* adjust the y location by a unit vector in the y direction
+	 * this is equivalent to transforming y+1 of the destination point
+	 * to source space
+	 */
+	v.vector[1] += src_image->common.transform->matrix[1][1];
+	dst += dst_stride;
     }
 }
 
 static void
 fast_path_composite (pixman_implementation_t *imp,
-		     pixman_op_t     op,
-		     pixman_image_t *src,
-		     pixman_image_t *mask,
-		     pixman_image_t *dest,
-		     int32_t         src_x,
-		     int32_t         src_y,
-		     int32_t         mask_x,
-		     int32_t         mask_y,
-		     int32_t         dest_x,
-		     int32_t         dest_y,
-		     int32_t        width,
-		     int32_t        height)
+                     pixman_op_t              op,
+                     pixman_image_t *         src,
+                     pixman_image_t *         mask,
+                     pixman_image_t *         dest,
+                     int32_t                  src_x,
+                     int32_t                  src_y,
+                     int32_t                  mask_x,
+                     int32_t                  mask_y,
+                     int32_t                  dest_x,
+                     int32_t                  dest_y,
+                     int32_t                  width,
+                     int32_t                  height)
 {
     if (src->type == BITS
         && src->common.transform
@@ -1188,57 +1278,55 @@ fast_path_composite (pixman_implementation_t *imp,
         && op == PIXMAN_OP_SRC
         && !src->common.alpha_map && !dest->common.alpha_map
         && (src->common.filter == PIXMAN_FILTER_NEAREST)
-        && PIXMAN_FORMAT_BPP(dest->bits.format) == 32
+        && PIXMAN_FORMAT_BPP (dest->bits.format) == 32
         && src->bits.format == dest->bits.format
-        && src->common.src_clip == &(src->common.full_region)
-        && !src->common.read_func && !src->common.write_func
-        && !dest->common.read_func && !dest->common.write_func)
+        && !src->bits.read_func && !src->bits.write_func
+        && !dest->bits.read_func && !dest->bits.write_func)
     {
-        /* ensure that the transform matrix only has a scale */
-        if (src->common.transform->matrix[0][1] == 0 &&
-            src->common.transform->matrix[1][0] == 0 &&
-            src->common.transform->matrix[2][0] == 0 &&
-            src->common.transform->matrix[2][1] == 0 &&
-            src->common.transform->matrix[2][2] == pixman_fixed_1)
+	/* ensure that the transform matrix only has a scale */
+	if (src->common.transform->matrix[0][1] == 0 &&
+	    src->common.transform->matrix[1][0] == 0 &&
+	    src->common.transform->matrix[2][0] == 0 &&
+	    src->common.transform->matrix[2][1] == 0 &&
+	    src->common.transform->matrix[2][2] == pixman_fixed_1)
 	{
 	    _pixman_walk_composite_region (imp, op,
-					   src, mask, dest,
-					   src_x, src_y,
-					   mask_x, mask_y,
-					   dest_x, dest_y,
-					   width, height,
-					   FALSE, FALSE, 
-					   fbCompositeSrcScaleNearest);
+	                                   src, mask, dest,
+	                                   src_x, src_y,
+	                                   mask_x, mask_y,
+	                                   dest_x, dest_y,
+	                                   width, height,
+	                                   fast_composite_src_scale_nearest);
 	    return;
 	}
     }
 
     if (_pixman_run_fast_path (c_fast_paths, imp,
-			       op, src, mask, dest,
-			       src_x, src_y,
-			       mask_x, mask_y,
-			       dest_x, dest_y,
-			       width, height))
+                               op, src, mask, dest,
+                               src_x, src_y,
+                               mask_x, mask_y,
+                               dest_x, dest_y,
+                               width, height))
     {
 	return;
     }
 
     _pixman_implementation_composite (imp->delegate, op,
-				      src, mask, dest,
-				      src_x, src_y,
-				      mask_x, mask_y,
-				      dest_x, dest_y,
-				      width, height);
+                                      src, mask, dest,
+                                      src_x, src_y,
+                                      mask_x, mask_y,
+                                      dest_x, dest_y,
+                                      width, height);
 }
 
 static void
-pixman_fill8 (uint32_t  *bits,
-	      int	stride,
-	      int	x,
-	      int	y,
-	      int	width,
-	      int	height,
-	      uint32_t  xor)
+pixman_fill8 (uint32_t *bits,
+              int       stride,
+              int       x,
+              int       y,
+              int       width,
+              int       height,
+              uint32_t xor)
 {
     int byte_stride = stride * (int) sizeof (uint32_t);
     uint8_t *dst = (uint8_t *) bits;
@@ -1258,14 +1346,15 @@ pixman_fill8 (uint32_t  *bits,
 
 static void
 pixman_fill16 (uint32_t *bits,
-	       int       stride,
-	       int       x,
-	       int       y,
-	       int       width,
-	       int       height,
-	       uint32_t  xor)
+               int       stride,
+               int       x,
+               int       y,
+               int       width,
+               int       height,
+               uint32_t xor)
 {
-    int short_stride = (stride * (int) sizeof (uint32_t)) / (int) sizeof (uint16_t);
+    int short_stride =
+	(stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
     uint16_t *dst = (uint16_t *)bits;
     uint16_t v = xor & 0xffff;
     int i;
@@ -1283,12 +1372,12 @@ pixman_fill16 (uint32_t *bits,
 
 static void
 pixman_fill32 (uint32_t *bits,
-	       int       stride,
-	       int       x,
-	       int       y,
-	       int       width,
-	       int       height,
-	       uint32_t  xor)
+               int       stride,
+               int       x,
+               int       y,
+               int       width,
+               int       height,
+               uint32_t  xor)
 {
     int i;
 
@@ -1305,46 +1394,47 @@ pixman_fill32 (uint32_t *bits,
 
 static pixman_bool_t
 fast_path_fill (pixman_implementation_t *imp,
-		uint32_t *bits,
-		int stride,
-		int bpp,
-		int x,
-		int y,
-		int width,
-		int height,
-		uint32_t xor)
+                uint32_t *               bits,
+                int                      stride,
+                int                      bpp,
+                int                      x,
+                int                      y,
+                int                      width,
+                int                      height,
+                uint32_t		 xor)
 {
     switch (bpp)
     {
     case 8:
 	pixman_fill8 (bits, stride, x, y, width, height, xor);
 	break;
-	
+
     case 16:
 	pixman_fill16 (bits, stride, x, y, width, height, xor);
 	break;
-	
+
     case 32:
 	pixman_fill32 (bits, stride, x, y, width, height, xor);
 	break;
-	
+
     default:
 	return _pixman_implementation_fill (
 	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
 	break;
     }
-    
+
     return TRUE;
 }
 
 pixman_implementation_t *
-_pixman_implementation_create_fast_path (pixman_implementation_t *toplevel)
+_pixman_implementation_create_fast_path (void)
 {
-    pixman_implementation_t *general = _pixman_implementation_create_general (NULL);
-    pixman_implementation_t *imp = _pixman_implementation_create (toplevel, general);
+    pixman_implementation_t *general = _pixman_implementation_create_general ();
+    pixman_implementation_t *imp = _pixman_implementation_create (general);
 
     imp->composite = fast_path_composite;
     imp->fill = fast_path_fill;
-    
+
     return imp;
 }
+
diff --git a/lib/pixman/pixman/pixman-general.c b/lib/pixman/pixman/pixman-general.c
index 1d0e10963..3ead3dac7 100644
--- a/lib/pixman/pixman/pixman-general.c
+++ b/lib/pixman/pixman/pixman-general.c
@@ -25,11 +25,12 @@
  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  * SOFTWARE.
  */
+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
-#include <assert.h>
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -42,86 +43,87 @@
 
 static void
 general_composite_rect  (pixman_implementation_t *imp,
-			 pixman_op_t              op,
-			 pixman_image_t          *src,
-			 pixman_image_t          *mask,
-			 pixman_image_t          *dest,
-			 int32_t                  src_x,
-			 int32_t                  src_y,
-			 int32_t                  mask_x,
-			 int32_t                  mask_y,
-			 int32_t                  dest_x,
-			 int32_t                  dest_y,
-			 int32_t                  width,
-			 int32_t                  height)
+                         pixman_op_t              op,
+                         pixman_image_t *         src,
+                         pixman_image_t *         mask,
+                         pixman_image_t *         dest,
+                         int32_t                  src_x,
+                         int32_t                  src_y,
+                         int32_t                  mask_x,
+                         int32_t                  mask_y,
+                         int32_t                  dest_x,
+                         int32_t                  dest_y,
+                         int32_t                  width,
+                         int32_t                  height)
 {
-    return_if_fail (src != NULL);
-    return_if_fail (dest != NULL);
-    {
     uint8_t stack_scanline_buffer[SCANLINE_BUFFER_LENGTH * 3];
-    const pixman_format_code_t srcFormat = src->type == BITS ? src->bits.format : 0;
-    const pixman_format_code_t maskFormat = mask && mask->type == BITS ? mask->bits.format : 0;
-    const pixman_format_code_t destFormat = dest->type == BITS ? dest->bits.format : 0;
-    const int srcWide = PIXMAN_FORMAT_16BPC(srcFormat);
-    const int maskWide = mask && PIXMAN_FORMAT_16BPC(maskFormat);
-    const int destWide = PIXMAN_FORMAT_16BPC(destFormat);
-    const int wide = srcWide || maskWide || destWide;
+    const pixman_format_code_t src_format =
+	src->type == BITS ? src->bits.format : 0;
+    const pixman_format_code_t mask_format =
+	mask && mask->type == BITS ? mask->bits.format : 0;
+    const pixman_format_code_t dest_format =
+	dest->type == BITS ? dest->bits.format : 0;
+    const int src_wide = PIXMAN_FORMAT_IS_WIDE (src_format);
+    const int mask_wide = mask && PIXMAN_FORMAT_IS_WIDE (mask_format);
+    const int dest_wide = PIXMAN_FORMAT_IS_WIDE (dest_format);
+    const int wide = src_wide || mask_wide || dest_wide;
     const int Bpp = wide ? 8 : 4;
     uint8_t *scanline_buffer = stack_scanline_buffer;
     uint8_t *src_buffer, *mask_buffer, *dest_buffer;
-    scanFetchProc fetchSrc = NULL, fetchMask = NULL, fetchDest = NULL;
+    fetch_scanline_t fetch_src = NULL, fetch_mask = NULL, fetch_dest = NULL;
     pixman_combine_32_func_t compose;
-    scanStoreProc store;
-    source_pict_class_t srcClass, maskClass;
+    store_scanline_t store;
+    source_image_class_t src_class, mask_class;
     pixman_bool_t component_alpha;
     uint32_t *bits;
     int32_t stride;
     int i;
-    
+
     if (width * Bpp > SCANLINE_BUFFER_LENGTH)
     {
 	scanline_buffer = pixman_malloc_abc (width, 3, Bpp);
-	
+
 	if (!scanline_buffer)
 	    return;
     }
-    
+
     src_buffer = scanline_buffer;
     mask_buffer = src_buffer + width * Bpp;
     dest_buffer = mask_buffer + width * Bpp;
-    
-    srcClass = _pixman_image_classify (src,
-				       src_x, src_y,
-				       width, height);
-    
-    maskClass = SOURCE_IMAGE_CLASS_UNKNOWN;
+
+    src_class = _pixman_image_classify (src,
+                                        src_x, src_y,
+                                        width, height);
+
+    mask_class = SOURCE_IMAGE_CLASS_UNKNOWN;
+
     if (mask)
     {
-	maskClass = _pixman_image_classify (mask,
-					    src_x, src_y,
-					    width, height);
+	mask_class = _pixman_image_classify (mask,
+	                                     src_x, src_y,
+	                                     width, height);
     }
-    
+
     if (op == PIXMAN_OP_CLEAR)
-        fetchSrc = NULL;
+	fetch_src = NULL;
     else if (wide)
-	fetchSrc = _pixman_image_get_scanline_64;
+	fetch_src = _pixman_image_get_scanline_64;
     else
-	fetchSrc = _pixman_image_get_scanline_32;
-    
+	fetch_src = _pixman_image_get_scanline_32;
+
     if (!mask || op == PIXMAN_OP_CLEAR)
-	fetchMask = NULL;
+	fetch_mask = NULL;
     else if (wide)
-	fetchMask = _pixman_image_get_scanline_64;
+	fetch_mask = _pixman_image_get_scanline_64;
     else
-	fetchMask = _pixman_image_get_scanline_32;
-    
+	fetch_mask = _pixman_image_get_scanline_32;
+
     if (op == PIXMAN_OP_CLEAR || op == PIXMAN_OP_SRC)
-	fetchDest = NULL;
+	fetch_dest = NULL;
     else if (wide)
-	fetchDest = _pixman_image_get_scanline_64;
+	fetch_dest = _pixman_image_get_scanline_64;
     else
-	fetchDest = _pixman_image_get_scanline_32;
+	fetch_dest = _pixman_image_get_scanline_32;
 
     if (wide)
 	store = _pixman_image_store_scanline_64;
@@ -133,15 +135,15 @@ general_composite_rect  (pixman_implementation_t *imp,
      * the destination format.
      */
     if (!wide &&
-	!dest->common.alpha_map &&
-	!dest->common.write_func && 
-	(op == PIXMAN_OP_ADD || op == PIXMAN_OP_OVER) &&
-	(dest->bits.format == PIXMAN_a8r8g8b8 ||
-	 dest->bits.format == PIXMAN_x8r8g8b8))
+        !dest->common.alpha_map &&
+        !dest->bits.write_func &&
+        (op == PIXMAN_OP_ADD || op == PIXMAN_OP_OVER) &&
+        (dest->bits.format == PIXMAN_a8r8g8b8 ||
+         dest->bits.format == PIXMAN_x8r8g8b8))
     {
 	store = NULL;
     }
-    
+
     if (!store)
     {
 	bits = dest->bits.bits;
@@ -152,15 +154,15 @@ general_composite_rect  (pixman_implementation_t *imp,
 	bits = NULL;
 	stride = 0;
     }
-    
+
     component_alpha =
-	fetchSrc			&&
-	fetchMask			&&
-	mask				&&
-	mask->common.type == BITS	&&
-	mask->common.component_alpha	&&
-	PIXMAN_FORMAT_RGB (mask->bits.format);
-    
+        fetch_src                       &&
+        fetch_mask                      &&
+        mask                            &&
+        mask->common.type == BITS       &&
+        mask->common.component_alpha    &&
+        PIXMAN_FORMAT_RGB (mask->bits.format);
+
     if (wide)
     {
 	if (component_alpha)
@@ -175,186 +177,148 @@ general_composite_rect  (pixman_implementation_t *imp,
 	else
 	    compose = _pixman_implementation_combine_32;
     }
-    
+
     if (!compose)
 	return;
-    
-    if (!fetchMask)
+
+    if (!fetch_mask)
 	mask_buffer = NULL;
-    
+
     for (i = 0; i < height; ++i)
     {
 	/* fill first half of scanline with source */
-	if (fetchSrc)
+	if (fetch_src)
 	{
-	    if (fetchMask)
+	    if (fetch_mask)
 	    {
 		/* fetch mask before source so that fetching of
 		   source can be optimized */
-		fetchMask (mask, mask_x, mask_y + i,
-			   width, (void *)mask_buffer, 0, 0);
-		
-		if (maskClass == SOURCE_IMAGE_CLASS_HORIZONTAL)
-		    fetchMask = NULL;
+		fetch_mask (mask, mask_x, mask_y + i,
+		            width, (void *)mask_buffer, 0, 0);
+
+		if (mask_class == SOURCE_IMAGE_CLASS_HORIZONTAL)
+		    fetch_mask = NULL;
 	    }
-	    
-	    if (srcClass == SOURCE_IMAGE_CLASS_HORIZONTAL)
+
+	    if (src_class == SOURCE_IMAGE_CLASS_HORIZONTAL)
 	    {
-		fetchSrc (src, src_x, src_y + i,
-			  width, (void *)src_buffer, 0, 0);
-		fetchSrc = NULL;
+		fetch_src (src, src_x, src_y + i,
+		           width, (void *)src_buffer, 0, 0);
+		fetch_src = NULL;
 	    }
 	    else
 	    {
-		fetchSrc (src, src_x, src_y + i,
-			  width, (void *)src_buffer, (void *)mask_buffer,
-			  0xffffffff);
+		fetch_src (src, src_x, src_y + i,
+		           width, (void *)src_buffer, (void *)mask_buffer,
+		           0xffffffff);
 	    }
 	}
-	else if (fetchMask)
+	else if (fetch_mask)
 	{
-	    fetchMask (mask, mask_x, mask_y + i,
-		       width, (void *)mask_buffer, 0, 0);
+	    fetch_mask (mask, mask_x, mask_y + i,
+	                width, (void *)mask_buffer, 0, 0);
 	}
-	
+
 	if (store)
 	{
 	    /* fill dest into second half of scanline */
-	    if (fetchDest)
-		fetchDest (dest, dest_x, dest_y + i,
-			   width, (void *)dest_buffer, 0, 0);
-	    
+	    if (fetch_dest)
+	    {
+		fetch_dest (dest, dest_x, dest_y + i,
+		            width, (void *)dest_buffer, 0, 0);
+	    }
+
 	    /* blend */
-	    compose (imp->toplevel, op, (void *)dest_buffer, (void *)src_buffer, (void *)mask_buffer, width);
-	    
+	    compose (imp->toplevel, op,
+		     (void *)dest_buffer,
+		     (void *)src_buffer,
+		     (void *)mask_buffer,
+		     width);
+
 	    /* write back */
 	    store (&(dest->bits), dest_x, dest_y + i, width,
-		   (void *)dest_buffer);
+	           (void *)dest_buffer);
 	}
 	else
 	{
 	    /* blend */
-	    compose (imp->toplevel, op, bits + (dest_y + i) * stride +
-		     dest_x,
-		     (void *)src_buffer, (void *)mask_buffer, width);
+	    compose (imp->toplevel, op,
+		     bits + (dest_y + i) * stride + dest_x,
+	             (void *)src_buffer, (void *)mask_buffer, width);
 	}
     }
-    
+
     if (scanline_buffer != stack_scanline_buffer)
 	free (scanline_buffer);
-    }
 }
 
 static void
-general_composite (pixman_implementation_t *	imp,
-		   pixman_op_t			op,
-		   pixman_image_t *		src,
-		   pixman_image_t *		mask,
-		   pixman_image_t *		dest,
-		   int32_t			src_x,
-		   int32_t			src_y,
-		   int32_t			mask_x,
-		   int32_t			mask_y,
-		   int32_t			dest_x,
-		   int32_t			dest_y,
-		   int32_t			width,
-		   int32_t			height)
+general_composite (pixman_implementation_t * imp,
+                   pixman_op_t               op,
+                   pixman_image_t *          src,
+                   pixman_image_t *          mask,
+                   pixman_image_t *          dest,
+                   int32_t                   src_x,
+                   int32_t                   src_y,
+                   int32_t                   mask_x,
+                   int32_t                   mask_y,
+                   int32_t                   dest_x,
+                   int32_t                   dest_y,
+                   int32_t                   width,
+                   int32_t                   height)
 {
-    pixman_bool_t srcRepeat = src->type == BITS && src->common.repeat == PIXMAN_REPEAT_NORMAL;
-    pixman_bool_t maskRepeat = FALSE;
-    pixman_bool_t srcTransform = src->common.transform != NULL;
-    pixman_bool_t maskTransform = FALSE;
-    
-    if (srcRepeat && srcTransform &&
-	src->bits.width == 1 &&
-	src->bits.height == 1)
-    {
-	srcTransform = FALSE;
-    }
-    
-    if (mask && mask->type == BITS)
-    {
-	maskRepeat = mask->common.repeat == PIXMAN_REPEAT_NORMAL;
-	
-	maskTransform = mask->common.transform != 0;
-	if (mask->common.filter == PIXMAN_FILTER_CONVOLUTION)
-	    maskTransform = TRUE;
-	
-	if (maskRepeat && maskTransform &&
-	    mask->bits.width == 1 &&
-	    mask->bits.height == 1)
-	{
-	    maskTransform = FALSE;
-	}
-    }
-    
-    /* CompositeGeneral optimizes 1x1 repeating images itself */
-    if (src->type == BITS &&
-	src->bits.width == 1 && src->bits.height == 1)
-    {
-	srcRepeat = FALSE;
-    }
-    
-    if (mask && mask->type == BITS &&
-	mask->bits.width == 1 && mask->bits.height == 1)
-    {
-	maskRepeat = FALSE;
-    }
-    
-    /* if we are transforming, repeats are handled in fbFetchTransformed */
-    if (srcTransform)
-	srcRepeat = FALSE;
-    
-    if (maskTransform)
-	maskRepeat = FALSE;
-    
     _pixman_walk_composite_region (imp, op, src, mask, dest, src_x, src_y,
-				   mask_x, mask_y, dest_x, dest_y, width, height,
-				   srcRepeat, maskRepeat, general_composite_rect);
+                                   mask_x, mask_y, dest_x, dest_y,
+				   width, height,
+                                   general_composite_rect);
 }
 
 static pixman_bool_t
 general_blt (pixman_implementation_t *imp,
-	     uint32_t *src_bits,
-	     uint32_t *dst_bits,
-	     int src_stride,
-	     int dst_stride,
-	     int src_bpp,
-	     int dst_bpp,
-	     int src_x, int src_y,
-	     int dst_x, int dst_y,
-	     int width, int height)
+             uint32_t *               src_bits,
+             uint32_t *               dst_bits,
+             int                      src_stride,
+             int                      dst_stride,
+             int                      src_bpp,
+             int                      dst_bpp,
+             int                      src_x,
+             int                      src_y,
+             int                      dst_x,
+             int                      dst_y,
+             int                      width,
+             int                      height)
 {
     /* We can't blit unless we have sse2 or mmx */
-    
+
     return FALSE;
 }
 
 static pixman_bool_t
 general_fill (pixman_implementation_t *imp,
-	      uint32_t *bits,
-	      int stride,
-	      int bpp,
-	      int x,
-	      int y,
-	      int width,
-	      int height,
-	      uint32_t xor)
+              uint32_t *               bits,
+              int                      stride,
+              int                      bpp,
+              int                      x,
+              int                      y,
+              int                      width,
+              int                      height,
+              uint32_t xor)
 {
     return FALSE;
 }
 
 pixman_implementation_t *
-_pixman_implementation_create_general (pixman_implementation_t *toplevel)
+_pixman_implementation_create_general (void)
 {
-    pixman_implementation_t *imp = _pixman_implementation_create (toplevel, NULL);
+    pixman_implementation_t *imp = _pixman_implementation_create (NULL);
 
     _pixman_setup_combiner_functions_32 (imp);
     _pixman_setup_combiner_functions_64 (imp);
-    
+
     imp->composite = general_composite;
     imp->blt = general_blt;
     imp->fill = general_fill;
-    
+
     return imp;
 }
+
diff --git a/lib/pixman/pixman/pixman-gradient-walker.c b/lib/pixman/pixman/pixman-gradient-walker.c
index 6a47a8ea3..dd666b412 100644
--- a/lib/pixman/pixman/pixman-gradient-walker.c
+++ b/lib/pixman/pixman/pixman-gradient-walker.c
@@ -23,13 +23,15 @@
  * SOFTWARE.
  */
 
+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif
 #include "pixman-private.h"
 
 void
-_pixman_gradient_walker_init (GradientWalker  *walker,
-			      gradient_t      *gradient,
-			      unsigned int     spread)
+_pixman_gradient_walker_init (pixman_gradient_walker_t *walker,
+                              gradient_t *              gradient,
+                              unsigned int              spread)
 {
     walker->num_stops = gradient->n_stops;
     walker->stops     = gradient->stops;
@@ -41,21 +43,21 @@ _pixman_gradient_walker_init (GradientWalker  *walker,
     walker->right_ag  = 0;
     walker->right_rb  = 0;
     walker->spread    = spread;
-    
+
     walker->need_reset = TRUE;
 }
 
 void
-_pixman_gradient_walker_reset (GradientWalker       *walker,
-			       pixman_fixed_32_32_t  pos)
+_pixman_gradient_walker_reset (pixman_gradient_walker_t *walker,
+                               pixman_fixed_32_32_t      pos)
 {
-    int32_t                  x, left_x, right_x;
+    int32_t x, left_x, right_x;
     pixman_color_t          *left_c, *right_c;
-    int                      n, count = walker->num_stops;
+    int n, count = walker->num_stops;
     pixman_gradient_stop_t *      stops = walker->stops;
-    
-    static const pixman_color_t   transparent_black = { 0, 0, 0, 0 };
-    
+
+    static const pixman_color_t transparent_black = { 0, 0, 0, 0 };
+
     switch (walker->spread)
     {
     case PIXMAN_REPEAT_NORMAL:
@@ -63,47 +65,59 @@ _pixman_gradient_walker_reset (GradientWalker       *walker,
 	for (n = 0; n < count; n++)
 	    if (x < stops[n].x)
 		break;
-	if (n == 0) {
-	    left_x =  stops[count-1].x - 0x10000;
-	    left_c = &stops[count-1].color;
-	} else {
-	    left_x =  stops[n-1].x;
-	    left_c = &stops[n-1].color;
-	}
-	
-	if (n == count) {
+	if (n == 0)
+	{
+	    left_x =  stops[count - 1].x - 0x10000;
+	    left_c = &stops[count - 1].color;
+	}
+	else
+	{
+	    left_x =  stops[n - 1].x;
+	    left_c = &stops[n - 1].color;
+	}
+
+	if (n == count)
+	{
 	    right_x =  stops[0].x + 0x10000;
 	    right_c = &stops[0].color;
-	} else {
+	}
+	else
+	{
 	    right_x =  stops[n].x;
 	    right_c = &stops[n].color;
 	}
 	left_x  += (pos - x);
 	right_x += (pos - x);
 	break;
-	
+
     case PIXMAN_REPEAT_PAD:
 	for (n = 0; n < count; n++)
 	    if (pos < stops[n].x)
 		break;
-	
-	if (n == 0) {
+
+	if (n == 0)
+	{
 	    left_x =  INT32_MIN;
 	    left_c = &stops[0].color;
-	} else {
-	    left_x =  stops[n-1].x;
-	    left_c = &stops[n-1].color;
 	}
-	
-	if (n == count) {
+	else
+	{
+	    left_x =  stops[n - 1].x;
+	    left_c = &stops[n - 1].color;
+	}
+
+	if (n == count)
+	{
 	    right_x =  INT32_MAX;
-	    right_c = &stops[n-1].color;
-	} else {
+	    right_c = &stops[n - 1].color;
+	}
+	else
+	{
 	    right_x =  stops[n].x;
 	    right_c = &stops[n].color;
 	}
 	break;
-	
+
     case PIXMAN_REPEAT_REFLECT:
 	x = (int32_t)pos & 0xFFFF;
 	if ((int32_t)pos & 0x10000)
@@ -111,46 +125,53 @@ _pixman_gradient_walker_reset (GradientWalker       *walker,
 	for (n = 0; n < count; n++)
 	    if (x < stops[n].x)
 		break;
-	
-	if (n == 0) {
+
+	if (n == 0)
+	{
 	    left_x =  -stops[0].x;
 	    left_c = &stops[0].color;
-	} else {
-	    left_x =  stops[n-1].x;
-	    left_c = &stops[n-1].color;
-	}
-	
-	if (n == count) {
-	    right_x = 0x20000 - stops[n-1].x;
-	    right_c = &stops[n-1].color;
-	} else {
+	}
+	else
+	{
+	    left_x =  stops[n - 1].x;
+	    left_c = &stops[n - 1].color;
+	}
+
+	if (n == count)
+	{
+	    right_x = 0x20000 - stops[n - 1].x;
+	    right_c = &stops[n - 1].color;
+	}
+	else
+	{
 	    right_x =  stops[n].x;
 	    right_c = &stops[n].color;
 	}
-	
-	if ((int32_t)pos & 0x10000) {
+
+	if ((int32_t)pos & 0x10000)
+	{
 	    pixman_color_t  *tmp_c;
-	    int32_t          tmp_x;
-	    
+	    int32_t tmp_x;
+
 	    tmp_x   = 0x10000 - right_x;
 	    right_x = 0x10000 - left_x;
 	    left_x  = tmp_x;
-	    
+
 	    tmp_c   = right_c;
 	    right_c = left_c;
 	    left_c  = tmp_c;
-	    
+
 	    x = 0x10000 - x;
 	}
 	left_x  += (pos - x);
 	right_x += (pos - x);
 	break;
-	
-    default:  /* RepeatNone */
+
+    default:  /* REPEAT_NONE */
 	for (n = 0; n < count; n++)
 	    if (pos < stops[n].x)
 		break;
-	
+
 	if (n == 0)
 	{
 	    left_x  =  INT32_MIN;
@@ -159,74 +180,75 @@ _pixman_gradient_walker_reset (GradientWalker       *walker,
 	}
 	else if (n == count)
 	{
-	    left_x  = stops[n-1].x;
+	    left_x  = stops[n - 1].x;
 	    right_x = INT32_MAX;
 	    left_c  = right_c = (pixman_color_t*) &transparent_black;
 	}
 	else
 	{
-	    left_x  =  stops[n-1].x;
+	    left_x  =  stops[n - 1].x;
 	    right_x =  stops[n].x;
-	    left_c  = &stops[n-1].color;
+	    left_c  = &stops[n - 1].color;
 	    right_c = &stops[n].color;
 	}
     }
-    
+
     walker->left_x   = left_x;
     walker->right_x  = right_x;
     walker->left_ag  = ((left_c->alpha >> 8) << 16)   | (left_c->green >> 8);
     walker->left_rb  = ((left_c->red & 0xff00) << 8)  | (left_c->blue >> 8);
     walker->right_ag = ((right_c->alpha >> 8) << 16)  | (right_c->green >> 8);
     walker->right_rb = ((right_c->red & 0xff00) << 8) | (right_c->blue >> 8);
-    
-    if ( walker->left_x == walker->right_x                ||
-	 ( walker->left_ag == walker->right_ag &&
-	   walker->left_rb == walker->right_rb )   )
+
+    if (walker->left_x == walker->right_x                ||
+        ( walker->left_ag == walker->right_ag &&
+          walker->left_rb == walker->right_rb )   )
     {
 	walker->stepper = 0;
     }
     else
     {
 	int32_t width = right_x - left_x;
-	walker->stepper = ((1 << 24) + width/2)/width;
+	walker->stepper = ((1 << 24) + width / 2) / width;
     }
-    
+
     walker->need_reset = FALSE;
 }
 
-#define  PIXMAN_GRADIENT_WALKER_NEED_RESET(w,x)				\
+#define  PIXMAN_GRADIENT_WALKER_NEED_RESET(w, x)                         \
     ( (w)->need_reset || (x) < (w)->left_x || (x) >= (w)->right_x)
 
 
 /* the following assumes that PIXMAN_GRADIENT_WALKER_NEED_RESET(w,x) is FALSE */
 uint32_t
-_pixman_gradient_walker_pixel (GradientWalker  *walker,
-			       pixman_fixed_32_32_t     x)
+_pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
+                               pixman_fixed_32_32_t      x)
 {
-    int  dist, idist;
-    uint32_t  t1, t2, a, color;
-    
+    int dist, idist;
+    uint32_t t1, t2, a, color;
+
     if (PIXMAN_GRADIENT_WALKER_NEED_RESET (walker, x))
-        _pixman_gradient_walker_reset (walker, x);
-    
-    dist  = ((int)(x - walker->left_x)*walker->stepper) >> 16;
+	_pixman_gradient_walker_reset (walker, x);
+
+    dist  = ((int)(x - walker->left_x) * walker->stepper) >> 16;
     idist = 256 - dist;
-    
+
     /* combined INTERPOLATE and premultiply */
-    t1 = walker->left_rb*idist + walker->right_rb*dist;
+    t1 = walker->left_rb * idist + walker->right_rb * dist;
     t1 = (t1 >> 8) & 0xff00ff;
-    
-    t2  = walker->left_ag*idist + walker->right_ag*dist;
+
+    t2  = walker->left_ag * idist + walker->right_ag * dist;
     t2 &= 0xff00ff00;
-    
+
     color = t2 & 0xff000000;
     a     = t2 >> 24;
-    
-    t1  = t1*a + 0x800080;
+
+    t1  = t1 * a + 0x800080;
     t1  = (t1 + ((t1 >> 8) & 0xff00ff)) >> 8;
-    
-    t2  = (t2 >> 8)*a + 0x800080;
+
+    t2  = (t2 >> 8) * a + 0x800080;
     t2  = (t2 + ((t2 >> 8) & 0xff00ff));
-    
+
     return (color | (t1 & 0xff00ff) | (t2 & 0xff00));
 }
+
diff --git a/lib/pixman/pixman/pixman-image.c b/lib/pixman/pixman/pixman-image.c
index c8295f882..fff0497f1 100644
--- a/lib/pixman/pixman/pixman-image.c
+++ b/lib/pixman/pixman/pixman-image.c
@@ -30,13 +30,12 @@
 #include <assert.h>
 
 #include "pixman-private.h"
-
-#define Alpha(x) ((x) >> 24)
+#include "pixman-combine32.h"
 
 pixman_bool_t
-_pixman_init_gradient (gradient_t     *gradient,
-		       const pixman_gradient_stop_t *stops,
-		       int	       n_stops)
+_pixman_init_gradient (gradient_t *                  gradient,
+                       const pixman_gradient_stop_t *stops,
+                       int                           n_stops)
 {
     return_val_if_fail (n_stops > 0, FALSE);
 
@@ -59,33 +58,40 @@ _pixman_init_gradient (gradient_t     *gradient,
 /*
  * By default, just evaluate the image at 32bpp and expand.  Individual image
  * types can plug in a better scanline getter if they want to. For example
- * we  could produce smoother gradients by evaluating them at higher color depth, but
- * that's a project for the future.
+ * we  could produce smoother gradients by evaluating them at higher color
+ * depth, but that's a project for the future.
  */
 void
-_pixman_image_get_scanline_64_generic (pixman_image_t * pict, int x, int y, int width,
-				       uint64_t *buffer, uint64_t *mask, uint32_t maskBits)
+_pixman_image_get_scanline_generic_64 (pixman_image_t * image,
+                                       int              x,
+                                       int              y,
+                                       int              width,
+                                       uint32_t *       buffer,
+                                       const uint32_t * mask,
+                                       uint32_t         mask_bits)
 {
     uint32_t *mask8 = NULL;
 
-    // Contract the mask image, if one exists, so that the 32-bit fetch function
-    // can use it.
-    if (mask) {
-        mask8 = pixman_malloc_ab(width, sizeof(uint32_t));
+    /* Contract the mask image, if one exists, so that the 32-bit fetch
+     * function can use it.
+     */
+    if (mask)
+    {
+	mask8 = pixman_malloc_ab (width, sizeof(uint32_t));
 	if (!mask8)
 	    return;
-	
-        pixman_contract(mask8, mask, width);
+
+	pixman_contract (mask8, (uint64_t *)mask, width);
     }
 
-    // Fetch the source image into the first half of buffer.
-    _pixman_image_get_scanline_32 (pict, x, y, width, (uint32_t*)buffer, mask8,
-				   maskBits);
+    /* Fetch the source image into the first half of buffer. */
+    _pixman_image_get_scanline_32 (image, x, y, width, (uint32_t*)buffer, mask8,
+                                   mask_bits);
 
-    // Expand from 32bpp to 64bpp in place.
-    pixman_expand(buffer, (uint32_t*)buffer, PIXMAN_a8r8g8b8, width);
+    /* Expand from 32bpp to 64bpp in place. */
+    pixman_expand ((uint64_t *)buffer, buffer, PIXMAN_a8r8g8b8, width);
 
-    free(mask8);
+    free (mask8);
 }
 
 pixman_image_t *
@@ -97,10 +103,10 @@ _pixman_image_allocate (void)
     {
 	image_common_t *common = &image->common;
 
-	pixman_region32_init (&common->full_region);
 	pixman_region32_init (&common->clip_region);
-	common->src_clip = &common->full_region;
-	common->has_client_clip = FALSE;
+
+	common->have_clip_region = FALSE;
+	common->clip_sources = FALSE;
 	common->transform = NULL;
 	common->repeat = PIXMAN_REPEAT_NONE;
 	common->filter = PIXMAN_FILTER_NEAREST;
@@ -109,20 +115,23 @@ _pixman_image_allocate (void)
 	common->alpha_map = NULL;
 	common->component_alpha = FALSE;
 	common->ref_count = 1;
-	common->read_func = NULL;
-	common->write_func = NULL;
 	common->classify = NULL;
+	common->client_clip = FALSE;
+	common->destroy_func = NULL;
+	common->destroy_data = NULL;
+	common->need_workaround = FALSE;
+	common->dirty = TRUE;
     }
 
     return image;
 }
 
-source_pict_class_t
+source_image_class_t
 _pixman_image_classify (pixman_image_t *image,
-			int             x,
-			int             y,
-			int             width,
-			int             height)
+                        int             x,
+                        int             y,
+                        int             width,
+                        int             height)
 {
     if (image->common.classify)
 	return image->common.classify (image, x, y, width, height);
@@ -131,44 +140,36 @@ _pixman_image_classify (pixman_image_t *image,
 }
 
 void
-_pixman_image_get_scanline_32 (pixman_image_t *image, int x, int y, int width,
-			       uint32_t *buffer, uint32_t *mask, uint32_t mask_bits)
+_pixman_image_get_scanline_32 (pixman_image_t *image,
+                               int             x,
+                               int             y,
+                               int             width,
+                               uint32_t *      buffer,
+                               const uint32_t *mask,
+                               uint32_t        mask_bits)
 {
     image->common.get_scanline_32 (image, x, y, width, buffer, mask, mask_bits);
 }
 
-void
-_pixman_image_get_scanline_64 (pixman_image_t *image, int x, int y, int width,
-			       uint32_t *buffer, uint32_t *unused, uint32_t unused2)
-{
-    image->common.get_scanline_64 (image, x, y, width, buffer, unused, unused2);
-}
-
 /* Even thought the type of buffer is uint32_t *, the function actually expects
  * a uint64_t *buffer.
  */
-
-scanFetchProc
-_pixman_image_get_fetcher (pixman_image_t *image,
-			   int             wide)
+void
+_pixman_image_get_scanline_64 (pixman_image_t *image,
+                               int             x,
+                               int             y,
+                               int             width,
+                               uint32_t *      buffer,
+                               const uint32_t *unused,
+                               uint32_t        unused2)
 {
-    assert (image->common.get_scanline_64);
-    assert (image->common.get_scanline_32);
-    
-    if (wide)
-	return image->common.get_scanline_64;
-    else
-	return image->common.get_scanline_32;
+    image->common.get_scanline_64 (image, x, y, width, buffer, unused, unused2);
 }
 
-#define WRITE_ACCESS(f) ((image->common.write_func)? f##_accessors : f)
-
 static void
 image_property_changed (pixman_image_t *image)
 {
-    
-    
-    image->common.property_changed (image);
+    image->common.dirty = TRUE;
 }
 
 /* Ref Counting */
@@ -190,8 +191,10 @@ pixman_image_unref (pixman_image_t *image)
 
     if (common->ref_count == 0)
     {
+	if (image->common.destroy_func)
+	    image->common.destroy_func (image, image->common.destroy_data);
+
 	pixman_region32_fini (&common->clip_region);
-	pixman_region32_fini (&common->full_region);
 
 	if (common->transform)
 	    free (common->transform);
@@ -202,21 +205,14 @@ pixman_image_unref (pixman_image_t *image)
 	if (common->alpha_map)
 	    pixman_image_unref ((pixman_image_t *)common->alpha_map);
 
-#if 0
-	if (image->type == BITS && image->bits.indexed)
-	    free (image->bits.indexed);
-#endif
-
-#if 0
-	memset (image, 0xaa, sizeof (pixman_image_t));
-#endif
-	if (image->type == LINEAR || image->type == RADIAL || image->type == CONICAL)
+	if (image->type == LINEAR ||
+	    image->type == RADIAL ||
+	    image->type == CONICAL)
 	{
 	    if (image->gradient.stops)
 		free (image->gradient.stops);
 	}
 
-
 	if (image->type == BITS && image->bits.free_me)
 	    free (image->bits.free_me);
 
@@ -228,34 +224,45 @@ pixman_image_unref (pixman_image_t *image)
     return FALSE;
 }
 
-/* Constructors */
+PIXMAN_EXPORT void
+pixman_image_set_destroy_function (pixman_image_t *            image,
+                                   pixman_image_destroy_func_t func,
+                                   void *                      data)
+{
+    image->common.destroy_func = func;
+    image->common.destroy_data = data;
+}
 
 void
 _pixman_image_reset_clip_region (pixman_image_t *image)
 {
-    pixman_region32_fini (&image->common.clip_region);
+    image->common.have_clip_region = FALSE;
+}
 
-    if (image->type == BITS)
-    {
-	pixman_region32_init_rect (&image->common.clip_region, 0, 0,
-				   image->bits.width, image->bits.height);
-    }
-    else
+void
+_pixman_image_validate (pixman_image_t *image)
+{
+    if (image->common.dirty)
     {
-	pixman_region32_init (&image->common.clip_region);
+	image->common.property_changed (image);
+	image->common.dirty = FALSE;
     }
+
+    if (image->common.alpha_map)
+	_pixman_image_validate (image->common.alpha_map);
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_image_set_clip_region32 (pixman_image_t *image,
-				pixman_region32_t *region)
+pixman_image_set_clip_region32 (pixman_image_t *   image,
+                                pixman_region32_t *region)
 {
     image_common_t *common = (image_common_t *)image;
     pixman_bool_t result;
 
     if (region)
     {
-	result = pixman_region32_copy (&common->clip_region, region);
+	if ((result = pixman_region32_copy (&common->clip_region, region)))
+	    image->common.have_clip_region = TRUE;
     }
     else
     {
@@ -269,17 +276,17 @@ pixman_image_set_clip_region32 (pixman_image_t *image,
     return result;
 }
 
-
 PIXMAN_EXPORT pixman_bool_t
-pixman_image_set_clip_region (pixman_image_t    *image,
-			      pixman_region16_t *region)
+pixman_image_set_clip_region (pixman_image_t *   image,
+                              pixman_region16_t *region)
 {
     image_common_t *common = (image_common_t *)image;
     pixman_bool_t result;
 
     if (region)
     {
-	result = pixman_region32_copy_from_region16 (&common->clip_region, region);
+	if ((result = pixman_region32_copy_from_region16 (&common->clip_region, region)))
+	    image->common.have_clip_region = TRUE;
     }
     else
     {
@@ -293,27 +300,22 @@ pixman_image_set_clip_region (pixman_image_t    *image,
     return result;
 }
 
-/* Sets whether the clip region includes a clip region set by the client
- */
 PIXMAN_EXPORT void
 pixman_image_set_has_client_clip (pixman_image_t *image,
-				  pixman_bool_t	  client_clip)
+                                  pixman_bool_t   client_clip)
 {
-    image->common.has_client_clip = client_clip;
-
-    image_property_changed (image);
+    image->common.client_clip = client_clip;
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_image_set_transform (pixman_image_t           *image,
-			    const pixman_transform_t *transform)
+pixman_image_set_transform (pixman_image_t *          image,
+                            const pixman_transform_t *transform)
 {
     static const pixman_transform_t id =
     {
 	{ { pixman_fixed_1, 0, 0 },
 	  { 0, pixman_fixed_1, 0 },
-	  { 0, 0, pixman_fixed_1 }
-	}
+	  { 0, 0, pixman_fixed_1 } }
     };
 
     image_common_t *common = (image_common_t *)image;
@@ -324,9 +326,10 @@ pixman_image_set_transform (pixman_image_t           *image,
 
     if (memcmp (&id, transform, sizeof (pixman_transform_t)) == 0)
     {
-	free(common->transform);
+	free (common->transform);
 	common->transform = NULL;
 	result = TRUE;
+
 	goto out;
     }
 
@@ -336,20 +339,23 @@ pixman_image_set_transform (pixman_image_t           *image,
     if (common->transform == NULL)
     {
 	result = FALSE;
+
 	goto out;
     }
 
-    memcpy(common->transform, transform, sizeof(pixman_transform_t));
+    memcpy (common->transform, transform, sizeof(pixman_transform_t));
+
+    result = TRUE;
 
 out:
     image_property_changed (image);
-    
-    return TRUE;
+
+    return result;
 }
 
 PIXMAN_EXPORT void
-pixman_image_set_repeat (pixman_image_t  *image,
-			 pixman_repeat_t  repeat)
+pixman_image_set_repeat (pixman_image_t *image,
+                         pixman_repeat_t repeat)
 {
     image->common.repeat = repeat;
 
@@ -357,10 +363,10 @@ pixman_image_set_repeat (pixman_image_t  *image,
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_image_set_filter (pixman_image_t       *image,
-			 pixman_filter_t       filter,
-			 const pixman_fixed_t *params,
-			 int		       n_params)
+pixman_image_set_filter (pixman_image_t *      image,
+                         pixman_filter_t       filter,
+                         const pixman_fixed_t *params,
+                         int                   n_params)
 {
     image_common_t *common = (image_common_t *)image;
     pixman_fixed_t *new_params;
@@ -376,7 +382,7 @@ pixman_image_set_filter (pixman_image_t       *image,
 	    return FALSE;
 
 	memcpy (new_params,
-		params, n_params * sizeof (pixman_fixed_t));
+	        params, n_params * sizeof (pixman_fixed_t));
     }
 
     common->filter = filter;
@@ -392,15 +398,10 @@ pixman_image_set_filter (pixman_image_t       *image,
 }
 
 PIXMAN_EXPORT void
-pixman_image_set_source_clipping (pixman_image_t  *image,
-				  pixman_bool_t    source_clipping)
+pixman_image_set_source_clipping (pixman_image_t *image,
+                                  pixman_bool_t   clip_sources)
 {
-    image_common_t *common = &image->common;
-
-    if (source_clipping)
-	common->src_clip = &common->clip_region;
-    else
-	common->src_clip = &common->full_region;
+    image->common.clip_sources = clip_sources;
 
     image_property_changed (image);
 }
@@ -410,8 +411,8 @@ pixman_image_set_source_clipping (pixman_image_t  *image,
  * way, way too expensive.
  */
 PIXMAN_EXPORT void
-pixman_image_set_indexed (pixman_image_t	 *image,
-			  const pixman_indexed_t *indexed)
+pixman_image_set_indexed (pixman_image_t *        image,
+                          const pixman_indexed_t *indexed)
 {
     bits_image_t *bits = (bits_image_t *)image;
 
@@ -422,9 +423,9 @@ pixman_image_set_indexed (pixman_image_t	 *image,
 
 PIXMAN_EXPORT void
 pixman_image_set_alpha_map (pixman_image_t *image,
-			    pixman_image_t *alpha_map,
-			    int16_t         x,
-			    int16_t         y)
+                            pixman_image_t *alpha_map,
+                            int16_t         x,
+                            int16_t         y)
 {
     image_common_t *common = (image_common_t *)image;
 
@@ -441,33 +442,35 @@ pixman_image_set_alpha_map (pixman_image_t *image,
 	    common->alpha_map = NULL;
     }
 
-    common->alpha_origin.x = x;
-    common->alpha_origin.y = y;
+    common->alpha_origin_x = x;
+    common->alpha_origin_y = y;
 
     image_property_changed (image);
 }
 
 PIXMAN_EXPORT void
-pixman_image_set_component_alpha   (pixman_image_t       *image,
-				    pixman_bool_t         component_alpha)
+pixman_image_set_component_alpha   (pixman_image_t *image,
+                                    pixman_bool_t   component_alpha)
 {
     image->common.component_alpha = component_alpha;
 
     image_property_changed (image);
 }
 
-
 PIXMAN_EXPORT void
-pixman_image_set_accessors (pixman_image_t             *image,
-			    pixman_read_memory_func_t	read_func,
-			    pixman_write_memory_func_t	write_func)
+pixman_image_set_accessors (pixman_image_t *           image,
+                            pixman_read_memory_func_t  read_func,
+                            pixman_write_memory_func_t write_func)
 {
     return_if_fail (image != NULL);
 
-    image->common.read_func = read_func;
-    image->common.write_func = write_func;
+    if (image->type == BITS)
+    {
+	image->bits.read_func = read_func;
+	image->bits.write_func = write_func;
 
-    image_property_changed (image);
+	image_property_changed (image);
+    }
 }
 
 PIXMAN_EXPORT uint32_t *
@@ -515,234 +518,92 @@ pixman_image_get_depth (pixman_image_t *image)
     return 0;
 }
 
-static uint32_t
-color_to_uint32 (const pixman_color_t *color)
+pixman_bool_t
+_pixman_image_is_solid (pixman_image_t *image)
 {
-    return
-	(color->alpha >> 8 << 24) |
-	(color->red >> 8 << 16) |
-        (color->green & 0xff00) |
-	(color->blue >> 8);
-}
+    if (image->type == SOLID)
+	return TRUE;
 
-static pixman_bool_t
-color_to_pixel (pixman_color_t *color,
-		uint32_t       *pixel,
-		pixman_format_code_t format)
-{
-    uint32_t c = color_to_uint32 (color);
-
-    if (!(format == PIXMAN_a8r8g8b8	||
-	  format == PIXMAN_x8r8g8b8	||
-	  format == PIXMAN_a8b8g8r8	||
-	  format == PIXMAN_x8b8g8r8	||
-	  format == PIXMAN_b8g8r8a8	||
-	  format == PIXMAN_b8g8r8x8	||
-	  format == PIXMAN_r5g6b5	||
-	  format == PIXMAN_b5g6r5	||
-	  format == PIXMAN_a8))
+    if (image->type != BITS     ||
+        image->bits.width != 1  ||
+        image->bits.height != 1)
     {
 	return FALSE;
     }
 
-    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_ABGR)
-    {
-	c = ((c & 0xff000000) >>  0) |
-	    ((c & 0x00ff0000) >> 16) |
-	    ((c & 0x0000ff00) >>  0) |
-	    ((c & 0x000000ff) << 16);
-    }
-    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_BGRA)
-    {
-	c = ((c & 0xff000000) >> 24) |
-	    ((c & 0x00ff0000) >>  8) |
-	    ((c & 0x0000ff00) <<  8) |
-	    ((c & 0x000000ff) << 24);
-    }
-
-    if (format == PIXMAN_a8)
-	c = c >> 24;
-    else if (format == PIXMAN_r5g6b5 ||
-	     format == PIXMAN_b5g6r5)
-	c = cvt8888to0565 (c);
-
-#if 0
-    printf ("color: %x %x %x %x\n", color->alpha, color->red, color->green, color->blue);
-    printf ("pixel: %x\n", c);
-#endif
+    if (image->common.repeat == PIXMAN_REPEAT_NONE)
+	return FALSE;
 
-    *pixel = c;
     return TRUE;
 }
 
-PIXMAN_EXPORT pixman_bool_t
-pixman_image_fill_rectangles (pixman_op_t		    op,
-			      pixman_image_t		   *dest,
-			      pixman_color_t		   *color,
-			      int			    n_rects,
-			      const pixman_rectangle16_t   *rects)
+uint32_t
+_pixman_image_get_solid (pixman_image_t *     image,
+                         pixman_format_code_t format)
 {
-    pixman_image_t *solid;
-    pixman_color_t c;
-    int i;
-
-    if (color->alpha == 0xffff)
-    {
-	if (op == PIXMAN_OP_OVER)
-	    op = PIXMAN_OP_SRC;
-    }
-
-    if (op == PIXMAN_OP_CLEAR)
-    {
-	c.red = 0;
-	c.green = 0;
-	c.blue = 0;
-	c.alpha = 0;
-
-	color = &c;
-
-	op = PIXMAN_OP_SRC;
-    }
-
-    if (op == PIXMAN_OP_SRC)
-    {
-	uint32_t pixel;
-
-	if (color_to_pixel (color, &pixel, dest->bits.format))
-	{
-	    for (i = 0; i < n_rects; ++i)
-	    {
-		pixman_region32_t fill_region;
-		int n_boxes, j;
-		pixman_box32_t *boxes;
-
-		pixman_region32_init_rect (&fill_region, rects[i].x, rects[i].y, rects[i].width, rects[i].height);
-		if (!pixman_region32_intersect (&fill_region,
-						&fill_region,
-						&dest->common.clip_region))
-		    return FALSE;
-
-
-		boxes = pixman_region32_rectangles (&fill_region, &n_boxes);
-		for (j = 0; j < n_boxes; ++j)
-		{
-		    const pixman_box32_t *box = &(boxes[j]);
-		    pixman_fill (dest->bits.bits, dest->bits.rowstride, PIXMAN_FORMAT_BPP (dest->bits.format),
-				 box->x1, box->y1, box->x2 - box->x1, box->y2 - box->y1,
-				 pixel);
-		}
-
-		pixman_region32_fini (&fill_region);
-	    }
-	    return TRUE;
-	}
-    }
+    uint32_t result;
 
-    solid = pixman_image_create_solid_fill (color);
-    if (!solid)
-	return FALSE;
+    _pixman_image_get_scanline_32 (image, 0, 0, 1, &result, NULL, 0);
 
-    for (i = 0; i < n_rects; ++i)
+    /* If necessary, convert RGB <--> BGR. */
+    if (PIXMAN_FORMAT_TYPE (format) != PIXMAN_TYPE_ARGB)
     {
-	const pixman_rectangle16_t *rect = &(rects[i]);
-
-	pixman_image_composite (op, solid, NULL, dest,
-				0, 0, 0, 0,
-				rect->x, rect->y,
-				rect->width, rect->height);
+	result = (((result & 0xff000000) >>  0) |
+	          ((result & 0x00ff0000) >> 16) |
+	          ((result & 0x0000ff00) >>  0) |
+	          ((result & 0x000000ff) << 16));
     }
 
-    pixman_image_unref (solid);
-
-    return TRUE;
+    return result;
 }
 
 pixman_bool_t
-pixman_image_can_get_solid (pixman_image_t *image)
+_pixman_image_is_opaque (pixman_image_t *image)
 {
-    if (image->type == SOLID)
-	return TRUE;
-
-    if (image->type != BITS	||
-	image->bits.width != 1	||
-	image->bits.height != 1)
-    {
-	return FALSE;
-    }
+    int i;
 
-    if (image->common.repeat != PIXMAN_REPEAT_NORMAL)
+    if (image->common.alpha_map)
 	return FALSE;
 
-    switch (image->bits.format)
+    switch (image->type)
     {
-    case PIXMAN_a8r8g8b8:
-    case PIXMAN_x8r8g8b8:
-    case PIXMAN_a8b8g8r8:
-    case PIXMAN_x8b8g8r8:
-    case PIXMAN_b8g8r8a8:
-    case PIXMAN_b8g8r8x8:
-    case PIXMAN_r8g8b8:
-    case PIXMAN_b8g8r8:
-    case PIXMAN_r5g6b5:
-    case PIXMAN_b5g6r5:
-	return TRUE;
-    default:
-	return FALSE;
-    }
-}
+    case BITS:
+	if (image->common.repeat == PIXMAN_REPEAT_NONE)
+	    return FALSE;
 
-pixman_bool_t
-pixman_image_is_opaque(pixman_image_t *image)
-{
-    int i = 0;
-    int gradientNumberOfColors = 0;
+	if (PIXMAN_FORMAT_A (image->bits.format))
+	    return FALSE;
+	break;
 
-    if(image->common.alpha_map)
-        return FALSE;
+    case LINEAR:
+    case RADIAL:
+	if (image->common.repeat == PIXMAN_REPEAT_NONE)
+	    return FALSE;
 
-    switch(image->type)
-    {
-    case BITS:
-        if(PIXMAN_FORMAT_A(image->bits.format))
-            return FALSE;
-        break;
+	for (i = 0; i < image->gradient.n_stops; ++i)
+	{
+	    if (image->gradient.stops[i].color.alpha != 0xffff)
+		return FALSE;
+	}
+	break;
 
-    case LINEAR:
     case CONICAL:
-    case RADIAL:
-        gradientNumberOfColors = image->gradient.n_stops;
-        i=0;
-        while(i<gradientNumberOfColors)
-        {
-            if(image->gradient.stops[i].color.alpha != 0xffff)
-                return FALSE;
-            i++;
-        }
-        break;
+	/* Conical gradients always have a transparent border */
+	return FALSE;
+	break;
 
     case SOLID:
-         if(Alpha(image->solid.color) != 0xff)
-            return FALSE;
-        break;
+	if (ALPHA_8 (image->solid.color) != 0xff)
+	    return FALSE;
+	break;
     }
 
-    /* Convolution filters can introduce translucency if the sum of the weights
-       is lower than 1. */
+    /* Convolution filters can introduce translucency if the sum of the
+     * weights is lower than 1.
+     */
     if (image->common.filter == PIXMAN_FILTER_CONVOLUTION)
-         return FALSE;
-
-    if (image->common.repeat == PIXMAN_REPEAT_NONE)
-    {
-        if (image->common.filter != PIXMAN_FILTER_NEAREST)
-            return FALSE;
-
-        if (image->common.transform)
-            return FALSE;
-
-	/* Gradients do not necessarily cover the entire compositing area */
-	if (image->type == LINEAR || image->type == CONICAL || image->type == RADIAL)
-	    return FALSE;
-    }
+	return FALSE;
 
-     return TRUE;
+    return TRUE;
 }
+
diff --git a/lib/pixman/pixman/pixman-implementation.c b/lib/pixman/pixman/pixman-implementation.c
index 86c2f3773..bcda9fe85 100644
--- a/lib/pixman/pixman/pixman-implementation.c
+++ b/lib/pixman/pixman/pixman-implementation.c
@@ -21,142 +21,141 @@
  * SOFTWARE.
  */
 
+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif
 #include <stdlib.h>
 #include "pixman-private.h"
 
 static void
-delegate_composite (pixman_implementation_t *	imp,
-		    pixman_op_t			op,
-		    pixman_image_t *		src,
-		    pixman_image_t *		mask,
-		    pixman_image_t *		dest,
-		    int32_t			src_x,
-		    int32_t			src_y,
-		    int32_t			mask_x,
-		    int32_t			mask_y,
-		    int32_t			dest_x,
-		    int32_t			dest_y,
-		    int32_t			width,
-		    int32_t			height)
+delegate_composite (pixman_implementation_t * imp,
+                    pixman_op_t               op,
+                    pixman_image_t *          src,
+                    pixman_image_t *          mask,
+                    pixman_image_t *          dest,
+                    int32_t                   src_x,
+                    int32_t                   src_y,
+                    int32_t                   mask_x,
+                    int32_t                   mask_y,
+                    int32_t                   dest_x,
+                    int32_t                   dest_y,
+                    int32_t                   width,
+                    int32_t                   height)
 {
     _pixman_implementation_composite (imp->delegate,
-				      op,
-				      src, mask, dest,
-				      src_x, src_y,
-				      mask_x, mask_y,
-				      dest_x, dest_y,
-				      width, height);
+                                      op,
+                                      src, mask, dest,
+                                      src_x, src_y,
+                                      mask_x, mask_y,
+                                      dest_x, dest_y,
+                                      width, height);
 }
 
 static void
-delegate_combine_32 (pixman_implementation_t *	imp,
-		     pixman_op_t		op,
-		     uint32_t *			dest,
-		     const uint32_t *		src,
-		     const uint32_t *		mask,
-		     int			width)
+delegate_combine_32 (pixman_implementation_t * imp,
+                     pixman_op_t               op,
+                     uint32_t *                dest,
+                     const uint32_t *          src,
+                     const uint32_t *          mask,
+                     int                       width)
 {
     _pixman_implementation_combine_32 (imp->delegate,
-				       op, dest, src, mask, width);
+                                       op, dest, src, mask, width);
 }
 
 static void
-delegate_combine_64 (pixman_implementation_t *	imp,
-		     pixman_op_t		op,
-		     uint64_t *			dest,
-		     const uint64_t *		src,
-		     const uint64_t *		mask,
-		     int			width)
+delegate_combine_64 (pixman_implementation_t * imp,
+                     pixman_op_t               op,
+                     uint64_t *                dest,
+                     const uint64_t *          src,
+                     const uint64_t *          mask,
+                     int                       width)
 {
     _pixman_implementation_combine_64 (imp->delegate,
-				       op, dest, src, mask, width);
+                                       op, dest, src, mask, width);
 }
 
 static void
-delegate_combine_32_ca (pixman_implementation_t *	imp,
-			pixman_op_t			op,
-			uint32_t *			dest,
-			const uint32_t *		src,
-			const uint32_t *		mask,
-			int				width)
+delegate_combine_32_ca (pixman_implementation_t * imp,
+                        pixman_op_t               op,
+                        uint32_t *                dest,
+                        const uint32_t *          src,
+                        const uint32_t *          mask,
+                        int                       width)
 {
     _pixman_implementation_combine_32_ca (imp->delegate,
-					  op, dest, src, mask, width);
+                                          op, dest, src, mask, width);
 }
 
 static void
-delegate_combine_64_ca (pixman_implementation_t *	imp,
-			pixman_op_t			op,
-			uint64_t *			dest,
-			const uint64_t *		src,
-			const uint64_t *		mask,
-			int				width)
+delegate_combine_64_ca (pixman_implementation_t * imp,
+                        pixman_op_t               op,
+                        uint64_t *                dest,
+                        const uint64_t *          src,
+                        const uint64_t *          mask,
+                        int                       width)
 {
     _pixman_implementation_combine_64_ca (imp->delegate,
-					  op, dest, src, mask, width);
+                                          op, dest, src, mask, width);
 }
 
 static pixman_bool_t
-delegate_blt (pixman_implementation_t *	imp,
-	      uint32_t *		src_bits,
-	      uint32_t *		dst_bits,
-	      int			src_stride,
-	      int			dst_stride,
-	      int			src_bpp,
-	      int			dst_bpp,
-	      int			src_x,
-	      int			src_y,
-	      int			dst_x,
-	      int			dst_y,
-	      int			width,
-	      int			height)
+delegate_blt (pixman_implementation_t * imp,
+              uint32_t *                src_bits,
+              uint32_t *                dst_bits,
+              int                       src_stride,
+              int                       dst_stride,
+              int                       src_bpp,
+              int                       dst_bpp,
+              int                       src_x,
+              int                       src_y,
+              int                       dst_x,
+              int                       dst_y,
+              int                       width,
+              int                       height)
 {
-    return _pixman_implementation_blt (imp->delegate, src_bits, dst_bits, src_stride, dst_stride,
-				       src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
-				       width, height);
+    return _pixman_implementation_blt (
+	imp->delegate, src_bits, dst_bits, src_stride, dst_stride,
+	src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
+	width, height);
 }
 
 static pixman_bool_t
 delegate_fill (pixman_implementation_t *imp,
-	       uint32_t *bits,
-	       int stride,
-	       int bpp,
-	       int x,
-	       int y,
-	       int width,
-	       int height,
-	       uint32_t xor)
+               uint32_t *               bits,
+               int                      stride,
+               int                      bpp,
+               int                      x,
+               int                      y,
+               int                      width,
+               int                      height,
+               uint32_t                 xor)
 {
-    return _pixman_implementation_fill (imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+    return _pixman_implementation_fill (
+	imp->delegate, bits, stride, bpp, x, y, width, height, xor);
 }
 
 pixman_implementation_t *
-_pixman_implementation_create (pixman_implementation_t *toplevel,
-			       pixman_implementation_t *delegate)
+_pixman_implementation_create (pixman_implementation_t *delegate)
 {
     pixman_implementation_t *imp = malloc (sizeof (pixman_implementation_t));
+    pixman_implementation_t *d;
     int i;
-    
+
     if (!imp)
 	return NULL;
-    
-    if (toplevel)
-	imp->toplevel = toplevel;
-    else
-	imp->toplevel = imp;
-    
-    if (delegate)
-	delegate->toplevel = imp->toplevel;
-    
+
+    /* Make sure the whole delegate chain has the right toplevel */
     imp->delegate = delegate;
-    
+    for (d = imp; d != NULL; d = d->delegate)
+	d->toplevel = imp;
+
     /* Fill out function pointers with ones that just delegate
      */
     imp->composite = delegate_composite;
     imp->blt = delegate_blt;
     imp->fill = delegate_fill;
-    
+
     for (i = 0; i < PIXMAN_OP_LAST; ++i)
     {
 	imp->combine_32[i] = delegate_combine_32;
@@ -164,105 +163,106 @@ _pixman_implementation_create (pixman_implementation_t *toplevel,
 	imp->combine_32_ca[i] = delegate_combine_32_ca;
 	imp->combine_64_ca[i] = delegate_combine_64_ca;
     }
-    
+
     return imp;
 }
 
 void
-_pixman_implementation_combine_32 (pixman_implementation_t *	imp,
-				   pixman_op_t			op,
-				   uint32_t *			dest,
-				   const uint32_t *		src,
-				   const uint32_t *		mask,
-				   int				width)
+_pixman_implementation_combine_32 (pixman_implementation_t * imp,
+                                   pixman_op_t               op,
+                                   uint32_t *                dest,
+                                   const uint32_t *          src,
+                                   const uint32_t *          mask,
+                                   int                       width)
 {
-    (* imp->combine_32[op]) (imp, op, dest, src, mask, width);
+    (*imp->combine_32[op]) (imp, op, dest, src, mask, width);
 }
 
 void
-_pixman_implementation_combine_64 (pixman_implementation_t *	imp,
-				   pixman_op_t			op,
-				   uint64_t *			dest,
-				   const uint64_t *		src,
-				   const uint64_t *		mask,
-				   int				width)
+_pixman_implementation_combine_64 (pixman_implementation_t * imp,
+                                   pixman_op_t               op,
+                                   uint64_t *                dest,
+                                   const uint64_t *          src,
+                                   const uint64_t *          mask,
+                                   int                       width)
 {
-    (* imp->combine_64[op]) (imp, op, dest, src, mask, width);
+    (*imp->combine_64[op]) (imp, op, dest, src, mask, width);
 }
 
 void
-_pixman_implementation_combine_32_ca (pixman_implementation_t *	imp,
-				      pixman_op_t		op,
-				      uint32_t *		dest,
-				      const uint32_t *		src,
-				      const uint32_t *		mask,
-				      int			width)
+_pixman_implementation_combine_32_ca (pixman_implementation_t * imp,
+                                      pixman_op_t               op,
+                                      uint32_t *                dest,
+                                      const uint32_t *          src,
+                                      const uint32_t *          mask,
+                                      int                       width)
 {
-    (* imp->combine_32_ca[op]) (imp, op, dest, src, mask, width);
+    (*imp->combine_32_ca[op]) (imp, op, dest, src, mask, width);
 }
 
 void
-_pixman_implementation_combine_64_ca (pixman_implementation_t *	imp,
-				      pixman_op_t		op,
-				      uint64_t *		dest,
-				      const uint64_t *		src,
-				      const uint64_t *		mask,
-				      int			width)
+_pixman_implementation_combine_64_ca (pixman_implementation_t * imp,
+                                      pixman_op_t               op,
+                                      uint64_t *                dest,
+                                      const uint64_t *          src,
+                                      const uint64_t *          mask,
+                                      int                       width)
 {
-    (* imp->combine_64_ca[op]) (imp, op, dest, src, mask, width);
+    (*imp->combine_64_ca[op]) (imp, op, dest, src, mask, width);
 }
 
 void
-_pixman_implementation_composite (pixman_implementation_t *	imp,
-				  pixman_op_t			op,
-				  pixman_image_t *		src,
-				  pixman_image_t *		mask,
-				  pixman_image_t *		dest,
-				  int32_t			src_x,
-				  int32_t			src_y,
-				  int32_t			mask_x,
-				  int32_t			mask_y,
-				  int32_t			dest_x,
-				  int32_t			dest_y,
-				  int32_t			width,
-				  int32_t			height)
+_pixman_implementation_composite (pixman_implementation_t * imp,
+                                  pixman_op_t               op,
+                                  pixman_image_t *          src,
+                                  pixman_image_t *          mask,
+                                  pixman_image_t *          dest,
+                                  int32_t                   src_x,
+                                  int32_t                   src_y,
+                                  int32_t                   mask_x,
+                                  int32_t                   mask_y,
+                                  int32_t                   dest_x,
+                                  int32_t                   dest_y,
+                                  int32_t                   width,
+                                  int32_t                   height)
 {
-    (* imp->composite) (imp, op,
-			src, mask, dest,
-			src_x, src_y, mask_x, mask_y, dest_x, dest_y,
-			width, height);
+    (*imp->composite) (imp, op,
+		       src, mask, dest,
+		       src_x, src_y, mask_x, mask_y, dest_x, dest_y,
+		       width, height);
 }
 
 pixman_bool_t
-_pixman_implementation_blt (pixman_implementation_t *	imp,
-			    uint32_t *			src_bits,
-			    uint32_t *			dst_bits,
-			    int				src_stride,
-			    int				dst_stride,
-			    int				src_bpp,
-			    int				dst_bpp,
-			    int				src_x,
-			    int				src_y,
-			    int				dst_x,
-			    int				dst_y,
-			    int				width,
-			    int				height)
+_pixman_implementation_blt (pixman_implementation_t * imp,
+                            uint32_t *                src_bits,
+                            uint32_t *                dst_bits,
+                            int                       src_stride,
+                            int                       dst_stride,
+                            int                       src_bpp,
+                            int                       dst_bpp,
+                            int                       src_x,
+                            int                       src_y,
+                            int                       dst_x,
+                            int                       dst_y,
+                            int                       width,
+                            int                       height)
 {
-    return (* imp->blt) (imp, src_bits, dst_bits, src_stride, dst_stride,
-			 src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
-			 width, height);
+    return (*imp->blt) (imp, src_bits, dst_bits, src_stride, dst_stride,
+			src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y,
+			width, height);
 }
 
 pixman_bool_t
 _pixman_implementation_fill (pixman_implementation_t *imp,
-			     uint32_t *bits,
-			     int stride,
-			     int bpp,
-			     int x,
-			     int y,
-			     int width,
-			     int height,
-			     uint32_t xor)
+                             uint32_t *               bits,
+                             int                      stride,
+                             int                      bpp,
+                             int                      x,
+                             int                      y,
+                             int                      width,
+                             int                      height,
+                             uint32_t                 xor)
 {
-    return (* imp->fill) (imp, bits, stride, bpp, x, y, width, height, xor);
+    return (*imp->fill) (imp, bits, stride, bpp, x, y, width, height, xor);
 }
+
diff --git a/lib/pixman/pixman/pixman-linear-gradient.c b/lib/pixman/pixman/pixman-linear-gradient.c
index ea2975036..d9409fe50 100644
--- a/lib/pixman/pixman/pixman-linear-gradient.c
+++ b/lib/pixman/pixman/pixman-linear-gradient.c
@@ -24,29 +24,33 @@
  * SOFTWARE.
  */
 
+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif
 #include <stdlib.h>
 #include "pixman-private.h"
 
-static source_pict_class_t
+static source_image_class_t
 linear_gradient_classify (pixman_image_t *image,
-			  int	          x,
-			  int	          y,
-			  int	          width,
-			  int	          height)
+                          int             x,
+                          int             y,
+                          int             width,
+                          int             height)
 {
     linear_gradient_t *linear = (linear_gradient_t *)image;
-    pixman_vector_t   v;
+    pixman_vector_t v;
     pixman_fixed_32_32_t l;
     pixman_fixed_48_16_t dx, dy, a, b, off;
     pixman_fixed_48_16_t factors[4];
-    int	     i;
-    
+    int i;
+
     image->source.class = SOURCE_IMAGE_CLASS_UNKNOWN;
-    
+
     dx = linear->p2.x - linear->p1.x;
     dy = linear->p2.y - linear->p1.y;
+
     l = dx * dx + dy * dy;
+
     if (l)
     {
 	a = (dx << 32) / l;
@@ -56,40 +60,45 @@ linear_gradient_classify (pixman_image_t *image,
     {
 	a = b = 0;
     }
-    
+
     off = (-a * linear->p1.x
-	   -b * linear->p1.y) >> 16;
-    
+           -b * linear->p1.y) >> 16;
+
     for (i = 0; i < 3; i++)
     {
 	v.vector[0] = pixman_int_to_fixed ((i % 2) * (width  - 1) + x);
 	v.vector[1] = pixman_int_to_fixed ((i / 2) * (height - 1) + y);
 	v.vector[2] = pixman_fixed_1;
-	
+
 	if (image->common.transform)
 	{
 	    if (!pixman_transform_point_3d (image->common.transform, &v))
 	    {
 		image->source.class = SOURCE_IMAGE_CLASS_UNKNOWN;
-		
+
 		return image->source.class;
 	    }
 	}
-	
+
 	factors[i] = ((a * v.vector[0] + b * v.vector[1]) >> 16) + off;
     }
-    
+
     if (factors[2] == factors[0])
 	image->source.class = SOURCE_IMAGE_CLASS_HORIZONTAL;
     else if (factors[1] == factors[0])
 	image->source.class = SOURCE_IMAGE_CLASS_VERTICAL;
-    
+
     return image->source.class;
 }
 
 static void
-linear_gradient_get_scanline_32 (pixman_image_t *image, int x, int y, int width,
-				 uint32_t *buffer, uint32_t *mask, uint32_t maskBits)
+linear_gradient_get_scanline_32 (pixman_image_t *image,
+                                 int             x,
+                                 int             y,
+                                 int             width,
+                                 uint32_t *      buffer,
+                                 const uint32_t *mask,
+                                 uint32_t        mask_bits)
 {
     pixman_vector_t v, unit;
     pixman_fixed_32_32_t l;
@@ -97,83 +106,102 @@ linear_gradient_get_scanline_32 (pixman_image_t *image, int x, int y, int width,
     gradient_t *gradient = (gradient_t *)image;
     source_image_t *source = (source_image_t *)image;
     linear_gradient_t *linear = (linear_gradient_t *)image;
-    uint32_t       *end = buffer + width;
-    GradientWalker  walker;
-    
+    uint32_t *end = buffer + width;
+    pixman_gradient_walker_t walker;
+
     _pixman_gradient_walker_init (&walker, gradient, source->common.repeat);
-    
+
     /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed(x) + pixman_fixed_1/2;
-    v.vector[1] = pixman_int_to_fixed(y) + pixman_fixed_1/2;
+    v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
     v.vector[2] = pixman_fixed_1;
-    if (source->common.transform) {
+
+    if (source->common.transform)
+    {
 	if (!pixman_transform_point_3d (source->common.transform, &v))
 	    return;
+	
 	unit.vector[0] = source->common.transform->matrix[0][0];
 	unit.vector[1] = source->common.transform->matrix[1][0];
 	unit.vector[2] = source->common.transform->matrix[2][0];
-    } else {
+    }
+    else
+    {
 	unit.vector[0] = pixman_fixed_1;
 	unit.vector[1] = 0;
 	unit.vector[2] = 0;
     }
-    
+
     dx = linear->p2.x - linear->p1.x;
     dy = linear->p2.y - linear->p1.y;
-    l = dx*dx + dy*dy;
-    if (l != 0) {
+
+    l = dx * dx + dy * dy;
+
+    if (l != 0)
+    {
 	a = (dx << 32) / l;
 	b = (dy << 32) / l;
-	off = (-a*linear->p1.x - b*linear->p1.y)>>16;
+	off = (-a * linear->p1.x
+	       -b * linear->p1.y) >> 16;
     }
-    if (l == 0  || (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1)) {
+
+    if (l == 0 || (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1))
+    {
 	pixman_fixed_48_16_t inc, t;
+
 	/* affine transformation only */
-	if (l == 0) {
+	if (l == 0)
+	{
 	    t = 0;
 	    inc = 0;
-	} else {
-	    t = ((a*v.vector[0] + b*v.vector[1]) >> 16) + off;
+	}
+	else
+	{
+	    t = ((a * v.vector[0] + b * v.vector[1]) >> 16) + off;
 	    inc = (a * unit.vector[0] + b * unit.vector[1]) >> 16;
 	}
-	
+
 	if (source->class == SOURCE_IMAGE_CLASS_VERTICAL)
 	{
 	    register uint32_t color;
-	    
-	    color = _pixman_gradient_walker_pixel( &walker, t );
+
+	    color = _pixman_gradient_walker_pixel (&walker, t);
 	    while (buffer < end)
-		*(buffer++) = color;
+		*buffer++ = color;
 	}
 	else
 	{
-	    if (!mask) {
+	    if (!mask)
+	    {
 		while (buffer < end)
 		{
-		    *(buffer) = _pixman_gradient_walker_pixel (&walker, t);
-		    buffer += 1;
-		    t      += inc;
+		    *buffer++ = _pixman_gradient_walker_pixel (&walker, t);
+		    
+		    t += inc;
 		}
-	    } else {
-		while (buffer < end) {
-		    if (*mask++ & maskBits)
-		    {
-			*(buffer) = _pixman_gradient_walker_pixel (&walker, t);
-		    }
-		    buffer += 1;
-		    t      += inc;
+	    }
+	    else
+	    {
+		while (buffer < end)
+		{
+		    if (*mask++ & mask_bits)
+			*buffer = _pixman_gradient_walker_pixel (&walker, t);
+
+		    buffer++;
+		    t += inc;
 		}
 	    }
 	}
     }
-    else /* projective transformation */
+    else
     {
+	/* projective transformation */
 	pixman_fixed_48_16_t t;
-	
+
 	if (source->class == SOURCE_IMAGE_CLASS_VERTICAL)
 	{
 	    register uint32_t color;
-	    
+
 	    if (v.vector[2] == 0)
 	    {
 		t = 0;
@@ -181,33 +209,39 @@ linear_gradient_get_scanline_32 (pixman_image_t *image, int x, int y, int width,
 	    else
 	    {
 		pixman_fixed_48_16_t x, y;
-		
+
 		x = ((pixman_fixed_48_16_t) v.vector[0] << 16) / v.vector[2];
 		y = ((pixman_fixed_48_16_t) v.vector[1] << 16) / v.vector[2];
 		t = ((a * x + b * y) >> 16) + off;
 	    }
-	    
-	    color = _pixman_gradient_walker_pixel( &walker, t );
+
+	    color = _pixman_gradient_walker_pixel (&walker, t);
 	    while (buffer < end)
-		*(buffer++) = color;
+		*buffer++ = color;
 	}
 	else
 	{
 	    while (buffer < end)
 	    {
-		if (!mask || *mask++ & maskBits)
+		if (!mask || *mask++ & mask_bits)
 		{
-		    if (v.vector[2] == 0) {
+		    if (v.vector[2] == 0)
+		    {
 			t = 0;
-		    } else {
+		    }
+		    else
+		    {
 			pixman_fixed_48_16_t x, y;
 			x = ((pixman_fixed_48_16_t)v.vector[0] << 16) / v.vector[2];
 			y = ((pixman_fixed_48_16_t)v.vector[1] << 16) / v.vector[2];
-			t = ((a*x + b*y) >> 16) + off;
+			t = ((a * x + b * y) >> 16) + off;
 		    }
-		    *(buffer) = _pixman_gradient_walker_pixel (&walker, t);
+		    
+		    *buffer = _pixman_gradient_walker_pixel (&walker, t);
 		}
+		
 		++buffer;
+		
 		v.vector[0] += unit.vector[0];
 		v.vector[1] += unit.vector[1];
 		v.vector[2] += unit.vector[2];
@@ -219,43 +253,42 @@ linear_gradient_get_scanline_32 (pixman_image_t *image, int x, int y, int width,
 static void
 linear_gradient_property_changed (pixman_image_t *image)
 {
-    image->common.get_scanline_32 = (scanFetchProc)linear_gradient_get_scanline_32;
-    image->common.get_scanline_64 = (scanFetchProc)_pixman_image_get_scanline_64_generic;
+    image->common.get_scanline_32 = linear_gradient_get_scanline_32;
+    image->common.get_scanline_64 = _pixman_image_get_scanline_generic_64;
 }
 
 PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_linear_gradient (pixman_point_fixed_t         *p1,
-				     pixman_point_fixed_t         *p2,
-				     const pixman_gradient_stop_t *stops,
-				     int                           n_stops)
+pixman_image_create_linear_gradient (pixman_point_fixed_t *        p1,
+                                     pixman_point_fixed_t *        p2,
+                                     const pixman_gradient_stop_t *stops,
+                                     int                           n_stops)
 {
     pixman_image_t *image;
     linear_gradient_t *linear;
-    
+
     return_val_if_fail (n_stops >= 2, NULL);
-    
-    image = _pixman_image_allocate();
-    
+
+    image = _pixman_image_allocate ();
+
     if (!image)
 	return NULL;
-    
+
     linear = &image->linear;
-    
+
     if (!_pixman_init_gradient (&linear->common, stops, n_stops))
     {
 	free (image);
 	return NULL;
     }
-    
+
     linear->p1 = *p1;
     linear->p2 = *p2;
-    
+
     image->type = LINEAR;
     image->source.class = SOURCE_IMAGE_CLASS_UNKNOWN;
     image->common.classify = linear_gradient_classify;
     image->common.property_changed = linear_gradient_property_changed;
-    
-    linear_gradient_property_changed (image);
-    
+
     return image;
 }
+
diff --git a/lib/pixman/pixman/pixman-matrix.c b/lib/pixman/pixman/pixman-matrix.c
index 79dae8de1..abdfa0525 100644
--- a/lib/pixman/pixman/pixman-matrix.c
+++ b/lib/pixman/pixman/pixman-matrix.c
@@ -32,595 +32,737 @@
 #include <string.h>
 #include "pixman-private.h"
 
-#define F(x)	pixman_int_to_fixed(x)
+#define F(x)    pixman_int_to_fixed (x)
 
 PIXMAN_EXPORT void
-pixman_transform_init_identity(struct pixman_transform *matrix)
+pixman_transform_init_identity (struct pixman_transform *matrix)
 {
-	int	i;
+    int i;
 
-	memset(matrix, '\0', sizeof (struct pixman_transform));
-	for (i = 0; i < 3; i++)
-		matrix->matrix[i][i] = F(1);
+    memset (matrix, '\0', sizeof (struct pixman_transform));
+    for (i = 0; i < 3; i++)
+	matrix->matrix[i][i] = F (1);
 }
 
-typedef pixman_fixed_32_32_t	pixman_fixed_34_30_t;
+typedef pixman_fixed_32_32_t pixman_fixed_34_30_t;
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_transform_point_3d(const struct pixman_transform *transform,
-			  struct pixman_vector *vector)
+pixman_transform_point_3d (const struct pixman_transform *transform,
+                           struct pixman_vector *         vector)
 {
-	struct pixman_vector result;
-	pixman_fixed_32_32_t partial;
-	pixman_fixed_48_16_t v;
-	int i, j;
+    struct pixman_vector result;
+    pixman_fixed_32_32_t partial;
+    pixman_fixed_48_16_t v;
+    int i, j;
 
-	for (j = 0; j < 3; j++)
+    for (j = 0; j < 3; j++)
+    {
+	v = 0;
+	for (i = 0; i < 3; i++)
 	{
-		v = 0;
-		for (i = 0; i < 3; i++)
-		{
-			partial = ((pixman_fixed_48_16_t) transform->matrix[j][i] *
-				   (pixman_fixed_48_16_t) vector->vector[i]);
-			v += partial >> 16;
-		}
-		if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
-			return FALSE;
-		result.vector[j] = (pixman_fixed_t) v;
+	    partial = ((pixman_fixed_48_16_t) transform->matrix[j][i] *
+	               (pixman_fixed_48_16_t) vector->vector[i]);
+	    v += partial >> 16;
 	}
-	*vector = result;
-	if (!result.vector[2])
-		return FALSE;
-	return TRUE;
+	
+	if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
+	    return FALSE;
+	
+	result.vector[j] = (pixman_fixed_t) v;
+    }
+    
+    *vector = result;
+
+    if (!result.vector[2])
+	return FALSE;
+
+    return TRUE;
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_transform_point(const struct pixman_transform *transform,
-		       struct pixman_vector *vector)
+pixman_transform_point (const struct pixman_transform *transform,
+                        struct pixman_vector *         vector)
 {
-	pixman_fixed_32_32_t partial;
-	pixman_fixed_34_30_t v[3];
-	pixman_fixed_48_16_t quo;
-	int i, j;
+    pixman_fixed_32_32_t partial;
+    pixman_fixed_34_30_t v[3];
+    pixman_fixed_48_16_t quo;
+    int i, j;
 
-	for (j = 0; j < 3; j++)
-	{
-		v[j] = 0;
-		for (i = 0; i < 3; i++)
-		{
-			partial = ((pixman_fixed_32_32_t) transform->matrix[j][i] * 
-				   (pixman_fixed_32_32_t) vector->vector[i]);
-			v[j] += partial >> 2;
-		}
-	}
-	if (!(v[2] >> 16))
-		return FALSE;
-	for (j = 0; j < 2; j++)
+    for (j = 0; j < 3; j++)
+    {
+	v[j] = 0;
+	
+	for (i = 0; i < 3; i++)
 	{
-		quo = v[j] / (v[2] >> 16);
-		if (quo > pixman_max_fixed_48_16 || quo < pixman_min_fixed_48_16)
-			return FALSE;
-		vector->vector[j] = (pixman_fixed_t) quo;
+	    partial = ((pixman_fixed_32_32_t) transform->matrix[j][i] *
+	               (pixman_fixed_32_32_t) vector->vector[i]);
+	    v[j] += partial >> 2;
 	}
-	vector->vector[2] = pixman_fixed_1;
-	return TRUE;
+    }
+    
+    if (!(v[2] >> 16))
+	return FALSE;
+
+    for (j = 0; j < 2; j++)
+    {
+	quo = v[j] / (v[2] >> 16);
+	if (quo > pixman_max_fixed_48_16 || quo < pixman_min_fixed_48_16)
+	    return FALSE;
+	vector->vector[j] = (pixman_fixed_t) quo;
+    }
+    
+    vector->vector[2] = pixman_fixed_1;
+    return TRUE;
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_transform_multiply (struct pixman_transform *dst,
-			   const struct pixman_transform *l,
-			   const struct pixman_transform *r)
-{
-	struct pixman_transform d;
-	int dx, dy;
-	int o;
-
-	for (dy = 0; dy < 3; dy++)
-		for (dx = 0; dx < 3; dx++) {
-			pixman_fixed_48_16_t    v;
-			pixman_fixed_32_32_t    partial;
-			v = 0;
-			for (o = 0; o < 3; o++) {
-				partial = (pixman_fixed_32_32_t) l->matrix[dy][o] * (pixman_fixed_32_32_t) r->matrix[o][dx];
-				v += partial >> 16;
-			}
-			if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
-				return FALSE;
-			d.matrix[dy][dx] = (pixman_fixed_t) v;
-		}
-	*dst = d;
-	return TRUE;
+pixman_transform_multiply (struct pixman_transform *      dst,
+                           const struct pixman_transform *l,
+                           const struct pixman_transform *r)
+{
+    struct pixman_transform d;
+    int dx, dy;
+    int o;
+
+    for (dy = 0; dy < 3; dy++)
+    {
+	for (dx = 0; dx < 3; dx++)
+	{
+	    pixman_fixed_48_16_t v;
+	    pixman_fixed_32_32_t partial;
+	    
+	    v = 0;
+	    for (o = 0; o < 3; o++)
+	    {
+		partial =
+		    (pixman_fixed_32_32_t) l->matrix[dy][o] *
+		    (pixman_fixed_32_32_t) r->matrix[o][dx];
+
+		v += partial >> 16;
+	    }
+
+	    if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
+		return FALSE;
+	    
+	    d.matrix[dy][dx] = (pixman_fixed_t) v;
+	}
+    }
+
+    *dst = d;
+    return TRUE;
 }
 
 PIXMAN_EXPORT void
 pixman_transform_init_scale (struct pixman_transform *t,
-			     pixman_fixed_t sx,
-			     pixman_fixed_t sy)
+                             pixman_fixed_t           sx,
+                             pixman_fixed_t           sy)
 {
-	memset (t, '\0', sizeof (struct pixman_transform));
-	t->matrix[0][0] = sx;
-	t->matrix[1][1] = sy;
-	t->matrix[2][2] = F (1);
+    memset (t, '\0', sizeof (struct pixman_transform));
+
+    t->matrix[0][0] = sx;
+    t->matrix[1][1] = sy;
+    t->matrix[2][2] = F (1);
 }
 
 static pixman_fixed_t
-fixed_inverse(pixman_fixed_t x)
+fixed_inverse (pixman_fixed_t x)
 {
-	return (pixman_fixed_t) ((((pixman_fixed_48_16_t) F(1)) * F(1)) / x);
+    return (pixman_fixed_t) ((((pixman_fixed_48_16_t) F (1)) * F (1)) / x);
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_transform_scale(struct pixman_transform *forward,
-		       struct pixman_transform *reverse,
-		       pixman_fixed_t sx, pixman_fixed_t sy)
-{
-	struct pixman_transform   t;
-
-	if (sx == 0 || sy == 0)
-		return FALSE;
-
-	if (forward) {
-		pixman_transform_init_scale (&t, sx, sy);
-		if (!pixman_transform_multiply (forward, &t, forward))
-			return FALSE;
-	}
-	if (reverse) {
-		pixman_transform_init_scale (&t, fixed_inverse (sx),
-					     fixed_inverse (sy));
-		if (!pixman_transform_multiply (reverse, reverse, &t))
-			return FALSE;
-	}
-	return TRUE;
+pixman_transform_scale (struct pixman_transform *forward,
+                        struct pixman_transform *reverse,
+                        pixman_fixed_t           sx,
+                        pixman_fixed_t           sy)
+{
+    struct pixman_transform t;
+
+    if (sx == 0 || sy == 0)
+	return FALSE;
+
+    if (forward)
+    {
+	pixman_transform_init_scale (&t, sx, sy);
+	if (!pixman_transform_multiply (forward, &t, forward))
+	    return FALSE;
+    }
+    
+    if (reverse)
+    {
+	pixman_transform_init_scale (&t, fixed_inverse (sx),
+	                             fixed_inverse (sy));
+	if (!pixman_transform_multiply (reverse, reverse, &t))
+	    return FALSE;
+    }
+    
+    return TRUE;
 }
 
 PIXMAN_EXPORT void
-pixman_transform_init_rotate(struct pixman_transform *t,
-			     pixman_fixed_t c,
-			     pixman_fixed_t s)
+pixman_transform_init_rotate (struct pixman_transform *t,
+                              pixman_fixed_t           c,
+                              pixman_fixed_t           s)
 {
-	memset(t, '\0', sizeof (struct pixman_transform));
-	t->matrix[0][0] = c;
-	t->matrix[0][1] = -s;
-	t->matrix[1][0] = s;
-	t->matrix[1][1] = c;
-	t->matrix[2][2] = F (1);
+    memset (t, '\0', sizeof (struct pixman_transform));
+
+    t->matrix[0][0] = c;
+    t->matrix[0][1] = -s;
+    t->matrix[1][0] = s;
+    t->matrix[1][1] = c;
+    t->matrix[2][2] = F (1);
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_transform_rotate(struct pixman_transform *forward,
-			struct pixman_transform *reverse,
-			pixman_fixed_t c, pixman_fixed_t s)
-{
-	struct pixman_transform   t;
-	
-	if (forward) {
-		pixman_transform_init_rotate(&t, c, s);
-		if (!pixman_transform_multiply(forward, &t, forward))
-			return FALSE;
-	}
-
-	if (reverse) {
-		pixman_transform_init_rotate(&t, c, -s);
-		if (!pixman_transform_multiply (reverse, reverse, &t))
-			return FALSE;
-	}
-	return TRUE;
+pixman_transform_rotate (struct pixman_transform *forward,
+                         struct pixman_transform *reverse,
+                         pixman_fixed_t           c,
+                         pixman_fixed_t           s)
+{
+    struct pixman_transform t;
+
+    if (forward)
+    {
+	pixman_transform_init_rotate (&t, c, s);
+	if (!pixman_transform_multiply (forward, &t, forward))
+	    return FALSE;
+    }
+
+    if (reverse)
+    {
+	pixman_transform_init_rotate (&t, c, -s);
+	if (!pixman_transform_multiply (reverse, reverse, &t))
+	    return FALSE;
+    }
+    
+    return TRUE;
 }
 
 PIXMAN_EXPORT void
-pixman_transform_init_translate(struct pixman_transform *t,
-				pixman_fixed_t tx, pixman_fixed_t ty)
+pixman_transform_init_translate (struct pixman_transform *t,
+                                 pixman_fixed_t           tx,
+                                 pixman_fixed_t           ty)
 {
-	memset(t, '\0', sizeof (struct pixman_transform));
-	t->matrix[0][0] = F (1);
-	t->matrix[0][2] = tx;
-	t->matrix[1][1] = F (1);
-	t->matrix[1][2] = ty;
-	t->matrix[2][2] = F (1);
+    memset (t, '\0', sizeof (struct pixman_transform));
+
+    t->matrix[0][0] = F (1);
+    t->matrix[0][2] = tx;
+    t->matrix[1][1] = F (1);
+    t->matrix[1][2] = ty;
+    t->matrix[2][2] = F (1);
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_transform_translate(struct pixman_transform *forward,
-			   struct pixman_transform *reverse,
-			   pixman_fixed_t tx, pixman_fixed_t ty)
+pixman_transform_translate (struct pixman_transform *forward,
+                            struct pixman_transform *reverse,
+                            pixman_fixed_t           tx,
+                            pixman_fixed_t           ty)
 {
-	struct pixman_transform   t;
+    struct pixman_transform t;
 
-	if (forward) {
-		pixman_transform_init_translate(&t, tx, ty);
-		if (!pixman_transform_multiply(forward, &t, forward))
-			return FALSE;
-	}
+    if (forward)
+    {
+	pixman_transform_init_translate (&t, tx, ty);
 
-	if (reverse) {
-		pixman_transform_init_translate(&t, -tx, -ty);
-		if (!pixman_transform_multiply(reverse, reverse, &t))
-			return FALSE;
-	}
-	return TRUE;
+	if (!pixman_transform_multiply (forward, &t, forward))
+	    return FALSE;
+    }
+
+    if (reverse)
+    {
+	pixman_transform_init_translate (&t, -tx, -ty);
+
+	if (!pixman_transform_multiply (reverse, reverse, &t))
+	    return FALSE;
+    }
+    return TRUE;
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_transform_bounds(const struct pixman_transform *matrix,
-			struct pixman_box16 *b)
-			
-{
-	struct pixman_vector v[4];
-	int i;
-	int x1, y1, x2, y2;
-
-	v[0].vector[0] = F (b->x1);    v[0].vector[1] = F (b->y1);	v[0].vector[2] = F(1);
-	v[1].vector[0] = F (b->x2);    v[1].vector[1] = F (b->y1);	v[1].vector[2] = F(1);
-	v[2].vector[0] = F (b->x2);    v[2].vector[1] = F (b->y2);	v[2].vector[2] = F(1);
-	v[3].vector[0] = F (b->x1);    v[3].vector[1] = F (b->y2);	v[3].vector[2] = F(1);
-	for (i = 0; i < 4; i++)
+pixman_transform_bounds (const struct pixman_transform *matrix,
+                         struct pixman_box16 *          b)
+
+{
+    struct pixman_vector v[4];
+    int i;
+    int x1, y1, x2, y2;
+
+    v[0].vector[0] = F (b->x1);
+    v[0].vector[1] = F (b->y1);
+    v[0].vector[2] = F (1);
+
+    v[1].vector[0] = F (b->x2);
+    v[1].vector[1] = F (b->y1);
+    v[1].vector[2] = F (1);
+
+    v[2].vector[0] = F (b->x2);
+    v[2].vector[1] = F (b->y2);
+    v[2].vector[2] = F (1);
+
+    v[3].vector[0] = F (b->x1);
+    v[3].vector[1] = F (b->y2);
+    v[3].vector[2] = F (1);
+
+    for (i = 0; i < 4; i++)
+    {
+	if (!pixman_transform_point (matrix, &v[i]))
+	    return FALSE;
+
+	x1 = pixman_fixed_to_int (v[i].vector[0]);
+	y1 = pixman_fixed_to_int (v[i].vector[1]);
+	x2 = pixman_fixed_to_int (pixman_fixed_ceil (v[i].vector[0]));
+	y2 = pixman_fixed_to_int (pixman_fixed_ceil (v[i].vector[1]));
+
+	if (i == 0)
+	{
+	    b->x1 = x1;
+	    b->y1 = y1;
+	    b->x2 = x2;
+	    b->y2 = y2;
+	}
+	else
 	{
-		if (!pixman_transform_point(matrix, &v[i]))
-			return FALSE;
-		x1 = pixman_fixed_to_int(v[i].vector[0]);
-		y1 = pixman_fixed_to_int(v[i].vector[1]);
-		x2 = pixman_fixed_to_int(pixman_fixed_ceil (v[i].vector[0]));
-		y2 = pixman_fixed_to_int(pixman_fixed_ceil (v[i].vector[1]));
-		if (i == 0)
-		{
-			b->x1 = x1; b->y1 = y1;
-			b->x2 = x2; b->y2 = y2;
-		}
-		else
-		{
-			if (x1 < b->x1) b->x1 = x1;
-			if (y1 < b->y1) b->y1 = y1;
-			if (x2 > b->x2) b->x2 = x2;
-			if (y2 > b->y2) b->y2 = y2;
-		}
+	    if (x1 < b->x1) b->x1 = x1;
+	    if (y1 < b->y1) b->y1 = y1;
+	    if (x2 > b->x2) b->x2 = x2;
+	    if (y2 > b->y2) b->y2 = y2;
 	}
-	return TRUE;
+    }
+
+    return TRUE;
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_transform_invert (struct pixman_transform *dst,
-			 const struct pixman_transform *src)
+pixman_transform_invert (struct pixman_transform *      dst,
+                         const struct pixman_transform *src)
 {
-	struct pixman_f_transform m, r;
+    struct pixman_f_transform m, r;
 
-	pixman_f_transform_from_pixman_transform (&m, src);
-	if (!pixman_f_transform_invert (&r, &m))
-		return FALSE;
-	if (!pixman_transform_from_pixman_f_transform (dst, &r))
-		return FALSE;
-	return TRUE;
+    pixman_f_transform_from_pixman_transform (&m, src);
+
+    if (!pixman_f_transform_invert (&r, &m))
+	return FALSE;
+
+    if (!pixman_transform_from_pixman_f_transform (dst, &r))
+	return FALSE;
+
+    return TRUE;
 }
 
 static pixman_bool_t
-within_epsilon(pixman_fixed_t a, pixman_fixed_t b, pixman_fixed_t epsilon)
+within_epsilon (pixman_fixed_t a,
+                pixman_fixed_t b,
+                pixman_fixed_t epsilon)
 {
-	pixman_fixed_t  t = a - b;
-	if (t < 0) t = -t;
-	return t <= epsilon;
+    pixman_fixed_t t = a - b;
+
+    if (t < 0)
+	t = -t;
+
+    return t <= epsilon;
 }
 
-#define epsilon	(pixman_fixed_t) (2)
+#define EPSILON (pixman_fixed_t) (2)
 
-#define is_same(a,b) (within_epsilon(a, b, epsilon))
-#define is_zero(a)   (within_epsilon(a, 0, epsilon))
-#define is_one(a)    (within_epsilon(a, F(1), epsilon))
-#define is_unit(a)   (within_epsilon(a, F( 1), epsilon) || \
-		      within_epsilon(a, F(-1), epsilon) || \
-		      is_zero(a))
-#define is_int(a)    (is_zero(pixman_fixed_frac(a)))
+#define IS_SAME(a, b) (within_epsilon (a, b, EPSILON))
+#define IS_ZERO(a)    (within_epsilon (a, 0, EPSILON))
+#define IS_ONE(a)     (within_epsilon (a, F (1), EPSILON))
+#define IS_UNIT(a)			    \
+    (within_epsilon (a, F (1), EPSILON) ||  \
+     within_epsilon (a, F (-1), EPSILON) || \
+     IS_ZERO (a))
+#define IS_INT(a)    (IS_ZERO (pixman_fixed_frac (a)))
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_transform_is_identity(const struct pixman_transform *t)
+pixman_transform_is_identity (const struct pixman_transform *t)
 {
-	return ( is_same(t->matrix[0][0], t->matrix[1][1]) &&
-		 is_same(t->matrix[0][0], t->matrix[2][2]) &&
-		!is_zero(t->matrix[0][0]) &&
-		 is_zero(t->matrix[0][1]) &&
-		 is_zero(t->matrix[0][2]) &&
-		 is_zero(t->matrix[1][0]) &&
-		 is_zero(t->matrix[1][2]) &&
-		 is_zero(t->matrix[2][0]) &&
-		 is_zero(t->matrix[2][1]));
+    return (IS_SAME (t->matrix[0][0], t->matrix[1][1]) &&
+	    IS_SAME (t->matrix[0][0], t->matrix[2][2]) &&
+	    !IS_ZERO (t->matrix[0][0]) &&
+	    IS_ZERO (t->matrix[0][1]) &&
+	    IS_ZERO (t->matrix[0][2]) &&
+	    IS_ZERO (t->matrix[1][0]) &&
+	    IS_ZERO (t->matrix[1][2]) &&
+	    IS_ZERO (t->matrix[2][0]) &&
+	    IS_ZERO (t->matrix[2][1]));
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_transform_is_scale(const struct pixman_transform *t)
+pixman_transform_is_scale (const struct pixman_transform *t)
 {
-	return (!is_zero(t->matrix[0][0]) &&
-		 is_zero(t->matrix[0][1]) &&
-		 is_zero(t->matrix[0][2]) &&
+    return (!IS_ZERO (t->matrix[0][0]) &&
+            IS_ZERO (t->matrix[0][1]) &&
+            IS_ZERO (t->matrix[0][2]) &&
 
-		 is_zero(t->matrix[1][0]) &&
-		!is_zero(t->matrix[1][1]) &&
-		 is_zero(t->matrix[1][2]) &&
+            IS_ZERO (t->matrix[1][0]) &&
+            !IS_ZERO (t->matrix[1][1]) &&
+            IS_ZERO (t->matrix[1][2]) &&
 
-		 is_zero(t->matrix[2][0]) &&
-		 is_zero(t->matrix[2][1]) &&
-		!is_zero(t->matrix[2][2]));
+            IS_ZERO (t->matrix[2][0]) &&
+            IS_ZERO (t->matrix[2][1]) &&
+            !IS_ZERO (t->matrix[2][2]));
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_transform_is_int_translate(const struct pixman_transform *t)
+pixman_transform_is_int_translate (const struct pixman_transform *t)
 {
-	return (is_one (t->matrix[0][0]) &&
-		is_zero(t->matrix[0][1]) &&
-		is_int (t->matrix[0][2]) &&
+    return (IS_ONE (t->matrix[0][0]) &&
+            IS_ZERO (t->matrix[0][1]) &&
+            IS_INT (t->matrix[0][2]) &&
 
-		is_zero(t->matrix[1][0]) &&
-		is_one (t->matrix[1][1]) &&
-		is_int (t->matrix[1][2]) &&
+            IS_ZERO (t->matrix[1][0]) &&
+            IS_ONE (t->matrix[1][1]) &&
+            IS_INT (t->matrix[1][2]) &&
 
-		is_zero(t->matrix[2][0]) &&
-		is_zero(t->matrix[2][1]) &&
-		is_one (t->matrix[2][2]));
+            IS_ZERO (t->matrix[2][0]) &&
+            IS_ZERO (t->matrix[2][1]) &&
+            IS_ONE (t->matrix[2][2]));
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_transform_is_inverse(const struct pixman_transform *a,
-			    const struct pixman_transform *b)
+pixman_transform_is_inverse (const struct pixman_transform *a,
+                             const struct pixman_transform *b)
 {
-	struct pixman_transform   t;
+    struct pixman_transform t;
 
-	pixman_transform_multiply(&t, a, b);
-	return pixman_transform_is_identity(&t);
+    pixman_transform_multiply (&t, a, b);
+
+    return pixman_transform_is_identity (&t);
 }
 
 PIXMAN_EXPORT void
-pixman_f_transform_from_pixman_transform (struct pixman_f_transform *ft,
-					  const struct pixman_transform *t)
+pixman_f_transform_from_pixman_transform (struct pixman_f_transform *    ft,
+                                          const struct pixman_transform *t)
 {
-	int	i, j;
+    int i, j;
 
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 3; i++)
-			ft->m[j][i] = pixman_fixed_to_double (t->matrix[j][i]);
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	    ft->m[j][i] = pixman_fixed_to_double (t->matrix[j][i]);
+    }
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_transform_from_pixman_f_transform (struct pixman_transform *t,
-					  const struct pixman_f_transform *ft)
+pixman_transform_from_pixman_f_transform (struct pixman_transform *        t,
+                                          const struct pixman_f_transform *ft)
 {
-	int	i, j;
+    int i, j;
 
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 3; i++)
-		{
-			double  d = ft->m[j][i];
-			if (d < -32767.0 || d > 32767.0)
-				return FALSE;
-			d = d * 65536.0 + 0.5;
-			t->matrix[j][i] = (pixman_fixed_t) floor (d);
-		}
-	return TRUE;
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	{
+	    double d = ft->m[j][i];
+	    if (d < -32767.0 || d > 32767.0)
+		return FALSE;
+	    d = d * 65536.0 + 0.5;
+	    t->matrix[j][i] = (pixman_fixed_t) floor (d);
+	}
+    }
+    
+    return TRUE;
 }
 
-static const int	a[3] = { 3, 3, 2 };
-static const int	b[3] = { 2, 1, 1 };
+static const int a[3] = { 3, 3, 2 };
+static const int b[3] = { 2, 1, 1 };
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_f_transform_invert(struct pixman_f_transform *dst,
-			  const struct pixman_f_transform *src)
-{
-	double  det;
-	int	    i, j;
-	static int	a[3] = { 2, 2, 1 };
-	static int	b[3] = { 1, 0, 0 };
-
-	det = 0;
-	for (i = 0; i < 3; i++) {
-		double	p;
-		int	ai = a[i];
-		int	bi = b[i];
-		p = src->m[i][0] * (src->m[ai][2] * src->m[bi][1] -
-				    src->m[ai][1] * src->m[bi][2]);
-		if (i == 1)
-			p = -p;
-		det += p;
-	}
-	if (det == 0)
-		return FALSE;
-	det = 1/det;
-	for (j = 0; j < 3; j++) {
-		for (i = 0; i < 3; i++) {
-			double  p;
-			int	    ai = a[i];
-			int	    aj = a[j];
-			int	    bi = b[i];
-			int	    bj = b[j];
-
-			p = (src->m[ai][aj] * src->m[bi][bj] -
-			     src->m[ai][bj] * src->m[bi][aj]);
-			if (((i + j) & 1) != 0)
-				p = -p;
-			dst->m[j][i] = det * p;
-		}
+pixman_f_transform_invert (struct pixman_f_transform *      dst,
+                           const struct pixman_f_transform *src)
+{
+    double det;
+    int i, j;
+    static int a[3] = { 2, 2, 1 };
+    static int b[3] = { 1, 0, 0 };
+
+    det = 0;
+    for (i = 0; i < 3; i++)
+    {
+	double p;
+	int ai = a[i];
+	int bi = b[i];
+	p = src->m[i][0] * (src->m[ai][2] * src->m[bi][1] -
+	                    src->m[ai][1] * src->m[bi][2]);
+	if (i == 1)
+	    p = -p;
+	det += p;
+    }
+    
+    if (det == 0)
+	return FALSE;
+    
+    det = 1 / det;
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	{
+	    double p;
+	    int ai = a[i];
+	    int aj = a[j];
+	    int bi = b[i];
+	    int bj = b[j];
+
+	    p = (src->m[ai][aj] * src->m[bi][bj] -
+	         src->m[ai][bj] * src->m[bi][aj]);
+	    
+	    if (((i + j) & 1) != 0)
+		p = -p;
+	    
+	    dst->m[j][i] = det * p;
 	}
-	return TRUE;
+    }
+
+    return TRUE;
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_f_transform_point(const struct pixman_f_transform *t,
-			 struct pixman_f_vector *v)
+pixman_f_transform_point (const struct pixman_f_transform *t,
+                          struct pixman_f_vector *         v)
 {
-	struct pixman_f_vector    result;
-	int			    i, j;
-	double		    a;
+    struct pixman_f_vector result;
+    int i, j;
+    double a;
 
-	for (j = 0; j < 3; j++)
-	{
-		a = 0;
-		for (i = 0; i < 3; i++)
-			a += t->m[j][i] * v->v[i];
-		result.v[j] = a;
-	}
-	if (!result.v[2])
-		return FALSE;
-	for (j = 0; j < 2; j++)
-		v->v[j] = result.v[j] / result.v[2];
-	v->v[2] = 1;
-	return TRUE;
+    for (j = 0; j < 3; j++)
+    {
+	a = 0;
+	for (i = 0; i < 3; i++)
+	    a += t->m[j][i] * v->v[i];
+	result.v[j] = a;
+    }
+    
+    if (!result.v[2])
+	return FALSE;
+
+    for (j = 0; j < 2; j++)
+	v->v[j] = result.v[j] / result.v[2];
+
+    v->v[2] = 1;
+
+    return TRUE;
 }
 
 PIXMAN_EXPORT void
-pixman_f_transform_point_3d(const struct pixman_f_transform *t,
-			    struct pixman_f_vector *v)
+pixman_f_transform_point_3d (const struct pixman_f_transform *t,
+                             struct pixman_f_vector *         v)
 {
-	struct pixman_f_vector    result;
-	int			    i, j;
-	double		    a;
+    struct pixman_f_vector result;
+    int i, j;
+    double a;
 
-	for (j = 0; j < 3; j++)
-	{
-		a = 0;
-		for (i = 0; i < 3; i++)
-			a += t->m[j][i] * v->v[i];
-		result.v[j] = a;
-	}
-	*v = result;
+    for (j = 0; j < 3; j++)
+    {
+	a = 0;
+	for (i = 0; i < 3; i++)
+	    a += t->m[j][i] * v->v[i];
+	result.v[j] = a;
+    }
+    
+    *v = result;
 }
 
 PIXMAN_EXPORT void
-pixman_f_transform_multiply(struct pixman_f_transform *dst,
-			    const struct pixman_f_transform *l,
-			    const struct pixman_f_transform *r)
+pixman_f_transform_multiply (struct pixman_f_transform *      dst,
+                             const struct pixman_f_transform *l,
+                             const struct pixman_f_transform *r)
 {
-	struct pixman_f_transform d;
-	int			    dx, dy;
-	int			    o;
+    struct pixman_f_transform d;
+    int dx, dy;
+    int o;
 
-	for (dy = 0; dy < 3; dy++)
-		for (dx = 0; dx < 3; dx++)
-		{
-			double v = 0;
-			for (o = 0; o < 3; o++)
-				v += l->m[dy][o] * r->m[o][dx];
-			d.m[dy][dx] = v;
-		}
-	*dst = d;
+    for (dy = 0; dy < 3; dy++)
+    {
+	for (dx = 0; dx < 3; dx++)
+	{
+	    double v = 0;
+	    for (o = 0; o < 3; o++)
+		v += l->m[dy][o] * r->m[o][dx];
+	    d.m[dy][dx] = v;
+	}
+    }
+    
+    *dst = d;
 }
 
 PIXMAN_EXPORT void
-pixman_f_transform_init_scale (struct pixman_f_transform *t, double sx, double sy)
+pixman_f_transform_init_scale (struct pixman_f_transform *t,
+                               double                     sx,
+                               double                     sy)
 {
-	t->m[0][0] = sx;	t->m[0][1] = 0;	    t->m[0][2] = 0;
-	t->m[1][0] = 0;	t->m[1][1] = sy;    t->m[1][2] = 0;
-	t->m[2][0] = 0;	t->m[2][1] = 0;	    t->m[2][2] = 1;
+    t->m[0][0] = sx;
+    t->m[0][1] = 0;
+    t->m[0][2] = 0;
+    t->m[1][0] = 0;
+    t->m[1][1] = sy;
+    t->m[1][2] = 0;
+    t->m[2][0] = 0;
+    t->m[2][1] = 0;
+    t->m[2][2] = 1;
 }
 
 PIXMAN_EXPORT pixman_bool_t
 pixman_f_transform_scale (struct pixman_f_transform *forward,
-			  struct pixman_f_transform *reverse,
-			  double sx, double sy)
-{
-	struct pixman_f_transform t;
-
-	if (sx == 0 || sy == 0)
-		return FALSE;
-
-	if (forward) {
-		pixman_f_transform_init_scale (&t, sx, sy);
-		pixman_f_transform_multiply (forward, &t, forward);
-	}
-	if (reverse) {
-		pixman_f_transform_init_scale (&t, 1/sx, 1/sy);
-		pixman_f_transform_multiply (reverse, reverse, &t);
-	}
-	return TRUE;
+                          struct pixman_f_transform *reverse,
+                          double                     sx,
+                          double                     sy)
+{
+    struct pixman_f_transform t;
+
+    if (sx == 0 || sy == 0)
+	return FALSE;
+
+    if (forward)
+    {
+	pixman_f_transform_init_scale (&t, sx, sy);
+	pixman_f_transform_multiply (forward, &t, forward);
+    }
+    
+    if (reverse)
+    {
+	pixman_f_transform_init_scale (&t, 1 / sx, 1 / sy);
+	pixman_f_transform_multiply (reverse, reverse, &t);
+    }
+    
+    return TRUE;
 }
 
 PIXMAN_EXPORT void
-pixman_f_transform_init_rotate (struct pixman_f_transform *t, double c, double s)
+pixman_f_transform_init_rotate (struct pixman_f_transform *t,
+                                double                     c,
+                                double                     s)
 {
-	t->m[0][0] = c;	t->m[0][1] = -s;    t->m[0][2] = 0;
-	t->m[1][0] = s;	t->m[1][1] = c;	    t->m[1][2] = 0;
-	t->m[2][0] = 0;	t->m[2][1] = 0;	    t->m[2][2] = 1;
+    t->m[0][0] = c;
+    t->m[0][1] = -s;
+    t->m[0][2] = 0;
+    t->m[1][0] = s;
+    t->m[1][1] = c;
+    t->m[1][2] = 0;
+    t->m[2][0] = 0;
+    t->m[2][1] = 0;
+    t->m[2][2] = 1;
 }
 
 PIXMAN_EXPORT pixman_bool_t
 pixman_f_transform_rotate (struct pixman_f_transform *forward,
-			   struct pixman_f_transform *reverse,
-			   double c, double s)
+                           struct pixman_f_transform *reverse,
+                           double                     c,
+                           double                     s)
 {
-	struct pixman_f_transform t;
+    struct pixman_f_transform t;
 
-	if (forward) {
-		pixman_f_transform_init_rotate (&t, c, s);
-		pixman_f_transform_multiply (forward, &t, forward);
-	}
-	if (reverse) {
-		pixman_f_transform_init_rotate (&t, c, -s);
-		pixman_f_transform_multiply (reverse, reverse, &t);
-	}
-	return TRUE;
+    if (forward)
+    {
+	pixman_f_transform_init_rotate (&t, c, s);
+	pixman_f_transform_multiply (forward, &t, forward);
+    }
+    
+    if (reverse)
+    {
+	pixman_f_transform_init_rotate (&t, c, -s);
+	pixman_f_transform_multiply (reverse, reverse, &t);
+    }
+
+    return TRUE;
 }
 
 PIXMAN_EXPORT void
-pixman_f_transform_init_translate (struct pixman_f_transform *t, double tx, double ty)
+pixman_f_transform_init_translate (struct pixman_f_transform *t,
+                                   double                     tx,
+                                   double                     ty)
 {
-	t->m[0][0] = 1;	t->m[0][1] = 0;	    t->m[0][2] = tx;
-	t->m[1][0] = 0;	t->m[1][1] = 1;	    t->m[1][2] = ty;
-	t->m[2][0] = 0;	t->m[2][1] = 0;	    t->m[2][2] = 1;
+    t->m[0][0] = 1;
+    t->m[0][1] = 0;
+    t->m[0][2] = tx;
+    t->m[1][0] = 0;
+    t->m[1][1] = 1;
+    t->m[1][2] = ty;
+    t->m[2][0] = 0;
+    t->m[2][1] = 0;
+    t->m[2][2] = 1;
 }
 
 PIXMAN_EXPORT pixman_bool_t
 pixman_f_transform_translate (struct pixman_f_transform *forward,
-			      struct pixman_f_transform *reverse,
-			      double tx, double ty)
+                              struct pixman_f_transform *reverse,
+                              double                     tx,
+                              double                     ty)
 {
-	struct pixman_f_transform t;
+    struct pixman_f_transform t;
 
-	if (forward) {
-		pixman_f_transform_init_translate (&t, tx, ty);
-		pixman_f_transform_multiply (forward, &t, forward);
-	}
-	if (reverse) {
-		pixman_f_transform_init_translate (&t, -tx, -ty);
-		pixman_f_transform_multiply (reverse, reverse, &t);
-	}
-	return TRUE;
+    if (forward)
+    {
+	pixman_f_transform_init_translate (&t, tx, ty);
+	pixman_f_transform_multiply (forward, &t, forward);
+    }
+
+    if (reverse)
+    {
+	pixman_f_transform_init_translate (&t, -tx, -ty);
+	pixman_f_transform_multiply (reverse, reverse, &t);
+    }
+
+    return TRUE;
 }
 
 PIXMAN_EXPORT pixman_bool_t
-pixman_f_transform_bounds(const struct pixman_f_transform *t, struct pixman_box16 *b)
-{
-	struct pixman_f_vector    v[4];
-	int			    i;
-	int			    x1, y1, x2, y2;
-
-	v[0].v[0] = b->x1;    v[0].v[1] = b->y1;	v[0].v[2] = 1;
-	v[1].v[0] = b->x2;    v[1].v[1] = b->y1;	v[1].v[2] = 1;
-	v[2].v[0] = b->x2;    v[2].v[1] = b->y2;	v[2].v[2] = 1;
-	v[3].v[0] = b->x1;    v[3].v[1] = b->y2;	v[3].v[2] = 1;
-	for (i = 0; i < 4; i++)
+pixman_f_transform_bounds (const struct pixman_f_transform *t,
+                           struct pixman_box16 *            b)
+{
+    struct pixman_f_vector v[4];
+    int i;
+    int x1, y1, x2, y2;
+
+    v[0].v[0] = b->x1;
+    v[0].v[1] = b->y1;
+    v[0].v[2] = 1;
+    v[1].v[0] = b->x2;
+    v[1].v[1] = b->y1;
+    v[1].v[2] = 1;
+    v[2].v[0] = b->x2;
+    v[2].v[1] = b->y2;
+    v[2].v[2] = 1;
+    v[3].v[0] = b->x1;
+    v[3].v[1] = b->y2;
+    v[3].v[2] = 1;
+
+    for (i = 0; i < 4; i++)
+    {
+	if (!pixman_f_transform_point (t, &v[i]))
+	    return FALSE;
+
+	x1 = floor (v[i].v[0]);
+	y1 = floor (v[i].v[1]);
+	x2 = ceil (v[i].v[0]);
+	y2 = ceil (v[i].v[1]);
+
+	if (i == 0)
+	{
+	    b->x1 = x1;
+	    b->y1 = y1;
+	    b->x2 = x2;
+	    b->y2 = y2;
+	}
+	else
 	{
-		if (!pixman_f_transform_point (t, &v[i]))
-			return FALSE;
-		x1 = floor (v[i].v[0]);
-		y1 = floor (v[i].v[1]);
-		x2 = ceil (v[i].v[0]);
-		y2 = ceil (v[i].v[1]);
-		if (i == 0)
-		{
-			b->x1 = x1; b->y1 = y1;
-			b->x2 = x2; b->y2 = y2;
-		}
-		else
-		{
-			if (x1 < b->x1) b->x1 = x1;
-			if (y1 < b->y1) b->y1 = y1;
-			if (x2 > b->x2) b->x2 = x2;
-			if (y2 > b->y2) b->y2 = y2;
-		}
+	    if (x1 < b->x1) b->x1 = x1;
+	    if (y1 < b->y1) b->y1 = y1;
+	    if (x2 > b->x2) b->x2 = x2;
+	    if (y2 > b->y2) b->y2 = y2;
 	}
-	return TRUE;
+    }
+
+    return TRUE;
 }
 
 PIXMAN_EXPORT void
 pixman_f_transform_init_identity (struct pixman_f_transform *t)
 {
-	int	i, j;
+    int i, j;
 
-	for (j = 0; j < 3; j++)
-		for (i = 0; i < 3; i++)
-			t->m[j][i] = i == j ? 1 : 0;
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	    t->m[j][i] = i == j ? 1 : 0;
+    }
 }
diff --git a/lib/pixman/pixman/pixman-mmx.c b/lib/pixman/pixman/pixman-mmx.c
index db87b1987..7dcc1dc96 100644
--- a/lib/pixman/pixman/pixman-mmx.c
+++ b/lib/pixman/pixman/pixman-mmx.c
@@ -37,11 +37,12 @@
 
 #include <mmintrin.h>
 #include "pixman-private.h"
+#include "pixman-combine32.h"
 
-#define noVERBOSE
+#define no_vERBOSE
 
 #ifdef VERBOSE
-#define CHECKPOINT() ErrorF ("at %s %d\n", __FUNCTION__, __LINE__)
+#define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
 #else
 #define CHECKPOINT()
 #endif
@@ -97,43 +98,43 @@ typedef struct
     mmxdatafield mmx_ffff0000ffff0000;
     mmxdatafield mmx_0000ffff00000000;
     mmxdatafield mmx_000000000000ffff;
-} MMXData;
+} mmx_data_t;
 
 #if defined(_MSC_VER)
-# define MMXDATA_INIT(field, val) { val##UI64 }
-#elif defined(M64_MEMBER)	/* __m64 is a struct, not an integral type */
-# define MMXDATA_INIT(field, val) field =   { val##ULL }
-#else				/* __m64 is an integral type */
-# define MMXDATA_INIT(field, val) field =   val##ULL
+# define MMXDATA_INIT(field, val) { val ## UI64 }
+#elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
+# define MMXDATA_INIT(field, val) field =   { val ## ULL }
+#else                           /* __m64 is an integral type */
+# define MMXDATA_INIT(field, val) field =   val ## ULL
 #endif
 
-static const MMXData c =
-{
-    MMXDATA_INIT(.mmx_4x00ff,			0x00ff00ff00ff00ff),
-    MMXDATA_INIT(.mmx_4x0080,			0x0080008000800080),
-    MMXDATA_INIT(.mmx_565_rgb,			0x000001f0003f001f),
-    MMXDATA_INIT(.mmx_565_unpack_multiplier,	0x0000008404100840),
-    MMXDATA_INIT(.mmx_565_r,			0x000000f800000000),
-    MMXDATA_INIT(.mmx_565_g,			0x0000000000fc0000),
-    MMXDATA_INIT(.mmx_565_b,			0x00000000000000f8),
-    MMXDATA_INIT(.mmx_mask_0,			0xffffffffffff0000),
-    MMXDATA_INIT(.mmx_mask_1,			0xffffffff0000ffff),
-    MMXDATA_INIT(.mmx_mask_2,			0xffff0000ffffffff),
-    MMXDATA_INIT(.mmx_mask_3,			0x0000ffffffffffff),
-    MMXDATA_INIT(.mmx_full_alpha,		0x00ff000000000000),
-    MMXDATA_INIT(.mmx_ffff0000ffff0000,		0xffff0000ffff0000),
-    MMXDATA_INIT(.mmx_0000ffff00000000,		0x0000ffff00000000),
-    MMXDATA_INIT(.mmx_000000000000ffff,		0x000000000000ffff),
+static const mmx_data_t c =
+{
+    MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
+    MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
+    MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
+    MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
+    MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
+    MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
+    MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
+    MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
+    MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
+    MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
+    MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
+    MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
+    MMXDATA_INIT (.mmx_ffff0000ffff0000,         0xffff0000ffff0000),
+    MMXDATA_INIT (.mmx_0000ffff00000000,         0x0000ffff00000000),
+    MMXDATA_INIT (.mmx_000000000000ffff,         0x000000000000ffff),
 };
 
 #ifdef __GNUC__
 #    ifdef __ICC
-#        define MC(x)  M64(c.mmx_##x)
+#        define MC(x)  M64 (c.mmx_ ## x)
 #    else
-#        define MC(x) ((__m64)c.mmx_##x)
+#        define MC(x) ((__m64)c.mmx_ ## x)
 #    endif
 #else
-#    define MC(x) c.mmx_##x
+#    define MC(x) c.mmx_ ## x
 #endif
 
 static force_inline __m64
@@ -141,12 +142,12 @@ M64 (uint64_t x)
 {
 #ifdef __ICC
     return _mm_cvtsi64_m64 (x);
-#elif defined M64_MEMBER	/* __m64 is a struct, not an integral type */
+#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
     __m64 res;
 
     res.M64_MEMBER = x;
     return res;
-#else				/* __m64 is an integral type */
+#else                           /* __m64 is an integral type */
     return (__m64)x;
 #endif
 }
@@ -156,16 +157,17 @@ UINT64 (__m64 x)
 {
 #ifdef __ICC
     return _mm_cvtm64_si64 (x);
-#elif defined M64_MEMBER	/* __m64 is a struct, not an integral type */
+#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
     uint64_t res = x.M64_MEMBER;
     return res;
-#else				/* __m64 is an integral type */
+#else                           /* __m64 is an integral type */
     return (uint64_t)x;
 #endif
 }
 
 static force_inline __m64
-shift (__m64 v, int s)
+shift (__m64 v,
+       int   s)
 {
     if (s > 0)
 	return _mm_slli_si64 (v, s);
@@ -178,7 +180,7 @@ shift (__m64 v, int s)
 static force_inline __m64
 negate (__m64 mask)
 {
-    return _mm_xor_si64 (mask, MC(4x00ff));
+    return _mm_xor_si64 (mask, MC (4x00ff));
 }
 
 static force_inline __m64
@@ -187,7 +189,7 @@ pix_multiply (__m64 a, __m64 b)
     __m64 res;
 
     res = _mm_mullo_pi16 (a, b);
-    res = _mm_adds_pu16 (res, MC(4x0080));
+    res = _mm_adds_pu16 (res, MC (4x0080));
     res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
     res = _mm_srli_pi16 (res, 8);
 
@@ -197,7 +199,7 @@ pix_multiply (__m64 a, __m64 b)
 static force_inline __m64
 pix_add (__m64 a, __m64 b)
 {
-    return  _mm_adds_pu8 (a, b);
+    return _mm_adds_pu8 (a, b);
 }
 
 static force_inline __m64
@@ -238,9 +240,9 @@ invert_colors (__m64 pixel)
 
     x = y = z = pixel;
 
-    x = _mm_and_si64 (x, MC(ffff0000ffff0000));
-    y = _mm_and_si64 (y, MC(000000000000ffff));
-    z = _mm_and_si64 (z, MC(0000ffff00000000));
+    x = _mm_and_si64 (x, MC (ffff0000ffff0000));
+    y = _mm_and_si64 (y, MC (000000000000ffff));
+    z = _mm_and_si64 (z, MC (0000ffff00000000));
 
     y = shift (y, 32);
     z = shift (z, -32);
@@ -252,23 +254,24 @@ invert_colors (__m64 pixel)
 }
 
 static force_inline __m64
-over (__m64 src, __m64 srca, __m64 dest)
+over (__m64 src,
+      __m64 srca,
+      __m64 dest)
 {
-    return  _mm_adds_pu8 (src, pix_multiply(dest, negate(srca)));
+    return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
 }
 
 static force_inline __m64
 over_rev_non_pre (__m64 src, __m64 dest)
 {
     __m64 srca = expand_alpha (src);
-    __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha));
+    __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
 
-    return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
+    return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
 }
 
 static force_inline __m64
-in (__m64 src,
-    __m64 mask)
+in (__m64 src, __m64 mask)
 {
     return pix_multiply (src, mask);
 }
@@ -276,28 +279,29 @@ in (__m64 src,
 static force_inline __m64
 in_over_full_src_alpha (__m64 src, __m64 mask, __m64 dest)
 {
-    src = _mm_or_si64 (src, MC(full_alpha));
+    src = _mm_or_si64 (src, MC (full_alpha));
 
-    return over(in (src, mask), mask, dest);
+    return over (in (src, mask), mask, dest);
 }
 
 #ifndef _MSC_VER
 static force_inline __m64
-in_over (__m64 src,
-	 __m64 srca,
-	 __m64 mask,
-	 __m64 dest)
+in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
 {
-    return over(in(src, mask), pix_multiply(srca, mask), dest);
+    return over (in (src, mask), pix_multiply (srca, mask), dest);
 }
+
 #else
-#define in_over(src, srca, mask, dest) over(in(src, mask), pix_multiply(srca, mask), dest)
+
+#define in_over(src, srca, mask, dest)					\
+    over (in (src, mask), pix_multiply (srca, mask), dest)
+
 #endif
 
 static force_inline __m64
 load8888 (uint32_t v)
 {
-    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64());
+    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64 ());
 }
 
 static force_inline __m64
@@ -309,7 +313,7 @@ pack8888 (__m64 lo, __m64 hi)
 static force_inline uint32_t
 store8888 (__m64 v)
 {
-    return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64()));
+    return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()));
 }
 
 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
@@ -340,9 +344,9 @@ expand565 (__m64 pixel, int pos)
 
     p = _mm_or_si64 (t1, p);
     p = _mm_or_si64 (t2, p);
-    p = _mm_and_si64 (p, MC(565_rgb));
+    p = _mm_and_si64 (p, MC (565_rgb));
 
-    pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier));
+    pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
     return _mm_srli_pi16 (pixel, 8);
 }
 
@@ -350,40 +354,40 @@ static force_inline __m64
 expand8888 (__m64 in, int pos)
 {
     if (pos == 0)
-	return _mm_unpacklo_pi8 (in, _mm_setzero_si64());
+	return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
     else
-	return _mm_unpackhi_pi8 (in, _mm_setzero_si64());
+	return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
 }
 
 static force_inline __m64
 expandx888 (__m64 in, int pos)
 {
-    return _mm_or_si64 (expand8888 (in, pos), MC(full_alpha));
+    return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
 }
 
 static force_inline __m64
-pack565 (__m64 pixel, __m64 target, int pos)
+pack_565 (__m64 pixel, __m64 target, int pos)
 {
     __m64 p = pixel;
     __m64 t = target;
     __m64 r, g, b;
 
-    r = _mm_and_si64 (p, MC(565_r));
-    g = _mm_and_si64 (p, MC(565_g));
-    b = _mm_and_si64 (p, MC(565_b));
+    r = _mm_and_si64 (p, MC (565_r));
+    g = _mm_and_si64 (p, MC (565_g));
+    b = _mm_and_si64 (p, MC (565_b));
 
-    r = shift (r, - (32 - 8) + pos * 16);
-    g = shift (g, - (16 - 3) + pos * 16);
-    b = shift (b, - (0  + 3) + pos * 16);
+    r = shift (r, -(32 - 8) + pos * 16);
+    g = shift (g, -(16 - 3) + pos * 16);
+    b = shift (b, -(0  + 3) + pos * 16);
 
     if (pos == 0)
-	t = _mm_and_si64 (t, MC(mask_0));
+	t = _mm_and_si64 (t, MC (mask_0));
     else if (pos == 1)
-	t = _mm_and_si64 (t, MC(mask_1));
+	t = _mm_and_si64 (t, MC (mask_1));
     else if (pos == 2)
-	t = _mm_and_si64 (t, MC(mask_2));
+	t = _mm_and_si64 (t, MC (mask_2));
     else if (pos == 3)
-	t = _mm_and_si64 (t, MC(mask_3));
+	t = _mm_and_si64 (t, MC (mask_3));
 
     p = _mm_or_si64 (r, t);
     p = _mm_or_si64 (g, p);
@@ -392,26 +396,23 @@ pack565 (__m64 pixel, __m64 target, int pos)
 }
 
 #ifndef _MSC_VER
+
 static force_inline __m64
 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
 {
-    x = _mm_mullo_pi16 (x, a);
-    y = _mm_mullo_pi16 (y, b);
-    x = _mm_adds_pu16 (x, MC(4x0080));
-    x = _mm_adds_pu16 (x, y);
-    x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8));
-    x = _mm_srli_pi16 (x, 8);
+    x = pix_multiply (x, a);
+    y = pix_multiply (y, b);
 
-    return x;
+    return pix_add (x, y);
 }
+
 #else
-#define pix_add_mul(x, a, y, b) \
-( x = _mm_mullo_pi16 (x, a), \
-  y = _mm_mullo_pi16 (y, b), \
-  x = _mm_adds_pu16 (x, MC(4x0080)), \
-  x = _mm_adds_pu16 (x, y), \
-  x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)), \
-  _mm_srli_pi16 (x, 8) )
+
+#define pix_add_mul(x, a, y, b)	 \
+    ( x = pix_multiply (x, a),	 \
+      y = pix_multiply (y, a),	 \
+      pix_add (x, y) )
+
 #endif
 
 /* --------------- MMX code patch for fbcompose.c --------------------- */
@@ -435,532 +436,699 @@ combine (const uint32_t *src, const uint32_t *mask)
     return ssrc;
 }
 
-static FASTCALL void
-mmxCombineOverU (pixman_implementation_t *imp, pixman_op_t op,
-		 uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_over_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     const uint32_t *end = dest + width;
 
-    while (dest < end) {
+    while (dest < end)
+    {
 	uint32_t ssrc = combine (src, mask);
 	uint32_t a = ssrc >> 24;
-	if (a == 0xff) {
+
+	if (a == 0xff)
+	{
 	    *dest = ssrc;
-	} else if (ssrc) {
+	}
+	else if (ssrc)
+	{
 	    __m64 s, sa;
-	    s = load8888(ssrc);
-	    sa = expand_alpha(s);
-	    *dest = store8888(over(s, sa, load8888(*dest)));
+	    s = load8888 (ssrc);
+	    sa = expand_alpha (s);
+	    *dest = store8888 (over (s, sa, load8888 (*dest)));
 	}
+
 	++dest;
 	++src;
 	if (mask)
 	    ++mask;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineOverReverseU (pixman_implementation_t *imp, pixman_op_t op,
-			uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_over_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
 {
     const uint32_t *end = dest + width;
 
-    while (dest < end) {
+    while (dest < end)
+    {
 	__m64 d, da;
 	uint32_t s = combine (src, mask);
-	d = load8888(*dest);
-	da = expand_alpha(d);
-	*dest = store8888(over (d, da, load8888(s)));
-        ++dest;
-        ++src;
+	
+	d = load8888 (*dest);
+	da = expand_alpha (d);
+	*dest = store8888 (over (d, da, load8888 (s)));
+
+	++dest;
+	++src;
 	if (mask)
 	    mask++;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineInU (pixman_implementation_t *imp, pixman_op_t op,
-	       uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_in_u (pixman_implementation_t *imp,
+                  pixman_op_t              op,
+                  uint32_t *               dest,
+                  const uint32_t *         src,
+                  const uint32_t *         mask,
+                  int                      width)
 {
     const uint32_t *end = dest + width;
 
-    while (dest < end) {
-        __m64 x, a;
-        x = load8888 (combine (src, mask));
-        a = load8888(*dest);
-        a = expand_alpha(a);
-        x = pix_multiply(x, a);
-        *dest = store8888(x);
-        ++dest;
-        ++src;
+    while (dest < end)
+    {
+	__m64 x, a;
+	
+	x = load8888 (combine (src, mask));
+	a = load8888 (*dest);
+	a = expand_alpha (a);
+	x = pix_multiply (x, a);
+	
+	*dest = store8888 (x);
+
+	++dest;
+	++src;
 	if (mask)
 	    mask++;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineInReverseU (pixman_implementation_t *imp, pixman_op_t op,
-		      uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_in_reverse_u (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *               dest,
+                          const uint32_t *         src,
+                          const uint32_t *         mask,
+                          int                      width)
 {
     const uint32_t *end = dest + width;
 
-    while (dest < end) {
-        __m64 x, a;
-        x = load8888(*dest);
-        a = load8888(combine (src, mask));
-        a = expand_alpha(a);
-        x = pix_multiply(x, a);
-        *dest = store8888(x);
-        ++dest;
-        ++src;
+    while (dest < end)
+    {
+	__m64 x, a;
+	
+	x = load8888 (*dest);
+	a = load8888 (combine (src, mask));
+	a = expand_alpha (a);
+	x = pix_multiply (x, a);
+	*dest = store8888 (x);
+
+	++dest;
+	++src;
 	if (mask)
 	    mask++;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineOutU (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_out_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
 {
     const uint32_t *end = dest + width;
 
-    while (dest < end) {
-        __m64 x, a;
-        x = load8888(combine (src, mask));
-        a = load8888(*dest);
-        a = expand_alpha(a);
-        a = negate(a);
-        x = pix_multiply(x, a);
-        *dest = store8888(x);
-        ++dest;
-        ++src;
+    while (dest < end)
+    {
+	__m64 x, a;
+	
+	x = load8888 (combine (src, mask));
+	a = load8888 (*dest);
+	a = expand_alpha (a);
+	a = negate (a);
+	x = pix_multiply (x, a);
+	*dest = store8888 (x);
+
+	++dest;
+	++src;
 	if (mask)
 	    mask++;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineOutReverseU (pixman_implementation_t *imp, pixman_op_t op,
-		       uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_out_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
 {
     const uint32_t *end = dest + width;
 
-    while (dest < end) {
-        __m64 x, a;
-        x = load8888(*dest);
-        a = load8888(combine (src, mask));
-        a = expand_alpha(a);
-        a = negate(a);
-        x = pix_multiply(x, a);
-        *dest = store8888(x);
-        ++dest;
-        ++src;
+    while (dest < end)
+    {
+	__m64 x, a;
+	
+	x = load8888 (*dest);
+	a = load8888 (combine (src, mask));
+	a = expand_alpha (a);
+	a = negate (a);
+	x = pix_multiply (x, a);
+
+	*dest = store8888 (x);
+
+	++dest;
+	++src;
 	if (mask)
 	    mask++;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineAtopU (pixman_implementation_t *imp, pixman_op_t op,
-		 uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_atop_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     const uint32_t *end = dest + width;
 
-    while (dest < end) {
-        __m64 s, da, d, sia;
-        s = load8888(combine (src, mask));
-        d = load8888(*dest);
-        sia = expand_alpha(s);
-        sia = negate(sia);
-        da = expand_alpha(d);
-        s = pix_add_mul (s, da, d, sia);
-        *dest = store8888(s);
-        ++dest;
-        ++src;
+    while (dest < end)
+    {
+	__m64 s, da, d, sia;
+	
+	s = load8888 (combine (src, mask));
+	d = load8888 (*dest);
+	sia = expand_alpha (s);
+	sia = negate (sia);
+	da = expand_alpha (d);
+	s = pix_add_mul (s, da, d, sia);
+	*dest = store8888 (s);
+
+	++dest;
+	++src;
 	if (mask)
 	    mask++;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineAtopReverseU (pixman_implementation_t *imp, pixman_op_t op,
-			uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
 {
     const uint32_t *end;
 
     end = dest + width;
 
-    while (dest < end) {
-        __m64 s, dia, d, sa;
-        s = load8888(combine(src, mask));
-        d = load8888(*dest);
-        sa = expand_alpha(s);
-        dia = expand_alpha(d);
-        dia = negate(dia);
+    while (dest < end)
+    {
+	__m64 s, dia, d, sa;
+	
+	s = load8888 (combine (src, mask));
+	d = load8888 (*dest);
+	sa = expand_alpha (s);
+	dia = expand_alpha (d);
+	dia = negate (dia);
 	s = pix_add_mul (s, dia, d, sa);
-        *dest = store8888(s);
-        ++dest;
-        ++src;
+	*dest = store8888 (s);
+
+	++dest;
+	++src;
 	if (mask)
 	    mask++;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineXorU (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_xor_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
 {
     const uint32_t *end = dest + width;
 
-    while (dest < end) {
-        __m64 s, dia, d, sia;
-        s = load8888(combine(src, mask));
-        d = load8888(*dest);
-        sia = expand_alpha(s);
-        dia = expand_alpha(d);
-        sia = negate(sia);
-        dia = negate(dia);
+    while (dest < end)
+    {
+	__m64 s, dia, d, sia;
+	
+	s = load8888 (combine (src, mask));
+	d = load8888 (*dest);
+	sia = expand_alpha (s);
+	dia = expand_alpha (d);
+	sia = negate (sia);
+	dia = negate (dia);
 	s = pix_add_mul (s, dia, d, sia);
-        *dest = store8888(s);
-        ++dest;
-        ++src;
+	*dest = store8888 (s);
+
+	++dest;
+	++src;
 	if (mask)
 	    mask++;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineAddU (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_add_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
 {
     const uint32_t *end = dest + width;
-    while (dest < end) {
-        __m64 s, d;
-        s = load8888(combine(src,mask));
-        d = load8888(*dest);
-        s = pix_add(s, d);
-        *dest = store8888(s);
-        ++dest;
-        ++src;
+
+    while (dest < end)
+    {
+	__m64 s, d;
+	
+	s = load8888 (combine (src, mask));
+	d = load8888 (*dest);
+	s = pix_add (s, d);
+	*dest = store8888 (s);
+
+	++dest;
+	++src;
 	if (mask)
 	    mask++;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineSaturateU (pixman_implementation_t *imp, pixman_op_t op,
-		     uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_saturate_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *               dest,
+                        const uint32_t *         src,
+                        const uint32_t *         mask,
+                        int                      width)
 {
     const uint32_t *end = dest + width;
-    while (dest < end) {
-        uint32_t s = combine(src,mask);
-        uint32_t d = *dest;
-        __m64 ms = load8888(s);
-        __m64 md = load8888(d);
-        uint32_t sa = s >> 24;
-        uint32_t da = ~d >> 24;
-
-        if (sa > da) {
-            __m64 msa = load8888(FbIntDiv(da, sa) << 24);
-            msa = expand_alpha(msa);
-            ms = pix_multiply(ms, msa);
-        }
-        md = pix_add(md, ms);
-        *dest = store8888(md);
-        ++src;
-        ++dest;
+
+    while (dest < end)
+    {
+	uint32_t s = combine (src, mask);
+	uint32_t d = *dest;
+	__m64 ms = load8888 (s);
+	__m64 md = load8888 (d);
+	uint32_t sa = s >> 24;
+	uint32_t da = ~d >> 24;
+
+	if (sa > da)
+	{
+	    __m64 msa = load8888 (DIV_UN8 (da, sa) << 24);
+	    msa = expand_alpha (msa);
+	    ms = pix_multiply (ms, msa);
+	}
+
+	md = pix_add (md, ms);
+	*dest = store8888 (md);
+
+	++src;
+	++dest;
 	if (mask)
 	    mask++;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-
-static FASTCALL void
-mmxCombineSrcC (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_src_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     const uint32_t *end = src + width;
-    while (src < end) {
-        __m64 a = load8888(*mask);
-        __m64 s = load8888(*src);
-        s = pix_multiply(s, a);
-        *dest = store8888(s);
-        ++src;
-        ++mask;
-        ++dest;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	
+	s = pix_multiply (s, a);
+	*dest = store8888 (s);
+
+	++src;
+	++mask;
+	++dest;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineOverC (pixman_implementation_t *imp, pixman_op_t op,
-		 uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_over_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
 {
     const uint32_t *end = src + width;
-    while (src < end) {
-        __m64 a = load8888(*mask);
-        __m64 s = load8888(*src);
-        __m64 d = load8888(*dest);
-        __m64 sa = expand_alpha(s);
 
-	*dest = store8888(in_over (s, sa, a, d));
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 sa = expand_alpha (s);
 
-        ++src;
-        ++dest;
-        ++mask;
+	*dest = store8888 (in_over (s, sa, a, d));
+
+	++src;
+	++dest;
+	++mask;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineOverReverseC (pixman_implementation_t *imp, pixman_op_t op,
-			uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
 {
     const uint32_t *end = src + width;
-    while (src < end) {
-        __m64 a = load8888(*mask);
-        __m64 s = load8888(*src);
-        __m64 d = load8888(*dest);
-        __m64 da = expand_alpha(d);
 
-	*dest = store8888(over (d, da, in (s, a)));
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 da = expand_alpha (d);
+
+	*dest = store8888 (over (d, da, in (s, a)));
 
-        ++src;
-        ++dest;
-        ++mask;
+	++src;
+	++dest;
+	++mask;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-
-static FASTCALL void
-mmxCombineInC (pixman_implementation_t *imp, pixman_op_t op,
-	       uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_in_ca (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
 {
     const uint32_t *end = src + width;
-    while (src < end) {
-        __m64 a = load8888(*mask);
-        __m64 s = load8888(*src);
-        __m64 d = load8888(*dest);
-        __m64 da = expand_alpha(d);
-        s = pix_multiply(s, a);
-        s = pix_multiply(s, da);
-        *dest = store8888(s);
-        ++src;
-        ++dest;
-        ++mask;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 da = expand_alpha (d);
+	
+	s = pix_multiply (s, a);
+	s = pix_multiply (s, da);
+	*dest = store8888 (s);
+
+	++src;
+	++dest;
+	++mask;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineInReverseC (pixman_implementation_t *imp, pixman_op_t op,
-		      uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
 {
     const uint32_t *end = src + width;
-    while (src < end) {
-        __m64 a = load8888(*mask);
-        __m64 s = load8888(*src);
-        __m64 d = load8888(*dest);
-        __m64 sa = expand_alpha(s);
-        a = pix_multiply(a, sa);
-        d = pix_multiply(d, a);
-        *dest = store8888(d);
-        ++src;
-        ++dest;
-        ++mask;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 sa = expand_alpha (s);
+	
+	a = pix_multiply (a, sa);
+	d = pix_multiply (d, a);
+	*dest = store8888 (d);
+
+	++src;
+	++dest;
+	++mask;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineOutC (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_out_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     const uint32_t *end = src + width;
-    while (src < end) {
-        __m64 a = load8888(*mask);
-        __m64 s = load8888(*src);
-        __m64 d = load8888(*dest);
-        __m64 da = expand_alpha(d);
-        da = negate(da);
-        s = pix_multiply(s, a);
-        s = pix_multiply(s, da);
-        *dest = store8888(s);
-        ++src;
-        ++dest;
-        ++mask;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 da = expand_alpha (d);
+	
+	da = negate (da);
+	s = pix_multiply (s, a);
+	s = pix_multiply (s, da);
+	*dest = store8888 (s);
+
+	++src;
+	++dest;
+	++mask;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineOutReverseC (pixman_implementation_t *imp, pixman_op_t op,
-		       uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
 {
     const uint32_t *end = src + width;
-    while (src < end) {
-        __m64 a = load8888(*mask);
-        __m64 s = load8888(*src);
-        __m64 d = load8888(*dest);
-        __m64 sa = expand_alpha(s);
-        a = pix_multiply(a, sa);
-        a = negate(a);
-        d = pix_multiply(d, a);
-        *dest = store8888(d);
-        ++src;
-        ++dest;
-        ++mask;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 sa = expand_alpha (s);
+
+	a = pix_multiply (a, sa);
+	a = negate (a);
+	d = pix_multiply (d, a);
+	*dest = store8888 (d);
+
+	++src;
+	++dest;
+	++mask;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineAtopC (pixman_implementation_t *imp, pixman_op_t op,
-		 uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_atop_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
 {
     const uint32_t *end = src + width;
-    while (src < end) {
-        __m64 a = load8888(*mask);
-        __m64 s = load8888(*src);
-        __m64 d = load8888(*dest);
-        __m64 da = expand_alpha(d);
-        __m64 sa = expand_alpha(s);
-        s = pix_multiply(s, a);
-        a = pix_multiply(a, sa);
-        a = negate(a);
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 da = expand_alpha (d);
+	__m64 sa = expand_alpha (s);
+
+	s = pix_multiply (s, a);
+	a = pix_multiply (a, sa);
+	a = negate (a);
 	d = pix_add_mul (d, a, s, da);
-        *dest = store8888(d);
-        ++src;
-        ++dest;
-        ++mask;
+	*dest = store8888 (d);
+
+	++src;
+	++dest;
+	++mask;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineAtopReverseC (pixman_implementation_t *imp, pixman_op_t op,
-			uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
 {
     const uint32_t *end = src + width;
-    while (src < end) {
-        __m64 a = load8888(*mask);
-        __m64 s = load8888(*src);
-        __m64 d = load8888(*dest);
-        __m64 da = expand_alpha(d);
-        __m64 sa = expand_alpha(s);
-        s = pix_multiply(s, a);
-        a = pix_multiply(a, sa);
-        da = negate(da);
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 da = expand_alpha (d);
+	__m64 sa = expand_alpha (s);
+
+	s = pix_multiply (s, a);
+	a = pix_multiply (a, sa);
+	da = negate (da);
 	d = pix_add_mul (d, a, s, da);
-        *dest = store8888(d);
-        ++src;
-        ++dest;
-        ++mask;
+	*dest = store8888 (d);
+
+	++src;
+	++dest;
+	++mask;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineXorC (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_xor_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     const uint32_t *end = src + width;
-    while (src < end) {
-        __m64 a = load8888(*mask);
-        __m64 s = load8888(*src);
-        __m64 d = load8888(*dest);
-        __m64 da = expand_alpha(d);
-        __m64 sa = expand_alpha(s);
-        s = pix_multiply(s, a);
-        a = pix_multiply(a, sa);
-        da = negate(da);
-        a = negate(a);
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+	__m64 da = expand_alpha (d);
+	__m64 sa = expand_alpha (s);
+
+	s = pix_multiply (s, a);
+	a = pix_multiply (a, sa);
+	da = negate (da);
+	a = negate (a);
 	d = pix_add_mul (d, a, s, da);
-        *dest = store8888(d);
-        ++src;
-        ++dest;
-        ++mask;
+	*dest = store8888 (d);
+
+	++src;
+	++dest;
+	++mask;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-static FASTCALL void
-mmxCombineAddC (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+mmx_combine_add_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     const uint32_t *end = src + width;
-    while (src < end) {
-        __m64 a = load8888(*mask);
-        __m64 s = load8888(*src);
-        __m64 d = load8888(*dest);
-        s = pix_multiply(s, a);
-        d = pix_add(s, d);
-        *dest = store8888(d);
-        ++src;
-        ++dest;
-        ++mask;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (*mask);
+	__m64 s = load8888 (*src);
+	__m64 d = load8888 (*dest);
+
+	s = pix_multiply (s, a);
+	d = pix_add (s, d);
+	*dest = store8888 (d);
+
+	++src;
+	++dest;
+	++mask;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* ------------------ MMX code paths called from fbpict.c ----------------------- */
+/* ------------- MMX code paths called from fbpict.c -------------------- */
 
 static void
-fbCompositeSolid_nx8888mmx (pixman_implementation_t *imp,
-			    pixman_op_t op,
-			    pixman_image_t * pSrc,
-			    pixman_image_t * pMask,
-			    pixman_image_t * pDst,
-			    int32_t	xSrc,
-			    int32_t	ySrc,
-			    int32_t	xMask,
-			    int32_t	yMask,
-			    int32_t	xDst,
-			    int32_t	yDst,
-			    int32_t	width,
-			    int32_t	height)
-{
-    uint32_t	src;
-    uint32_t	*dstLine, *dst;
-    uint16_t	w;
-    int	dstStride;
-    __m64	vsrc, vsrca;
-
-    CHECKPOINT();
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
-
-    if (src >> 24 == 0)
+mmx_composite_over_n_8888 (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           pixman_image_t *         src_image,
+                           pixman_image_t *         mask_image,
+                           pixman_image_t *         dst_image,
+                           int32_t                  src_x,
+                           int32_t                  src_y,
+                           int32_t                  mask_x,
+                           int32_t                  mask_y,
+                           int32_t                  dest_x,
+                           int32_t                  dest_y,
+                           int32_t                  width,
+                           int32_t                  height)
+{
+    uint32_t src;
+    uint32_t    *dst_line, *dst;
+    uint16_t w;
+    int dst_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+    if (src == 0)
 	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
     vsrc = load8888 (src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
+	dst = dst_line;
+	dst_line += dst_stride;
 	w = width;
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w && (unsigned long)dst & 7)
 	{
-	    *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
+	    *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
 
 	    w--;
 	    dst++;
@@ -973,76 +1141,77 @@ fbCompositeSolid_nx8888mmx (pixman_implementation_t *imp,
 
 	    vdest = *(__m64 *)dst;
 
-	    dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
-	    dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
+	    dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
+	    dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
 
-	    *(__m64 *)dst = pack8888(dest0, dest1);
+	    *(__m64 *)dst = pack8888 (dest0, dest1);
 
 	    dst += 2;
 	    w -= 2;
 	}
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w)
 	{
-	    *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
+	    *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
 
 	    w--;
 	    dst++;
 	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-fbCompositeSolid_nx0565mmx (pixman_implementation_t *imp,
-			    pixman_op_t op,
-			    pixman_image_t * pSrc,
-			    pixman_image_t * pMask,
-			    pixman_image_t * pDst,
-			    int32_t	xSrc,
-			    int32_t	ySrc,
-			    int32_t	xMask,
-			    int32_t	yMask,
-			    int32_t	xDst,
-			    int32_t	yDst,
-			    int32_t	width,
-			    int32_t	height)
-{
-    uint32_t	src;
-    uint16_t	*dstLine, *dst;
-    uint16_t	w;
-    int	dstStride;
-    __m64	vsrc, vsrca;
-
-    CHECKPOINT();
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
-
-    if (src >> 24 == 0)
+mmx_composite_over_n_0565 (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           pixman_image_t *         src_image,
+                           pixman_image_t *         mask_image,
+                           pixman_image_t *         dst_image,
+                           int32_t                  src_x,
+                           int32_t                  src_y,
+                           int32_t                  mask_x,
+                           int32_t                  mask_y,
+                           int32_t                  dest_x,
+                           int32_t                  dest_y,
+                           int32_t                  width,
+                           int32_t                  height)
+{
+    uint32_t src;
+    uint16_t    *dst_line, *dst;
+    uint16_t w;
+    int dst_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+    if (src == 0)
 	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 
     vsrc = load8888 (src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
+	dst = dst_line;
+	dst_line += dst_stride;
 	w = width;
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w && (unsigned long)dst & 7)
 	{
 	    uint64_t d = *dst;
-	    __m64 vdest = expand565 (M64(d), 0);
-	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
-	    *dst = UINT64(vdest);
+	    __m64 vdest = expand565 (M64 (d), 0);
+	    
+	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
+	    *dst = UINT64 (vdest);
 
 	    w--;
 	    dst++;
@@ -1054,10 +1223,10 @@ fbCompositeSolid_nx0565mmx (pixman_implementation_t *imp,
 
 	    vdest = *(__m64 *)dst;
 
-	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
-	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
-	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
-	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
+	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
+	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
+	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
+	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
 
 	    *(__m64 *)dst = vdest;
 
@@ -1065,63 +1234,64 @@ fbCompositeSolid_nx0565mmx (pixman_implementation_t *imp,
 	    w -= 4;
 	}
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w)
 	{
 	    uint64_t d = *dst;
-	    __m64 vdest = expand565 (M64(d), 0);
-	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
-	    *dst = UINT64(vdest);
+	    __m64 vdest = expand565 (M64 (d), 0);
+	    
+	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
+	    *dst = UINT64 (vdest);
 
 	    w--;
 	    dst++;
 	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-fbCompositeSolidMask_nx8888x8888Cmmx (pixman_implementation_t *imp,
-				      pixman_op_t op,
-				      pixman_image_t * pSrc,
-				      pixman_image_t * pMask,
-				      pixman_image_t * pDst,
-				      int32_t	xSrc,
-				      int32_t	ySrc,
-				      int32_t	xMask,
-				      int32_t	yMask,
-				      int32_t	xDst,
-				      int32_t	yDst,
-				      int32_t	width,
-				      int32_t	height)
-{
-    uint32_t	src, srca;
-    uint32_t	*dstLine;
-    uint32_t	*maskLine;
-    int	dstStride, maskStride;
-    __m64	vsrc, vsrca;
-
-    CHECKPOINT();
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                   pixman_op_t              op,
+                                   pixman_image_t *         src_image,
+                                   pixman_image_t *         mask_image,
+                                   pixman_image_t *         dst_image,
+                                   int32_t                  src_x,
+                                   int32_t                  src_y,
+                                   int32_t                  mask_x,
+                                   int32_t                  mask_y,
+                                   int32_t                  dest_x,
+                                   int32_t                  dest_y,
+                                   int32_t                  width,
+                                   int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t    *dst_line;
+    uint32_t    *mask_line;
+    int dst_stride, mask_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
-    if (srca == 0)
+    if (src == 0)
 	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
-    vsrc = load8888(src);
-    vsrca = expand_alpha(vsrc);
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
 
     while (height--)
     {
 	int twidth = width;
-	uint32_t *p = (uint32_t *)maskLine;
-	uint32_t *q = (uint32_t *)dstLine;
+	uint32_t *p = (uint32_t *)mask_line;
+	uint32_t *q = (uint32_t *)dst_line;
 
 	while (twidth && (unsigned long)q & 7)
 	{
@@ -1129,9 +1299,9 @@ fbCompositeSolidMask_nx8888x8888Cmmx (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		__m64 vdest = load8888(*q);
-		vdest = in_over(vsrc, vsrca, load8888(m), vdest);
-		*q = store8888(vdest);
+		__m64 vdest = load8888 (*q);
+		vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
+		*q = store8888 (vdest);
 	    }
 
 	    twidth--;
@@ -1150,12 +1320,12 @@ fbCompositeSolidMask_nx8888x8888Cmmx (pixman_implementation_t *imp,
 		__m64 dest0, dest1;
 		__m64 vdest = *(__m64 *)q;
 
-		dest0 = in_over(vsrc, vsrca, load8888(m0),
-				expand8888 (vdest, 0));
-		dest1 = in_over(vsrc, vsrca, load8888(m1),
-				expand8888 (vdest, 1));
+		dest0 = in_over (vsrc, vsrca, load8888 (m0),
+		                 expand8888 (vdest, 0));
+		dest1 = in_over (vsrc, vsrca, load8888 (m1),
+		                 expand8888 (vdest, 1));
 
-		*(__m64 *)q = pack8888(dest0, dest1);
+		*(__m64 *)q = pack8888 (dest0, dest1);
 	    }
 
 	    p += 2;
@@ -1169,9 +1339,9 @@ fbCompositeSolidMask_nx8888x8888Cmmx (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		__m64 vdest = load8888(*q);
-		vdest = in_over(vsrc, vsrca, load8888(m), vdest);
-		*q = store8888(vdest);
+		__m64 vdest = load8888 (*q);
+		vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
+		*q = store8888 (vdest);
 	    }
 
 	    twidth--;
@@ -1179,52 +1349,52 @@ fbCompositeSolidMask_nx8888x8888Cmmx (pixman_implementation_t *imp,
 	    q++;
 	}
 
-	dstLine += dstStride;
-	maskLine += maskStride;
+	dst_line += dst_stride;
+	mask_line += mask_stride;
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-fbCompositeSrc_8888x8x8888mmx (pixman_implementation_t *imp,
-			       pixman_op_t op,
-			       pixman_image_t * pSrc,
-			       pixman_image_t * pMask,
-			       pixman_image_t * pDst,
-			       int32_t	xSrc,
-			       int32_t	ySrc,
-			       int32_t      xMask,
-			       int32_t      yMask,
-			       int32_t      xDst,
-			       int32_t      yDst,
-			       int32_t     width,
-			       int32_t     height)
-{
-    uint32_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-    uint32_t	mask;
-    __m64	vmask;
-    int	dstStride, srcStride;
-    uint16_t	w;
-    __m64  srca;
-
-    CHECKPOINT();
-
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-
-    fbComposeGetSolid (pMask, mask, pDst->bits.format);
+mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                pixman_image_t *         src_image,
+                                pixman_image_t *         mask_image,
+                                pixman_image_t *         dst_image,
+                                int32_t                  src_x,
+                                int32_t                  src_y,
+                                int32_t                  mask_x,
+                                int32_t                  mask_y,
+                                int32_t                  dest_x,
+                                int32_t                  dest_y,
+                                int32_t                  width,
+                                int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    __m64 vmask;
+    int dst_stride, src_stride;
+    uint16_t w;
+    __m64 srca;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
     mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
     vmask = load8888 (mask);
-    srca = MC(4x00ff);
+    srca = MC (4x00ff);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
 	while (w && (unsigned long)dst & 7)
@@ -1247,8 +1417,8 @@ fbCompositeSrc_8888x8x8888mmx (pixman_implementation_t *imp,
 	    __m64 vsrc1 = expand8888 (vs, 1);
 
 	    *(__m64 *)dst = pack8888 (
-		in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
-		in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
+	        in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
+	        in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
 
 	    w -= 2;
 	    dst += 2;
@@ -1268,48 +1438,48 @@ fbCompositeSrc_8888x8x8888mmx (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-fbCompositeSrc_x888xnx8888mmx (pixman_implementation_t *imp,
-			       pixman_op_t op,
-			       pixman_image_t * pSrc,
-			       pixman_image_t * pMask,
-			       pixman_image_t * pDst,
-			       int32_t	xSrc,
-			       int32_t	ySrc,
-			       int32_t      xMask,
-			       int32_t      yMask,
-			       int32_t      xDst,
-			       int32_t      yDst,
-			       int32_t     width,
-			       int32_t     height)
-{
-    uint32_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-    uint32_t	mask;
-    __m64	vmask;
-    int	dstStride, srcStride;
-    uint16_t	w;
-    __m64  srca;
-
-    CHECKPOINT();
-
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-    fbComposeGetSolid (pMask, mask, pDst->bits.format);
+mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                pixman_image_t *         src_image,
+                                pixman_image_t *         mask_image,
+                                pixman_image_t *         dst_image,
+                                int32_t                  src_x,
+                                int32_t                  src_y,
+                                int32_t                  mask_x,
+                                int32_t                  mask_y,
+                                int32_t                  dest_x,
+                                int32_t                  dest_y,
+                                int32_t                  width,
+                                int32_t                  height)
+{
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    uint32_t mask;
+    __m64 vmask;
+    int dst_stride, src_stride;
+    uint16_t w;
+    __m64 srca;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
 
     mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
     vmask = load8888 (mask);
-    srca = MC(4x00ff);
+    srca = MC (4x00ff);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
 	while (w && (unsigned long)dst & 7)
@@ -1345,36 +1515,36 @@ fbCompositeSrc_x888xnx8888mmx (pixman_implementation_t *imp,
 	    __m64 vs7 = *(__m64 *)(src + 14);
 
 	    vd0 = pack8888 (
-		in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
-		in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
+	        in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
+	        in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
 
 	    vd1 = pack8888 (
-		in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
-		in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
+	        in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
+	        in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
 
 	    vd2 = pack8888 (
-		in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
-		in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
+	        in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
+	        in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
 
 	    vd3 = pack8888 (
-		in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
-		in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
+	        in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
+	        in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
 
 	    vd4 = pack8888 (
-		in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
-		in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
+	        in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
+	        in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
 
 	    vd5 = pack8888 (
-		in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
-		in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
+	        in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
+	        in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
 
-            vd6 = pack8888 (
-		in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
-		in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
+	    vd6 = pack8888 (
+	        in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
+	        in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
 
 	    vd7 = pack8888 (
-		in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
-		in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
+	        in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
+	        in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
 
 	    *(__m64 *)(dst + 0) = vd0;
 	    *(__m64 *)(dst + 2) = vd1;
@@ -1403,135 +1573,141 @@ fbCompositeSrc_x888xnx8888mmx (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-fbCompositeSrc_8888x8888mmx (pixman_implementation_t *imp,
-			     pixman_op_t op,
-			     pixman_image_t * pSrc,
-			     pixman_image_t * pMask,
-			     pixman_image_t * pDst,
-			     int32_t	xSrc,
-			     int32_t	ySrc,
-			     int32_t      xMask,
-			     int32_t      yMask,
-			     int32_t      xDst,
-			     int32_t      yDst,
-			     int32_t     width,
-			     int32_t     height)
-{
-    uint32_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-    uint32_t    s;
-    int	dstStride, srcStride;
-    uint8_t     a;
-    uint16_t	w;
-
-    CHECKPOINT();
-
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    uint32_t s;
+    int dst_stride, src_stride;
+    uint8_t a;
+    uint16_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
 	while (w--)
 	{
 	    s = *src++;
 	    a = s >> 24;
+	    
 	    if (a == 0xff)
+	    {
 		*dst = s;
-	    else if (s) {
+	    }
+	    else if (s)
+	    {
 		__m64 ms, sa;
-		ms = load8888(s);
-		sa = expand_alpha(ms);
-		*dst = store8888(over(ms, sa, load8888(*dst)));
+		ms = load8888 (s);
+		sa = expand_alpha (ms);
+		*dst = store8888 (over (ms, sa, load8888 (*dst)));
 	    }
+	    
 	    dst++;
 	}
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-fbCompositeSrc_8888x0565mmx (pixman_implementation_t *imp,
-			     pixman_op_t op,
-			     pixman_image_t * pSrc,
-			     pixman_image_t * pMask,
-			     pixman_image_t * pDst,
-			     int32_t      xSrc,
-			     int32_t      ySrc,
-			     int32_t      xMask,
-			     int32_t      yMask,
-			     int32_t      xDst,
-			     int32_t      yDst,
-			     int32_t     width,
-			     int32_t     height)
-{
-    uint16_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-    int	dstStride, srcStride;
-    uint16_t	w;
-
-    CHECKPOINT();
-
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
 #if 0
     /* FIXME */
-    assert (pSrc->pDrawable == pMask->pDrawable);
+    assert (src_image->drawable == mask_image->drawable);
 #endif
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w && (unsigned long)dst & 7)
 	{
 	    __m64 vsrc = load8888 (*src);
 	    uint64_t d = *dst;
-	    __m64 vdest = expand565 (M64(d), 0);
+	    __m64 vdest = expand565 (M64 (d), 0);
 
-	    vdest = pack565(over(vsrc, expand_alpha(vsrc), vdest), vdest, 0);
+	    vdest = pack_565 (
+		over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
 
-	    *dst = UINT64(vdest);
+	    *dst = UINT64 (vdest);
 
 	    w--;
 	    dst++;
 	    src++;
 	}
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w >= 4)
 	{
 	    __m64 vsrc0, vsrc1, vsrc2, vsrc3;
 	    __m64 vdest;
 
-	    vsrc0 = load8888(*(src + 0));
-	    vsrc1 = load8888(*(src + 1));
-	    vsrc2 = load8888(*(src + 2));
-	    vsrc3 = load8888(*(src + 3));
+	    vsrc0 = load8888 (*(src + 0));
+	    vsrc1 = load8888 (*(src + 1));
+	    vsrc2 = load8888 (*(src + 2));
+	    vsrc3 = load8888 (*(src + 3));
 
 	    vdest = *(__m64 *)dst;
 
-	    vdest = pack565(over(vsrc0, expand_alpha(vsrc0), expand565(vdest, 0)), vdest, 0);
-	    vdest = pack565(over(vsrc1, expand_alpha(vsrc1), expand565(vdest, 1)), vdest, 1);
-	    vdest = pack565(over(vsrc2, expand_alpha(vsrc2), expand565(vdest, 2)), vdest, 2);
-	    vdest = pack565(over(vsrc3, expand_alpha(vsrc3), expand565(vdest, 3)), vdest, 3);
+	    vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
+	    vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
+	    vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
+	    vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
 
 	    *(__m64 *)dst = vdest;
 
@@ -1540,17 +1716,17 @@ fbCompositeSrc_8888x0565mmx (pixman_implementation_t *imp,
 	    src += 4;
 	}
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w)
 	{
 	    __m64 vsrc = load8888 (*src);
 	    uint64_t d = *dst;
-	    __m64 vdest = expand565 (M64(d), 0);
+	    __m64 vdest = expand565 (M64 (d), 0);
 
-	    vdest = pack565(over(vsrc, expand_alpha(vsrc), vdest), vdest, 0);
+	    vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
 
-	    *dst = UINT64(vdest);
+	    *dst = UINT64 (vdest);
 
 	    w--;
 	    dst++;
@@ -1558,57 +1734,57 @@ fbCompositeSrc_8888x0565mmx (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-fbCompositeSolidMask_nx8x8888mmx (pixman_implementation_t *imp,
-				  pixman_op_t op,
-				  pixman_image_t * pSrc,
-				  pixman_image_t * pMask,
-				  pixman_image_t * pDst,
-				  int32_t      xSrc,
-				  int32_t      ySrc,
-				  int32_t      xMask,
-				  int32_t      yMask,
-				  int32_t      xDst,
-				  int32_t      yDst,
-				  int32_t     width,
-				  int32_t     height)
-{
-    uint32_t	src, srca;
-    uint32_t	*dstLine, *dst;
-    uint8_t	*maskLine, *mask;
-    int	dstStride, maskStride;
-    uint16_t	w;
-    __m64	vsrc, vsrca;
-    uint64_t	srcsrc;
-
-    CHECKPOINT();
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             pixman_image_t *         src_image,
+                             pixman_image_t *         mask_image,
+                             pixman_image_t *         dst_image,
+                             int32_t                  src_x,
+                             int32_t                  src_y,
+                             int32_t                  mask_x,
+                             int32_t                  mask_y,
+                             int32_t                  dest_x,
+                             int32_t                  dest_y,
+                             int32_t                  width,
+                             int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
+    __m64 vsrc, vsrca;
+    uint64_t srcsrc;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
-    if (srca == 0)
+    if (src == 0)
 	return;
 
     srcsrc = (uint64_t)src << 32 | src;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     vsrc = load8888 (src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	mask = maskLine;
-	maskLine += maskStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
 	w = width;
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w && (unsigned long)dst & 7)
 	{
@@ -1616,8 +1792,11 @@ fbCompositeSolidMask_nx8x8888mmx (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev (M64(m)), load8888(*dst));
-		*dst = store8888(vdest);
+		__m64 vdest = in_over (vsrc, vsrca,
+				       expand_alpha_rev (M64 (m)),
+				       load8888 (*dst));
+		
+		*dst = store8888 (vdest);
 	    }
 
 	    w--;
@@ -1625,11 +1804,12 @@ fbCompositeSolidMask_nx8x8888mmx (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w >= 2)
 	{
 	    uint64_t m0, m1;
+	    
 	    m0 = *mask;
 	    m1 = *(mask + 1);
 
@@ -1644,10 +1824,12 @@ fbCompositeSolidMask_nx8x8888mmx (pixman_implementation_t *imp,
 
 		vdest = *(__m64 *)dst;
 
-		dest0 = in_over(vsrc, vsrca, expand_alpha_rev (M64(m0)), expand8888(vdest, 0));
-		dest1 = in_over(vsrc, vsrca, expand_alpha_rev (M64(m1)), expand8888(vdest, 1));
+		dest0 = in_over (vsrc, vsrca, expand_alpha_rev (M64 (m0)),
+				 expand8888 (vdest, 0));
+		dest1 = in_over (vsrc, vsrca, expand_alpha_rev (M64 (m1)),
+				 expand8888 (vdest, 1));
 
-		*(__m64 *)dst = pack8888(dest0, dest1);
+		*(__m64 *)dst = pack8888 (dest0, dest1);
 	    }
 
 	    mask += 2;
@@ -1655,7 +1837,7 @@ fbCompositeSolidMask_nx8x8888mmx (pixman_implementation_t *imp,
 	    w -= 2;
 	}
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w)
 	{
@@ -1663,9 +1845,11 @@ fbCompositeSolidMask_nx8x8888mmx (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		__m64 vdest = load8888(*dst);
-		vdest = in_over(vsrc, vsrca, expand_alpha_rev (M64(m)), vdest);
-		*dst = store8888(vdest);
+		__m64 vdest = load8888 (*dst);
+
+		vdest = in_over (
+		    vsrc, vsrca, expand_alpha_rev (M64 (m)), vdest);
+		*dst = store8888 (vdest);
 	    }
 
 	    w--;
@@ -1674,25 +1858,26 @@ fbCompositeSolidMask_nx8x8888mmx (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 pixman_bool_t
 pixman_fill_mmx (uint32_t *bits,
-		 int stride,
-		 int bpp,
-		 int x,
-		 int y,
-		 int width,
-		 int height,
-		 uint32_t xor)
-{
-    uint64_t	fill;
-    __m64	vfill;
-    uint32_t	byte_width;
-    uint8_t	*byte_line;
+                 int       stride,
+                 int       bpp,
+                 int       x,
+                 int       y,
+                 int       width,
+                 int       height,
+                 uint32_t xor)
+{
+    uint64_t fill;
+    __m64 vfill;
+    uint32_t byte_width;
+    uint8_t     *byte_line;
+
 #ifdef __GNUC__
-    __m64	v1, v2, v3, v4, v5, v6, v7;
+    __m64 v1, v2, v3, v4, v5, v6, v7;
 #endif
 
     if (bpp != 16 && bpp != 32 && bpp != 8)
@@ -1702,12 +1887,12 @@ pixman_fill_mmx (uint32_t *bits,
 	return FALSE;
 
     if (bpp == 8 &&
-	((xor >> 16 != (xor & 0xffff)) ||
-	 (xor >> 24 != (xor & 0x00ff) >> 16)))
+        ((xor >> 16 != (xor & 0xffff)) ||
+         (xor >> 24 != (xor & 0x00ff) >> 16)))
     {
 	return FALSE;
     }
-    
+
     if (bpp == 8)
     {
 	stride = stride * (int) sizeof (uint32_t) / 1;
@@ -1731,19 +1916,19 @@ pixman_fill_mmx (uint32_t *bits,
     }
 
     fill = ((uint64_t)xor << 32) | xor;
-    vfill = M64(fill);
+    vfill = M64 (fill);
 
 #ifdef __GNUC__
     __asm__ (
-	"movq		%7,	%0\n"
-	"movq		%7,	%1\n"
-	"movq		%7,	%2\n"
-	"movq		%7,	%3\n"
-	"movq		%7,	%4\n"
-	"movq		%7,	%5\n"
-	"movq		%7,	%6\n"
+        "movq		%7,	%0\n"
+        "movq		%7,	%1\n"
+        "movq		%7,	%2\n"
+        "movq		%7,	%3\n"
+        "movq		%7,	%4\n"
+        "movq		%7,	%5\n"
+        "movq		%7,	%6\n"
 	: "=y" (v1), "=y" (v2), "=y" (v3),
-	  "=y" (v4), "=y" (v5), "=y" (v6), "=y" (v7)
+        "=y" (v4), "=y" (v5), "=y" (v6), "=y" (v7)
 	: "y" (vfill));
 #endif
 
@@ -1751,6 +1936,7 @@ pixman_fill_mmx (uint32_t *bits,
     {
 	int w;
 	uint8_t *d = byte_line;
+	
 	byte_line += stride;
 	w = byte_width;
 
@@ -1760,7 +1946,7 @@ pixman_fill_mmx (uint32_t *bits,
 	    w--;
 	    d++;
 	}
-	
+
 	while (w >= 2 && ((unsigned long)d & 3))
 	{
 	    *(uint16_t *)d = xor;
@@ -1780,18 +1966,18 @@ pixman_fill_mmx (uint32_t *bits,
 	{
 #ifdef __GNUC__
 	    __asm__ (
-		"movq	%1,	  (%0)\n"
-		"movq	%2,	 8(%0)\n"
-		"movq	%3,	16(%0)\n"
-		"movq	%4,	24(%0)\n"
-		"movq	%5,	32(%0)\n"
-		"movq	%6,	40(%0)\n"
-		"movq	%7,	48(%0)\n"
-		"movq	%8,	56(%0)\n"
+	        "movq	%1,	  (%0)\n"
+	        "movq	%2,	 8(%0)\n"
+	        "movq	%3,	16(%0)\n"
+	        "movq	%4,	24(%0)\n"
+	        "movq	%5,	32(%0)\n"
+	        "movq	%6,	40(%0)\n"
+	        "movq	%7,	48(%0)\n"
+	        "movq	%8,	56(%0)\n"
 		:
 		: "r" (d),
-		  "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
-		  "y" (v4), "y" (v5), "y" (v6), "y" (v7)
+	        "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
+	        "y" (v4), "y" (v5), "y" (v6), "y" (v7)
 		: "memory");
 #else
 	    *(__m64*) (d +  0) = vfill;
@@ -1826,65 +2012,66 @@ pixman_fill_mmx (uint32_t *bits,
 	    w--;
 	    d++;
 	}
-	
+
     }
 
-    _mm_empty();
+    _mm_empty ();
     return TRUE;
 }
 
 static void
-fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_implementation_t *imp,
-				     pixman_op_t op,
-				     pixman_image_t * pSrc,
-				     pixman_image_t * pMask,
-				     pixman_image_t * pDst,
-				     int32_t      xSrc,
-				     int32_t      ySrc,
-				     int32_t      xMask,
-				     int32_t      yMask,
-				     int32_t      xDst,
-				     int32_t      yDst,
-				     int32_t     width,
-				     int32_t     height)
-{
-    uint32_t	src, srca;
-    uint32_t	*dstLine, *dst;
-    uint8_t	*maskLine, *mask;
-    int	dstStride, maskStride;
-    uint16_t	w;
-    __m64	vsrc, vsrca;
-    uint64_t	srcsrc;
-
-    CHECKPOINT();
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            pixman_image_t *         src_image,
+                            pixman_image_t *         mask_image,
+                            pixman_image_t *         dst_image,
+                            int32_t                  src_x,
+                            int32_t                  src_y,
+                            int32_t                  mask_x,
+                            int32_t                  mask_y,
+                            int32_t                  dest_x,
+                            int32_t                  dest_y,
+                            int32_t                  width,
+                            int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
+    __m64 vsrc, vsrca;
+    uint64_t srcsrc;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
-    if (srca == 0)
+    if (src == 0)
     {
-	pixman_fill_mmx (pDst->bits.bits, pDst->bits.rowstride, PIXMAN_FORMAT_BPP (pDst->bits.format),
-			 xDst, yDst, width, height, 0);
+	pixman_fill_mmx (dst_image->bits.bits, dst_image->bits.rowstride,
+			 PIXMAN_FORMAT_BPP (dst_image->bits.format),
+	                 dest_x, dest_y, width, height, 0);
 	return;
     }
 
     srcsrc = (uint64_t)src << 32 | src;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     vsrc = load8888 (src);
     vsrca = expand_alpha (vsrc);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	mask = maskLine;
-	maskLine += maskStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
 	w = width;
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w && (unsigned long)dst & 7)
 	{
@@ -1892,8 +2079,9 @@ fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		__m64 vdest = in(vsrc, expand_alpha_rev (M64(m)));
-		*dst = store8888(vdest);
+		__m64 vdest = in (vsrc, expand_alpha_rev (M64 (m)));
+		
+		*dst = store8888 (vdest);
 	    }
 	    else
 	    {
@@ -1905,7 +2093,7 @@ fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w >= 2)
 	{
@@ -1924,10 +2112,10 @@ fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_implementation_t *imp,
 
 		vdest = *(__m64 *)dst;
 
-		dest0 = in(vsrc, expand_alpha_rev (M64(m0)));
-		dest1 = in(vsrc, expand_alpha_rev (M64(m1)));
+		dest0 = in (vsrc, expand_alpha_rev (M64 (m0)));
+		dest1 = in (vsrc, expand_alpha_rev (M64 (m1)));
 
-		*(__m64 *)dst = pack8888(dest0, dest1);
+		*(__m64 *)dst = pack8888 (dest0, dest1);
 	    }
 	    else
 	    {
@@ -1939,7 +2127,7 @@ fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_implementation_t *imp,
 	    w -= 2;
 	}
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w)
 	{
@@ -1947,9 +2135,10 @@ fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_implementation_t *imp,
 
 	    if (m)
 	    {
-		__m64 vdest = load8888(*dst);
-		vdest = in(vsrc, expand_alpha_rev (M64(m)));
-		*dst = store8888(vdest);
+		__m64 vdest = load8888 (*dst);
+		
+		vdest = in (vsrc, expand_alpha_rev (M64 (m)));
+		*dst = store8888 (vdest);
 	    }
 	    else
 	    {
@@ -1962,61 +2151,62 @@ fbCompositeSolidMaskSrc_nx8x8888mmx (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-fbCompositeSolidMask_nx8x0565mmx (pixman_implementation_t *imp,
-				  pixman_op_t op,
-				  pixman_image_t * pSrc,
-				  pixman_image_t * pMask,
-				  pixman_image_t * pDst,
-				  int32_t      xSrc,
-				  int32_t      ySrc,
-				  int32_t      xMask,
-				  int32_t      yMask,
-				  int32_t      xDst,
-				  int32_t      yDst,
-				  int32_t     width,
-				  int32_t     height)
-{
-    uint32_t	src, srca;
-    uint16_t	*dstLine, *dst;
-    uint8_t	*maskLine, *mask;
-    int	dstStride, maskStride;
-    uint16_t	w;
-    __m64	vsrc, vsrca, tmp;
+mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             pixman_image_t *         src_image,
+                             pixman_image_t *         mask_image,
+                             pixman_image_t *         dst_image,
+                             int32_t                  src_x,
+                             int32_t                  src_y,
+                             int32_t                  mask_x,
+                             int32_t                  mask_y,
+                             int32_t                  dest_x,
+                             int32_t                  dest_y,
+                             int32_t                  width,
+                             int32_t                  height)
+{
+    uint32_t src, srca;
+    uint16_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
+    __m64 vsrc, vsrca, tmp;
     uint64_t srcsrcsrcsrc, src16;
 
-    CHECKPOINT();
+    CHECKPOINT ();
 
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
-    if (srca == 0)
+    if (src == 0)
 	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
     vsrc = load8888 (src);
     vsrca = expand_alpha (vsrc);
 
-    tmp = pack565(vsrc, _mm_setzero_si64(), 0);
-    src16 = UINT64(tmp);
+    tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
+    src16 = UINT64 (tmp);
 
-    srcsrcsrcsrc = (uint64_t)src16 << 48 | (uint64_t)src16 << 32 |
+    srcsrcsrcsrc =
+	(uint64_t)src16 << 48 | (uint64_t)src16 << 32 |
 	(uint64_t)src16 << 16 | (uint64_t)src16;
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	mask = maskLine;
-	maskLine += maskStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
 	w = width;
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w && (unsigned long)dst & 7)
 	{
@@ -2025,10 +2215,12 @@ fbCompositeSolidMask_nx8x0565mmx (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		uint64_t d = *dst;
-		__m64 vd = M64(d);
-		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev (M64 (m)), expand565(vd, 0));
-		vd = pack565(vdest, _mm_setzero_si64(), 0);
-		*dst = UINT64(vd);
+		__m64 vd = M64 (d);
+		__m64 vdest = in_over (
+		    vsrc, vsrca, expand_alpha_rev (M64 (m)), expand565 (vd, 0));
+		
+		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
+		*dst = UINT64 (vd);
 	    }
 
 	    w--;
@@ -2036,7 +2228,7 @@ fbCompositeSolidMask_nx8x0565mmx (pixman_implementation_t *imp,
 	    dst++;
 	}
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w >= 4)
 	{
@@ -2057,14 +2249,18 @@ fbCompositeSolidMask_nx8x0565mmx (pixman_implementation_t *imp,
 
 		vdest = *(__m64 *)dst;
 
-		vm0 = M64(m0);
-		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
-		vm1 = M64(m1);
-		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
-		vm2 = M64(m2);
-		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
-		vm3 = M64(m3);
-		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
+		vm0 = M64 (m0);
+		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
+					   expand565 (vdest, 0)), vdest, 0);
+		vm1 = M64 (m1);
+		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
+					   expand565 (vdest, 1)), vdest, 1);
+		vm2 = M64 (m2);
+		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
+					   expand565 (vdest, 2)), vdest, 2);
+		vm3 = M64 (m3);
+		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
+					   expand565 (vdest, 3)), vdest, 3);
 
 		*(__m64 *)dst = vdest;
 	    }
@@ -2074,7 +2270,7 @@ fbCompositeSolidMask_nx8x0565mmx (pixman_implementation_t *imp,
 	    dst += 4;
 	}
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w)
 	{
@@ -2083,10 +2279,11 @@ fbCompositeSolidMask_nx8x0565mmx (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		uint64_t d = *dst;
-		__m64 vd = M64(d);
-		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev (M64(m)), expand565(vd, 0));
-		vd = pack565(vdest, _mm_setzero_si64(), 0);
-		*dst = UINT64(vd);
+		__m64 vd = M64 (d);
+		__m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (M64 (m)),
+				       expand565 (vd, 0));
+		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
+		*dst = UINT64 (vd);
 	    }
 
 	    w--;
@@ -2095,65 +2292,65 @@ fbCompositeSolidMask_nx8x0565mmx (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-fbCompositeSrc_8888RevNPx0565mmx (pixman_implementation_t *imp,
-				  pixman_op_t op,
-				  pixman_image_t * pSrc,
-				  pixman_image_t * pMask,
-				  pixman_image_t * pDst,
-				  int32_t      xSrc,
-				  int32_t      ySrc,
-				  int32_t      xMask,
-				  int32_t      yMask,
-				  int32_t      xDst,
-				  int32_t      yDst,
-				  int32_t     width,
-				  int32_t     height)
-{
-    uint16_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-    int	dstStride, srcStride;
-    uint16_t	w;
-
-    CHECKPOINT();
-
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                pixman_image_t *         src_image,
+                                pixman_image_t *         mask_image,
+                                pixman_image_t *         dst_image,
+                                int32_t                  src_x,
+                                int32_t                  src_y,
+                                int32_t                  mask_x,
+                                int32_t                  mask_y,
+                                int32_t                  dest_x,
+                                int32_t                  dest_y,
+                                int32_t                  width,
+                                int32_t                  height)
+{
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
 #if 0
     /* FIXME */
-    assert (pSrc->pDrawable == pMask->pDrawable);
+    assert (src_image->drawable == mask_image->drawable);
 #endif
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w && (unsigned long)dst & 7)
 	{
 	    __m64 vsrc = load8888 (*src);
 	    uint64_t d = *dst;
-	    __m64 vdest = expand565 (M64(d), 0);
+	    __m64 vdest = expand565 (M64 (d), 0);
 
-	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
+	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
 
-	    *dst = UINT64(vdest);
+	    *dst = UINT64 (vdest);
 
 	    w--;
 	    dst++;
 	    src++;
 	}
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w >= 4)
 	{
@@ -2173,10 +2370,10 @@ fbCompositeSrc_8888RevNPx0565mmx (pixman_implementation_t *imp,
 	    if ((a0 & a1 & a2 & a3) == 0xFF)
 	    {
 		__m64 vdest;
-		vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0);
-		vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
-		vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
-		vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
+		vdest = pack_565 (invert_colors (load8888 (s0)), _mm_setzero_si64 (), 0);
+		vdest = pack_565 (invert_colors (load8888 (s1)), vdest, 1);
+		vdest = pack_565 (invert_colors (load8888 (s2)), vdest, 2);
+		vdest = pack_565 (invert_colors (load8888 (s3)), vdest, 3);
 
 		*(__m64 *)dst = vdest;
 	    }
@@ -2184,10 +2381,10 @@ fbCompositeSrc_8888RevNPx0565mmx (pixman_implementation_t *imp,
 	    {
 		__m64 vdest = *(__m64 *)dst;
 
-		vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
-	        vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
-		vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
-		vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
+		vdest = pack_565 (over_rev_non_pre (load8888 (s0), expand565 (vdest, 0)), vdest, 0);
+		vdest = pack_565 (over_rev_non_pre (load8888 (s1), expand565 (vdest, 1)), vdest, 1);
+		vdest = pack_565 (over_rev_non_pre (load8888 (s2), expand565 (vdest, 2)), vdest, 2);
+		vdest = pack_565 (over_rev_non_pre (load8888 (s3), expand565 (vdest, 3)), vdest, 3);
 
 		*(__m64 *)dst = vdest;
 	    }
@@ -2197,17 +2394,17 @@ fbCompositeSrc_8888RevNPx0565mmx (pixman_implementation_t *imp,
 	    src += 4;
 	}
 
-	CHECKPOINT();
+	CHECKPOINT ();
 
 	while (w)
 	{
 	    __m64 vsrc = load8888 (*src);
 	    uint64_t d = *dst;
-	    __m64 vdest = expand565 (M64(d), 0);
+	    __m64 vdest = expand565 (M64 (d), 0);
 
-	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
+	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
 
-	    *dst = UINT64(vdest);
+	    *dst = UINT64 (vdest);
 
 	    w--;
 	    dst++;
@@ -2215,47 +2412,45 @@ fbCompositeSrc_8888RevNPx0565mmx (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
-
 static void
-fbCompositeSrc_8888RevNPx8888mmx (pixman_implementation_t *imp,
-				  pixman_op_t op,
-				  pixman_image_t * pSrc,
-				  pixman_image_t * pMask,
-				  pixman_image_t * pDst,
-				  int32_t      xSrc,
-				  int32_t      ySrc,
-				  int32_t      xMask,
-				  int32_t      yMask,
-				  int32_t      xDst,
-				  int32_t      yDst,
-				  int32_t     width,
-				  int32_t     height)
-{
-    uint32_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-    int	dstStride, srcStride;
-    uint16_t	w;
-
-    CHECKPOINT();
-
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                pixman_image_t *         src_image,
+                                pixman_image_t *         mask_image,
+                                pixman_image_t *         dst_image,
+                                int32_t                  src_x,
+                                int32_t                  src_y,
+                                int32_t                  mask_x,
+                                int32_t                  mask_y,
+                                int32_t                  dest_x,
+                                int32_t                  dest_y,
+                                int32_t                  width,
+                                int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
 #if 0
     /* FIXME */
-    assert (pSrc->pDrawable == pMask->pDrawable);
+    assert (src_image->drawable == mask_image->drawable);
 #endif
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
 	while (w && (unsigned long)dst & 7)
@@ -2284,8 +2479,8 @@ fbCompositeSrc_8888RevNPx8888mmx (pixman_implementation_t *imp,
 
 	    if ((a0 & a1) == 0xFF)
 	    {
-		d0 = invert_colors(load8888(s0));
-		d1 = invert_colors(load8888(s1));
+		d0 = invert_colors (load8888 (s0));
+		d1 = invert_colors (load8888 (s1));
 
 		*(__m64 *)dst = pack8888 (d0, d1);
 	    }
@@ -2293,8 +2488,8 @@ fbCompositeSrc_8888RevNPx8888mmx (pixman_implementation_t *imp,
 	    {
 		__m64 vdest = *(__m64 *)dst;
 
-		d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
-		d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
+		d0 = over_rev_non_pre (load8888 (s0), expand8888 (vdest, 0));
+		d1 = over_rev_non_pre (load8888 (s1), expand8888 (vdest, 1));
 
 		*(__m64 *)dst = pack8888 (d0, d1);
 	    }
@@ -2317,40 +2512,40 @@ fbCompositeSrc_8888RevNPx8888mmx (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-fbCompositeSolidMask_nx8888x0565Cmmx (pixman_implementation_t *imp,
-				      pixman_op_t op,
-				      pixman_image_t * pSrc,
-				      pixman_image_t * pMask,
-				      pixman_image_t * pDst,
-				      int32_t      xSrc,
-				      int32_t      ySrc,
-				      int32_t      xMask,
-				      int32_t      yMask,
-				      int32_t      xDst,
-				      int32_t      yDst,
-				      int32_t     width,
-				      int32_t     height)
-{
-    uint32_t	src, srca;
-    uint16_t	*dstLine;
-    uint32_t	*maskLine;
-    int	dstStride, maskStride;
-    __m64  vsrc, vsrca;
-
-    CHECKPOINT();
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                   pixman_op_t              op,
+                                   pixman_image_t *         src_image,
+                                   pixman_image_t *         mask_image,
+                                   pixman_image_t *         dst_image,
+                                   int32_t                  src_x,
+                                   int32_t                  src_y,
+                                   int32_t                  mask_x,
+                                   int32_t                  mask_y,
+                                   int32_t                  dest_x,
+                                   int32_t                  dest_y,
+                                   int32_t                  width,
+                                   int32_t                  height)
+{
+    uint32_t src, srca;
+    uint16_t    *dst_line;
+    uint32_t    *mask_line;
+    int dst_stride, mask_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
-    if (srca == 0)
+    if (src == 0)
 	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
     vsrc = load8888 (src);
     vsrca = expand_alpha (vsrc);
@@ -2358,8 +2553,8 @@ fbCompositeSolidMask_nx8888x0565Cmmx (pixman_implementation_t *imp,
     while (height--)
     {
 	int twidth = width;
-	uint32_t *p = (uint32_t *)maskLine;
-	uint16_t *q = (uint16_t *)dstLine;
+	uint32_t *p = (uint32_t *)mask_line;
+	uint16_t *q = (uint16_t *)dst_line;
 
 	while (twidth && ((unsigned long)q & 7))
 	{
@@ -2368,9 +2563,9 @@ fbCompositeSolidMask_nx8888x0565Cmmx (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		uint64_t d = *q;
-		__m64 vdest = expand565 (M64(d), 0);
-		vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
-		*q = UINT64(vdest);
+		__m64 vdest = expand565 (M64 (d), 0);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
+		*q = UINT64 (vdest);
 	    }
 
 	    twidth--;
@@ -2391,10 +2586,10 @@ fbCompositeSolidMask_nx8888x0565Cmmx (pixman_implementation_t *imp,
 	    {
 		__m64 vdest = *(__m64 *)q;
 
-		vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
-		vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
-		vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
-		vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m0), expand565 (vdest, 0)), vdest, 0);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m1), expand565 (vdest, 1)), vdest, 1);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m2), expand565 (vdest, 2)), vdest, 2);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m3), expand565 (vdest, 3)), vdest, 3);
 
 		*(__m64 *)q = vdest;
 	    }
@@ -2411,9 +2606,9 @@ fbCompositeSolidMask_nx8888x0565Cmmx (pixman_implementation_t *imp,
 	    if (m)
 	    {
 		uint64_t d = *q;
-		__m64 vdest = expand565(M64(d), 0);
-		vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
-		*q = UINT64(vdest);
+		__m64 vdest = expand565 (M64 (d), 0);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
+		*q = UINT64 (vdest);
 	    }
 
 	    twidth--;
@@ -2421,58 +2616,56 @@ fbCompositeSolidMask_nx8888x0565Cmmx (pixman_implementation_t *imp,
 	    q++;
 	}
 
-	maskLine += maskStride;
-	dstLine += dstStride;
+	mask_line += mask_stride;
+	dst_line += dst_stride;
     }
 
     _mm_empty ();
 }
 
 static void
-fbCompositeIn_nx8x8mmx (pixman_implementation_t *imp,
-			pixman_op_t op,
-			pixman_image_t * pSrc,
-			pixman_image_t * pMask,
-			pixman_image_t * pDst,
-			int32_t      xSrc,
-			int32_t      ySrc,
-			int32_t      xMask,
-			int32_t      yMask,
-			int32_t      xDst,
-			int32_t      yDst,
-			int32_t     width,
-			int32_t     height)
-{
-    uint8_t	*dstLine, *dst;
-    uint8_t	*maskLine, *mask;
-    int	dstStride, maskStride;
-    uint16_t	w;
-    uint32_t	src;
-    uint8_t	sa;
-    __m64	vsrc, vsrca;
-
-    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        pixman_image_t *         src_image,
+                        pixman_image_t *         mask_image,
+                        pixman_image_t *         dst_image,
+                        int32_t                  src_x,
+                        int32_t                  src_y,
+                        int32_t                  mask_x,
+                        int32_t                  mask_y,
+                        int32_t                  dest_x,
+                        int32_t                  dest_y,
+                        int32_t                  width,
+                        int32_t                  height)
+{
+    uint8_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
+    uint32_t src;
+    uint8_t sa;
+    __m64 vsrc, vsrca;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     sa = src >> 24;
-    if (sa == 0)
-	return;
 
-    vsrc = load8888(src);
-    vsrca = expand_alpha(vsrc);
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	mask = maskLine;
-	maskLine += maskStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
 	w = width;
 
-	if ((((unsigned long)pDst & 3) == 0) &&
-	    (((unsigned long)pSrc & 3) == 0))
+	if ((((unsigned long)dst_image & 3) == 0) &&
+	    (((unsigned long)src_image & 3) == 0))
 	{
 	    while (w >= 4)
 	    {
@@ -2495,57 +2688,56 @@ fbCompositeIn_nx8x8mmx (pixman_implementation_t *imp,
 
 	while (w--)
 	{
-	    uint16_t	tmp;
-	    uint8_t	a;
-	    uint32_t	m, d;
-	    uint32_t	r;
+	    uint16_t tmp;
+	    uint8_t a;
+	    uint32_t m, d;
 
 	    a = *mask++;
 	    d = *dst;
 
-	    m = FbInU (sa, 0, a, tmp);
-	    r = FbInU (m, 0, d, tmp);
+	    m = MUL_UN8 (sa, a, tmp);
+	    d = MUL_UN8 (m, d, tmp);
 
-	    *dst++ = r;
+	    *dst++ = d;
 	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-fbCompositeIn_8x8mmx (pixman_implementation_t *imp,
-		      pixman_op_t op,
-		      pixman_image_t * pSrc,
-		      pixman_image_t * pMask,
-		      pixman_image_t * pDst,
-		      int32_t      xSrc,
-		      int32_t      ySrc,
-		      int32_t      xMask,
-		      int32_t      yMask,
-		      int32_t      xDst,
-		      int32_t      yDst,
-		      int32_t     width,
-		      int32_t     height)
-{
-    uint8_t	*dstLine, *dst;
-    uint8_t	*srcLine, *src;
-    int	srcStride, dstStride;
-    uint16_t	w;
-
-    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
+mmx_composite_in_8_8 (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      pixman_image_t *         src_image,
+                      pixman_image_t *         mask_image,
+                      pixman_image_t *         dst_image,
+                      int32_t                  src_x,
+                      int32_t                  src_y,
+                      int32_t                  mask_x,
+                      int32_t                  mask_y,
+                      int32_t                  dest_x,
+                      int32_t                  dest_y,
+                      int32_t                  width,
+                      int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int src_stride, dst_stride;
+    uint16_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
-	if ((((unsigned long)pDst & 3) == 0) &&
-	    (((unsigned long)pSrc & 3) == 0))
+	if ((((unsigned long)dst_image & 3) == 0) &&
+	    (((unsigned long)src_image & 3) == 0))
 	{
 	    while (w >= 4)
 	    {
@@ -2568,7 +2760,7 @@ fbCompositeIn_8x8mmx (pixman_implementation_t *imp,
 	    s = *src;
 	    d = *dst;
 
-	    *dst = FbInU (s, 0, d, tmp);
+	    *dst = MUL_UN8 (s, d, tmp);
 
 	    src++;
 	    dst++;
@@ -2579,50 +2771,51 @@ fbCompositeIn_8x8mmx (pixman_implementation_t *imp,
 }
 
 static void
-fbCompositeSrcAdd_8888x8x8mmx (pixman_implementation_t *imp,
-			       pixman_op_t op,
-			       pixman_image_t * pSrc,
-			       pixman_image_t * pMask,
-			       pixman_image_t * pDst,
-			       int32_t      xSrc,
-			       int32_t      ySrc,
-			       int32_t      xMask,
-			       int32_t      yMask,
-			       int32_t      xDst,
-			       int32_t      yDst,
-			       int32_t     width,
-			       int32_t     height)
-{
-    uint8_t	*dstLine, *dst;
-    uint8_t	*maskLine, *mask;
-    int	dstStride, maskStride;
-    uint16_t	w;
-    uint32_t	src;
-    uint8_t	sa;
-    __m64	vsrc, vsrca;
-
-    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
+			 pixman_op_t              op,
+			 pixman_image_t *         src_image,
+			 pixman_image_t *         mask_image,
+			 pixman_image_t *         dst_image,
+			 int32_t                  src_x,
+			 int32_t                  src_y,
+			 int32_t                  mask_x,
+			 int32_t                  mask_y,
+			 int32_t                  dest_x,
+			 int32_t                  dest_y,
+			 int32_t                  width,
+			 int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
+    uint32_t src;
+    uint8_t sa;
+    __m64 vsrc, vsrca;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     sa = src >> 24;
-    if (sa == 0)
+
+    if (src == 0)
 	return;
 
-    vsrc = load8888(src);
-    vsrca = expand_alpha(vsrc);
+    vsrc = load8888 (src);
+    vsrca = expand_alpha (vsrc);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	mask = maskLine;
-	maskLine += maskStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
 	w = width;
 
-	if ((((unsigned long)pMask & 3) == 0) &&
-	    (((unsigned long)pDst  & 3) == 0))
+	if ((((unsigned long)mask_image & 3) == 0) &&
+	    (((unsigned long)dst_image  & 3) == 0))
 	{
 	    while (w >= 4)
 	    {
@@ -2639,57 +2832,57 @@ fbCompositeSrcAdd_8888x8x8mmx (pixman_implementation_t *imp,
 
 	while (w--)
 	{
-	    uint16_t	tmp;
-	    uint16_t	a;
-	    uint32_t	m, d;
-	    uint32_t	r;
+	    uint16_t tmp;
+	    uint16_t a;
+	    uint32_t m, d;
+	    uint32_t r;
 
 	    a = *mask++;
 	    d = *dst;
 
-	    m = FbInU (sa, 0, a, tmp);
-	    r = FbAdd (m, d, 0, tmp);
+	    m = MUL_UN8 (sa, a, tmp);
+	    r = ADD_UN8 (m, d, tmp);
 
 	    *dst++ = r;
 	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-fbCompositeSrcAdd_8000x8000mmx (pixman_implementation_t *imp,
-				pixman_op_t op,
-				pixman_image_t * pSrc,
-				pixman_image_t * pMask,
-				pixman_image_t * pDst,
-				int32_t      xSrc,
-				int32_t      ySrc,
-				int32_t      xMask,
-				int32_t      yMask,
-				int32_t      xDst,
-				int32_t      yDst,
-				int32_t     width,
-				int32_t     height)
-{
-    uint8_t	*dstLine, *dst;
-    uint8_t	*srcLine, *src;
-    int	dstStride, srcStride;
-    uint16_t	w;
-    uint8_t	s, d;
-    uint16_t	t;
-
-    CHECKPOINT();
-
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
-    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
+mmx_composite_add_8000_8000 (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             pixman_image_t *         src_image,
+                             pixman_image_t *         mask_image,
+                             pixman_image_t *         dst_image,
+                             int32_t                  src_x,
+                             int32_t                  src_y,
+                             int32_t                  mask_x,
+                             int32_t                  mask_y,
+                             int32_t                  dest_x,
+                             int32_t                  dest_y,
+                             int32_t                  width,
+                             int32_t                  height)
+{
+    uint8_t *dst_line, *dst;
+    uint8_t *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w;
+    uint8_t s, d;
+    uint16_t t;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
 	while (w && (unsigned long)dst & 7)
@@ -2707,7 +2900,7 @@ fbCompositeSrcAdd_8000x8000mmx (pixman_implementation_t *imp,
 
 	while (w >= 8)
 	{
-	    *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
+	    *(__m64*)dst = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
 	    dst += 8;
 	    src += 8;
 	    w -= 8;
@@ -2727,47 +2920,47 @@ fbCompositeSrcAdd_8000x8000mmx (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 static void
-fbCompositeSrcAdd_8888x8888mmx (pixman_implementation_t *imp,
-				pixman_op_t 	op,
-				pixman_image_t *	pSrc,
-				pixman_image_t *	pMask,
-				pixman_image_t *	 pDst,
-				int32_t		 xSrc,
-				int32_t      ySrc,
-				int32_t      xMask,
-				int32_t      yMask,
-				int32_t      xDst,
-				int32_t      yDst,
-				int32_t     width,
-				int32_t     height)
+mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             pixman_image_t *         src_image,
+                             pixman_image_t *         mask_image,
+                             pixman_image_t *         dst_image,
+                             int32_t                  src_x,
+                             int32_t                  src_y,
+                             int32_t                  mask_x,
+                             int32_t                  mask_y,
+                             int32_t                  dest_x,
+                             int32_t                  dest_y,
+                             int32_t                  width,
+                             int32_t                  height)
 {
     __m64 dst64;
-    uint32_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-    int	dstStride, srcStride;
-    uint16_t	w;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w;
 
-    CHECKPOINT();
+    CHECKPOINT ();
 
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	src = srcLine;
-	srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 	w = width;
 
 	while (w && (unsigned long)dst & 7)
 	{
-	    *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
-						 _mm_cvtsi32_si64(*dst)));
+	    *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
+	                                           _mm_cvtsi32_si64 (*dst)));
 	    dst++;
 	    src++;
 	    w--;
@@ -2775,8 +2968,8 @@ fbCompositeSrcAdd_8888x8888mmx (pixman_implementation_t *imp,
 
 	while (w >= 2)
 	{
-	    dst64 = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
-	    *(uint64_t*)dst = UINT64(dst64);
+	    dst64 = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
+	    *(uint64_t*)dst = UINT64 (dst64);
 	    dst += 2;
 	    src += 2;
 	    w -= 2;
@@ -2784,29 +2977,32 @@ fbCompositeSrcAdd_8888x8888mmx (pixman_implementation_t *imp,
 
 	if (w)
 	{
-	    *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
-						 _mm_cvtsi32_si64(*dst)));
+	    *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
+	                                           _mm_cvtsi32_si64 (*dst)));
 
 	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 static pixman_bool_t
 pixman_blt_mmx (uint32_t *src_bits,
-		uint32_t *dst_bits,
-		int src_stride,
-		int dst_stride,
-		int src_bpp,
-		int dst_bpp,
-		int src_x, int src_y,
-		int dst_x, int dst_y,
-		int width, int height)
-{
-    uint8_t *	src_bytes;
-    uint8_t *	dst_bytes;
-    int		byte_width;
+                uint32_t *dst_bits,
+                int       src_stride,
+                int       dst_stride,
+                int       src_bpp,
+                int       dst_bpp,
+                int       src_x,
+                int       src_y,
+                int       dst_x,
+                int       dst_y,
+                int       width,
+                int       height)
+{
+    uint8_t *   src_bytes;
+    uint8_t *   dst_bytes;
+    int byte_width;
 
     if (src_bpp != dst_bpp)
 	return FALSE;
@@ -2820,7 +3016,9 @@ pixman_blt_mmx (uint32_t *src_bits,
 	byte_width = 2 * width;
 	src_stride *= 2;
 	dst_stride *= 2;
-    } else if (src_bpp == 32) {
+    }
+    else if (src_bpp == 32)
+    {
 	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
 	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
@@ -2828,7 +3026,9 @@ pixman_blt_mmx (uint32_t *src_bits,
 	byte_width = 4 * width;
 	src_stride *= 4;
 	dst_stride *= 4;
-    } else {
+    }
+    else
+    {
 	return FALSE;
     }
 
@@ -2862,28 +3062,28 @@ pixman_blt_mmx (uint32_t *src_bits,
 	{
 #if defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
 	    __asm__ (
-		"movq	  (%1),	  %%mm0\n"
-		"movq	 8(%1),	  %%mm1\n"
-		"movq	16(%1),	  %%mm2\n"
-		"movq	24(%1),	  %%mm3\n"
-		"movq	32(%1),	  %%mm4\n"
-		"movq	40(%1),	  %%mm5\n"
-		"movq	48(%1),	  %%mm6\n"
-		"movq	56(%1),	  %%mm7\n"
-
-		"movq	%%mm0,	  (%0)\n"
-		"movq	%%mm1,	 8(%0)\n"
-		"movq	%%mm2,	16(%0)\n"
-		"movq	%%mm3,	24(%0)\n"
-		"movq	%%mm4,	32(%0)\n"
-		"movq	%%mm5,	40(%0)\n"
-		"movq	%%mm6,	48(%0)\n"
-		"movq	%%mm7,	56(%0)\n"
+	        "movq	  (%1),	  %%mm0\n"
+	        "movq	 8(%1),	  %%mm1\n"
+	        "movq	16(%1),	  %%mm2\n"
+	        "movq	24(%1),	  %%mm3\n"
+	        "movq	32(%1),	  %%mm4\n"
+	        "movq	40(%1),	  %%mm5\n"
+	        "movq	48(%1),	  %%mm6\n"
+	        "movq	56(%1),	  %%mm7\n"
+
+	        "movq	%%mm0,	  (%0)\n"
+	        "movq	%%mm1,	 8(%0)\n"
+	        "movq	%%mm2,	16(%0)\n"
+	        "movq	%%mm3,	24(%0)\n"
+	        "movq	%%mm4,	32(%0)\n"
+	        "movq	%%mm5,	40(%0)\n"
+	        "movq	%%mm6,	48(%0)\n"
+	        "movq	%%mm7,	56(%0)\n"
 		:
 		: "r" (d), "r" (s)
 		: "memory",
-		  "%mm0", "%mm1", "%mm2", "%mm3",
-		  "%mm4", "%mm5", "%mm6", "%mm7");
+	        "%mm0", "%mm1", "%mm2", "%mm3",
+	        "%mm4", "%mm5", "%mm6", "%mm7");
 #else
 	    __m64 v0 = *(__m64 *)(s + 0);
 	    __m64 v1 = *(__m64 *)(s + 8);
@@ -2924,68 +3124,68 @@ pixman_blt_mmx (uint32_t *src_bits,
 	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 
     return TRUE;
 }
 
 static void
-fbCompositeCopyAreammx (pixman_implementation_t *imp,
-			pixman_op_t       op,
-			pixman_image_t *	pSrc,
-			pixman_image_t *	pMask,
-			pixman_image_t *	pDst,
-			int32_t		xSrc,
-			int32_t		ySrc,
-			int32_t		xMask,
-			int32_t		yMask,
-			int32_t		xDst,
-			int32_t		yDst,
-			int32_t		width,
-			int32_t		height)
-{
-    pixman_blt_mmx (pSrc->bits.bits,
-		    pDst->bits.bits,
-		    pSrc->bits.rowstride,
-		    pDst->bits.rowstride,
-		    PIXMAN_FORMAT_BPP (pSrc->bits.format),
-		    PIXMAN_FORMAT_BPP (pDst->bits.format),
-		    xSrc, ySrc, xDst, yDst, width, height);
+mmx_composite_copy_area (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         pixman_image_t *         src_image,
+                         pixman_image_t *         mask_image,
+                         pixman_image_t *         dst_image,
+                         int32_t                  src_x,
+                         int32_t                  src_y,
+                         int32_t                  mask_x,
+                         int32_t                  mask_y,
+                         int32_t                  dest_x,
+                         int32_t                  dest_y,
+                         int32_t                  width,
+                         int32_t                  height)
+{
+    pixman_blt_mmx (src_image->bits.bits,
+                    dst_image->bits.bits,
+                    src_image->bits.rowstride,
+                    dst_image->bits.rowstride,
+                    PIXMAN_FORMAT_BPP (src_image->bits.format),
+                    PIXMAN_FORMAT_BPP (dst_image->bits.format),
+                    src_x, src_y, dest_x, dest_y, width, height);
 }
 
 static void
-fbCompositeOver_x888x8x8888mmx (pixman_implementation_t *imp,
-				pixman_op_t      op,
-				pixman_image_t * pSrc,
-				pixman_image_t * pMask,
-				pixman_image_t * pDst,
-				int32_t      xSrc,
-				int32_t      ySrc,
-				int32_t      xMask,
-				int32_t      yMask,
-				int32_t      xDst,
-				int32_t      yDst,
-				int32_t     width,
-				int32_t     height)
-{
-    uint32_t	*src, *srcLine;
-    uint32_t    *dst, *dstLine;
-    uint8_t	*mask, *maskLine;
-    int		 srcStride, maskStride, dstStride;
+mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                pixman_image_t *         src_image,
+                                pixman_image_t *         mask_image,
+                                pixman_image_t *         dst_image,
+                                int32_t                  src_x,
+                                int32_t                  src_y,
+                                int32_t                  mask_x,
+                                int32_t                  mask_y,
+                                int32_t                  dest_x,
+                                int32_t                  dest_y,
+                                int32_t                  width,
+                                int32_t                  height)
+{
+    uint32_t    *src, *src_line;
+    uint32_t    *dst, *dst_line;
+    uint8_t     *mask, *mask_line;
+    int src_stride, mask_stride, dst_stride;
     uint16_t w;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     while (height--)
     {
-	src = srcLine;
-	srcLine += srcStride;
-	dst = dstLine;
-	dstLine += dstStride;
-	mask = maskLine;
-	maskLine += maskStride;
+	src = src_line;
+	src_line += src_stride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
 
 	w = width;
 
@@ -2998,12 +3198,14 @@ fbCompositeOver_x888x8x8888mmx (pixman_implementation_t *imp,
 		__m64 s = load8888 (*src | 0xff000000);
 
 		if (m == 0xff)
+		{
 		    *dst = store8888 (s);
+		}
 		else
 		{
 		    __m64 sa = expand_alpha (s);
-		    __m64 vm = expand_alpha_rev (M64(m));
-		    __m64 vdest = in_over(s, sa, vm, load8888 (*dst));
+		    __m64 vm = expand_alpha_rev (M64 (m));
+		    __m64 vdest = in_over (s, sa, vm, load8888 (*dst));
 
 		    *dst = store8888 (vdest);
 		}
@@ -3015,137 +3217,142 @@ fbCompositeOver_x888x8x8888mmx (pixman_implementation_t *imp,
 	}
     }
 
-    _mm_empty();
-}
-
-static const FastPathInfo mmx_fast_paths[] =
-{
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   fbCompositeSolidMask_nx8x0565mmx,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   fbCompositeSolidMask_nx8x0565mmx,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888mmx,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888mmx,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888mmx,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888mmx,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8888x8888Cmmx, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8888x8888Cmmx, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   fbCompositeSolidMask_nx8888x0565Cmmx, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8888x8888Cmmx, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8888x8888Cmmx, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   fbCompositeSolidMask_nx8888x0565Cmmx, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888RevNPx8888mmx, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888RevNPx8888mmx, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888RevNPx8888mmx, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888RevNPx8888mmx, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   fbCompositeSrc_8888RevNPx0565mmx, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5,   fbCompositeSrc_8888RevNPx0565mmx, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, fbCompositeSrc_8888RevNPx8888mmx, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, fbCompositeSrc_8888RevNPx8888mmx, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, fbCompositeSrc_8888RevNPx8888mmx, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, fbCompositeSrc_8888RevNPx8888mmx, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5,   fbCompositeSrc_8888RevNPx0565mmx, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   fbCompositeSrc_8888RevNPx0565mmx, NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSrc_x888xnx8888mmx,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSrc_x888xnx8888mmx,	   NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,	PIXMAN_a8b8g8r8, fbCompositeSrc_x888xnx8888mmx,	   NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,	PIXMAN_x8b8g8r8, fbCompositeSrc_x888xnx8888mmx,	   NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888mmx,    NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888mmx,	   NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,	PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8x8888mmx,	   NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,	PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8x8888mmx,	   NEED_SOLID_MASK },
+    _mm_empty ();
+}
+
+static const pixman_fast_path_t mmx_fast_paths[] =
+{
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   mmx_composite_over_n_8_0565,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   mmx_composite_over_n_8_0565,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, mmx_composite_over_n_8_8888,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, mmx_composite_over_n_8_8888,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, mmx_composite_over_n_8_8888,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, mmx_composite_over_n_8_8888,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, mmx_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, mmx_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   mmx_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, mmx_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, mmx_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   mmx_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   mmx_composite_over_pixbuf_0565, NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5,   mmx_composite_over_pixbuf_0565, NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, mmx_composite_over_pixbuf_8888, NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5,   mmx_composite_over_pixbuf_0565, NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   mmx_composite_over_pixbuf_0565, NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, mmx_composite_over_x888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, mmx_composite_over_x888_n_8888,           NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, mmx_composite_over_x888_n_8888,           NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, mmx_composite_over_x888_n_8888,           NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, mmx_composite_over_8888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, mmx_composite_over_8888_n_8888,           NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, mmx_composite_over_8888_n_8888,           NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, mmx_composite_over_8888_n_8888,           NEED_SOLID_MASK },
 #if 0
     /* FIXME: This code is commented out since it's apparently not actually faster than the generic code. */
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,	PIXMAN_x8r8g8b8, fbCompositeOver_x888x8x8888mmx,   0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,	PIXMAN_a8r8g8b8, fbCompositeOver_x888x8x8888mmx,   0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8r8g8, PIXMAN_a8,	PIXMAN_x8b8g8r8, fbCompositeOver_x888x8x8888mmx,   0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8r8g8, PIXMAN_a8,	PIXMAN_a8r8g8b8, fbCompositeOver_x888x8x8888mmx,   0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, mmx_composite_over_x888_8_8888,   0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, mmx_composite_over_x888_8_8888,   0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8r8g8, PIXMAN_a8,       PIXMAN_x8b8g8r8, mmx_composite_over_x888_8_8888,   0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8r8g8, PIXMAN_a8,       PIXMAN_a8r8g8b8, mmx_composite_over_x888_8_8888,   0 },
 #endif
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,	PIXMAN_a8r8g8b8, fbCompositeSolid_nx8888mmx,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSolid_nx8888mmx,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSolid_nx0565mmx,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeCopyAreammx,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeCopyAreammx,	   0 },
-
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888mmx,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,	PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888mmx,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,	PIXMAN_r5g6b5,	 fbCompositeSrc_8888x0565mmx,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,	PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888mmx,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,	PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888mmx,	   0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_8888x0565mmx,	   0 },
-
-    { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8,  PIXMAN_null,	PIXMAN_a8r8g8b8, fbCompositeSrcAdd_8888x8888mmx,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8,  PIXMAN_null,	PIXMAN_a8b8g8r8, fbCompositeSrcAdd_8888x8888mmx,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fbCompositeSrcAdd_8000x8000mmx,   0 },
-    { PIXMAN_OP_ADD, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       fbCompositeSrcAdd_8888x8x8mmx,    0 },
-
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSolidMaskSrc_nx8x8888mmx, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMaskSrc_nx8x8888mmx, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMaskSrc_nx8x8888mmx, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSolidMaskSrc_nx8x8888mmx, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,	PIXMAN_a8r8g8b8, fbCompositeCopyAreammx, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,	PIXMAN_a8b8g8r8, fbCompositeCopyAreammx, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,	PIXMAN_x8r8g8b8, fbCompositeCopyAreammx, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,	PIXMAN_x8b8g8r8, fbCompositeCopyAreammx, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,	PIXMAN_x8r8g8b8, fbCompositeCopyAreammx, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,	PIXMAN_x8b8g8r8, fbCompositeCopyAreammx, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_r5g6b5,    PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeCopyAreammx, 0 },
-    { PIXMAN_OP_SRC, PIXMAN_b5g6r5,    PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeCopyAreammx, 0 },    
-
-    { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fbCompositeIn_8x8mmx,   0 },
-    { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,	PIXMAN_a8,	 fbCompositeIn_nx8x8mmx, 0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_a8r8g8b8, mmx_composite_over_n_8888,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_x8r8g8b8, mmx_composite_over_n_8888,        0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   mmx_composite_over_n_0565,        0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, mmx_composite_copy_area,          0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, mmx_composite_copy_area,          0 },
+
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, mmx_composite_over_8888_8888,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, mmx_composite_over_8888_8888,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   mmx_composite_over_8888_0565,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, mmx_composite_over_8888_8888,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, mmx_composite_over_8888_8888,     0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   mmx_composite_over_8888_0565,     0 },
+
+    { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, mmx_composite_add_8888_8888,   0 },
+    { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, mmx_composite_add_8888_8888,   0 },
+    { PIXMAN_OP_ADD, PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       mmx_composite_add_8000_8000,   0 },
+    { PIXMAN_OP_ADD, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       mmx_composite_add_n_8_8,    0 },
+
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8r8g8b8, mmx_composite_src_n_8_8888, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8r8g8b8, mmx_composite_src_n_8_8888, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8b8g8r8, mmx_composite_src_n_8_8888, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8b8g8r8, mmx_composite_src_n_8_8888, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, mmx_composite_copy_area, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, mmx_composite_copy_area, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, mmx_composite_copy_area, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, mmx_composite_copy_area, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, mmx_composite_copy_area, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, mmx_composite_copy_area, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_r5g6b5,    PIXMAN_null,     PIXMAN_r5g6b5,   mmx_composite_copy_area, 0 },
+    { PIXMAN_OP_SRC, PIXMAN_b5g6r5,    PIXMAN_null,     PIXMAN_b5g6r5,   mmx_composite_copy_area, 0 },
+
+    { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       mmx_composite_in_8_8,   0 },
+    { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       mmx_composite_in_n_8_8, 0 },
 
     { PIXMAN_OP_NONE },
 };
 
 static void
 mmx_composite (pixman_implementation_t *imp,
-	       pixman_op_t     op,
-	       pixman_image_t *src,
-	       pixman_image_t *mask,
-	       pixman_image_t *dest,
-	       int32_t         src_x,
-	       int32_t         src_y,
-	       int32_t         mask_x,
-	       int32_t         mask_y,
-	       int32_t         dest_x,
-	       int32_t         dest_y,
-	       int32_t         width,
-	       int32_t         height)
+               pixman_op_t              op,
+               pixman_image_t *         src,
+               pixman_image_t *         mask,
+               pixman_image_t *         dest,
+               int32_t                  src_x,
+               int32_t                  src_y,
+               int32_t                  mask_x,
+               int32_t                  mask_y,
+               int32_t                  dest_x,
+               int32_t                  dest_y,
+               int32_t                  width,
+               int32_t                  height)
 {
     if (_pixman_run_fast_path (mmx_fast_paths, imp,
-			       op, src, mask, dest,
-			       src_x, src_y,
-			       mask_x, mask_y,
-			       dest_x, dest_y,
-			       width, height))
+                               op, src, mask, dest,
+                               src_x, src_y,
+                               mask_x, mask_y,
+                               dest_x, dest_y,
+                               width, height))
+    {
 	return;
+    }
 
     _pixman_implementation_composite (imp->delegate,
-				      op, src, mask, dest, src_x, src_y,
-				      mask_x, mask_y, dest_x, dest_y,
-				      width, height);
+                                      op, src, mask, dest, src_x, src_y,
+                                      mask_x, mask_y, dest_x, dest_y,
+                                      width, height);
 }
 
 static pixman_bool_t
 mmx_blt (pixman_implementation_t *imp,
-	 uint32_t *src_bits,
-	 uint32_t *dst_bits,
-	 int src_stride,
-	 int dst_stride,
-	 int src_bpp,
-	 int dst_bpp,
-	 int src_x, int src_y,
-	 int dst_x, int dst_y,
-	 int width, int height)
+         uint32_t *               src_bits,
+         uint32_t *               dst_bits,
+         int                      src_stride,
+         int                      dst_stride,
+         int                      src_bpp,
+         int                      dst_bpp,
+         int                      src_x,
+         int                      src_y,
+         int                      dst_x,
+         int                      dst_y,
+         int                      width,
+         int                      height)
 {
     if (!pixman_blt_mmx (
-	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-	    src_x, src_y, dst_x, dst_y, width, height))
+            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+            src_x, src_y, dst_x, dst_y, width, height))
 
     {
 	return _pixman_implementation_blt (
-	    imp->delegate,
-	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-	    src_x, src_y, dst_x, dst_y, width, height);
+	           imp->delegate,
+	           src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+	           src_x, src_y, dst_x, dst_y, width, height);
     }
 
     return TRUE;
@@ -3153,58 +3360,58 @@ mmx_blt (pixman_implementation_t *imp,
 
 static pixman_bool_t
 mmx_fill (pixman_implementation_t *imp,
-	  uint32_t *bits,
-	  int stride,
-	  int bpp,
-	  int x,
-	  int y,
-	  int width,
-	  int height,
-	  uint32_t xor)
+          uint32_t *               bits,
+          int                      stride,
+          int                      bpp,
+          int                      x,
+          int                      y,
+          int                      width,
+          int                      height,
+          uint32_t xor)
 {
     if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
     {
 	return _pixman_implementation_fill (
-	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+	           imp->delegate, bits, stride, bpp, x, y, width, height, xor);
     }
 
     return TRUE;
 }
 
 pixman_implementation_t *
-_pixman_implementation_create_mmx (pixman_implementation_t *toplevel)
-{
-    pixman_implementation_t *general = _pixman_implementation_create_fast_path (NULL);
-    pixman_implementation_t *imp = _pixman_implementation_create (toplevel, general);
-
-    imp->combine_32[PIXMAN_OP_OVER] = mmxCombineOverU;
-    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmxCombineOverReverseU;
-    imp->combine_32[PIXMAN_OP_IN] = mmxCombineInU;
-    imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmxCombineInReverseU;
-    imp->combine_32[PIXMAN_OP_OUT] = mmxCombineOutU;
-    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmxCombineOutReverseU;
-    imp->combine_32[PIXMAN_OP_ATOP] = mmxCombineAtopU;
-    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmxCombineAtopReverseU;
-    imp->combine_32[PIXMAN_OP_XOR] = mmxCombineXorU; 
-    imp->combine_32[PIXMAN_OP_ADD] = mmxCombineAddU;
-    imp->combine_32[PIXMAN_OP_SATURATE] = mmxCombineSaturateU;
-    
-    imp->combine_32_ca[PIXMAN_OP_SRC] = mmxCombineSrcC;
-    imp->combine_32_ca[PIXMAN_OP_OVER] = mmxCombineOverC;
-    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmxCombineOverReverseC;
-    imp->combine_32_ca[PIXMAN_OP_IN] = mmxCombineInC;
-    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmxCombineInReverseC;
-    imp->combine_32_ca[PIXMAN_OP_OUT] = mmxCombineOutC;
-    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmxCombineOutReverseC;
-    imp->combine_32_ca[PIXMAN_OP_ATOP] = mmxCombineAtopC;
-    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmxCombineAtopReverseC;
-    imp->combine_32_ca[PIXMAN_OP_XOR] = mmxCombineXorC;
-    imp->combine_32_ca[PIXMAN_OP_ADD] = mmxCombineAddC;
+_pixman_implementation_create_mmx (void)
+{
+    pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
+    pixman_implementation_t *imp = _pixman_implementation_create (general);
+
+    imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
+    imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
 
     imp->composite = mmx_composite;
     imp->blt = mmx_blt;
     imp->fill = mmx_fill;
-    
+
     return imp;
 }
 
diff --git a/lib/pixman/pixman/pixman-pict.c b/lib/pixman/pixman/pixman-pict.c
deleted file mode 100644
index 17bd566e2..000000000
--- a/lib/pixman/pixman/pixman-pict.c
+++ /dev/null
@@ -1,175 +0,0 @@
-/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
-/*
- * Copyright © 2000 SuSE, Inc.
- * Copyright © 2007 Red Hat, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of SuSE not be used in advertising or
- * publicity pertaining to distribution of the software without specific,
- * written prior permission.  SuSE makes no representations about the
- * suitability of this software for any purpose.  It is provided "as is"
- * without express or implied warranty.
- *
- * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
- * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
- * Author:  Keith Packard, SuSE, Inc.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-#include "pixman-private.h"
-
-/*
- * Operator optimizations based on source or destination opacity
- */
-typedef struct
-{
-    pixman_op_t			op;
-    pixman_op_t			opSrcDstOpaque;
-    pixman_op_t			opSrcOpaque;
-    pixman_op_t			opDstOpaque;
-} OptimizedOperatorInfo;
-
-static const OptimizedOperatorInfo optimized_operators[] =
-{
-    /* Input Operator           SRC&DST Opaque          SRC Opaque              DST Opaque      */
-    { PIXMAN_OP_OVER,           PIXMAN_OP_SRC,          PIXMAN_OP_SRC,          PIXMAN_OP_OVER },
-    { PIXMAN_OP_OVER_REVERSE,   PIXMAN_OP_DST,          PIXMAN_OP_OVER_REVERSE, PIXMAN_OP_DST },
-    { PIXMAN_OP_IN,             PIXMAN_OP_SRC,          PIXMAN_OP_IN,           PIXMAN_OP_SRC },
-    { PIXMAN_OP_IN_REVERSE,     PIXMAN_OP_DST,          PIXMAN_OP_DST,          PIXMAN_OP_IN_REVERSE },
-    { PIXMAN_OP_OUT,            PIXMAN_OP_CLEAR,        PIXMAN_OP_OUT,          PIXMAN_OP_CLEAR },
-    { PIXMAN_OP_OUT_REVERSE,    PIXMAN_OP_CLEAR,        PIXMAN_OP_CLEAR,        PIXMAN_OP_OUT_REVERSE },
-    { PIXMAN_OP_ATOP,           PIXMAN_OP_SRC,          PIXMAN_OP_IN,           PIXMAN_OP_OVER },
-    { PIXMAN_OP_ATOP_REVERSE,   PIXMAN_OP_DST,          PIXMAN_OP_OVER_REVERSE, PIXMAN_OP_IN_REVERSE },
-    { PIXMAN_OP_XOR,            PIXMAN_OP_CLEAR,        PIXMAN_OP_OUT,          PIXMAN_OP_OUT_REVERSE },
-    { PIXMAN_OP_SATURATE,       PIXMAN_OP_DST,          PIXMAN_OP_OVER_REVERSE, PIXMAN_OP_DST },
-    { PIXMAN_OP_NONE }
-};
-
-/*
- * Check if the current operator could be optimized
- */
-static const OptimizedOperatorInfo*
-pixman_operator_can_be_optimized(pixman_op_t op)
-{
-    const OptimizedOperatorInfo *info;
-
-    for (info = optimized_operators; info->op != PIXMAN_OP_NONE; info++)
-    {
-        if(info->op == op)
-            return info;
-    }
-    return NULL;
-}
-
-/*
- * Optimize the current operator based on opacity of source or destination
- * The output operator should be mathematically equivalent to the source.
- */
-static pixman_op_t
-pixman_optimize_operator(pixman_op_t op, pixman_image_t *pSrc, pixman_image_t *pMask, pixman_image_t *pDst )
-{
-    pixman_bool_t is_source_opaque;
-    pixman_bool_t is_dest_opaque;
-    const OptimizedOperatorInfo *info = pixman_operator_can_be_optimized(op);
-
-    if(!info || pMask)
-        return op;
-
-    is_source_opaque = pixman_image_is_opaque(pSrc);
-    is_dest_opaque = pixman_image_is_opaque(pDst);
-
-    if(is_source_opaque == FALSE && is_dest_opaque == FALSE)
-        return op;
-
-    if(is_source_opaque && is_dest_opaque)
-        return info->opSrcDstOpaque;
-    else if(is_source_opaque)
-        return info->opSrcOpaque;
-    else if(is_dest_opaque)
-        return info->opDstOpaque;
-
-    return op;
-
-}
-
-static pixman_implementation_t *imp;
-
-PIXMAN_EXPORT void
-pixman_image_composite (pixman_op_t      op,
-			pixman_image_t * src,
-			pixman_image_t * mask,
-			pixman_image_t * dest,
-			int16_t      src_x,
-			int16_t      src_y,
-			int16_t      mask_x,
-			int16_t      mask_y,
-			int16_t      dest_x,
-			int16_t      dest_y,
-			uint16_t     width,
-			uint16_t     height)
-{
-    /*
-     * Check if we can replace our operator by a simpler one if the src or dest are opaque
-     * The output operator should be mathematically equivalent to the source.
-     */
-    op = pixman_optimize_operator(op, src, mask, dest);
-    if(op == PIXMAN_OP_DST)
-        return;
-
-    if (!imp)
-	imp = _pixman_choose_implementation();
-
-    _pixman_implementation_composite (imp, op,
-				      src, mask, dest,
-				      src_x, src_y,
-				      mask_x, mask_y,
-				      dest_x, dest_y,
-				      width, height);
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_blt (uint32_t *src_bits,
-	    uint32_t *dst_bits,
-	    int src_stride,
-	    int dst_stride,
-	    int src_bpp,
-	    int dst_bpp,
-	    int src_x, int src_y,
-	    int dst_x, int dst_y,
-	    int width, int height)
-{
-    if (!imp)
-	imp = _pixman_choose_implementation();
-    
-    return _pixman_implementation_blt (imp, src_bits, dst_bits, src_stride, dst_stride,
-				       src_bpp, dst_bpp,
-				       src_x, src_y,
-				       dst_x, dst_y,
-				       width, height);
-}
-
-PIXMAN_EXPORT pixman_bool_t
-pixman_fill (uint32_t *bits,
-	     int stride,
-	     int bpp,
-	     int x,
-	     int y,
-	     int width,
-	     int height,
-	     uint32_t xor)
-{
-    if (!imp)
-	imp = _pixman_choose_implementation();
-
-    return _pixman_implementation_fill (imp, bits, stride, bpp, x, y, width, height, xor);
-}
diff --git a/lib/pixman/pixman/pixman-private.h b/lib/pixman/pixman/pixman-private.h
index 9e770f6f5..ff7a65f88 100644
--- a/lib/pixman/pixman/pixman-private.h
+++ b/lib/pixman/pixman/pixman-private.h
@@ -9,133 +9,11 @@
 #include <time.h>
 #include <assert.h>
 
-#ifndef FALSE
-#define FALSE 0
-#endif
-
-#ifndef TRUE
-#define TRUE 1
-#endif
-
-#define MSBFirst 0
-#define LSBFirst 1
-
-#ifdef WORDS_BIGENDIAN
-#  define IMAGE_BYTE_ORDER MSBFirst
-#  define BITMAP_BIT_ORDER MSBFirst
-#else
-#  define IMAGE_BYTE_ORDER LSBFirst
-#  define BITMAP_BIT_ORDER LSBFirst
-#endif
-
-#undef DEBUG
-#define DEBUG 0
-
-#if defined (__GNUC__)
-#  define FUNC     ((const char*) (__PRETTY_FUNCTION__))
-#elif defined (__sun) || (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
-#  define FUNC     ((const char*) (__func__))
-#else
-#  define FUNC     ((const char*) ("???"))
-#endif
-
-#ifndef INT16_MIN
-# define INT16_MIN              (-32767-1)
-#endif
-
-#ifndef INT16_MAX
-# define INT16_MAX              (32767)
-#endif
-
-#ifndef INT32_MIN
-# define INT32_MIN              (-2147483647-1)
-#endif
-
-#ifndef INT32_MAX
-# define INT32_MAX              (2147483647)
-#endif
-
-#ifndef UINT32_MIN
-# define UINT32_MIN             (0)
-#endif
-
-#ifndef UINT32_MAX
-# define UINT32_MAX             (4294967295U)
-#endif
-
-#ifndef M_PI
-# define M_PI			3.14159265358979323846
-#endif
-
-#ifdef _MSC_VER
-/* 'inline' is available only in C++ in MSVC */
-#   define inline __inline
-#   define force_inline __forceinline
-#elif defined __GNUC__ || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
-#   define inline __inline__
-#   define force_inline __inline__ __attribute__ ((__always_inline__))
-#else
-# ifndef force_inline
-#  define force_inline inline
-# endif
-#endif
-
-#define FB_SHIFT    5
-#define FB_UNIT     (1 << FB_SHIFT)
-#define FB_HALFUNIT (1 << (FB_SHIFT-1))
-#define FB_MASK     (FB_UNIT - 1)
-#define FB_ALLONES  ((uint32_t) -1)
-
-/* Memory allocation helpers */
-void *pixman_malloc_ab (unsigned int n, unsigned int b);
-void *pixman_malloc_abc (unsigned int a, unsigned int b, unsigned int c);
-pixman_bool_t pixman_multiply_overflows_int (unsigned int a, unsigned int b);
-pixman_bool_t pixman_addition_overflows_int (unsigned int a, unsigned int b);
-
-#if DEBUG
-
-#define return_if_fail(expr)						\
-    do									\
-    {									\
-	if (!(expr))							\
-	{								\
-	    fprintf(stderr, "In %s: %s failed\n", FUNC, #expr);		\
-	    return;							\
-	}								\
-    }									\
-    while (0)
-
-#define return_val_if_fail(expr, retval) 				\
-    do									\
-    {									\
-	if (!(expr))							\
-	{								\
-	    fprintf(stderr, "In %s: %s failed\n", FUNC, #expr);		\
-	    return (retval);						\
-	}								\
-    }									\
-    while (0)
-
-#else
-
-#define return_if_fail(expr)						\
-    do									\
-    {									\
-	if (!(expr))							\
-	    return;							\
-    }									\
-    while (0)
-
-#define return_val_if_fail(expr, retval)				\
-    do									\
-    {									\
-	if (!(expr))							\
-	    return (retval);						\
-    }									\
-    while (0)
-
-#endif
+#include "pixman-compiler.h"
 
+/*
+ * Images
+ */
 typedef struct image_common image_common_t;
 typedef struct source_image source_image_t;
 typedef struct solid_fill solid_fill_t;
@@ -147,86 +25,28 @@ typedef struct conical_gradient conical_gradient_t;
 typedef struct radial_gradient radial_gradient_t;
 typedef struct bits_image bits_image_t;
 typedef struct circle circle_t;
-typedef struct point point_t;
 
-/* FIXME - the types and structures below should be give proper names
- */
+typedef void (*fetch_scanline_t) (pixman_image_t *image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  uint32_t       *buffer,
+				  const uint32_t *mask,
+				  uint32_t        mask_bits);
+
+typedef uint32_t (*fetch_pixel_32_t) (bits_image_t *image,
+				      int           x,
+				      int           y);
 
-#define FASTCALL
-typedef FASTCALL void (*CombineFunc32) (uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width);
-typedef FASTCALL void (*fetchProc32)(bits_image_t *pict, int x, int y, int width,
-                                     uint32_t *buffer);
-typedef FASTCALL uint32_t (*fetchPixelProc32)(bits_image_t *pict, int offset, int line);
-typedef FASTCALL void (*storeProc32)(pixman_image_t *, uint32_t *bits,
-                                     const uint32_t *values, int x, int width,
-                                     const pixman_indexed_t *);
-
-typedef FASTCALL void (*CombineFunc64) (uint64_t *dest, const uint64_t *src, const uint64_t *mask, int width);
-typedef FASTCALL void (*fetchProc64)(bits_image_t *pict, int x, int y, int width,
-                                     uint64_t *buffer);
-typedef FASTCALL uint64_t (*fetchPixelProc64)(bits_image_t *pict, int offset, int line);
-typedef FASTCALL void (*storeProc64)(pixman_image_t *, uint32_t *bits,
-                                     const uint64_t *values, int x, int width,
-                                     const pixman_indexed_t *);
-
-typedef struct _FbComposeData {
-    uint8_t	 op;
-    pixman_image_t	*src;
-    pixman_image_t	*mask;
-    pixman_image_t	*dest;
-    int16_t	 xSrc;
-    int16_t	 ySrc;
-    int16_t	 xMask;
-    int16_t	 yMask;
-    int16_t	 xDest;
-    int16_t	 yDest;
-    uint16_t	 width;
-    uint16_t	 height;
-} FbComposeData;
-
-void pixman_composite_rect_general_accessors (const FbComposeData *data,
-                                              void *src_buffer,
-                                              void *mask_buffer,
-                                              void *dest_buffer,
-                                              const int wide);
-void pixman_composite_rect_general (const FbComposeData *data);
-
-fetchProc32 pixman_fetchProcForPicture32 (bits_image_t *);
-fetchPixelProc32 pixman_fetchPixelProcForPicture32 (bits_image_t *);
-storeProc32 pixman_storeProcForPicture32 (bits_image_t *);
-fetchProc32 pixman_fetchProcForPicture32_accessors (bits_image_t *);
-fetchPixelProc32 pixman_fetchPixelProcForPicture32_accessors (bits_image_t *);
-storeProc32 pixman_storeProcForPicture32_accessors (bits_image_t *);
-
-fetchProc64 pixman_fetchProcForPicture64 (bits_image_t *);
-fetchPixelProc64 pixman_fetchPixelProcForPicture64 (bits_image_t *);
-storeProc64 pixman_storeProcForPicture64 (bits_image_t *);
-fetchProc64 pixman_fetchProcForPicture64_accessors (bits_image_t *);
-fetchPixelProc64 pixman_fetchPixelProcForPicture64_accessors (bits_image_t *);
-storeProc64 pixman_storeProcForPicture64_accessors (bits_image_t *);
-
-void pixman_expand(uint64_t *dst, const uint32_t *src, pixman_format_code_t, int width);
-void pixman_contract(uint32_t *dst, const uint64_t *src, int width);
-
-void pixmanFetchGradient (gradient_t *, int x, int y, int width,
-                           uint32_t *buffer, uint32_t *mask, uint32_t maskBits);
-void _pixman_image_get_scanline_64_generic (pixman_image_t * pict, int x, int y, int width,
-					    uint64_t *buffer, uint64_t *mask, uint32_t maskBits);
-void fbFetchTransformed(bits_image_t *, int x, int y, int width,
-                        uint32_t *buffer, uint32_t *mask, uint32_t maskBits);
-void fbFetchExternalAlpha(bits_image_t *, int x, int y, int width,
-                          uint32_t *buffer, uint32_t *mask, uint32_t maskBits);
-
-void fbFetchTransformed_accessors(bits_image_t *, int x, int y, int width,
-                                  uint32_t *buffer, uint32_t *mask,
-                                  uint32_t maskBits);
-void fbStoreExternalAlpha_accessors(bits_image_t *, int x, int y, int width,
-                                    uint32_t *buffer);
-void fbFetchExternalAlpha_accessors(bits_image_t *, int x, int y, int width,
-                                    uint32_t *buffer, uint32_t *mask,
-                                    uint32_t maskBits);
-
-/* end */
+typedef uint64_t (*fetch_pixel_64_t) (bits_image_t *image,
+				      int           x,
+				      int           y);
+
+typedef void (*store_scanline_t) (bits_image_t *  image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  const uint32_t *values);
 
 typedef enum
 {
@@ -237,119 +57,78 @@ typedef enum
     SOLID
 } image_type_t;
 
-#define IS_SOURCE_IMAGE(img)     (((image_common_t *)img)->type > BITS)
-
 typedef enum
 {
     SOURCE_IMAGE_CLASS_UNKNOWN,
     SOURCE_IMAGE_CLASS_HORIZONTAL,
     SOURCE_IMAGE_CLASS_VERTICAL,
-} source_pict_class_t;
-
-typedef void (*scanStoreProc)(bits_image_t *img, int x, int y, int width, uint32_t *buffer);
-typedef void (*scanFetchProc)(pixman_image_t *, int, int, int, uint32_t *,
-			      uint32_t *, uint32_t);
-
-source_pict_class_t _pixman_image_classify (pixman_image_t *image,
-					    int             x,
-					    int             y,
-					    int             width,
-					    int             height);
-
-void
-_pixman_image_get_scanline_32 (pixman_image_t *image, int x, int y, int width,
-			       uint32_t *buffer, uint32_t *mask, uint32_t mask_bits);
-
-/* Even thought the type of buffer is uint32_t *, the function actually expects
- * a uint64_t *buffer.
- */
-void
-_pixman_image_get_scanline_64 (pixman_image_t *image, int x, int y, int width,
-			       uint32_t *buffer, uint32_t *unused, uint32_t unused2);
-
-void
-_pixman_image_store_scanline_32 (bits_image_t *image, int x, int y, int width,
-				 uint32_t *buffer);
-/* Even thought the type of buffer is uint32_t *, the function actually expects
- * a uint64_t *buffer.
- */
-void
-_pixman_image_store_scanline_64 (bits_image_t *image, int x, int y, int width,
-				 uint32_t *buffer);
-
-pixman_image_t *
-_pixman_image_allocate (void);
-
-pixman_bool_t
-_pixman_init_gradient (gradient_t     *gradient,
-		       const pixman_gradient_stop_t *stops,
-		       int	       n_stops);
-void
-_pixman_image_reset_clip_region (pixman_image_t *image);
+} source_image_class_t;
 
-struct point
-{
-    int16_t x, y;
-};
-
-typedef source_pict_class_t (* classify_func_t) (pixman_image_t *image,
-						 int             x,
-						 int             y,
-						 int             width,
-						 int             height);
-typedef void (* property_changed_func_t)        (pixman_image_t *image);
+typedef source_image_class_t (*classify_func_t) (pixman_image_t *image,
+						int             x,
+						int             y,
+						int             width,
+						int             height);
+typedef void (*property_changed_func_t) (pixman_image_t *image);
 
 struct image_common
 {
-    image_type_t		type;
-    int32_t			ref_count;
-    pixman_region32_t		full_region;
-    pixman_region32_t		clip_region;
-    pixman_region32_t	       *src_clip;
-    pixman_bool_t               has_client_clip;
-    pixman_transform_t	       *transform;
-    pixman_repeat_t		repeat;
-    pixman_filter_t		filter;
-    pixman_fixed_t	       *filter_params;
-    int				n_filter_params;
-    bits_image_t	       *alpha_map;
-    point_t			alpha_origin;
-    pixman_bool_t		component_alpha;
-    pixman_read_memory_func_t	read_func;
-    pixman_write_memory_func_t	write_func;
-    classify_func_t		classify;
-    property_changed_func_t	property_changed;
-    scanFetchProc		get_scanline_32;
-    scanFetchProc		get_scanline_64;
+    image_type_t                type;
+    int32_t                     ref_count;
+    pixman_region32_t           clip_region;
+    pixman_bool_t               have_clip_region;   /* FALSE if there is no clip */
+    pixman_bool_t               client_clip;        /* Whether the source clip was
+						       set by a client */
+    pixman_bool_t               clip_sources;       /* Whether the clip applies when
+						     * the image is used as a source
+						     */
+    pixman_bool_t		dirty;
+    pixman_bool_t               need_workaround;
+    pixman_transform_t *        transform;
+    pixman_repeat_t             repeat;
+    pixman_filter_t             filter;
+    pixman_fixed_t *            filter_params;
+    int                         n_filter_params;
+    bits_image_t *              alpha_map;
+    int                         alpha_origin_x;
+    int                         alpha_origin_y;
+    pixman_bool_t               component_alpha;
+    classify_func_t             classify;
+    property_changed_func_t     property_changed;
+    fetch_scanline_t            get_scanline_32;
+    fetch_scanline_t            get_scanline_64;
+
+    pixman_image_destroy_func_t destroy_func;
+    void *                      destroy_data;
 };
 
 struct source_image
 {
-    image_common_t	common;
-    source_pict_class_t class;
+    image_common_t common;
+    source_image_class_t class;
 };
 
 struct solid_fill
 {
-    source_image_t	common;
-    uint32_t		color;		/* FIXME: shouldn't this be a pixman_color_t? */
+    source_image_t common;
+    uint32_t       color;    /* FIXME: shouldn't this be a pixman_color_t? */
 };
 
 struct gradient
 {
-    source_image_t		common;
-    int				n_stops;
-    pixman_gradient_stop_t *	stops;
-    int				stop_range;
-    uint32_t *			color_table;
-    int				color_table_size;
+    source_image_t          common;
+    int                     n_stops;
+    pixman_gradient_stop_t *stops;
+    int                     stop_range;
+    uint32_t *              color_table;
+    int                     color_table_size;
 };
 
 struct linear_gradient
 {
-    gradient_t			common;
-    pixman_point_fixed_t	p1;
-    pixman_point_fixed_t	p2;
+    gradient_t           common;
+    pixman_point_fixed_t p1;
+    pixman_point_fixed_t p2;
 };
 
 struct circle
@@ -361,721 +140,658 @@ struct circle
 
 struct radial_gradient
 {
-    gradient_t	common;
-
-    circle_t	c1;
-    circle_t	c2;
-    double	cdx;
-    double	cdy;
-    double	dr;
-    double	A;
+    gradient_t common;
+
+    circle_t   c1;
+    circle_t   c2;
+    double     cdx;
+    double     cdy;
+    double     dr;
+    double     A;
 };
 
 struct conical_gradient
 {
-    gradient_t			common;
-    pixman_point_fixed_t	center;
-    pixman_fixed_t		angle;
+    gradient_t           common;
+    pixman_point_fixed_t center;
+    pixman_fixed_t       angle;
 };
 
 struct bits_image
 {
-    image_common_t		common;
-    pixman_format_code_t	format;
-    const pixman_indexed_t     *indexed;
-    int				width;
-    int				height;
-    uint32_t *			bits;
-    uint32_t *			free_me;
-    int				rowstride; /* in number of uint32_t's */
-
-    scanStoreProc		store_scanline_32;
-    scanStoreProc		store_scanline_64;
+    image_common_t             common;
+    pixman_format_code_t       format;
+    const pixman_indexed_t *   indexed;
+    int                        width;
+    int                        height;
+    uint32_t *                 bits;
+    uint32_t *                 free_me;
+    int                        rowstride;  /* in number of uint32_t's */
+
+    /* Fetch a pixel, disregarding alpha maps, transformations etc. */
+    fetch_pixel_32_t	       fetch_pixel_raw_32;
+    fetch_pixel_64_t	       fetch_pixel_raw_64;
+
+    /* Fetch a pixel, taking alpha maps into account */
+    fetch_pixel_32_t	       fetch_pixel_32;
+    fetch_pixel_64_t	       fetch_pixel_64;
+
+    /* Fetch raw scanlines, with no regard for transformations, alpha maps etc. */
+    fetch_scanline_t           fetch_scanline_raw_32;
+    fetch_scanline_t           fetch_scanline_raw_64;
+
+    /* Store scanlines with no regard for alpha maps */
+    store_scanline_t           store_scanline_raw_32;
+    store_scanline_t           store_scanline_raw_64;
+
+    /* Store a scanline, taking alpha maps into account */
+    store_scanline_t           store_scanline_32;
+    store_scanline_t           store_scanline_64;
+
+    /* Used for indirect access to the bits */
+    pixman_read_memory_func_t  read_func;
+    pixman_write_memory_func_t write_func;
 };
 
 union pixman_image
 {
-    image_type_t		type;
-    image_common_t		common;
-    bits_image_t		bits;
-    source_image_t		source;
-    gradient_t			gradient;
-    linear_gradient_t		linear;
-    conical_gradient_t		conical;
-    radial_gradient_t		radial;
-    solid_fill_t		solid;
+    image_type_t       type;
+    image_common_t     common;
+    bits_image_t       bits;
+    source_image_t     source;
+    gradient_t         gradient;
+    linear_gradient_t  linear;
+    conical_gradient_t conical;
+    radial_gradient_t  radial;
+    solid_fill_t       solid;
 };
 
-/* Gradient walker
- */
-typedef struct
-{
-    uint32_t        left_ag;
-    uint32_t        left_rb;
-    uint32_t        right_ag;
-    uint32_t        right_rb;
-    int32_t       left_x;
-    int32_t       right_x;
-    int32_t       stepper;
 
-    pixman_gradient_stop_t	*stops;
-    int                      num_stops;
-    unsigned int             spread;
+void
+_pixman_bits_image_setup_raw_accessors (bits_image_t *image);
 
-    int		  need_reset;
-} GradientWalker;
+void
+_pixman_image_get_scanline_generic_64  (pixman_image_t *image,
+                                        int             x,
+                                        int             y,
+                                        int             width,
+                                        uint32_t *      buffer,
+                                        const uint32_t *mask,
+                                        uint32_t        mask_bits);
+
+source_image_class_t
+_pixman_image_classify (pixman_image_t *image,
+                        int             x,
+                        int             y,
+                        int             width,
+                        int             height);
 
 void
-_pixman_gradient_walker_init (GradientWalker  *walker,
-			      gradient_t      *gradient,
-			      unsigned int     spread);
+_pixman_image_get_scanline_32 (pixman_image_t *image,
+                               int             x,
+                               int             y,
+                               int             width,
+                               uint32_t *      buffer,
+                               const uint32_t *mask,
+                               uint32_t        mask_bits);
 
+/* Even thought the type of buffer is uint32_t *, the function actually expects
+ * a uint64_t *buffer.
+ */
 void
-_pixman_gradient_walker_reset (GradientWalker       *walker,
-			       pixman_fixed_32_32_t  pos);
+_pixman_image_get_scanline_64 (pixman_image_t *image,
+                               int             x,
+                               int             y,
+                               int             width,
+                               uint32_t *      buffer,
+                               const uint32_t *unused,
+                               uint32_t        unused2);
 
-uint32_t
-_pixman_gradient_walker_pixel (GradientWalker       *walker,
-			       pixman_fixed_32_32_t  x);
+void
+_pixman_image_store_scanline_32 (bits_image_t *  image,
+                                 int             x,
+                                 int             y,
+                                 int             width,
+                                 const uint32_t *buffer);
+void
+_pixman_image_fetch_pixels (bits_image_t *image,
+                            uint32_t *    buffer,
+                            int           n_pixels);
 
+/* Even though the type of buffer is uint32_t *, the function
+ * actually expects a uint64_t *buffer.
+ */
+void
+_pixman_image_store_scanline_64 (bits_image_t *  image,
+                                 int             x,
+                                 int             y,
+                                 int             width,
+                                 const uint32_t *buffer);
 
+pixman_image_t *
+_pixman_image_allocate (void);
 
-#define LOG2_BITMAP_PAD 5
-#define FB_STIP_SHIFT	LOG2_BITMAP_PAD
-#define FB_STIP_UNIT	(1 << FB_STIP_SHIFT)
-#define FB_STIP_MASK	(FB_STIP_UNIT - 1)
-#define FB_STIP_ALLONES	((uint32_t) -1)
+pixman_bool_t
+_pixman_init_gradient (gradient_t *                  gradient,
+                       const pixman_gradient_stop_t *stops,
+                       int                           n_stops);
+void
+_pixman_image_reset_clip_region (pixman_image_t *image);
 
-#if BITMAP_BIT_ORDER == LSBFirst
-#define FbScrLeft(x,n)	((x) >> (n))
-#define FbScrRight(x,n)	((x) << (n))
-#define FbLeftStipBits(x,n) ((x) & ((((uint32_t) 1) << (n)) - 1))
-#else
-#define FbScrLeft(x,n)	((x) << (n))
-#define FbScrRight(x,n)	((x) >> (n))
-#define FbLeftStipBits(x,n) ((x) >> (FB_STIP_UNIT - (n)))
-#endif
+void
+_pixman_image_validate (pixman_image_t *image);
 
-#define FbStipLeft(x,n)	FbScrLeft(x,n)
-#define FbStipRight(x,n) FbScrRight(x,n)
-#define FbStipMask(x,w)	(FbStipRight(FB_STIP_ALLONES,(x) & FB_STIP_MASK) & \
-			 FbStipLeft(FB_STIP_ALLONES,(FB_STIP_UNIT - ((x)+(w))) & FB_STIP_MASK))
-
-#define FbLeftMask(x)       ( ((x) & FB_MASK) ? \
-			      FbScrRight(FB_ALLONES,(x) & FB_MASK) : 0)
-#define FbRightMask(x)      ( ((FB_UNIT - (x)) & FB_MASK) ? \
-			      FbScrLeft(FB_ALLONES,(FB_UNIT - (x)) & FB_MASK) : 0)
-
-#define FbMaskBits(x,w,l,n,r) {						\
-	n = (w); \
-	r = FbRightMask((x)+n); \
-	l = FbLeftMask(x); \
-	if (l) { \
-	    n -= FB_UNIT - ((x) & FB_MASK); \
-	    if (n < 0) { \
-		n = 0; \
-		l &= r; \
-		r = 0; \
-	    } \
-	} \
-	n >>= FB_SHIFT; \
-    }
+pixman_bool_t
+_pixman_image_is_opaque (pixman_image_t *image);
 
-#if IMAGE_BYTE_ORDER == MSBFirst
-#define Fetch24(img, a)  ((unsigned long) (a) & 1 ?	      \
-    ((READ(img, a) << 16) | READ(img, (uint16_t *) ((a)+1))) : \
-    ((READ(img, (uint16_t *) (a)) << 8) | READ(img, (a)+2)))
-#define Store24(img,a,v) ((unsigned long) (a) & 1 ? \
-    (WRITE(img, a, (uint8_t) ((v) >> 16)),		  \
-     WRITE(img, (uint16_t *) ((a)+1), (uint16_t) (v))) :  \
-    (WRITE(img, (uint16_t *) (a), (uint16_t) ((v) >> 8)), \
-     WRITE(img, (a)+2, (uint8_t) (v))))
-#else
-#define Fetch24(img,a)  ((unsigned long) (a) & 1 ?			     \
-    (READ(img, a) | (READ(img, (uint16_t *) ((a)+1)) << 8)) : \
-    (READ(img, (uint16_t *) (a)) | (READ(img, (a)+2) << 16)))
-#define Store24(img,a,v) ((unsigned long) (a) & 1 ? \
-    (WRITE(img, a, (uint8_t) (v)),				\
-     WRITE(img, (uint16_t *) ((a)+1), (uint16_t) ((v) >> 8))) : \
-    (WRITE(img, (uint16_t *) (a), (uint16_t) (v)),		\
-     WRITE(img, (a)+2, (uint8_t) ((v) >> 16))))
-#endif
+pixman_bool_t
+_pixman_image_is_solid (pixman_image_t *image);
 
-#define CvtR8G8B8toY15(s)       (((((s) >> 16) & 0xff) * 153 + \
-                                  (((s) >>  8) & 0xff) * 301 +		\
-                                  (((s)      ) & 0xff) * 58) >> 2)
-#define miCvtR8G8B8to15(s) ((((s) >> 3) & 0x001f) |  \
-			    (((s) >> 6) & 0x03e0) |  \
-			    (((s) >> 9) & 0x7c00))
-#define miIndexToEnt15(mif,rgb15) ((mif)->ent[rgb15])
-#define miIndexToEnt24(mif,rgb24) miIndexToEnt15(mif,miCvtR8G8B8to15(rgb24))
+uint32_t
+_pixman_image_get_solid (pixman_image_t *     image,
+                         pixman_format_code_t format);
+
+#define PIXMAN_IMAGE_GET_LINE(image, x, y, type, out_stride, line, mul)	\
+    do									\
+    {									\
+	uint32_t *__bits__;						\
+	int       __stride__;						\
+        								\
+	__bits__ = image->bits.bits;					\
+	__stride__ = image->bits.rowstride;				\
+	(out_stride) =							\
+	    __stride__ * (int) sizeof (uint32_t) / (int) sizeof (type);	\
+	(line) =							\
+	    ((type *) __bits__) + (out_stride) * (y) + (mul) * (x);	\
+    } while (0)
 
-#define miIndexToEntY24(mif,rgb24) ((mif)->ent[CvtR8G8B8toY15(rgb24)])
+/*
+ * Gradient walker
+ */
+typedef struct
+{
+    uint32_t                left_ag;
+    uint32_t                left_rb;
+    uint32_t                right_ag;
+    uint32_t                right_rb;
+    int32_t                 left_x;
+    int32_t                 right_x;
+    int32_t                 stepper;
 
+    pixman_gradient_stop_t *stops;
+    int                     num_stops;
+    unsigned int            spread;
 
-#define FbIntMult(a,b,t) ( (t) = (a) * (b) + 0x80, ( ( ( (t)>>8 ) + (t) )>>8 ) )
-#define FbIntDiv(a,b)	 (((uint16_t) (a) * 255) / (b))
+    int                     need_reset;
+} pixman_gradient_walker_t;
 
-#define FbGet8(v,i)   ((uint16_t) (uint8_t) ((v) >> i))
+void
+_pixman_gradient_walker_init (pixman_gradient_walker_t *walker,
+                              gradient_t *              gradient,
+                              unsigned int              spread);
 
+void
+_pixman_gradient_walker_reset (pixman_gradient_walker_t *walker,
+                               pixman_fixed_32_32_t      pos);
 
-#define cvt8888to0565(s)    ((((s) >> 3) & 0x001f) | \
-			     (((s) >> 5) & 0x07e0) | \
-			     (((s) >> 8) & 0xf800))
-#define cvt0565to0888(s)    (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) | \
-			     ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) | \
-			     ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)))
+uint32_t
+_pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
+                               pixman_fixed_32_32_t      x);
 
 /*
- * There are two ways of handling alpha -- either as a single unified value or
- * a separate value for each component, hence each macro must have two
- * versions.  The unified alpha version has a 'U' at the end of the name,
- * the component version has a 'C'.  Similarly, functions which deal with
- * this difference will have two versions using the same convention.
+ * Edges
  */
 
-#define FbOverU(x,y,i,a,t) ((t) = FbIntMult(FbGet8(y,i),(a),(t)) + FbGet8(x,i),	\
-			    (uint32_t) ((uint8_t) ((t) | (0 - ((t) >> 8)))) << (i))
+#define MAX_ALPHA(n)    ((1 << (n)) - 1)
+#define N_Y_FRAC(n)     ((n) == 1 ? 1 : (1 << ((n) / 2)) - 1)
+#define N_X_FRAC(n)     ((n) == 1 ? 1 : (1 << ((n) / 2)) + 1)
 
-#define FbOverC(x,y,i,a,t) ((t) = FbIntMult(FbGet8(y,i),FbGet8(a,i),(t)) + FbGet8(x,i),	\
-			    (uint32_t) ((uint8_t) ((t) | (0 - ((t) >> 8)))) << (i))
+#define STEP_Y_SMALL(n) (pixman_fixed_1 / N_Y_FRAC (n))
+#define STEP_Y_BIG(n)   (pixman_fixed_1 - (N_Y_FRAC (n) - 1) * STEP_Y_SMALL (n))
 
-#define FbInU(x,i,a,t) ((uint32_t) FbIntMult(FbGet8(x,i),(a),(t)) << (i))
+#define Y_FRAC_FIRST(n) (STEP_Y_SMALL (n) / 2)
+#define Y_FRAC_LAST(n)  (Y_FRAC_FIRST (n) + (N_Y_FRAC (n) - 1) * STEP_Y_SMALL (n))
 
-#define FbInC(x,i,a,t) ((uint32_t) FbIntMult(FbGet8(x,i),FbGet8(a,i),(t)) << (i))
+#define STEP_X_SMALL(n) (pixman_fixed_1 / N_X_FRAC (n))
+#define STEP_X_BIG(n)   (pixman_fixed_1 - (N_X_FRAC (n) - 1) * STEP_X_SMALL (n))
 
-#define FbAdd(x,y,i,t)	((t) = FbGet8(x,i) + FbGet8(y,i),		\
-			 (uint32_t) ((uint8_t) ((t) | (0 - ((t) >> 8)))) << (i))
+#define X_FRAC_FIRST(n) (STEP_X_SMALL (n) / 2)
+#define X_FRAC_LAST(n)  (X_FRAC_FIRST (n) + (N_X_FRAC (n) - 1) * STEP_X_SMALL (n))
 
-#define div_255(x) (((x) + 0x80 + (((x) + 0x80) >> 8)) >> 8)
-#define div_65535(x) (((x) + 0x8000 + (((x) + 0x8000) >> 16)) >> 16)
+#define RENDER_SAMPLES_X(x, n)						\
+    ((n) == 1? 0 : (pixman_fixed_frac (x) +				\
+		    X_FRAC_FIRST (n)) / STEP_X_SMALL (n))
 
-#define MOD(a,b) ((a) < 0 ? ((b) - ((-(a) - 1) % (b))) - 1 : (a) % (b))
+void
+pixman_rasterize_edges_accessors (pixman_image_t *image,
+                                  pixman_edge_t * l,
+                                  pixman_edge_t * r,
+                                  pixman_fixed_t  t,
+                                  pixman_fixed_t  b);
 
-#define DIV(a,b) ((((a) < 0) == ((b) < 0)) ? (a) / (b) :		\
-		  ((a) - (b) + 1 - (((b) < 0) << 1)) / (b))
+/*
+ * Implementations
+ */
 
-#define CLIP(a,b,c) ((a) < (b) ? (b) : ((a) > (c) ? (c) : (a)))
+typedef struct pixman_implementation_t pixman_implementation_t;
 
-#if 0
-/* FIXME: the MOD macro above is equivalent, but faster I think */
-#define mod(a,b) ((b) == 1 ? 0 : (a) >= 0 ? (a) % (b) : (b) - (-a) % (b))
-#endif
+typedef void (*pixman_combine_32_func_t) (pixman_implementation_t *imp,
+					  pixman_op_t              op,
+					  uint32_t *               dest,
+					  const uint32_t *         src,
+					  const uint32_t *         mask,
+					  int                      width);
+
+typedef void (*pixman_combine_64_func_t) (pixman_implementation_t *imp,
+					  pixman_op_t              op,
+					  uint64_t *               dest,
+					  const uint64_t *         src,
+					  const uint64_t *         mask,
+					  int                      width);
+
+typedef void (*pixman_composite_func_t) (pixman_implementation_t *imp,
+					 pixman_op_t              op,
+					 pixman_image_t *         src,
+					 pixman_image_t *         mask,
+					 pixman_image_t *         dest,
+					 int32_t                  src_x,
+					 int32_t                  src_y,
+					 int32_t                  mask_x,
+					 int32_t                  mask_y,
+					 int32_t                  dest_x,
+					 int32_t                  dest_y,
+					 int32_t                  width,
+					 int32_t                  height);
+typedef pixman_bool_t (*pixman_blt_func_t) (pixman_implementation_t *imp,
+					    uint32_t *               src_bits,
+					    uint32_t *               dst_bits,
+					    int                      src_stride,
+					    int                      dst_stride,
+					    int                      src_bpp,
+					    int                      dst_bpp,
+					    int                      src_x,
+					    int                      src_y,
+					    int                      dst_x,
+					    int                      dst_y,
+					    int                      width,
+					    int                      height);
+typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp,
+					     uint32_t *               bits,
+					     int                      stride,
+					     int                      bpp,
+					     int                      x,
+					     int                      y,
+					     int                      width,
+					     int                      height,
+					     uint32_t                 xor);
 
-/* FIXME: the (void)__read_func hides lots of warnings (which is what they
- * are supposed to do), but some of them are real. For example the one
- * where Fetch4 doesn't have a READ
- */
+void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp);
+void _pixman_setup_combiner_functions_64 (pixman_implementation_t *imp);
 
-#if 0
-/* Framebuffer access support macros */
-#define ACCESS_MEM(code)						\
-    do {								\
-	const image_common_t *const com__ =				\
-	    (image_common_t *)image;					\
-									\
-	if (!com__->read_func && !com__->write_func)			\
-	{								\
-	    const int do_access__ = 0;					\
-	    const pixman_read_memory_func_t read_func__ = NULL;		\
-	    const pixman_write_memory_func_t write_func__ = NULL;	\
-	    (void)read_func__;						\
-	    (void)write_func__;						\
-	    (void)do_access__;						\
-									\
-	    {code}							\
-	}								\
-	else								\
-	{								\
-	    const int do_access__ = 1;					\
-	    const pixman_read_memory_func_t read_func__ =		\
-		com__->read_func;					\
-	    const pixman_write_memory_func_t write_func__ =		\
-		com__->write_func;					\
-	    (void)read_func__;						\
-	    (void)write_func__;						\
-	    (void)do_access__;						\
-	    								\
-	    {code}							\
-	}								\
-    } while (0)
-#endif
+struct pixman_implementation_t
+{
+    pixman_implementation_t *toplevel;
+    pixman_implementation_t *delegate;
 
-#ifdef PIXMAN_FB_ACCESSORS
+    pixman_composite_func_t  composite;
+    pixman_blt_func_t        blt;
+    pixman_fill_func_t       fill;
 
-#define ACCESS(sym) sym##_accessors
+    pixman_combine_32_func_t combine_32[PIXMAN_OP_LAST];
+    pixman_combine_32_func_t combine_32_ca[PIXMAN_OP_LAST];
+    pixman_combine_64_func_t combine_64[PIXMAN_OP_LAST];
+    pixman_combine_64_func_t combine_64_ca[PIXMAN_OP_LAST];
+};
 
-#define READ(img, ptr)							\
-    ((img)->common.read_func ((ptr), sizeof(*(ptr))))
-#define WRITE(img, ptr,val)						\
-    ((img)->common.write_func ((ptr), (val), sizeof (*(ptr))))
+pixman_implementation_t *
+_pixman_implementation_create (pixman_implementation_t *delegate);
 
-#define MEMCPY_WRAPPED(img, dst, src, size)				\
-    do {								\
-	size_t _i;							\
-	uint8_t *_dst = (uint8_t*)(dst), *_src = (uint8_t*)(src);	\
-	for(_i = 0; _i < size; _i++) {					\
-	    WRITE((img), _dst +_i, READ((img), _src + _i));		\
-	}								\
-    } while (0)
+void
+_pixman_implementation_combine_32 (pixman_implementation_t *imp,
+                                   pixman_op_t              op,
+                                   uint32_t *               dest,
+                                   const uint32_t *         src,
+                                   const uint32_t *         mask,
+                                   int                      width);
+void
+_pixman_implementation_combine_64 (pixman_implementation_t *imp,
+                                   pixman_op_t              op,
+                                   uint64_t *               dest,
+                                   const uint64_t *         src,
+                                   const uint64_t *         mask,
+                                   int                      width);
+void
+_pixman_implementation_combine_32_ca (pixman_implementation_t *imp,
+                                      pixman_op_t              op,
+                                      uint32_t *               dest,
+                                      const uint32_t *         src,
+                                      const uint32_t *         mask,
+                                      int                      width);
+void
+_pixman_implementation_combine_64_ca (pixman_implementation_t *imp,
+                                      pixman_op_t              op,
+                                      uint64_t *               dest,
+                                      const uint64_t *         src,
+                                      const uint64_t *         mask,
+                                      int                      width);
+void
+_pixman_implementation_composite (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  pixman_image_t *         src,
+                                  pixman_image_t *         mask,
+                                  pixman_image_t *         dest,
+                                  int32_t                  src_x,
+                                  int32_t                  src_y,
+                                  int32_t                  mask_x,
+                                  int32_t                  mask_y,
+                                  int32_t                  dest_x,
+                                  int32_t                  dest_y,
+                                  int32_t                  width,
+                                  int32_t                  height);
 
-#define MEMSET_WRAPPED(img, dst, val, size)				\
-    do {								\
-	size_t _i;							\
-	uint8_t *_dst = (uint8_t*)(dst);				\
-	for(_i = 0; _i < (size_t) size; _i++) {				\
-	    WRITE((img), _dst +_i, (val));				\
-	}								\
-    } while (0)
+pixman_bool_t
+_pixman_implementation_blt (pixman_implementation_t *imp,
+                            uint32_t *               src_bits,
+                            uint32_t *               dst_bits,
+                            int                      src_stride,
+                            int                      dst_stride,
+                            int                      src_bpp,
+                            int                      dst_bpp,
+                            int                      src_x,
+                            int                      src_y,
+                            int                      dst_x,
+                            int                      dst_y,
+                            int                      width,
+                            int                      height);
 
-#else
+pixman_bool_t
+_pixman_implementation_fill (pixman_implementation_t *imp,
+                             uint32_t *               bits,
+                             int                      stride,
+                             int                      bpp,
+                             int                      x,
+                             int                      y,
+                             int                      width,
+                             int                      height,
+                             uint32_t                 xor);
 
-#define ACCESS(sym) sym
+/* Specific implementations */
+pixman_implementation_t *
+_pixman_implementation_create_general (void);
 
-#define READ(img, ptr)		(*(ptr))
-#define WRITE(img, ptr, val)	(*(ptr) = (val))
-#define MEMCPY_WRAPPED(img, dst, src, size)				\
-    memcpy(dst, src, size)
-#define MEMSET_WRAPPED(img, dst, val, size)				\
-    memset(dst, val, size)
+pixman_implementation_t *
+_pixman_implementation_create_fast_path (void);
 
+#ifdef USE_MMX
+pixman_implementation_t *
+_pixman_implementation_create_mmx (void);
 #endif
 
-#define fbComposeGetSolid(img, res, fmt)				\
-    do									\
-    {									\
-	pixman_format_code_t format__;					\
-	if (img->type == SOLID)						\
-	{								\
-	    format__ = PIXMAN_a8r8g8b8;					\
-	    (res) = img->solid.color;					\
-	}								\
-	else								\
-	{								\
-	    uint32_t	       *bits__   = (img)->bits.bits;		\
-	    format__ = (img)->bits.format;				\
-		  							\
-	    switch (PIXMAN_FORMAT_BPP((img)->bits.format))		\
-	    {								\
-	    case 32:							\
-		(res) = READ(img, (uint32_t *)bits__);			\
-		break;							\
-	    case 24:							\
-		(res) = Fetch24(img, (uint8_t *) bits__);		\
-		break;							\
-	    case 16:							\
-		(res) = READ(img, (uint16_t *) bits__);			\
-		(res) = cvt0565to0888(res);				\
-		break;							\
-	    case 8:							\
-		(res) = READ(img, (uint8_t *) bits__);			\
-		(res) = (res) << 24;					\
-		break;							\
-	    case 1:							\
-		(res) = READ(img, (uint32_t *) bits__);			\
-		(res) = FbLeftStipBits((res),1) ? 0xff000000 : 0x00000000; \
-		break;							\
-	    default:							\
-		return;							\
-	    }								\
-	    /* manage missing src alpha */				\
-	    if (!PIXMAN_FORMAT_A((img)->bits.format))			\
-		(res) |= 0xff000000;					\
-	}								\
-									\
-	/* If necessary, convert RGB <--> BGR. */			\
-	if (PIXMAN_FORMAT_TYPE (format__) != PIXMAN_FORMAT_TYPE(fmt))	\
-	{								\
-	    (res) = ((((res) & 0xff000000) >>  0) |			\
-		     (((res) & 0x00ff0000) >> 16) |			\
-		     (((res) & 0x0000ff00) >>  0) |			\
-		     (((res) & 0x000000ff) << 16));			\
-	}								\
-    }									\
-    while (0)
+#ifdef USE_SSE2
+pixman_implementation_t *
+_pixman_implementation_create_sse2 (void);
+#endif
+
+#ifdef USE_ARM_SIMD
+pixman_implementation_t *
+_pixman_implementation_create_arm_simd (void);
+#endif
+
+#ifdef USE_ARM_NEON
+pixman_implementation_t *
+_pixman_implementation_create_arm_neon (void);
+#endif
+
+#ifdef USE_VMX
+pixman_implementation_t *
+_pixman_implementation_create_vmx (void);
+#endif
+
+pixman_implementation_t *
+_pixman_choose_implementation (void);
 
-#define fbComposeGetStart(pict,x,y,type,out_stride,line,mul) do {	\
-	uint32_t	*__bits__;					\
-	int		__stride__;					\
-	int		__bpp__;					\
-									\
-	__bits__ = pict->bits.bits;					\
-	__stride__ = pict->bits.rowstride;				\
-	__bpp__ = PIXMAN_FORMAT_BPP(pict->bits.format);			\
-	(out_stride) = __stride__ * (int) sizeof (uint32_t) / (int) sizeof (type);	\
-	(line) = ((type *) __bits__) +					\
-	    (out_stride) * (y) + (mul) * (x);				\
-    } while (0)
 
 
-#define PIXMAN_FORMAT_16BPC(f)	(PIXMAN_FORMAT_A(f) > 8 || \
-				 PIXMAN_FORMAT_R(f) > 8 || \
-				 PIXMAN_FORMAT_G(f) > 8 || \
-				 PIXMAN_FORMAT_B(f) > 8)
 /*
- * Edges
+ * Utilities
+ */
+
+/* These "formats" both have depth 0, so they
+ * will never clash with any real ones
  */
+#define PIXMAN_null             PIXMAN_FORMAT (0, 0, 0, 0, 0, 0)
+#define PIXMAN_solid            PIXMAN_FORMAT (0, 1, 0, 0, 0, 0)
 
-#define MAX_ALPHA(n)	((1 << (n)) - 1)
-#define N_Y_FRAC(n)	((n) == 1 ? 1 : (1 << ((n)/2)) - 1)
-#define N_X_FRAC(n)	((n) == 1 ? 1 : (1 << ((n)/2)) + 1)
+#define NEED_COMPONENT_ALPHA            (1 << 0)
+#define NEED_PIXBUF                     (1 << 1)
+#define NEED_SOLID_MASK                 (1 << 2)
 
-#define STEP_Y_SMALL(n)	(pixman_fixed_1 / N_Y_FRAC(n))
-#define STEP_Y_BIG(n)	(pixman_fixed_1 - (N_Y_FRAC(n) - 1) * STEP_Y_SMALL(n))
+typedef struct
+{
+    pixman_op_t             op;
+    pixman_format_code_t    src_format;
+    pixman_format_code_t    mask_format;
+    pixman_format_code_t    dest_format;
+    pixman_composite_func_t func;
+    uint32_t                flags;
+} pixman_fast_path_t;
 
-#define Y_FRAC_FIRST(n)	(STEP_Y_SMALL(n) / 2)
-#define Y_FRAC_LAST(n)	(Y_FRAC_FIRST(n) + (N_Y_FRAC(n) - 1) * STEP_Y_SMALL(n))
+/* Memory allocation helpers */
+void *
+pixman_malloc_ab (unsigned int n, unsigned int b);
 
-#define STEP_X_SMALL(n)	(pixman_fixed_1 / N_X_FRAC(n))
-#define STEP_X_BIG(n)	(pixman_fixed_1 - (N_X_FRAC(n) - 1) * STEP_X_SMALL(n))
+void *
+pixman_malloc_abc (unsigned int a, unsigned int b, unsigned int c);
 
-#define X_FRAC_FIRST(n)	(STEP_X_SMALL(n) / 2)
-#define X_FRAC_LAST(n)	(X_FRAC_FIRST(n) + (N_X_FRAC(n) - 1) * STEP_X_SMALL(n))
+pixman_bool_t
+pixman_multiply_overflows_int (unsigned int a, unsigned int b);
 
-#define RenderSamplesX(x,n)	((n) == 1 ? 0 : (pixman_fixed_frac (x) + X_FRAC_FIRST(n)) / STEP_X_SMALL(n))
+pixman_bool_t
+pixman_addition_overflows_int (unsigned int a, unsigned int b);
 
-/*
- * Step across a small sample grid gap
- */
-#define RenderEdgeStepSmall(edge) { \
-    edge->x += edge->stepx_small;   \
-    edge->e += edge->dx_small;	    \
-    if (edge->e > 0)		    \
-    {				    \
-	edge->e -= edge->dy;	    \
-	edge->x += edge->signdx;    \
-    }				    \
-}
+/* Compositing utilities */
+pixman_bool_t
+_pixman_run_fast_path (const pixman_fast_path_t *paths,
+                       pixman_implementation_t * imp,
+                       pixman_op_t               op,
+                       pixman_image_t *          src,
+                       pixman_image_t *          mask,
+                       pixman_image_t *          dest,
+                       int32_t                   src_x,
+                       int32_t                   src_y,
+                       int32_t                   mask_x,
+                       int32_t                   mask_y,
+                       int32_t                   dest_x,
+                       int32_t                   dest_y,
+                       int32_t                   width,
+                       int32_t                   height);
 
-/*
- * Step across a large sample grid gap
- */
-#define RenderEdgeStepBig(edge) {   \
-    edge->x += edge->stepx_big;	    \
-    edge->e += edge->dx_big;	    \
-    if (edge->e > 0)		    \
-    {				    \
-	edge->e -= edge->dy;	    \
-	edge->x += edge->signdx;    \
-    }				    \
-}
+void
+_pixman_walk_composite_region (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               pixman_image_t *         src_image,
+                               pixman_image_t *         mask_image,
+                               pixman_image_t *         dst_image,
+                               int16_t                  src_x,
+                               int16_t                  src_y,
+                               int16_t                  mask_x,
+                               int16_t                  mask_y,
+                               int16_t                  dest_x,
+                               int16_t                  dest_y,
+                               uint16_t                 width,
+                               uint16_t                 height,
+                               pixman_composite_func_t  composite_rect);
 
 void
-pixman_rasterize_edges_accessors (pixman_image_t *image,
-				  pixman_edge_t	*l,
-				  pixman_edge_t	*r,
-				  pixman_fixed_t	t,
-				  pixman_fixed_t	b);
+pixman_expand (uint64_t *           dst,
+               const uint32_t *     src,
+               pixman_format_code_t format,
+               int                  width);
 
-pixman_bool_t
-pixman_image_is_opaque(pixman_image_t *image);
+void
+pixman_contract (uint32_t *      dst,
+                 const uint64_t *src,
+                 int             width);
 
+
+/* Region Helpers */
 pixman_bool_t
-pixman_image_can_get_solid (pixman_image_t *image);
+pixman_region32_copy_from_region16 (pixman_region32_t *dst,
+                                    pixman_region16_t *src);
 
 pixman_bool_t
-pixman_compute_composite_region32 (pixman_region32_t *	pRegion,
-				   pixman_image_t *	pSrc,
-				   pixman_image_t *	pMask,
-				   pixman_image_t *	pDst,
-				   int16_t		xSrc,
-				   int16_t		ySrc,
-				   int16_t		xMask,
-				   int16_t		yMask,
-				   int16_t		xDst,
-				   int16_t		yDst,
-				   uint16_t		width,
-				   uint16_t		height);
-
-/* GCC visibility */
-#if defined(__GNUC__) && __GNUC__ >= 4
-#define PIXMAN_EXPORT __attribute__ ((visibility("default")))
-/* Sun Studio 8 visibility */
-#elif defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
-#define PIXMAN_EXPORT __global
-#else
-#define PIXMAN_EXPORT
+pixman_region16_copy_from_region32 (pixman_region16_t *dst,
+                                    pixman_region32_t *src);
+
+
+/* Misc macros */
+
+#ifndef FALSE
+#   define FALSE 0
 #endif
 
-/* Region Helpers */
-pixman_bool_t pixman_region32_copy_from_region16 (pixman_region32_t *dst,
-						  pixman_region16_t *src);
-pixman_bool_t pixman_region16_copy_from_region32 (pixman_region16_t *dst,
-						  pixman_region32_t *src);
-void pixman_region_internal_set_static_pointers (pixman_box16_t *empty_box,
-						 pixman_region16_data_t *empty_data,
-						 pixman_region16_data_t *broken_data);
+#ifndef TRUE
+#   define TRUE 1
+#endif
 
-#ifdef PIXMAN_TIMING
+#ifndef MIN
+#  define MIN(a, b) ((a < b) ? a : b)
+#endif
 
-/* Timing */
-static inline uint64_t
-oil_profile_stamp_rdtsc (void)
-{
-    uint64_t ts;
-    __asm__ __volatile__("rdtsc\n" : "=A" (ts));
-    return ts;
-}
-#define OIL_STAMP oil_profile_stamp_rdtsc
+#ifndef MAX
+#  define MAX(a, b) ((a > b) ? a : b)
+#endif
 
-typedef struct PixmanTimer PixmanTimer;
+/* Integer division that rounds towards -infinity */
+#define DIV(a, b)					   \
+    ((((a) < 0) == ((b) < 0)) ? (a) / (b) :                \
+     ((a) - (b) + 1 - (((b) < 0) << 1)) / (b))
 
-struct PixmanTimer
-{
-    int initialized;
-    const char *name;
-    uint64_t n_times;
-    uint64_t total;
-    PixmanTimer *next;
-};
+/* Modulus that produces the remainder wrt. DIV */
+#define MOD(a, b) ((a) < 0 ? ((b) - ((-(a) - 1) % (b))) - 1 : (a) % (b))
 
-extern int timer_defined;
-void pixman_timer_register (PixmanTimer *timer);
+#define CLIP(v, low, high) ((v) < (low) ? (low) : ((v) > (high) ? (high) : (v)))
 
-#define TIMER_BEGIN(tname)						\
-    {									\
-	static PixmanTimer	timer##tname;				\
-	uint64_t		begin##tname;				\
-									\
-	if (!timer##tname.initialized)					\
-	{								\
-	    timer##tname.initialized = 1;				\
-	    timer##tname.name = #tname;					\
-	    pixman_timer_register (&timer##tname);			\
-	}								\
-									\
-	timer##tname.n_times++;						\
-	begin##tname = OIL_STAMP();
+/* Conversion between 8888 and 0565 */
 
-#define TIMER_END(tname)						\
-        timer##tname.total += OIL_STAMP() - begin##tname;		\
-    }
+#define CONVERT_8888_TO_0565(s)						\
+    ((((s) >> 3) & 0x001f) |						\
+     (((s) >> 5) & 0x07e0) |						\
+     (((s) >> 8) & 0xf800))
 
-#endif /* PIXMAN_TIMING */
+#define CONVERT_0565_TO_0888(s)						\
+    (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) |			\
+     ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) |			\
+     ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)))
 
-typedef struct pixman_implementation_t pixman_implementation_t;
+#define PIXMAN_FORMAT_IS_WIDE(f)					\
+    (PIXMAN_FORMAT_A (f) > 8 ||						\
+     PIXMAN_FORMAT_R (f) > 8 ||						\
+     PIXMAN_FORMAT_G (f) > 8 ||						\
+     PIXMAN_FORMAT_B (f) > 8)
 
-typedef void (* pixman_combine_32_func_t) (pixman_implementation_t *	imp,
-					   pixman_op_t			op,
-					   uint32_t *			dest,
-					   const uint32_t *		src,
-					   const uint32_t *		mask,
-					   int				width);
-
-typedef void (* pixman_combine_64_func_t) (pixman_implementation_t *	imp,
-					   pixman_op_t			op,
-					   uint64_t *			dest,
-					   const uint64_t *		src,
-					   const uint64_t *		mask,
-					   int				width);
-
-typedef void (* pixman_composite_func_t)  (pixman_implementation_t *	imp,
-					   pixman_op_t			op,
-					   pixman_image_t *		src,
-					   pixman_image_t *		mask,
-					   pixman_image_t *		dest,
-					   int32_t			src_x,
-					   int32_t			src_y,
-					   int32_t			mask_x,
-					   int32_t			mask_y,
-					   int32_t			dest_x,
-					   int32_t			dest_y,
-					   int32_t			width,
-					   int32_t			height);
-typedef pixman_bool_t (* pixman_blt_func_t) (pixman_implementation_t *	imp,
-					     uint32_t *			src_bits,
-					     uint32_t *			dst_bits,
-					     int			src_stride,
-					     int			dst_stride,
-					     int			src_bpp,
-					     int			dst_bpp,
-					     int			src_x,
-					     int			src_y,
-					     int			dst_x,
-					     int			dst_y,
-					     int			width,
-					     int			height);
-typedef pixman_bool_t (* pixman_fill_func_t) (pixman_implementation_t *imp,
-					      uint32_t *bits,
-					      int stride,
-					      int bpp,
-					      int x,
-					      int y,
-					      int width,
-					      int height,
-					      uint32_t xor);
+/*
+ * Various debugging code
+ */
 
-void
-_pixman_walk_composite_region (pixman_implementation_t *imp,
-			      pixman_op_t op,
-			      pixman_image_t * pSrc,
-			      pixman_image_t * pMask,
-			      pixman_image_t * pDst,
-			      int16_t xSrc,
-			      int16_t ySrc,
-			      int16_t xMask,
-			      int16_t yMask,
-			      int16_t xDst,
-			      int16_t yDst,
-			      uint16_t width,
-			      uint16_t height,
-			      pixman_bool_t srcRepeat,
-			      pixman_bool_t maskRepeat,
-			       pixman_composite_func_t compositeRect);
+#undef DEBUG
+#define DEBUG 0
 
-void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp);
-void _pixman_setup_combiner_functions_64 (pixman_implementation_t *imp);
+#if DEBUG
 
-/* These "formats" both have depth 0, so they
- * will never clash with any real ones
+#define return_if_fail(expr)                                            \
+    do                                                                  \
+    {                                                                   \
+	if (!(expr))                                                    \
+	{                                                               \
+	    fprintf (stderr, "In %s: %s failed\n", FUNC, # expr);	\
+	    return;                                                     \
+	}                                                               \
+    }                                                                   \
+    while (0)
+
+#define return_val_if_fail(expr, retval)                                \
+    do                                                                  \
+    {                                                                   \
+	if (!(expr))                                                    \
+	{                                                               \
+	    fprintf (stderr, "In %s: %s failed\n", FUNC, # expr);	\
+	    return (retval);                                            \
+	}                                                               \
+    }                                                                   \
+    while (0)
+
+#else
+
+#define return_if_fail(expr)                                            \
+    do                                                                  \
+    {                                                                   \
+	if (!(expr))							\
+	    return;							\
+    }                                                                   \
+    while (0)
+
+#define return_val_if_fail(expr, retval)                                \
+    do                                                                  \
+    {                                                                   \
+	if (!(expr))							\
+	    return (retval);						\
+    }                                                                   \
+    while (0)
+
+#endif
+
+/*
+ * Timers
  */
-#define PIXMAN_null		PIXMAN_FORMAT(0,0,0,0,0,0)
-#define PIXMAN_solid		PIXMAN_FORMAT(0,1,0,0,0,0)
 
-#define NEED_COMPONENT_ALPHA		(1 << 0)
-#define NEED_PIXBUF			(1 << 1)
-#define NEED_SOLID_MASK		        (1 << 2)
+#ifdef PIXMAN_TIMERS
 
-typedef struct
+static inline uint64_t
+oil_profile_stamp_rdtsc (void)
 {
-    pixman_op_t			op;
-    pixman_format_code_t	src_format;
-    pixman_format_code_t	mask_format;
-    pixman_format_code_t	dest_format;
-    pixman_composite_func_t	func;
-    uint32_t			flags;
-} FastPathInfo;
+    uint64_t ts;
 
-struct pixman_implementation_t
+    __asm__ __volatile__ ("rdtsc\n" : "=A" (ts));
+    return ts;
+}
+
+#define OIL_STAMP oil_profile_stamp_rdtsc
+
+typedef struct pixman_timer_t pixman_timer_t;
+
+struct pixman_timer_t
 {
-    pixman_implementation_t *	toplevel;
-    pixman_implementation_t *	delegate;
-
-    pixman_composite_func_t	composite;
-    pixman_blt_func_t		blt;
-    pixman_fill_func_t		fill;
-    
-    pixman_combine_32_func_t	combine_32[PIXMAN_OP_LAST];
-    pixman_combine_32_func_t	combine_32_ca[PIXMAN_OP_LAST];
-    pixman_combine_64_func_t	combine_64[PIXMAN_OP_LAST];
-    pixman_combine_64_func_t	combine_64_ca[PIXMAN_OP_LAST];
+    int             initialized;
+    const char *    name;
+    uint64_t        n_times;
+    uint64_t        total;
+    pixman_timer_t *next;
 };
 
-pixman_implementation_t *
-_pixman_implementation_create (pixman_implementation_t *toplevel,
-			       pixman_implementation_t *delegate);
+extern int timer_defined;
 
-void
-_pixman_implementation_combine_32 (pixman_implementation_t *	imp,
-				   pixman_op_t			op,
-				   uint32_t *			dest,
-				   const uint32_t *		src,
-				   const uint32_t *		mask,
-				   int				width);
-void
-_pixman_implementation_combine_64 (pixman_implementation_t *	imp,
-				   pixman_op_t			op,
-				   uint64_t *			dest,
-				   const uint64_t *		src,
-				   const uint64_t *		mask,
-				   int				width);
-void
-_pixman_implementation_combine_32_ca (pixman_implementation_t *	imp,
-				      pixman_op_t		op,
-				      uint32_t *		dest,
-				      const uint32_t *		src,
-				      const uint32_t *		mask,
-				      int			width);
-void
-_pixman_implementation_combine_64_ca (pixman_implementation_t *	imp,
-				      pixman_op_t		op,
-				      uint64_t *		dest,
-				      const uint64_t *		src,
-				      const uint64_t *		mask,
-				      int			width);
-void
-_pixman_implementation_composite (pixman_implementation_t *	imp,
-				  pixman_op_t			op,
-				  pixman_image_t *		src,
-				  pixman_image_t *		mask,
-				  pixman_image_t *		dest,
-				  int32_t			src_x,
-				  int32_t			src_y,
-				  int32_t			mask_x,
-				  int32_t			mask_y,
-				  int32_t			dest_x,
-				  int32_t			dest_y,
-				  int32_t			width,
-				  int32_t			height);
+void pixman_timer_register (pixman_timer_t *timer);
+
+#define TIMER_BEGIN(tname)                                              \
+    {                                                                   \
+	static pixman_timer_t timer ## tname;                           \
+	uint64_t              begin ## tname;                           \
+        								\
+	if (!timer ## tname.initialized)				\
+	{                                                               \
+	    timer ## tname.initialized = 1;				\
+	    timer ## tname.name = # tname;				\
+	    pixman_timer_register (&timer ## tname);			\
+	}                                                               \
+									\
+	timer ## tname.n_times++;					\
+	begin ## tname = OIL_STAMP ();
 
-pixman_bool_t
-_pixman_implementation_blt (pixman_implementation_t *	imp,
-			    uint32_t *			src_bits,
-			    uint32_t *			dst_bits,
-			    int				src_stride,
-			    int				dst_stride,
-			    int				src_bpp,
-			    int				dst_bpp,
-			    int				src_x,
-			    int				src_y,
-			    int				dst_x,
-			    int				dst_y,
-			    int				width,
-			    int				height);
-pixman_bool_t
-_pixman_implementation_fill (pixman_implementation_t *   imp,
-			     uint32_t *bits,
-			     int stride,
-			     int bpp,
-			     int x,
-			     int y,
-			     int width,
-			     int height,
-			     uint32_t xor);
-    
-/* Specific implementations */
-pixman_implementation_t *
-_pixman_implementation_create_general (pixman_implementation_t *toplevel);
-pixman_implementation_t *
-_pixman_implementation_create_fast_path (pixman_implementation_t *toplevel);
-#ifdef USE_MMX
-pixman_implementation_t *
-_pixman_implementation_create_mmx (pixman_implementation_t *toplevel);
-#endif
-#ifdef USE_SSE2
-pixman_implementation_t *
-_pixman_implementation_create_sse2 (pixman_implementation_t *toplevel);
-#endif
-#ifdef USE_ARM_SIMD
-pixman_implementation_t *
-_pixman_implementation_create_arm_simd (pixman_implementation_t *toplevel);
-#endif
-#ifdef USE_ARM_NEON
-pixman_implementation_t *
-_pixman_implementation_create_arm_neon (pixman_implementation_t *toplevel);
-#endif
-#ifdef USE_VMX
-pixman_implementation_t *
-_pixman_implementation_create_vmx (pixman_implementation_t *toplevel);
-#endif
+#define TIMER_END(tname)                                                \
+    timer ## tname.total += OIL_STAMP () - begin ## tname;		\
+    }
 
-pixman_bool_t
-_pixman_run_fast_path (const FastPathInfo *paths,
-		       pixman_implementation_t *imp,
-		       pixman_op_t op,
-		       pixman_image_t *src,
-		       pixman_image_t *mask,
-		       pixman_image_t *dest,
-		       int32_t src_x,
-		       int32_t src_y,
-		       int32_t mask_x,
-		       int32_t mask_y,
-		       int32_t dest_x,
-		       int32_t dest_y,
-		       int32_t width,
-		       int32_t height);
-    
-pixman_implementation_t *
-_pixman_choose_implementation (void);
+#endif /* PIXMAN_TIMERS */
 
 #endif /* PIXMAN_PRIVATE_H */
diff --git a/lib/pixman/pixman/pixman-radial-gradient.c b/lib/pixman/pixman/pixman-radial-gradient.c
index 4a4543001..022157b9b 100644
--- a/lib/pixman/pixman/pixman-radial-gradient.c
+++ b/lib/pixman/pixman/pixman-radial-gradient.c
@@ -26,14 +26,21 @@
  * SOFTWARE.
  */
 
+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif
 #include <stdlib.h>
 #include <math.h>
 #include "pixman-private.h"
 
 static void
-radial_gradient_get_scanline_32 (pixman_image_t *image, int x, int y, int width,
-				 uint32_t *buffer, uint32_t *mask, uint32_t maskBits)
+radial_gradient_get_scanline_32 (pixman_image_t *image,
+                                 int             x,
+                                 int             y,
+                                 int             width,
+                                 uint32_t *      buffer,
+                                 const uint32_t *mask,
+                                 uint32_t        mask_bits)
 {
     /*
      * In the radial gradient problem we are given two circles (c₁,r₁) and
@@ -153,8 +160,8 @@ radial_gradient_get_scanline_32 (pixman_image_t *image, int x, int y, int width,
     gradient_t *gradient = (gradient_t *)image;
     source_image_t *source = (source_image_t *)image;
     radial_gradient_t *radial = (radial_gradient_t *)image;
-    uint32_t       *end = buffer + width;
-    GradientWalker  walker;
+    uint32_t *end = buffer + width;
+    pixman_gradient_walker_t walker;
     pixman_bool_t affine = TRUE;
     double cx = 1.;
     double cy = 0.;
@@ -162,67 +169,101 @@ radial_gradient_get_scanline_32 (pixman_image_t *image, int x, int y, int width,
     double rx = x + 0.5;
     double ry = y + 0.5;
     double rz = 1.;
-    
+
     _pixman_gradient_walker_init (&walker, gradient, source->common.repeat);
-    
-    if (source->common.transform) {
+
+    if (source->common.transform)
+    {
 	pixman_vector_t v;
 	/* reference point is the center of the pixel */
-	v.vector[0] = pixman_int_to_fixed(x) + pixman_fixed_1/2;
-	v.vector[1] = pixman_int_to_fixed(y) + pixman_fixed_1/2;
+	v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+	v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
 	v.vector[2] = pixman_fixed_1;
+	
 	if (!pixman_transform_point_3d (source->common.transform, &v))
 	    return;
+
+	cx = source->common.transform->matrix[0][0] / 65536.;
+	cy = source->common.transform->matrix[1][0] / 65536.;
+	cz = source->common.transform->matrix[2][0] / 65536.;
 	
-	cx = source->common.transform->matrix[0][0]/65536.;
-	cy = source->common.transform->matrix[1][0]/65536.;
-	cz = source->common.transform->matrix[2][0]/65536.;
-	rx = v.vector[0]/65536.;
-	ry = v.vector[1]/65536.;
-	rz = v.vector[2]/65536.;
-	affine = source->common.transform->matrix[2][0] == 0 && v.vector[2] == pixman_fixed_1;
+	rx = v.vector[0] / 65536.;
+	ry = v.vector[1] / 65536.;
+	rz = v.vector[2] / 65536.;
+
+	affine =
+	    source->common.transform->matrix[2][0] == 0 &&
+	    v.vector[2] == pixman_fixed_1;
     }
-    
-    if (affine) {
-	while (buffer < end) {
-	    if (!mask || *mask++ & maskBits)
+
+    if (affine)
+    {
+	/* When computing t over a scanline, we notice that some expressions
+	 * are constant so we can compute them just once. Given:
+	 *
+	 * t = (-2·B ± ⎷(B² - 4·A·C)) / 2·A
+	 *
+	 * where
+	 *
+	 * A = cdx² + cdy² - dr² [precomputed as radial->A]
+	 * B = -2·(pdx·cdx + pdy·cdy + r₁·dr)
+	 * C = pdx² + pdy² - r₁²
+	 *
+	 * Since we have an affine transformation, we know that (pdx, pdy)
+	 * increase linearly with each pixel,
+	 *
+	 * pdx = pdx₀ + n·cx,
+	 * pdy = pdy₀ + n·cy,
+	 *
+	 * we can then express B in terms of an linear increment along
+	 * the scanline:
+	 *
+	 * B = B₀ + n·cB, with
+	 * B₀ = -2·(pdx₀·cdx + pdy₀·cdy + r₁·dr) and
+	 * cB = -2·(cx·cdx + cy·cdy)
+	 *
+	 * Thus we can replace the full evaluation of B per-pixel (4 multiplies,
+	 * 2 additions) with a single addition.
+	 */
+	double r1   = radial->c1.radius / 65536.;
+	double r1sq = r1 * r1;
+	double pdx  = rx - radial->c1.x / 65536.;
+	double pdy  = ry - radial->c1.y / 65536.;
+	double A = radial->A;
+	double invA = -65536. / (2. * A);
+	double A4 = -4. * A;
+	double B  = -2. * (pdx*radial->cdx + pdy*radial->cdy + r1*radial->dr);
+	double cB = -2. *  (cx*radial->cdx +  cy*radial->cdy);
+	pixman_bool_t invert = A * radial->dr < 0;
+
+	while (buffer < end)
+	{
+	    if (!mask || *mask++ & mask_bits)
 	    {
-		double pdx, pdy;
-		double B, C;
-		double det;
-		double c1x = radial->c1.x / 65536.0;
-		double c1y = radial->c1.y / 65536.0;
-		double r1  = radial->c1.radius / 65536.0;
 		pixman_fixed_48_16_t t;
-		
-		pdx = rx - c1x;
-		pdy = ry - c1y;
-		
-		B = -2 * (  pdx * radial->cdx
-			    + pdy * radial->cdy
-			    + r1 * radial->dr);
-		C = (pdx * pdx + pdy * pdy - r1 * r1);
-		
-		det = (B * B) - (4 * radial->A * C);
-		if (det < 0.0)
-		    det = 0.0;
-		
-		if (radial->A < 0)
-		    t = (pixman_fixed_48_16_t) ((- B - sqrt(det)) / (2.0 * radial->A) * 65536);
+		double det = B * B + A4 * (pdx * pdx + pdy * pdy - r1sq);
+		if (det <= 0.)
+		    t = (pixman_fixed_48_16_t) (B * invA);
+		else if (invert)
+		    t = (pixman_fixed_48_16_t) ((B + sqrt (det)) * invA);
 		else
-		    t = (pixman_fixed_48_16_t) ((- B + sqrt(det)) / (2.0 * radial->A) * 65536);
-		
-		*(buffer) = _pixman_gradient_walker_pixel (&walker, t);
+		    t = (pixman_fixed_48_16_t) ((B - sqrt (det)) * invA);
+
+		*buffer = _pixman_gradient_walker_pixel (&walker, t);
 	    }
 	    ++buffer;
-	    
-	    rx += cx;
-	    ry += cy;
+
+	    pdx += cx;
+	    pdy += cy;
+	    B += cB;
 	}
-    } else {
+    }
+    else
+    {
 	/* projective */
-	while (buffer < end) {
-	    if (!mask || *mask++ & maskBits)
+	while (buffer < end)
+	{
+	    if (!mask || *mask++ & mask_bits)
 	    {
 		double pdx, pdy;
 		double B, C;
@@ -232,78 +273,81 @@ radial_gradient_get_scanline_32 (pixman_image_t *image, int x, int y, int width,
 		double r1  = radial->c1.radius / 65536.0;
 		pixman_fixed_48_16_t t;
 		double x, y;
-		
-		if (rz != 0) {
-		    x = rx/rz;
-		    y = ry/rz;
-		} else {
+
+		if (rz != 0)
+		{
+		    x = rx / rz;
+		    y = ry / rz;
+		}
+		else
+		{
 		    x = y = 0.;
 		}
-		
+
 		pdx = x - c1x;
 		pdy = y - c1y;
-		
-		B = -2 * (  pdx * radial->cdx
-			    + pdy * radial->cdy
-			    + r1 * radial->dr);
+
+		B = -2 * (pdx * radial->cdx +
+			  pdy * radial->cdy +
+			  r1 * radial->dr);
 		C = (pdx * pdx + pdy * pdy - r1 * r1);
-		
+
 		det = (B * B) - (4 * radial->A * C);
 		if (det < 0.0)
 		    det = 0.0;
-		
-		if (radial->A < 0)
-		    t = (pixman_fixed_48_16_t) ((- B - sqrt(det)) / (2.0 * radial->A) * 65536);
+
+		if (radial->A * radial->dr < 0)
+		    t = (pixman_fixed_48_16_t) ((-B - sqrt (det)) / (2.0 * radial->A) * 65536);
 		else
-		    t = (pixman_fixed_48_16_t) ((- B + sqrt(det)) / (2.0 * radial->A) * 65536);
-		
-		*(buffer) = _pixman_gradient_walker_pixel (&walker, t);
+		    t = (pixman_fixed_48_16_t) ((-B + sqrt (det)) / (2.0 * radial->A) * 65536);
+
+		*buffer = _pixman_gradient_walker_pixel (&walker, t);
 	    }
-	    ++buffer;
 	    
+	    ++buffer;
+
 	    rx += cx;
 	    ry += cy;
 	    rz += cz;
 	}
     }
-    
 }
 
 static void
 radial_gradient_property_changed (pixman_image_t *image)
 {
-    image->common.get_scanline_32 = (scanFetchProc)radial_gradient_get_scanline_32;
-    image->common.get_scanline_64 = (scanFetchProc)_pixman_image_get_scanline_64_generic;
+    image->common.get_scanline_32 = radial_gradient_get_scanline_32;
+    image->common.get_scanline_64 = _pixman_image_get_scanline_generic_64;
 }
 
 PIXMAN_EXPORT pixman_image_t *
-pixman_image_create_radial_gradient (pixman_point_fixed_t         *inner,
-				     pixman_point_fixed_t         *outer,
-				     pixman_fixed_t                inner_radius,
-				     pixman_fixed_t                outer_radius,
-				     const pixman_gradient_stop_t *stops,
-				     int                           n_stops)
+pixman_image_create_radial_gradient (pixman_point_fixed_t *        inner,
+                                     pixman_point_fixed_t *        outer,
+                                     pixman_fixed_t                inner_radius,
+                                     pixman_fixed_t                outer_radius,
+                                     const pixman_gradient_stop_t *stops,
+                                     int                           n_stops)
 {
     pixman_image_t *image;
     radial_gradient_t *radial;
-    
+
     return_val_if_fail (n_stops >= 2, NULL);
-    
-    image = _pixman_image_allocate();
-    
+
+    image = _pixman_image_allocate ();
+
     if (!image)
 	return NULL;
-    
+
     radial = &image->radial;
-    
+
     if (!_pixman_init_gradient (&radial->common, stops, n_stops))
     {
 	free (image);
 	return NULL;
     }
-    
+
     image->type = RADIAL;
-    
+
     radial->c1.x = inner->x;
     radial->c1.y = inner->y;
     radial->c1.radius = inner_radius;
@@ -313,14 +357,12 @@ pixman_image_create_radial_gradient (pixman_point_fixed_t         *inner,
     radial->cdx = pixman_fixed_to_double (radial->c2.x - radial->c1.x);
     radial->cdy = pixman_fixed_to_double (radial->c2.y - radial->c1.y);
     radial->dr = pixman_fixed_to_double (radial->c2.radius - radial->c1.radius);
-    radial->A = (radial->cdx * radial->cdx
-		 + radial->cdy * radial->cdy
-		 - radial->dr  * radial->dr);
-    
+    radial->A = (radial->cdx * radial->cdx +
+		 radial->cdy * radial->cdy -
+		 radial->dr  * radial->dr);
+
     image->common.property_changed = radial_gradient_property_changed;
-    
-    radial_gradient_property_changed (image);
-    
+
     return image;
 }
 
diff --git a/lib/pixman/pixman/pixman-region.c b/lib/pixman/pixman/pixman-region.c
index dec2c9de3..8ce5deb77 100644
--- a/lib/pixman/pixman/pixman-region.c
+++ b/lib/pixman/pixman/pixman-region.c
@@ -1,112 +1,156 @@
-/***********************************************************
-
-Copyright 1987, 1988, 1989, 1998  The Open Group
-
-Permission to use, copy, modify, distribute, and sell this software and its
-documentation for any purpose is hereby granted without fee, provided that
-the above copyright notice appear in all copies and that both that
-copyright notice and this permission notice appear in supporting
-documentation.
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
-AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-Except as contained in this notice, the name of The Open Group shall not be
-used in advertising or otherwise to promote the sale, use or other dealings
-in this Software without prior written authorization from The Open Group.
-
-Copyright 1987, 1988, 1989 by
-Digital Equipment Corporation, Maynard, Massachusetts.
-
-                        All Rights Reserved
-
-Permission to use, copy, modify, and distribute this software and its
-documentation for any purpose and without fee is hereby granted,
-provided that the above copyright notice appear in all copies and that
-both that copyright notice and this permission notice appear in
-supporting documentation, and that the name of Digital not be
-used in advertising or publicity pertaining to distribution of the
-software without specific, written prior permission.
-
-DIGITAL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
-ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
-DIGITAL BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
-ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
-WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
-ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
-SOFTWARE.
-
-******************************************************************/
+/*
+ * Copyright 1987, 1988, 1989, 1998  The Open Group
+ * 
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation.
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ * Except as contained in this notice, the name of The Open Group shall not be
+ * used in advertising or otherwise to promote the sale, use or other dealings
+ * in this Software without prior written authorization from The Open Group.
+ * 
+ * Copyright 1987, 1988, 1989 by
+ * Digital Equipment Corporation, Maynard, Massachusetts.
+ * 
+ *                    All Rights Reserved
+ * 
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation for any purpose and without fee is hereby granted,
+ * provided that the above copyright notice appear in all copies and that
+ * both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of Digital not be
+ * used in advertising or publicity pertaining to distribution of the
+ * software without specific, written prior permission.
+ * 
+ * DIGITAL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
+ * ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
+ * DIGITAL BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ */
 
 #include <stdlib.h>
 #include <limits.h>
 #include <string.h>
 #include <stdio.h>
+#include "pixman-private.h"
 
 #define PIXREGION_NIL(reg) ((reg)->data && !(reg)->data->numRects)
 /* not a region */
-#define PIXREGION_NAR(reg)	((reg)->data == pixman_brokendata)
-#define PIXREGION_NUM_RECTS(reg) ((reg)->data ? (reg)->data->numRects : 1)
+#define PIXREGION_NAR(reg)      ((reg)->data == pixman_broken_data)
+#define PIXREGION_NUMRECTS(reg) ((reg)->data ? (reg)->data->numRects : 1)
 #define PIXREGION_SIZE(reg) ((reg)->data ? (reg)->data->size : 0)
-#define PIXREGION_RECTS(reg) ((reg)->data ? (box_type_t *)((reg)->data + 1) \
-			               : &(reg)->extents)
+#define PIXREGION_RECTS(reg) \
+    ((reg)->data ? (box_type_t *)((reg)->data + 1) \
+     : &(reg)->extents)
 #define PIXREGION_BOXPTR(reg) ((box_type_t *)((reg)->data + 1))
-#define PIXREGION_BOX(reg,i) (&PIXREGION_BOXPTR(reg)[i])
-#define PIXREGION_TOP(reg) PIXREGION_BOX(reg, (reg)->data->numRects)
-#define PIXREGION_END(reg) PIXREGION_BOX(reg, (reg)->data->numRects - 1)
+#define PIXREGION_BOX(reg, i) (&PIXREGION_BOXPTR (reg)[i])
+#define PIXREGION_TOP(reg) PIXREGION_BOX (reg, (reg)->data->numRects)
+#define PIXREGION_END(reg) PIXREGION_BOX (reg, (reg)->data->numRects - 1)
 
+#define GOOD_RECT(rect) ((rect)->x1 < (rect)->x2 && (rect)->y1 < (rect)->y2)
+#define BAD_RECT(rect) ((rect)->x1 > (rect)->x2 || (rect)->y1 > (rect)->y2)
 
-#undef assert
-#ifdef DEBUG_PIXREGION
-#define assert(expr) {if (!(expr)) \
-		FatalError("Assertion failed file %s, line %d: expr\n", \
-			__FILE__, __LINE__); }
+/* Turn on debugging depending on what type of release this is
+ */
+
+#if ((PIXMAN_VERSION_MICRO % 2) == 1)
+/* Random git checkout.
+ * 
+ * Those are often used for performance work, so we don't turn on the
+ * full self-checking, but we do turn on the asserts.
+ */
+#    define   FATAL_BUGS
+#    define noSELF_CHECKS
+#elif ((PIXMAN_VERSION_MINOR % 2) == 0)
+/* Stable release.
+ *
+ * We don't want assertions because the X server should stay alive
+ * if possible. We also don't want self-checks for performance-reasons.
+ */
+#    define noFATAL_BUGS
+#    define noSELF_CHECKS
 #else
-#define assert(expr)
+/* Development snapshot.
+ *
+ * These are the things that get shipped in development distributions
+ * such as Rawhide. We want both self-checking and fatal assertions
+ * to catch as many bugs as possible.
+ */
+#    define FATAL_BUGS
+#    define SELF_CHECKS
 #endif
 
-#define good(reg) assert(PREFIX(_selfcheck) (reg))
+#ifndef FATAL_BUGS
+#    undef assert
+#    undef abort
+#    define assert(expr)
+#    define abort()
+#endif
 
-#undef MIN
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
-#undef MAX
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
+#ifdef SELF_CHECKS
 
-static const box_type_t PREFIX(_emptyBox_) = {0, 0, 0, 0};
-static const region_data_type_t PREFIX(_emptyData_) = {0, 0};
-static const region_data_type_t PREFIX(_brokendata_) = {0, 0};
+static void
+log_region_error (const char *function, const char *message)
+{
+    static int n_messages = 0;
 
-static box_type_t *pixman_region_emptyBox = (box_type_t *)&PREFIX(_emptyBox_);
-static region_data_type_t *pixman_region_emptyData = (region_data_type_t *)&PREFIX(_emptyData_);
-static region_data_type_t *pixman_brokendata = (region_data_type_t *)&PREFIX(_brokendata_);
+    if (n_messages < 50)
+    {
+	fprintf (stderr,
+		 "*** BUG ***\n"
+		 "%s: %s\n"
+		 "Set a breakpoint on 'log_region_error' to debug\n\n",
+                 function, message);
 
-/* This function exists only to make it possible to preserve the X ABI - it should
- * go away at first opportunity.
- *
- * The problem is that the X ABI exports the three structs and has used
- * them through macros. So the X server calls this function with
- * the addresses of those structs which makes the existing code continue to
- * work.
- */
-void
-PREFIX(_internal_set_static_pointers) (box_type_t *empty_box,
-				       region_data_type_t *empty_data,
-				       region_data_type_t *broken_data)
-{
-    pixman_region_emptyBox = empty_box;
-    pixman_region_emptyData = empty_data;
-    pixman_brokendata = broken_data;
+        abort (); /* This is #defined away unless FATAL_BUGS is defined */
+
+	n_messages++;
+    }
 }
 
+#define GOOD(reg)							\
+    do									\
+    {									\
+	if (!PREFIX (_selfcheck (reg)))					\
+	    log_region_error (FUNC, "Malformed region " # reg);         \
+    } while (0)
+
+#else
+
+#define log_region_error(function, name)
+#define GOOD(reg)
+
+#endif
+
+static const box_type_t PREFIX (_empty_box_) = { 0, 0, 0, 0 };
+static const region_data_type_t PREFIX (_empty_data_) = { 0, 0 };
+static const region_data_type_t PREFIX (_broken_data_) = { 0, 0 };
+
+static box_type_t *pixman_region_empty_box =
+    (box_type_t *)&PREFIX (_empty_box_);
+static region_data_type_t *pixman_region_empty_data =
+    (region_data_type_t *)&PREFIX (_empty_data_);
+static region_data_type_t *pixman_broken_data =
+    (region_data_type_t *)&PREFIX (_broken_data_);
+
 static pixman_bool_t
-pixman_break (region_type_t *pReg);
+pixman_break (region_type_t *region);
 
 /*
  * The functions in this file implement the Region abstraction used extensively
@@ -149,197 +193,275 @@ pixman_break (region_type_t *pReg);
  *
  * Adam de Boor wrote most of the original region code.  Joel McCormack
  * substantially modified or rewrote most of the core arithmetic routines, and
- * added pixman_region_validate in order to support several speed improvements to
- * pixman_region_validateTree.  Bob Scheifler changed the representation to be more
- * compact when empty or a single rectangle, and did a bunch of gratuitous
- * reformatting. Carl Worth did further gratuitous reformatting while re-merging
- * the server and client region code into libpixregion.
+ * added pixman_region_validate in order to support several speed improvements
+ * to pixman_region_validate_tree.  Bob Scheifler changed the representation
+ * to be more compact when empty or a single rectangle, and did a bunch of
+ * gratuitous reformatting. Carl Worth did further gratuitous reformatting
+ * while re-merging the server and client region code into libpixregion.
+ * Soren Sandmann did even more gratuitous reformatting.
  */
 
 /*  true iff two Boxes overlap */
-#define EXTENTCHECK(r1,r2) \
-      (!( ((r1)->x2 <= (r2)->x1)  || \
-          ((r1)->x1 >= (r2)->x2)  || \
-          ((r1)->y2 <= (r2)->y1)  || \
-          ((r1)->y1 >= (r2)->y2) ) )
+#define EXTENTCHECK(r1, r2)	   \
+    (!( ((r1)->x2 <= (r2)->x1)  || \
+        ((r1)->x1 >= (r2)->x2)  || \
+        ((r1)->y2 <= (r2)->y1)  || \
+        ((r1)->y1 >= (r2)->y2) ) )
 
 /* true iff (x,y) is in Box */
-#define INBOX(r,x,y) \
-      ( ((r)->x2 >  x) && \
-        ((r)->x1 <= x) && \
-        ((r)->y2 >  y) && \
-        ((r)->y1 <= y) )
+#define INBOX(r, x, y)	\
+    ( ((r)->x2 >  x) && \
+      ((r)->x1 <= x) && \
+      ((r)->y2 >  y) && \
+      ((r)->y1 <= y) )
 
 /* true iff Box r1 contains Box r2 */
-#define SUBSUMES(r1,r2) \
-      ( ((r1)->x1 <= (r2)->x1) && \
-        ((r1)->x2 >= (r2)->x2) && \
-        ((r1)->y1 <= (r2)->y1) && \
-        ((r1)->y2 >= (r2)->y2) )
+#define SUBSUMES(r1, r2)	\
+    ( ((r1)->x1 <= (r2)->x1) && \
+      ((r1)->x2 >= (r2)->x2) && \
+      ((r1)->y1 <= (r2)->y1) && \
+      ((r1)->y2 >= (r2)->y2) )
 
 static size_t
-PIXREGION_SZOF(size_t n)
+PIXREGION_SZOF (size_t n)
 {
     size_t size = n * sizeof(box_type_t);
+    
     if (n > UINT32_MAX / sizeof(box_type_t))
-        return 0;
+	return 0;
 
     if (sizeof(region_data_type_t) > UINT32_MAX - size)
-        return 0;
+	return 0;
 
     return size + sizeof(region_data_type_t);
 }
 
 static void *
-allocData(size_t n)
+alloc_data (size_t n)
 {
-    size_t sz = PIXREGION_SZOF(n);
+    size_t sz = PIXREGION_SZOF (n);
+
     if (!sz)
 	return NULL;
 
-    return malloc(sz);
+    return malloc (sz);
 }
 
-#define freeData(reg) if ((reg)->data && (reg)->data->size) free((reg)->data)
+#define FREE_DATA(reg) if ((reg)->data && (reg)->data->size) free ((reg)->data)
 
-#define RECTALLOC_BAIL(pReg,n,bail) \
-if (!(pReg)->data || (((pReg)->data->numRects + (n)) > (pReg)->data->size)) \
-    if (!pixman_rect_alloc(pReg, n)) { goto bail; }
-
-#define RECTALLOC(pReg,n) \
-if (!(pReg)->data || (((pReg)->data->numRects + (n)) > (pReg)->data->size)) \
-    if (!pixman_rect_alloc(pReg, n)) { return FALSE; }
-
-#define ADDRECT(pNextRect,nx1,ny1,nx2,ny2)	\
-{						\
-    pNextRect->x1 = nx1;			\
-    pNextRect->y1 = ny1;			\
-    pNextRect->x2 = nx2;			\
-    pNextRect->y2 = ny2;			\
-    pNextRect++;				\
-}
+#define RECTALLOC_BAIL(region, n, bail)					\
+    do									\
+    {									\
+	if (!(region)->data ||						\
+	    (((region)->data->numRects + (n)) > (region)->data->size))	\
+	{								\
+	    if (!pixman_rect_alloc (region, n))				\
+		goto bail;						\
+	}								\
+    } while (0)
 
-#define NEWRECT(pReg,pNextRect,nx1,ny1,nx2,ny2)			\
-{									\
-    if (!(pReg)->data || ((pReg)->data->numRects == (pReg)->data->size))\
+#define RECTALLOC(region, n)						\
+    do									\
     {									\
-	if (!pixman_rect_alloc(pReg, 1))					\
-	    return FALSE;						\
-	pNextRect = PIXREGION_TOP(pReg);					\
-    }									\
-    ADDRECT(pNextRect,nx1,ny1,nx2,ny2);					\
-    pReg->data->numRects++;						\
-    assert(pReg->data->numRects<=pReg->data->size);			\
-}
+	if (!(region)->data ||						\
+	    (((region)->data->numRects + (n)) > (region)->data->size))	\
+	{								\
+	    if (!pixman_rect_alloc (region, n)) {			\
+		return FALSE;						\
+	    }								\
+	}								\
+    } while (0)
+
+#define ADDRECT(next_rect, nx1, ny1, nx2, ny2)      \
+    do						    \
+    {						    \
+	next_rect->x1 = nx1;                        \
+	next_rect->y1 = ny1;                        \
+	next_rect->x2 = nx2;                        \
+	next_rect->y2 = ny2;                        \
+	next_rect++;                                \
+    }						    \
+    while (0)
+
+#define NEWRECT(region, next_rect, nx1, ny1, nx2, ny2)			\
+    do									\
+    {									\
+	if (!(region)->data ||						\
+	    ((region)->data->numRects == (region)->data->size))		\
+	{								\
+	    if (!pixman_rect_alloc (region, 1))				\
+		return FALSE;						\
+	    next_rect = PIXREGION_TOP (region);				\
+	}								\
+	ADDRECT (next_rect, nx1, ny1, nx2, ny2);			\
+	region->data->numRects++;					\
+	assert (region->data->numRects <= region->data->size);		\
+    } while (0)
 
-#define DOWNSIZE(reg,numRects)						\
-    if (((numRects) < ((reg)->data->size >> 1)) && ((reg)->data->size > 50)) \
+#define DOWNSIZE(reg, numRects)						\
+    do									\
     {									\
-	region_data_type_t * NewData;				\
-	size_t data_size = PIXREGION_SZOF(numRects);			\
-	if (!data_size)							\
-	    NewData = NULL;						\
-	else								\
-	    NewData = (region_data_type_t *)realloc((reg)->data, data_size); \
-	if (NewData)							\
+	if (((numRects) < ((reg)->data->size >> 1)) &&			\
+	    ((reg)->data->size > 50))					\
 	{								\
-	    NewData->size = (numRects);					\
-	    (reg)->data = NewData;					\
+	    region_data_type_t * new_data;				\
+	    size_t data_size = PIXREGION_SZOF (numRects);		\
+									\
+	    if (!data_size)						\
+	    {								\
+		new_data = NULL;					\
+	    }								\
+	    else							\
+	    {								\
+		new_data = (region_data_type_t *)			\
+		    realloc ((reg)->data, data_size);			\
+	    }								\
+									\
+	    if (new_data)						\
+	    {								\
+		new_data->size = (numRects);				\
+		(reg)->data = new_data;					\
+	    }								\
 	}								\
-    }
+    } while (0)
 
 PIXMAN_EXPORT pixman_bool_t
-PREFIX(_equal) (reg1, reg2)
-    region_type_t * reg1;
-    region_type_t * reg2;
+PREFIX (_equal) (region_type_t *reg1, region_type_t *reg2)
 {
     int i;
     box_type_t *rects1;
     box_type_t *rects2;
 
-    if (reg1->extents.x1 != reg2->extents.x1) return FALSE;
-    if (reg1->extents.x2 != reg2->extents.x2) return FALSE;
-    if (reg1->extents.y1 != reg2->extents.y1) return FALSE;
-    if (reg1->extents.y2 != reg2->extents.y2) return FALSE;
-    if (PIXREGION_NUM_RECTS(reg1) != PIXREGION_NUM_RECTS(reg2)) return FALSE;
+    if (reg1->extents.x1 != reg2->extents.x1)
+	return FALSE;
+    
+    if (reg1->extents.x2 != reg2->extents.x2)
+	return FALSE;
+    
+    if (reg1->extents.y1 != reg2->extents.y1)
+	return FALSE;
+    
+    if (reg1->extents.y2 != reg2->extents.y2)
+	return FALSE;
+    
+    if (PIXREGION_NUMRECTS (reg1) != PIXREGION_NUMRECTS (reg2))
+	return FALSE;
 
-    rects1 = PIXREGION_RECTS(reg1);
-    rects2 = PIXREGION_RECTS(reg2);
-    for (i = 0; i != PIXREGION_NUM_RECTS(reg1); i++) {
-	if (rects1[i].x1 != rects2[i].x1) return FALSE;
-	if (rects1[i].x2 != rects2[i].x2) return FALSE;
-	if (rects1[i].y1 != rects2[i].y1) return FALSE;
-	if (rects1[i].y2 != rects2[i].y2) return FALSE;
+    rects1 = PIXREGION_RECTS (reg1);
+    rects2 = PIXREGION_RECTS (reg2);
+    
+    for (i = 0; i != PIXREGION_NUMRECTS (reg1); i++)
+    {
+	if (rects1[i].x1 != rects2[i].x1)
+	    return FALSE;
+	
+	if (rects1[i].x2 != rects2[i].x2)
+	    return FALSE;
+	
+	if (rects1[i].y1 != rects2[i].y1)
+	    return FALSE;
+	
+	if (rects1[i].y2 != rects2[i].y2)
+	    return FALSE;
     }
+
     return TRUE;
 }
 
 int
-PREFIX(_print) (rgn)
-    region_type_t * rgn;
+PREFIX (_print) (region_type_t *rgn)
 {
     int num, size;
     int i;
     box_type_t * rects;
 
-    num = PIXREGION_NUM_RECTS(rgn);
-    size = PIXREGION_SIZE(rgn);
-    rects = PIXREGION_RECTS(rgn);
-    fprintf(stderr, "num: %d size: %d\n", num, size);
-    fprintf(stderr, "extents: %d %d %d %d\n",
-	   rgn->extents.x1, rgn->extents.y1, rgn->extents.x2, rgn->extents.y2);
+    num = PIXREGION_NUMRECTS (rgn);
+    size = PIXREGION_SIZE (rgn);
+    rects = PIXREGION_RECTS (rgn);
+
+    fprintf (stderr, "num: %d size: %d\n", num, size);
+    fprintf (stderr, "extents: %d %d %d %d\n",
+             rgn->extents.x1,
+	     rgn->extents.y1,
+	     rgn->extents.x2,
+	     rgn->extents.y2);
+    
     for (i = 0; i < num; i++)
-	fprintf(stderr, "%d %d %d %d \n",
-		rects[i].x1, rects[i].y1, rects[i].x2, rects[i].y2);
-    fprintf(stderr, "\n");
+    {
+	fprintf (stderr, "%d %d %d %d \n",
+	         rects[i].x1, rects[i].y1, rects[i].x2, rects[i].y2);
+    }
+    
+    fprintf (stderr, "\n");
+
     return(num);
 }
 
 
 PIXMAN_EXPORT void
-PREFIX(_init) (region_type_t *region)
+PREFIX (_init) (region_type_t *region)
 {
-    region->extents = *pixman_region_emptyBox;
-    region->data = pixman_region_emptyData;
+    region->extents = *pixman_region_empty_box;
+    region->data = pixman_region_empty_data;
 }
 
 PIXMAN_EXPORT void
-PREFIX(_init_rect) (region_type_t *region,
-		    int x, int y, unsigned int width, unsigned int height)
+PREFIX (_init_rect) (region_type_t *	region,
+                     int		x,
+		     int		y,
+		     unsigned int	width,
+		     unsigned int	height)
 {
     region->extents.x1 = x;
     region->extents.y1 = y;
     region->extents.x2 = x + width;
     region->extents.y2 = y + height;
+
+    if (!GOOD_RECT (&region->extents))
+    {
+        if (BAD_RECT (&region->extents))
+            log_region_error (FUNC, "Invalid rectangle passed");
+        PREFIX (_init) (region);
+        return;
+    }
+
     region->data = NULL;
 }
 
 PIXMAN_EXPORT void
-PREFIX(_init_with_extents) (region_type_t *region, box_type_t *extents)
+PREFIX (_init_with_extents) (region_type_t *region, box_type_t *extents)
 {
+    if (!GOOD_RECT (extents))
+    {
+        if (BAD_RECT (extents))
+            log_region_error (FUNC, "Invalid rectangle passed");
+        PREFIX (_init) (region);
+        return;
+    }
     region->extents = *extents;
+
     region->data = NULL;
 }
 
 PIXMAN_EXPORT void
-PREFIX(_fini) (region_type_t *region)
+PREFIX (_fini) (region_type_t *region)
 {
-    good (region);
-    freeData (region);
+    GOOD (region);
+    FREE_DATA (region);
 }
 
 PIXMAN_EXPORT int
-PREFIX(_n_rects) (region_type_t *region)
+PREFIX (_n_rects) (region_type_t *region)
 {
-    return PIXREGION_NUM_RECTS (region);
+    return PIXREGION_NUMRECTS (region);
 }
 
 PIXMAN_EXPORT box_type_t *
-PREFIX(_rectangles) (region_type_t *region,
-				  int		    *n_rects)
+PREFIX (_rectangles) (region_type_t *region,
+                      int               *n_rects)
 {
     if (n_rects)
-	*n_rects = PIXREGION_NUM_RECTS (region);
+	*n_rects = PIXREGION_NUMRECTS (region);
 
     return PIXREGION_RECTS (region);
 }
@@ -347,81 +469,110 @@ PREFIX(_rectangles) (region_type_t *region,
 static pixman_bool_t
 pixman_break (region_type_t *region)
 {
-    freeData (region);
-    region->extents = *pixman_region_emptyBox;
-    region->data = pixman_brokendata;
+    FREE_DATA (region);
+
+    region->extents = *pixman_region_empty_box;
+    region->data = pixman_broken_data;
+
     return FALSE;
 }
 
 static pixman_bool_t
-pixman_rect_alloc (region_type_t * region, int n)
+pixman_rect_alloc (region_type_t * region,
+                   int             n)
 {
     region_data_type_t *data;
 
     if (!region->data)
     {
 	n++;
-	region->data = allocData(n);
+	region->data = alloc_data (n);
+
 	if (!region->data)
 	    return pixman_break (region);
+
 	region->data->numRects = 1;
-	*PIXREGION_BOXPTR(region) = region->extents;
+	*PIXREGION_BOXPTR (region) = region->extents;
     }
     else if (!region->data->size)
     {
-	region->data = allocData(n);
+	region->data = alloc_data (n);
+
 	if (!region->data)
 	    return pixman_break (region);
+
 	region->data->numRects = 0;
     }
     else
     {
 	size_t data_size;
+
 	if (n == 1)
 	{
 	    n = region->data->numRects;
 	    if (n > 500) /* XXX pick numbers out of a hat */
 		n = 250;
 	}
+
 	n += region->data->numRects;
-	data_size = PIXREGION_SZOF(n);
+	data_size = PIXREGION_SZOF (n);
+
 	if (!data_size)
+	{
 	    data = NULL;
+	}
 	else
-	    data = (region_data_type_t *)realloc(region->data, PIXREGION_SZOF(n));
+	{
+	    data = (region_data_type_t *)
+		realloc (region->data, PIXREGION_SZOF (n));
+	}
+	
 	if (!data)
 	    return pixman_break (region);
+	
 	region->data = data;
     }
+    
     region->data->size = n;
+
     return TRUE;
 }
 
 PIXMAN_EXPORT pixman_bool_t
-PREFIX(_copy) (region_type_t *dst, region_type_t *src)
+PREFIX (_copy) (region_type_t *dst, region_type_t *src)
 {
-    good(dst);
-    good(src);
+    GOOD (dst);
+    GOOD (src);
+
     if (dst == src)
 	return TRUE;
+    
     dst->extents = src->extents;
+
     if (!src->data || !src->data->size)
     {
-	freeData(dst);
+	FREE_DATA (dst);
 	dst->data = src->data;
 	return TRUE;
     }
+    
     if (!dst->data || (dst->data->size < src->data->numRects))
     {
-	freeData(dst);
-	dst->data = allocData(src->data->numRects);
+	FREE_DATA (dst);
+
+	dst->data = alloc_data (src->data->numRects);
+
 	if (!dst->data)
 	    return pixman_break (dst);
+
 	dst->data->size = src->data->numRects;
     }
+
     dst->data->numRects = src->data->numRects;
-    memmove((char *)PIXREGION_BOXPTR(dst),(char *)PIXREGION_BOXPTR(src),
-	  dst->data->numRects * sizeof(box_type_t));
+
+    memmove ((char *)PIXREGION_BOXPTR (dst), (char *)PIXREGION_BOXPTR (src),
+             dst->data->numRects * sizeof(box_type_t));
+
     return TRUE;
 }
 
@@ -448,30 +599,30 @@ PREFIX(_copy) (region_type_t *dst, region_type_t *src)
  *-----------------------------------------------------------------------
  */
 static inline int
-pixman_coalesce (
-    region_type_t *	region,	    	/* Region to coalesce		     */
-    int	    	  	prevStart,  	/* Index of start of previous band   */
-    int	    	  	curStart)   	/* Index of start of current band    */
+pixman_coalesce (region_type_t * region,      /* Region to coalesce		 */
+		 int             prev_start,  /* Index of start of previous band */
+		 int             cur_start)   /* Index of start of current band  */
 {
-    box_type_t *	pPrevBox;   	/* Current box in previous band	     */
-    box_type_t *	pCurBox;    	/* Current box in current band       */
-    int  	numRects;	/* Number rectangles in both bands   */
-    int	y2;		/* Bottom of current band	     */
+    box_type_t *prev_box;       /* Current box in previous band	     */
+    box_type_t *cur_box;        /* Current box in current band       */
+    int numRects;               /* Number rectangles in both bands   */
+    int y2;                     /* Bottom of current band	     */
+
     /*
      * Figure out how many rectangles are in the band.
      */
-    numRects = curStart - prevStart;
-    assert(numRects == region->data->numRects - curStart);
+    numRects = cur_start - prev_start;
+    assert (numRects == region->data->numRects - cur_start);
 
-    if (!numRects) return curStart;
+    if (!numRects) return cur_start;
 
     /*
      * The bands may only be coalesced if the bottom of the previous
      * matches the top scanline of the current.
      */
-    pPrevBox = PIXREGION_BOX(region, prevStart);
-    pCurBox = PIXREGION_BOX(region, curStart);
-    if (pPrevBox->y2 != pCurBox->y1) return curStart;
+    prev_box = PIXREGION_BOX (region, prev_start);
+    cur_box = PIXREGION_BOX (region, cur_start);
+    if (prev_box->y2 != cur_box->y1) return cur_start;
 
     /*
      * Make sure the bands have boxes in the same places. This
@@ -479,43 +630,51 @@ pixman_coalesce (
      * cover the most area possible. I.e. two boxes in a band must
      * have some horizontal space between them.
      */
-    y2 = pCurBox->y2;
+    y2 = cur_box->y2;
 
-    do {
-	if ((pPrevBox->x1 != pCurBox->x1) || (pPrevBox->x2 != pCurBox->x2)) {
-	    return (curStart);
-	}
-	pPrevBox++;
-	pCurBox++;
+    do
+    {
+	if ((prev_box->x1 != cur_box->x1) || (prev_box->x2 != cur_box->x2))
+	    return (cur_start);
+	
+	prev_box++;
+	cur_box++;
 	numRects--;
-    } while (numRects);
+    }
+    while (numRects);
 
     /*
      * The bands may be merged, so set the bottom y of each box
      * in the previous band to the bottom y of the current band.
      */
-    numRects = curStart - prevStart;
+    numRects = cur_start - prev_start;
     region->data->numRects -= numRects;
-    do {
-	pPrevBox--;
-	pPrevBox->y2 = y2;
+
+    do
+    {
+	prev_box--;
+	prev_box->y2 = y2;
 	numRects--;
-    } while (numRects);
-    return prevStart;
+    }
+    while (numRects);
+
+    return prev_start;
 }
 
 /* Quicky macro to avoid trivial reject procedure calls to pixman_coalesce */
 
-#define Coalesce(newReg, prevBand, curBand)				\
-    if (curBand - prevBand == newReg->data->numRects - curBand) {	\
-	prevBand = pixman_coalesce(newReg, prevBand, curBand);		\
-    } else {								\
-	prevBand = curBand;						\
-    }
+#define COALESCE(new_reg, prev_band, cur_band)                          \
+    do									\
+    {									\
+	if (cur_band - prev_band == new_reg->data->numRects - cur_band)	\
+	    prev_band = pixman_coalesce (new_reg, prev_band, cur_band);	\
+	else								\
+	    prev_band = cur_band;					\
+    } while (0)
 
 /*-
  *-----------------------------------------------------------------------
- * pixman_region_appendNonO --
+ * pixman_region_append_non_o --
  *	Handle a non-overlapping band for the union and subtract operations.
  *      Just adds the (top/bottom-clipped) rectangles into the region.
  *      Doesn't have to check for subsumption or anything.
@@ -529,55 +688,58 @@ pixman_coalesce (
  *
  *-----------------------------------------------------------------------
  */
-
 static inline pixman_bool_t
-pixman_region_appendNonO (
-    region_type_t *	region,
-    box_type_t *	r,
-    box_type_t *  	  	rEnd,
-    int  	y1,
-    int  	y2)
+pixman_region_append_non_o (region_type_t * region,
+			    box_type_t *    r,
+			    box_type_t *    r_end,
+			    int             y1,
+			    int             y2)
 {
-    box_type_t *	pNextRect;
-    int	newRects;
+    box_type_t *next_rect;
+    int new_rects;
 
-    newRects = rEnd - r;
+    new_rects = r_end - r;
 
-    assert(y1 < y2);
-    assert(newRects != 0);
+    assert (y1 < y2);
+    assert (new_rects != 0);
 
     /* Make sure we have enough space for all rectangles to be added */
-    RECTALLOC(region, newRects);
-    pNextRect = PIXREGION_TOP(region);
-    region->data->numRects += newRects;
-    do {
-	assert(r->x1 < r->x2);
-	ADDRECT(pNextRect, r->x1, y1, r->x2, y2);
+    RECTALLOC (region, new_rects);
+    next_rect = PIXREGION_TOP (region);
+    region->data->numRects += new_rects;
+
+    do
+    {
+	assert (r->x1 < r->x2);
+	ADDRECT (next_rect, r->x1, y1, r->x2, y2);
 	r++;
-    } while (r != rEnd);
+    }
+    while (r != r_end);
 
     return TRUE;
 }
 
-#define FindBand(r, rBandEnd, rEnd, ry1)		    \
-{							    \
-    ry1 = r->y1;					    \
-    rBandEnd = r+1;					    \
-    while ((rBandEnd != rEnd) && (rBandEnd->y1 == ry1)) {   \
-	rBandEnd++;					    \
-    }							    \
-}
-
-#define	AppendRegions(newReg, r, rEnd)					\
-{									\
-    int newRects;							\
-    if ((newRects = rEnd - r)) {					\
-	RECTALLOC_BAIL(newReg, newRects, bail);					\
-	memmove((char *)PIXREGION_TOP(newReg),(char *)r, 			\
-              newRects * sizeof(box_type_t));				\
-	newReg->data->numRects += newRects;				\
-    }									\
-}
+#define FIND_BAND(r, r_band_end, r_end, ry1)			     \
+    do								     \
+    {								     \
+	ry1 = r->y1;						     \
+	r_band_end = r + 1;					     \
+	while ((r_band_end != r_end) && (r_band_end->y1 == ry1)) {   \
+	    r_band_end++;					     \
+	}							     \
+    } while (0)
+
+#define APPEND_REGIONS(new_reg, r, r_end)				\
+    do									\
+    {									\
+	int new_rects;							\
+	if ((new_rects = r_end - r)) {					\
+	    RECTALLOC_BAIL (new_reg, new_rects, bail);			\
+	    memmove ((char *)PIXREGION_TOP (new_reg), (char *)r,	\
+		     new_rects * sizeof(box_type_t));			\
+	    new_reg->data->numRects += new_rects;			\
+	}								\
+    } while (0)
 
 /*-
  *-----------------------------------------------------------------------
@@ -591,15 +753,15 @@ pixman_region_appendNonO (
  *
  * Side Effects:
  *	The new region is overwritten.
- *	pOverlap set to TRUE if overlapFunc ever returns TRUE.
+ *	overlap set to TRUE if overlap_func ever returns TRUE.
  *
  * Notes:
  *	The idea behind this function is to view the two regions as sets.
  *	Together they cover a rectangle of area that this function divides
  *	into horizontal bands where points are covered only by one region
- *	or by both. For the first case, the nonOverlapFunc is called with
+ *	or by both. For the first case, the non_overlap_func is called with
  *	each the band and the band's upper and lower extents. For the
- *	second, the overlapFunc is called to process the entire band. It
+ *	second, the overlap_func is called to process the entire band. It
  *	is responsible for clipping the rectangles in the band, though
  *	this function provides the boundaries.
  *	At the end of each band, the new region is coalesced, if possible,
@@ -608,92 +770,101 @@ pixman_region_appendNonO (
  *-----------------------------------------------------------------------
  */
 
-typedef pixman_bool_t (*OverlapProcPtr)(
-    region_type_t	 *region,
-    box_type_t *r1,
-    box_type_t *r1End,
-    box_type_t *r2,
-    box_type_t *r2End,
-    int    	 y1,
-    int    	 y2,
-    int		 *pOverlap);
+typedef pixman_bool_t (*overlap_proc_ptr) (region_type_t *region,
+					   box_type_t *   r1,
+					   box_type_t *   r1_end,
+					   box_type_t *   r2,
+					   box_type_t *   r2_end,
+					   int            y1,
+					   int            y2,
+					   int *          overlap);
 
 static pixman_bool_t
-pixman_op(
-    region_type_t *newReg,		    /* Place to store result	     */
-    region_type_t *       reg1,		    /* First region in operation     */
-    region_type_t *       reg2,		    /* 2d region in operation        */
-    OverlapProcPtr  overlapFunc,            /* Function to call for over-
-					     * lapping bands		     */
-    int	    appendNon1,		    /* Append non-overlapping bands  */
-					    /* in region 1 ? */
-    int	    appendNon2,		    /* Append non-overlapping bands  */
-					    /* in region 2 ? */
-    int	    *pOverlap)
+pixman_op (region_type_t *  new_reg,               /* Place to store result	    */
+	   region_type_t *  reg1,                  /* First region in operation     */
+	   region_type_t *  reg2,                  /* 2d region in operation        */
+	   overlap_proc_ptr overlap_func,          /* Function to call for over-
+						    * lapping bands		    */
+	   int              append_non1,           /* Append non-overlapping bands  
+						    * in region 1 ?
+						    */
+	   int              append_non2,           /* Append non-overlapping bands
+						    * in region 2 ?
+						    */
+	   int *            overlap)
 {
-    box_type_t * r1;			    /* Pointer into first region     */
-    box_type_t * r2;			    /* Pointer into 2d region	     */
-    box_type_t *	    r1End;		    /* End of 1st region	     */
-    box_type_t *	    r2End;		    /* End of 2d region		     */
-    int	    ybot;		    /* Bottom of intersection	     */
-    int	    ytop;		    /* Top of intersection	     */
-    region_data_type_t *	    oldData;		    /* Old data for newReg	     */
-    int		    prevBand;		    /* Index of start of
-					     * previous band in newReg       */
-    int		    curBand;		    /* Index of start of current
-					     * band in newReg		     */
-    box_type_t * r1BandEnd;		    /* End of current band in r1     */
-    box_type_t * r2BandEnd;		    /* End of current band in r2     */
-    int	    top;		    /* Top of non-overlapping band   */
-    int	    bot;		    /* Bottom of non-overlapping band*/
-    int    r1y1;		    /* Temps for r1->y1 and r2->y1   */
-    int    r2y1;
-    int		    newSize;
-    int		    numRects;
+    box_type_t *r1;                 /* Pointer into first region     */
+    box_type_t *r2;                 /* Pointer into 2d region	     */
+    box_type_t *r1_end;             /* End of 1st region	     */
+    box_type_t *r2_end;             /* End of 2d region		     */
+    int ybot;                       /* Bottom of intersection	     */
+    int ytop;                       /* Top of intersection	     */
+    region_data_type_t *old_data;   /* Old data for new_reg	     */
+    int prev_band;                  /* Index of start of
+				     * previous band in new_reg       */
+    int cur_band;                   /* Index of start of current
+				     * band in new_reg		     */
+    box_type_t * r1_band_end;       /* End of current band in r1     */
+    box_type_t * r2_band_end;       /* End of current band in r2     */
+    int top;                        /* Top of non-overlapping band   */
+    int bot;                        /* Bottom of non-overlapping band*/
+    int r1y1;                       /* Temps for r1->y1 and r2->y1   */
+    int r2y1;
+    int new_size;
+    int numRects;
 
     /*
      * Break any region computed from a broken region
      */
-    if (PIXREGION_NAR (reg1) || PIXREGION_NAR(reg2))
-	return pixman_break (newReg);
+    if (PIXREGION_NAR (reg1) || PIXREGION_NAR (reg2))
+	return pixman_break (new_reg);
 
     /*
      * Initialization:
-     *	set r1, r2, r1End and r2End appropriately, save the rectangles
+     *	set r1, r2, r1_end and r2_end appropriately, save the rectangles
      * of the destination region until the end in case it's one of
      * the two source regions, then mark the "new" region empty, allocating
      * another array of rectangles for it to use.
      */
 
-    r1 = PIXREGION_RECTS(reg1);
-    newSize = PIXREGION_NUM_RECTS(reg1);
-    r1End = r1 + newSize;
-    numRects = PIXREGION_NUM_RECTS(reg2);
-    r2 = PIXREGION_RECTS(reg2);
-    r2End = r2 + numRects;
-    assert(r1 != r1End);
-    assert(r2 != r2End);
+    r1 = PIXREGION_RECTS (reg1);
+    new_size = PIXREGION_NUMRECTS (reg1);
+    r1_end = r1 + new_size;
+
+    numRects = PIXREGION_NUMRECTS (reg2);
+    r2 = PIXREGION_RECTS (reg2);
+    r2_end = r2 + numRects;
+    
+    assert (r1 != r1_end);
+    assert (r2 != r2_end);
 
-    oldData = (region_data_type_t *)NULL;
-    if (((newReg == reg1) && (newSize > 1)) ||
-	((newReg == reg2) && (numRects > 1)))
+    old_data = (region_data_type_t *)NULL;
+
+    if (((new_reg == reg1) && (new_size > 1)) ||
+        ((new_reg == reg2) && (numRects > 1)))
     {
-	oldData = newReg->data;
-	newReg->data = pixman_region_emptyData;
+        old_data = new_reg->data;
+        new_reg->data = pixman_region_empty_data;
     }
+
     /* guess at new size */
-    if (numRects > newSize)
-	newSize = numRects;
-    newSize <<= 1;
-    if (!newReg->data)
-	newReg->data = pixman_region_emptyData;
-    else if (newReg->data->size)
-	newReg->data->numRects = 0;
-    if (newSize > newReg->data->size) {
-	if (!pixman_rect_alloc(newReg, newSize)) {
-	    if (oldData)
-		free (oldData);
-	    return FALSE;
+    if (numRects > new_size)
+	new_size = numRects;
+
+    new_size <<= 1;
+
+    if (!new_reg->data)
+	new_reg->data = pixman_region_empty_data;
+    else if (new_reg->data->size)
+	new_reg->data->numRects = 0;
+
+    if (new_size > new_reg->data->size)
+    {
+        if (!pixman_rect_alloc (new_reg, new_size))
+        {
+            if (old_data)
+		free (old_data);
+            return FALSE;
 	}
     }
 
@@ -702,7 +873,7 @@ pixman_op(
      * In the upcoming loop, ybot and ytop serve different functions depending
      * on whether the band being handled is an overlapping or non-overlapping
      * band.
-     * 	In the case of a non-overlapping band (only one of the regions
+     *  In the case of a non-overlapping band (only one of the regions
      * has points in the band), ybot is the bottom of the most recent
      * intersection and thus clips the top of the rectangles in that band.
      * ytop is the top of the next intersection between the two regions and
@@ -711,34 +882,35 @@ pixman_op(
      * the top of the rectangles of both regions and ybot clips the bottoms.
      */
 
-    ybot = MIN(r1->y1, r2->y1);
+    ybot = MIN (r1->y1, r2->y1);
 
     /*
-     * prevBand serves to mark the start of the previous band so rectangles
+     * prev_band serves to mark the start of the previous band so rectangles
      * can be coalesced into larger rectangles. qv. pixman_coalesce, above.
-     * In the beginning, there is no previous band, so prevBand == curBand
-     * (curBand is set later on, of course, but the first band will always
-     * start at index 0). prevBand and curBand must be indices because of
+     * In the beginning, there is no previous band, so prev_band == cur_band
+     * (cur_band is set later on, of course, but the first band will always
+     * start at index 0). prev_band and cur_band must be indices because of
      * the possible expansion, and resultant moving, of the new region's
      * array of rectangles.
      */
-    prevBand = 0;
+    prev_band = 0;
 
-    do {
-	/*
+    do
+    {
+        /*
 	 * This algorithm proceeds one source-band (as opposed to a
 	 * destination band, which is determined by where the two regions
-	 * intersect) at a time. r1BandEnd and r2BandEnd serve to mark the
+	 * intersect) at a time. r1_band_end and r2_band_end serve to mark the
 	 * rectangle after the last one in the current band for their
 	 * respective regions.
 	 */
-	assert(r1 != r1End);
-	assert(r2 != r2End);
+        assert (r1 != r1_end);
+        assert (r2 != r2_end);
 
-	FindBand(r1, r1BandEnd, r1End, r1y1);
-	FindBand(r2, r2BandEnd, r2End, r2y1);
+        FIND_BAND (r1, r1_band_end, r1_end, r1y1);
+        FIND_BAND (r2, r2_band_end, r2_end, r2y1);
 
-	/*
+        /*
 	 * First handle the band that doesn't intersect, if any.
 	 *
 	 * Note that attention is restricted to one band in the
@@ -746,58 +918,79 @@ pixman_op(
 	 * bands between the current position and the next place it overlaps
 	 * the other, this entire loop will be passed through n times.
 	 */
-	if (r1y1 < r2y1) {
-	    if (appendNon1) {
-		top = MAX(r1y1, ybot);
-		bot = MIN(r1->y2, r2y1);
-		if (top != bot)	{
-		    curBand = newReg->data->numRects;
-		    if (!pixman_region_appendNonO(newReg, r1, r1BandEnd, top, bot))
+        if (r1y1 < r2y1)
+        {
+            if (append_non1)
+            {
+                top = MAX (r1y1, ybot);
+                bot = MIN (r1->y2, r2y1);
+                if (top != bot)
+                {
+                    cur_band = new_reg->data->numRects;
+                    if (!pixman_region_append_non_o (new_reg, r1, r1_band_end, top, bot))
 			goto bail;
-		    Coalesce(newReg, prevBand, curBand);
+                    COALESCE (new_reg, prev_band, cur_band);
 		}
 	    }
-	    ytop = r2y1;
-	} else if (r2y1 < r1y1) {
-	    if (appendNon2) {
-		top = MAX(r2y1, ybot);
-		bot = MIN(r2->y2, r1y1);
-		if (top != bot) {
-		    curBand = newReg->data->numRects;
-		    if (!pixman_region_appendNonO(newReg, r2, r2BandEnd, top, bot))
+            ytop = r2y1;
+	}
+        else if (r2y1 < r1y1)
+        {
+            if (append_non2)
+            {
+                top = MAX (r2y1, ybot);
+                bot = MIN (r2->y2, r1y1);
+		
+                if (top != bot)
+                {
+                    cur_band = new_reg->data->numRects;
+
+                    if (!pixman_region_append_non_o (new_reg, r2, r2_band_end, top, bot))
 			goto bail;
-		    Coalesce(newReg, prevBand, curBand);
+
+                    COALESCE (new_reg, prev_band, cur_band);
 		}
 	    }
-	    ytop = r1y1;
-	} else {
-	    ytop = r1y1;
+            ytop = r1y1;
+	}
+        else
+        {
+            ytop = r1y1;
 	}
 
-	/*
+        /*
 	 * Now see if we've hit an intersecting band. The two bands only
 	 * intersect if ybot > ytop
 	 */
-	ybot = MIN(r1->y2, r2->y2);
-	if (ybot > ytop) {
-	    curBand = newReg->data->numRects;
-	    if (!(* overlapFunc)(newReg,
-				 r1, r1BandEnd,
-				 r2, r2BandEnd,
-				 ytop, ybot,
-				 pOverlap))
+        ybot = MIN (r1->y2, r2->y2);
+        if (ybot > ytop)
+        {
+            cur_band = new_reg->data->numRects;
+
+            if (!(*overlap_func)(new_reg,
+                                 r1, r1_band_end,
+                                 r2, r2_band_end,
+                                 ytop, ybot,
+                                 overlap))
+	    {
 		goto bail;
-	    Coalesce(newReg, prevBand, curBand);
+	    }
+	    
+            COALESCE (new_reg, prev_band, cur_band);
 	}
 
-	/*
+        /*
 	 * If we've finished with a band (y2 == ybot) we skip forward
 	 * in the region to the next band.
 	 */
-	if (r1->y2 == ybot) r1 = r1BandEnd;
-	if (r2->y2 == ybot) r2 = r2BandEnd;
+        if (r1->y2 == ybot)
+	    r1 = r1_band_end;
+
+        if (r2->y2 == ybot)
+	    r2 = r2_band_end;
 
-    } while (r1 != r1End && r2 != r2End);
+    }
+    while (r1 != r1_end && r2 != r2_end);
 
     /*
      * Deal with whichever region (if any) still has rectangles left.
@@ -807,64 +1000,79 @@ pixman_op(
      * regardless of how many bands, into one final append to the list.
      */
 
-    if ((r1 != r1End) && appendNon1) {
-	/* Do first nonOverlap1Func call, which may be able to coalesce */
-	FindBand(r1, r1BandEnd, r1End, r1y1);
-	curBand = newReg->data->numRects;
-	if (!pixman_region_appendNonO(newReg,
-				      r1, r1BandEnd,
-				      MAX(r1y1, ybot), r1->y2))
+    if ((r1 != r1_end) && append_non1)
+    {
+        /* Do first non_overlap1Func call, which may be able to coalesce */
+        FIND_BAND (r1, r1_band_end, r1_end, r1y1);
+	
+        cur_band = new_reg->data->numRects;
+	
+        if (!pixman_region_append_non_o (new_reg,
+                                         r1, r1_band_end,
+                                         MAX (r1y1, ybot), r1->y2))
+	{
 	    goto bail;
-	Coalesce(newReg, prevBand, curBand);
-	/* Just append the rest of the boxes  */
-	AppendRegions(newReg, r1BandEnd, r1End);
-
-    } else if ((r2 != r2End) && appendNon2) {
-	/* Do first nonOverlap2Func call, which may be able to coalesce */
-	FindBand(r2, r2BandEnd, r2End, r2y1);
-	curBand = newReg->data->numRects;
-	if (!pixman_region_appendNonO(newReg,
-				      r2, r2BandEnd,
-				      MAX(r2y1, ybot), r2->y2))
+	}
+	
+        COALESCE (new_reg, prev_band, cur_band);
+
+        /* Just append the rest of the boxes  */
+        APPEND_REGIONS (new_reg, r1_band_end, r1_end);
+    }
+    else if ((r2 != r2_end) && append_non2)
+    {
+        /* Do first non_overlap2Func call, which may be able to coalesce */
+        FIND_BAND (r2, r2_band_end, r2_end, r2y1);
+
+	cur_band = new_reg->data->numRects;
+
+        if (!pixman_region_append_non_o (new_reg,
+                                         r2, r2_band_end,
+                                         MAX (r2y1, ybot), r2->y2))
+	{
 	    goto bail;
-	Coalesce(newReg, prevBand, curBand);
-	/* Append rest of boxes */
-	AppendRegions(newReg, r2BandEnd, r2End);
+	}
+
+        COALESCE (new_reg, prev_band, cur_band);
+
+        /* Append rest of boxes */
+        APPEND_REGIONS (new_reg, r2_band_end, r2_end);
     }
 
-    if (oldData)
-	free(oldData);
+    if (old_data)
+	free (old_data);
 
-    if (!(numRects = newReg->data->numRects))
+    if (!(numRects = new_reg->data->numRects))
     {
-	freeData(newReg);
-	newReg->data = pixman_region_emptyData;
+        FREE_DATA (new_reg);
+        new_reg->data = pixman_region_empty_data;
     }
     else if (numRects == 1)
     {
-	newReg->extents = *PIXREGION_BOXPTR(newReg);
-	freeData(newReg);
-	newReg->data = (region_data_type_t *)NULL;
+        new_reg->extents = *PIXREGION_BOXPTR (new_reg);
+        FREE_DATA (new_reg);
+        new_reg->data = (region_data_type_t *)NULL;
     }
     else
     {
-	DOWNSIZE(newReg, numRects);
+        DOWNSIZE (new_reg, numRects);
     }
 
     return TRUE;
 
 bail:
-    if (oldData)
-	free(oldData);
-    return pixman_break (newReg);
+    if (old_data)
+	free (old_data);
+
+    return pixman_break (new_reg);
 }
 
 /*-
  *-----------------------------------------------------------------------
  * pixman_set_extents --
  *	Reset the extents of a region to what they should be. Called by
- *	pixman_region_subtract and pixman_region_intersect as they can't figure it out along the
- *	way or do so easily, as pixman_region_union can.
+ *	pixman_region_subtract and pixman_region_intersect as they can't
+ *      figure it out along the way or do so easily, as pixman_region_union can.
  *
  * Results:
  *	None.
@@ -877,42 +1085,45 @@ bail:
 static void
 pixman_set_extents (region_type_t *region)
 {
-    box_type_t *box, *boxEnd;
+    box_type_t *box, *box_end;
 
     if (!region->data)
 	return;
+
     if (!region->data->size)
     {
-	region->extents.x2 = region->extents.x1;
-	region->extents.y2 = region->extents.y1;
-	return;
+        region->extents.x2 = region->extents.x1;
+        region->extents.y2 = region->extents.y1;
+        return;
     }
 
-    box = PIXREGION_BOXPTR(region);
-    boxEnd = PIXREGION_END(region);
+    box = PIXREGION_BOXPTR (region);
+    box_end = PIXREGION_END (region);
 
     /*
      * Since box is the first rectangle in the region, it must have the
-     * smallest y1 and since boxEnd is the last rectangle in the region,
+     * smallest y1 and since box_end is the last rectangle in the region,
      * it must have the largest y2, because of banding. Initialize x1 and
-     * x2 from  box and boxEnd, resp., as good things to initialize them
+     * x2 from  box and box_end, resp., as good things to initialize them
      * to...
      */
     region->extents.x1 = box->x1;
     region->extents.y1 = box->y1;
-    region->extents.x2 = boxEnd->x2;
-    region->extents.y2 = boxEnd->y2;
+    region->extents.x2 = box_end->x2;
+    region->extents.y2 = box_end->y2;
 
-    assert(region->extents.y1 < region->extents.y2);
-    while (box <= boxEnd) {
-	if (box->x1 < region->extents.x1)
+    assert (region->extents.y1 < region->extents.y2);
+
+    while (box <= box_end)
+    {
+        if (box->x1 < region->extents.x1)
 	    region->extents.x1 = box->x1;
-	if (box->x2 > region->extents.x2)
+        if (box->x2 > region->extents.x2)
 	    region->extents.x2 = box->x2;
-	box++;
-    };
+        box++;
+    }
 
-    assert(region->extents.x1 < region->extents.x2);
+    assert (region->extents.x1 < region->extents.x2);
 }
 
 /*======================================================================
@@ -920,7 +1131,7 @@ pixman_set_extents (region_type_t *region)
  *====================================================================*/
 /*-
  *-----------------------------------------------------------------------
- * pixman_region_intersectO --
+ * pixman_region_intersect_o --
  *	Handle an overlapping band for pixman_region_intersect.
  *
  * Results:
@@ -933,125 +1144,145 @@ pixman_set_extents (region_type_t *region)
  */
 /*ARGSUSED*/
 static pixman_bool_t
-pixman_region_intersectO (region_type_t *region,
-			  box_type_t    *r1,
-			  box_type_t    *r1End,
-			  box_type_t    *r2,
-			  box_type_t    *r2End,
-			  int    	     y1,
-			  int    	     y2,
-			  int		    *pOverlap)
+pixman_region_intersect_o (region_type_t *region,
+                           box_type_t *   r1,
+                           box_type_t *   r1_end,
+                           box_type_t *   r2,
+                           box_type_t *   r2_end,
+                           int            y1,
+                           int            y2,
+                           int *          overlap)
 {
-    int  	x1;
-    int  	x2;
-    box_type_t *	pNextRect;
+    int x1;
+    int x2;
+    box_type_t *        next_rect;
 
-    pNextRect = PIXREGION_TOP(region);
+    next_rect = PIXREGION_TOP (region);
 
-    assert(y1 < y2);
-    assert(r1 != r1End && r2 != r2End);
+    assert (y1 < y2);
+    assert (r1 != r1_end && r2 != r2_end);
 
-    do {
-	x1 = MAX(r1->x1, r2->x1);
-	x2 = MIN(r1->x2, r2->x2);
+    do
+    {
+        x1 = MAX (r1->x1, r2->x1);
+        x2 = MIN (r1->x2, r2->x2);
 
-	/*
+        /*
 	 * If there's any overlap between the two rectangles, add that
 	 * overlap to the new region.
 	 */
-	if (x1 < x2)
-	    NEWRECT(region, pNextRect, x1, y1, x2, y2);
+        if (x1 < x2)
+	    NEWRECT (region, next_rect, x1, y1, x2, y2);
 
-	/*
+        /*
 	 * Advance the pointer(s) with the leftmost right side, since the next
 	 * rectangle on that list may still overlap the other region's
 	 * current rectangle.
 	 */
-	if (r1->x2 == x2) {
-	    r1++;
+        if (r1->x2 == x2)
+        {
+            r1++;
 	}
-	if (r2->x2 == x2) {
-	    r2++;
+        if (r2->x2 == x2)
+        {
+            r2++;
 	}
-    } while ((r1 != r1End) && (r2 != r2End));
+    }
+    while ((r1 != r1_end) && (r2 != r2_end));
 
     return TRUE;
 }
 
 PIXMAN_EXPORT pixman_bool_t
-PREFIX(_intersect) (region_type_t * 	newReg,
-			 region_type_t * 	reg1,
-			 region_type_t *	reg2)
+PREFIX (_intersect) (region_type_t *     new_reg,
+                     region_type_t *        reg1,
+                     region_type_t *        reg2)
 {
-    good(reg1);
-    good(reg2);
-    good(newReg);
-   /* check for trivial reject */
-    if (PIXREGION_NIL(reg1)  || PIXREGION_NIL(reg2) ||
-	!EXTENTCHECK(&reg1->extents, &reg2->extents))
-    {
-	/* Covers about 20% of all cases */
-	freeData(newReg);
-	newReg->extents.x2 = newReg->extents.x1;
-	newReg->extents.y2 = newReg->extents.y1;
-	if (PIXREGION_NAR(reg1) || PIXREGION_NAR(reg2))
+    GOOD (reg1);
+    GOOD (reg2);
+    GOOD (new_reg);
+
+    /* check for trivial reject */
+    if (PIXREGION_NIL (reg1) || PIXREGION_NIL (reg2) ||
+        !EXTENTCHECK (&reg1->extents, &reg2->extents))
+    {
+        /* Covers about 20% of all cases */
+        FREE_DATA (new_reg);
+        new_reg->extents.x2 = new_reg->extents.x1;
+        new_reg->extents.y2 = new_reg->extents.y1;
+        if (PIXREGION_NAR (reg1) || PIXREGION_NAR (reg2))
+        {
+            new_reg->data = pixman_broken_data;
+            return FALSE;
+	}
+        else
 	{
-	    newReg->data = pixman_brokendata;
-	    return FALSE;
+	    new_reg->data = pixman_region_empty_data;
 	}
-	else
-	    newReg->data = pixman_region_emptyData;
     }
     else if (!reg1->data && !reg2->data)
     {
-	/* Covers about 80% of cases that aren't trivially rejected */
-	newReg->extents.x1 = MAX(reg1->extents.x1, reg2->extents.x1);
-	newReg->extents.y1 = MAX(reg1->extents.y1, reg2->extents.y1);
-	newReg->extents.x2 = MIN(reg1->extents.x2, reg2->extents.x2);
-	newReg->extents.y2 = MIN(reg1->extents.y2, reg2->extents.y2);
-	freeData(newReg);
-	newReg->data = (region_data_type_t *)NULL;
+        /* Covers about 80% of cases that aren't trivially rejected */
+        new_reg->extents.x1 = MAX (reg1->extents.x1, reg2->extents.x1);
+        new_reg->extents.y1 = MAX (reg1->extents.y1, reg2->extents.y1);
+        new_reg->extents.x2 = MIN (reg1->extents.x2, reg2->extents.x2);
+        new_reg->extents.y2 = MIN (reg1->extents.y2, reg2->extents.y2);
+
+        FREE_DATA (new_reg);
+
+	new_reg->data = (region_data_type_t *)NULL;
     }
-    else if (!reg2->data && SUBSUMES(&reg2->extents, &reg1->extents))
+    else if (!reg2->data && SUBSUMES (&reg2->extents, &reg1->extents))
     {
-	return PREFIX(_copy) (newReg, reg1);
+        return PREFIX (_copy) (new_reg, reg1);
     }
-    else if (!reg1->data && SUBSUMES(&reg1->extents, &reg2->extents))
+    else if (!reg1->data && SUBSUMES (&reg1->extents, &reg2->extents))
     {
-	return PREFIX(_copy) (newReg, reg2);
+        return PREFIX (_copy) (new_reg, reg2);
     }
     else if (reg1 == reg2)
     {
-	return PREFIX(_copy) (newReg, reg1);
+        return PREFIX (_copy) (new_reg, reg1);
     }
     else
     {
-	/* General purpose intersection */
-	int overlap; /* result ignored */
-	if (!pixman_op(newReg, reg1, reg2, pixman_region_intersectO, FALSE, FALSE,
-			&overlap))
+        /* General purpose intersection */
+        int overlap; /* result ignored */
+
+        if (!pixman_op (new_reg, reg1, reg2, pixman_region_intersect_o, FALSE, FALSE,
+                        &overlap))
+	{
 	    return FALSE;
-	pixman_set_extents(newReg);
+	}
+	
+        pixman_set_extents (new_reg);
     }
 
-    good(newReg);
+    GOOD (new_reg);
     return(TRUE);
 }
 
-#define MERGERECT(r)						\
-{								\
-    if (r->x1 <= x2) {						\
-	/* Merge with current rectangle */			\
-	if (r->x1 < x2) *pOverlap = TRUE;				\
-	if (x2 < r->x2) x2 = r->x2;				\
-    } else {							\
-	/* Add current rectangle, start new one */		\
-	NEWRECT(region, pNextRect, x1, y1, x2, y2);		\
-	x1 = r->x1;						\
-	x2 = r->x2;						\
-    }								\
-    r++;							\
-}
+#define MERGERECT(r)							\
+    do									\
+    {									\
+        if (r->x1 <= x2)						\
+	{								\
+            /* Merge with current rectangle */				\
+            if (r->x1 < x2)						\
+		*overlap = TRUE;					\
+									\
+            if (x2 < r->x2)						\
+		x2 = r->x2;						\
+	}								\
+	else								\
+	{								\
+            /* Add current rectangle, start new one */			\
+            NEWRECT (region, next_rect, x1, y1, x2, y2);		\
+            x1 = r->x1;							\
+            x2 = r->x2;							\
+	}								\
+        r++;								\
+    } while (0)
 
 /*======================================================================
  *	    Region Union
@@ -1059,7 +1290,7 @@ PREFIX(_intersect) (region_type_t * 	newReg,
 
 /*-
  *-----------------------------------------------------------------------
- * pixman_region_unionO --
+ * pixman_region_union_o --
  *	Handle an overlapping band for the union operation. Picks the
  *	left-most rectangle each time and merges it into the region.
  *
@@ -1068,66 +1299,70 @@ PREFIX(_intersect) (region_type_t * 	newReg,
  *
  * Side Effects:
  *	region is overwritten.
- *	pOverlap is set to TRUE if any boxes overlap.
+ *	overlap is set to TRUE if any boxes overlap.
  *
  *-----------------------------------------------------------------------
  */
 static pixman_bool_t
-pixman_region_unionO (
-    region_type_t	 *region,
-    box_type_t *r1,
-    box_type_t *r1End,
-    box_type_t *r2,
-    box_type_t *r2End,
-    int	  y1,
-    int	  y2,
-    int		  *pOverlap)
+pixman_region_union_o (region_type_t *region,
+		       box_type_t *   r1,
+		       box_type_t *   r1_end,
+		       box_type_t *   r2,
+		       box_type_t *   r2_end,
+		       int            y1,
+		       int            y2,
+		       int *          overlap)
 {
-    box_type_t *     pNextRect;
-    int        x1;     /* left and right side of current union */
-    int        x2;
+    box_type_t *next_rect;
+    int x1;            /* left and right side of current union */
+    int x2;
 
     assert (y1 < y2);
-    assert(r1 != r1End && r2 != r2End);
+    assert (r1 != r1_end && r2 != r2_end);
 
-    pNextRect = PIXREGION_TOP(region);
+    next_rect = PIXREGION_TOP (region);
 
     /* Start off current rectangle */
     if (r1->x1 < r2->x1)
     {
-	x1 = r1->x1;
-	x2 = r1->x2;
-	r1++;
+        x1 = r1->x1;
+        x2 = r1->x2;
+        r1++;
     }
     else
     {
-	x1 = r2->x1;
-	x2 = r2->x2;
-	r2++;
+        x1 = r2->x1;
+        x2 = r2->x2;
+        r2++;
     }
-    while (r1 != r1End && r2 != r2End)
+    while (r1 != r1_end && r2 != r2_end)
     {
-	if (r1->x1 < r2->x1) MERGERECT(r1) else MERGERECT(r2);
+        if (r1->x1 < r2->x1)
+	    MERGERECT (r1);
+	else
+	    MERGERECT (r2);
     }
 
     /* Finish off whoever (if any) is left */
-    if (r1 != r1End)
+    if (r1 != r1_end)
     {
-	do
-	{
-	    MERGERECT(r1);
-	} while (r1 != r1End);
+        do
+        {
+            MERGERECT (r1);
+	}
+        while (r1 != r1_end);
     }
-    else if (r2 != r2End)
+    else if (r2 != r2_end)
     {
-	do
-	{
-	    MERGERECT(r2);
-	} while (r2 != r2End);
+        do
+        {
+            MERGERECT (r2);
+	}
+        while (r2 != r2_end);
     }
 
     /* Add current rectangle */
-    NEWRECT(region, pNextRect, x1, y1, x2, y2);
+    NEWRECT (region, next_rect, x1, y1, x2, y2);
 
     return TRUE;
 }
@@ -1136,99 +1371,114 @@ pixman_region_unionO (
  * single rectangle
  */
 PIXMAN_EXPORT pixman_bool_t
-PREFIX(_union_rect) (region_type_t *dest,
-			  region_type_t *source,
-			  int x, int y,
-			  unsigned int width, unsigned int height)
+PREFIX (_union_rect) (region_type_t *dest,
+                      region_type_t *source,
+                      int            x,
+		      int            y,
+                      unsigned int   width,
+		      unsigned int   height)
 {
     region_type_t region;
 
-    if (!width || !height)
-	return PREFIX(_copy) (dest, source);
-    region.data = NULL;
     region.extents.x1 = x;
     region.extents.y1 = y;
     region.extents.x2 = x + width;
     region.extents.y2 = y + height;
 
-    return PREFIX(_union) (dest, source, &region);
+    if (!GOOD_RECT (&region.extents))
+    {
+        if (BAD_RECT (&region.extents))
+            log_region_error (FUNC, "Invalid rectangle passed");
+	return PREFIX (_copy) (dest, source);
+    }
+    
+    region.data = NULL;
+
+    return PREFIX (_union) (dest, source, &region);
 }
 
 PIXMAN_EXPORT pixman_bool_t
-PREFIX(_union) (region_type_t *newReg,
-		     region_type_t *reg1,
-		     region_type_t *reg2)
+PREFIX (_union) (region_type_t *new_reg,
+                 region_type_t *reg1,
+                 region_type_t *reg2)
 {
     int overlap; /* result ignored */
 
     /* Return TRUE if some overlap
      * between reg1, reg2
      */
-    good(reg1);
-    good(reg2);
-    good(newReg);
+    GOOD (reg1);
+    GOOD (reg2);
+    GOOD (new_reg);
+
     /*  checks all the simple cases */
 
     /*
      * Region 1 and 2 are the same
      */
     if (reg1 == reg2)
-    {
-	return PREFIX(_copy) (newReg, reg1);
-    }
+        return PREFIX (_copy) (new_reg, reg1);
 
     /*
      * Region 1 is empty
      */
-    if (PIXREGION_NIL(reg1))
+    if (PIXREGION_NIL (reg1))
     {
-	if (PIXREGION_NAR(reg1))
-	    return pixman_break (newReg);
-        if (newReg != reg2)
-	    return PREFIX(_copy) (newReg, reg2);
-        return TRUE;
+        if (PIXREGION_NAR (reg1))
+	    return pixman_break (new_reg);
+
+        if (new_reg != reg2)
+	    return PREFIX (_copy) (new_reg, reg2);
+
+	return TRUE;
     }
 
     /*
      * Region 2 is empty
      */
-    if (PIXREGION_NIL(reg2))
+    if (PIXREGION_NIL (reg2))
     {
-	if (PIXREGION_NAR(reg2))
-	    return pixman_break (newReg);
-        if (newReg != reg1)
-	    return PREFIX(_copy) (newReg, reg1);
-        return TRUE;
+        if (PIXREGION_NAR (reg2))
+	    return pixman_break (new_reg);
+
+	if (new_reg != reg1)
+	    return PREFIX (_copy) (new_reg, reg1);
+
+	return TRUE;
     }
 
     /*
      * Region 1 completely subsumes region 2
      */
-    if (!reg1->data && SUBSUMES(&reg1->extents, &reg2->extents))
+    if (!reg1->data && SUBSUMES (&reg1->extents, &reg2->extents))
     {
-        if (newReg != reg1)
-	    return PREFIX(_copy) (newReg, reg1);
-        return TRUE;
+        if (new_reg != reg1)
+	    return PREFIX (_copy) (new_reg, reg1);
+
+	return TRUE;
     }
 
     /*
      * Region 2 completely subsumes region 1
      */
-    if (!reg2->data && SUBSUMES(&reg2->extents, &reg1->extents))
+    if (!reg2->data && SUBSUMES (&reg2->extents, &reg1->extents))
     {
-        if (newReg != reg2)
-	    return PREFIX(_copy) (newReg, reg2);
-        return TRUE;
+        if (new_reg != reg2)
+	    return PREFIX (_copy) (new_reg, reg2);
+
+	return TRUE;
     }
 
-    if (!pixman_op(newReg, reg1, reg2, pixman_region_unionO, TRUE, TRUE, &overlap))
+    if (!pixman_op (new_reg, reg1, reg2, pixman_region_union_o, TRUE, TRUE, &overlap))
 	return FALSE;
 
-    newReg->extents.x1 = MIN(reg1->extents.x1, reg2->extents.x1);
-    newReg->extents.y1 = MIN(reg1->extents.y1, reg2->extents.y1);
-    newReg->extents.x2 = MAX(reg1->extents.x2, reg2->extents.x2);
-    newReg->extents.y2 = MAX(reg1->extents.y2, reg2->extents.y2);
-    good(newReg);
+    new_reg->extents.x1 = MIN (reg1->extents.x1, reg2->extents.x1);
+    new_reg->extents.y1 = MIN (reg1->extents.y1, reg2->extents.y1);
+    new_reg->extents.x2 = MAX (reg1->extents.x2, reg2->extents.x2);
+    new_reg->extents.y2 = MAX (reg1->extents.y2, reg2->extents.y2);
+    
+    GOOD (new_reg);
+
     return TRUE;
 }
 
@@ -1236,71 +1486,83 @@ PREFIX(_union) (region_type_t *newReg,
  *	    Batch Rectangle Union
  *====================================================================*/
 
-#define ExchangeRects(a, b) \
-{			    \
-    box_type_t     t;	    \
-    t = rects[a];	    \
-    rects[a] = rects[b];    \
-    rects[b] = t;	    \
-}
+#define EXCHANGE_RECTS(a, b)	\
+    {                           \
+        box_type_t t;		\
+        t = rects[a];           \
+        rects[a] = rects[b];    \
+        rects[b] = t;           \
+    }
 
 static void
-QuickSortRects(
-    box_type_t     rects[],
+quick_sort_rects (
+    box_type_t rects[],
     int        numRects)
 {
-    int	y1;
-    int	x1;
-    int        i, j;
+    int y1;
+    int x1;
+    int i, j;
     box_type_t *r;
 
     /* Always called with numRects > 1 */
 
     do
     {
-	if (numRects == 2)
-	{
-	    if (rects[0].y1 > rects[1].y1 ||
-		    (rects[0].y1 == rects[1].y1 && rects[0].x1 > rects[1].x1))
-		ExchangeRects(0, 1);
-	    return;
+        if (numRects == 2)
+        {
+            if (rects[0].y1 > rects[1].y1 ||
+                (rects[0].y1 == rects[1].y1 && rects[0].x1 > rects[1].x1))
+	    {
+		EXCHANGE_RECTS (0, 1);
+	    }
+
+            return;
 	}
 
-	/* Choose partition element, stick in location 0 */
-        ExchangeRects(0, numRects >> 1);
-	y1 = rects[0].y1;
-	x1 = rects[0].x1;
+        /* Choose partition element, stick in location 0 */
+        EXCHANGE_RECTS (0, numRects >> 1);
+        y1 = rects[0].y1;
+        x1 = rects[0].x1;
 
         /* Partition array */
         i = 0;
         j = numRects;
+
         do
-	{
-	    r = &(rects[i]);
-	    do
-	    {
-		r++;
-		i++;
-            } while (i != numRects &&
-		     (r->y1 < y1 || (r->y1 == y1 && r->x1 < x1)));
+        {
+            r = &(rects[i]);
+            do
+            {
+                r++;
+                i++;
+	    }
+
+            while (i != numRects && (r->y1 < y1 || (r->y1 == y1 && r->x1 < x1)))
+		;
+
 	    r = &(rects[j]);
-	    do
-	    {
-		r--;
-		j--;
-            } while (y1 < r->y1 || (y1 == r->y1 && x1 < r->x1));
+            do
+            {
+                r--;
+                j--;
+	    }
+            while (y1 < r->y1 || (y1 == r->y1 && x1 < r->x1));
+	    
             if (i < j)
-		ExchangeRects(i, j);
-        } while (i < j);
+		EXCHANGE_RECTS (i, j);
+	}
+        while (i < j);
 
         /* Move partition element back to middle */
-        ExchangeRects(0, j);
+        EXCHANGE_RECTS (0, j);
+
+        /* Recurse */
+        if (numRects - j - 1 > 1)
+	    quick_sort_rects (&rects[j + 1], numRects - j - 1);
 
-	/* Recurse */
-        if (numRects-j-1 > 1)
-	    QuickSortRects(&rects[j+1], numRects-j-1);
         numRects = j;
-    } while (numRects > 1);
+    }
+    while (numRects > 1);
 }
 
 /*-
@@ -1316,7 +1578,7 @@ QuickSortRects(
  *
  * Side Effects:
  *      The passed-in ``region'' may be modified.
- *	pOverlap set to TRUE if any retangles overlapped,
+ *	overlap set to TRUE if any retangles overlapped,
  *      else FALSE;
  *
  * Strategy:
@@ -1338,208 +1600,247 @@ QuickSortRects(
 
 static pixman_bool_t
 validate (region_type_t * badreg,
-	  int *pOverlap)
+          int *           overlap)
 {
     /* Descriptor for regions under construction  in Step 2. */
-    typedef struct {
-	region_type_t   reg;
-	int	    prevBand;
-	int	    curBand;
-    } RegionInfo;
-
-    RegionInfo stack_regions[64];
-
-	     int	numRects;   /* Original numRects for badreg	    */
-	     RegionInfo *ri;	    /* Array of current regions		    */
-    	     int	numRI;      /* Number of entries used in ri	    */
-	     int	sizeRI;	    /* Number of entries available in ri    */
-	     int	i;	    /* Index into rects			    */
-    int	j;	    /* Index into ri			    */
-    RegionInfo *rit;       /* &ri[j]				    */
-    region_type_t *  reg;        /* ri[j].reg			    */
-    box_type_t *	box;	    /* Current box in rects		    */
-    box_type_t *	riBox;      /* Last box in ri[j].reg		    */
-    region_type_t *  hreg;       /* ri[j_half].reg			    */
+    typedef struct
+    {
+        region_type_t reg;
+        int prev_band;
+        int cur_band;
+    } region_info_t;
+
+    region_info_t stack_regions[64];
+
+    int numRects;                   /* Original numRects for badreg	    */
+    region_info_t *ri;              /* Array of current regions		    */
+    int num_ri;                     /* Number of entries used in ri	    */
+    int size_ri;                    /* Number of entries available in ri    */
+    int i;                          /* Index into rects			    */
+    int j;                          /* Index into ri			    */
+    region_info_t *rit;             /* &ri[j]				    */
+    region_type_t *reg;             /* ri[j].reg			    */
+    box_type_t *box;                /* Current box in rects		    */
+    box_type_t *ri_box;             /* Last box in ri[j].reg		    */
+    region_type_t *hreg;            /* ri[j_half].reg			    */
     pixman_bool_t ret = TRUE;
 
-    *pOverlap = FALSE;
+    *overlap = FALSE;
     if (!badreg->data)
     {
-	good(badreg);
-	return TRUE;
+        GOOD (badreg);
+        return TRUE;
     }
+    
     numRects = badreg->data->numRects;
     if (!numRects)
     {
-	if (PIXREGION_NAR(badreg))
+        if (PIXREGION_NAR (badreg))
 	    return FALSE;
-	good(badreg);
-	return TRUE;
+        GOOD (badreg);
+        return TRUE;
     }
+    
     if (badreg->extents.x1 < badreg->extents.x2)
     {
-	if ((numRects) == 1)
-	{
-	    freeData(badreg);
-	    badreg->data = (region_data_type_t *) NULL;
+        if ((numRects) == 1)
+        {
+            FREE_DATA (badreg);
+            badreg->data = (region_data_type_t *) NULL;
 	}
-	else
-	{
-	    DOWNSIZE(badreg, numRects);
+        else
+        {
+            DOWNSIZE (badreg, numRects);
 	}
-	good(badreg);
+
+        GOOD (badreg);
+
 	return TRUE;
     }
 
     /* Step 1: Sort the rects array into ascending (y1, x1) order */
-    QuickSortRects(PIXREGION_BOXPTR(badreg), numRects);
+    quick_sort_rects (PIXREGION_BOXPTR (badreg), numRects);
 
     /* Step 2: Scatter the sorted array into the minimum number of regions */
 
     /* Set up the first region to be the first rectangle in badreg */
     /* Note that step 2 code will never overflow the ri[0].reg rects array */
     ri = stack_regions;
-    sizeRI = sizeof (stack_regions) / sizeof (stack_regions[0]);
-    numRI = 1;
-    ri[0].prevBand = 0;
-    ri[0].curBand = 0;
+    size_ri = sizeof (stack_regions) / sizeof (stack_regions[0]);
+    num_ri = 1;
+    ri[0].prev_band = 0;
+    ri[0].cur_band = 0;
     ri[0].reg = *badreg;
-    box = PIXREGION_BOXPTR(&ri[0].reg);
+    box = PIXREGION_BOXPTR (&ri[0].reg);
     ri[0].reg.extents = *box;
     ri[0].reg.data->numRects = 1;
-    badreg->extents = *pixman_region_emptyBox;
-    badreg->data = pixman_region_emptyData;
+    badreg->extents = *pixman_region_empty_box;
+    badreg->data = pixman_region_empty_data;
 
     /* Now scatter rectangles into the minimum set of valid regions.  If the
-       next rectangle to be added to a region would force an existing rectangle
-       in the region to be split up in order to maintain y-x banding, just
-       forget it.  Try the next region.  If it doesn't fit cleanly into any
-       region, make a new one. */
+     * next rectangle to be added to a region would force an existing rectangle
+     * in the region to be split up in order to maintain y-x banding, just
+     * forget it.  Try the next region.  If it doesn't fit cleanly into any
+     * region, make a new one.
+     */
 
     for (i = numRects; --i > 0;)
     {
-	box++;
-	/* Look for a region to append box to */
-	for (j = numRI, rit = ri; --j >= 0; rit++)
-	{
-	    reg = &rit->reg;
-	    riBox = PIXREGION_END(reg);
-
-	    if (box->y1 == riBox->y1 && box->y2 == riBox->y2)
-	    {
-		/* box is in same band as riBox.  Merge or append it */
-		if (box->x1 <= riBox->x2)
-		{
-		    /* Merge it with riBox */
-		    if (box->x1 < riBox->x2) *pOverlap = TRUE;
-		    if (box->x2 > riBox->x2) riBox->x2 = box->x2;
+        box++;
+        /* Look for a region to append box to */
+        for (j = num_ri, rit = ri; --j >= 0; rit++)
+        {
+            reg = &rit->reg;
+            ri_box = PIXREGION_END (reg);
+
+            if (box->y1 == ri_box->y1 && box->y2 == ri_box->y2)
+            {
+                /* box is in same band as ri_box.  Merge or append it */
+                if (box->x1 <= ri_box->x2)
+                {
+                    /* Merge it with ri_box */
+                    if (box->x1 < ri_box->x2)
+			*overlap = TRUE;
+
+                    if (box->x2 > ri_box->x2)
+			ri_box->x2 = box->x2;
 		}
-		else
-		{
-		    RECTALLOC_BAIL(reg, 1, bail);
-		    *PIXREGION_TOP(reg) = *box;
-		    reg->data->numRects++;
+                else
+                {
+                    RECTALLOC_BAIL (reg, 1, bail);
+                    *PIXREGION_TOP (reg) = *box;
+                    reg->data->numRects++;
 		}
-		goto NextRect;   /* So sue me */
+		
+                goto next_rect;   /* So sue me */
 	    }
-	    else if (box->y1 >= riBox->y2)
-	    {
-		/* Put box into new band */
-		if (reg->extents.x2 < riBox->x2) reg->extents.x2 = riBox->x2;
-		if (reg->extents.x1 > box->x1)   reg->extents.x1 = box->x1;
-		Coalesce(reg, rit->prevBand, rit->curBand);
-		rit->curBand = reg->data->numRects;
-		RECTALLOC_BAIL(reg, 1, bail);
-		*PIXREGION_TOP(reg) = *box;
-		reg->data->numRects++;
-		goto NextRect;
+            else if (box->y1 >= ri_box->y2)
+            {
+                /* Put box into new band */
+                if (reg->extents.x2 < ri_box->x2)
+		    reg->extents.x2 = ri_box->x2;
+		
+                if (reg->extents.x1 > box->x1)
+		    reg->extents.x1 = box->x1;
+		
+                COALESCE (reg, rit->prev_band, rit->cur_band);
+                rit->cur_band = reg->data->numRects;
+                RECTALLOC_BAIL (reg, 1, bail);
+                *PIXREGION_TOP (reg) = *box;
+                reg->data->numRects++;
+
+                goto next_rect;
 	    }
-	    /* Well, this region was inappropriate.  Try the next one. */
+            /* Well, this region was inappropriate.  Try the next one. */
 	} /* for j */
 
-	/* Uh-oh.  No regions were appropriate.  Create a new one. */
-	if (sizeRI == numRI)
-	{
-	    size_t data_size;
-	    
-	    /* Oops, allocate space for new region information */
-	    sizeRI <<= 1;
-
-            data_size = sizeRI * sizeof(RegionInfo);
-            if (data_size / sizeRI != sizeof(RegionInfo))
-                goto bail;
-	    if (ri == stack_regions) {
-		rit = malloc (data_size);
-		if (!rit)
+        /* Uh-oh.  No regions were appropriate.  Create a new one. */
+        if (size_ri == num_ri)
+        {
+            size_t data_size;
+
+            /* Oops, allocate space for new region information */
+            size_ri <<= 1;
+
+            data_size = size_ri * sizeof(region_info_t);
+            if (data_size / size_ri != sizeof(region_info_t))
+		goto bail;
+
+            if (ri == stack_regions)
+            {
+                rit = malloc (data_size);
+                if (!rit)
 		    goto bail;
-		memcpy (rit, ri, numRI * sizeof (RegionInfo));
-	    } else {
-		rit = (RegionInfo *) realloc(ri, data_size);
-		if (!rit)
+                memcpy (rit, ri, num_ri * sizeof (region_info_t));
+	    }
+            else
+            {
+                rit = (region_info_t *) realloc (ri, data_size);
+                if (!rit)
 		    goto bail;
 	    }
-	    ri = rit;
-	    rit = &ri[numRI];
+            ri = rit;
+            rit = &ri[num_ri];
 	}
-	numRI++;
-	rit->prevBand = 0;
-	rit->curBand = 0;
-	rit->reg.extents = *box;
-	rit->reg.data = (region_data_type_t *)NULL;
-	if (!pixman_rect_alloc(&rit->reg, (i+numRI) / numRI)) /* MUST force allocation */
+        num_ri++;
+        rit->prev_band = 0;
+        rit->cur_band = 0;
+        rit->reg.extents = *box;
+        rit->reg.data = (region_data_type_t *)NULL;
+
+	/* MUST force allocation */
+        if (!pixman_rect_alloc (&rit->reg, (i + num_ri) / num_ri))
 	    goto bail;
-NextRect: ;
+	
+    next_rect: ;
     } /* for i */
 
-    /* Make a final pass over each region in order to Coalesce and set
-       extents.x2 and extents.y2 */
-
-    for (j = numRI, rit = ri; --j >= 0; rit++)
+    /* Make a final pass over each region in order to COALESCE and set
+     * extents.x2 and extents.y2
+     */
+    for (j = num_ri, rit = ri; --j >= 0; rit++)
     {
-	reg = &rit->reg;
-	riBox = PIXREGION_END(reg);
-	reg->extents.y2 = riBox->y2;
-	if (reg->extents.x2 < riBox->x2) reg->extents.x2 = riBox->x2;
-	Coalesce(reg, rit->prevBand, rit->curBand);
+        reg = &rit->reg;
+        ri_box = PIXREGION_END (reg);
+        reg->extents.y2 = ri_box->y2;
+
+        if (reg->extents.x2 < ri_box->x2)
+	    reg->extents.x2 = ri_box->x2;
+	
+        COALESCE (reg, rit->prev_band, rit->cur_band);
+
 	if (reg->data->numRects == 1) /* keep unions happy below */
-	{
-	    freeData(reg);
-	    reg->data = (region_data_type_t *)NULL;
+        {
+            FREE_DATA (reg);
+            reg->data = (region_data_type_t *)NULL;
 	}
     }
 
     /* Step 3: Union all regions into a single region */
-    while (numRI > 1)
+    while (num_ri > 1)
     {
-	int half = numRI/2;
-	for (j = numRI & 1; j < (half + (numRI & 1)); j++)
-	{
-	    reg = &ri[j].reg;
-	    hreg = &ri[j+half].reg;
-	    if (!pixman_op(reg, reg, hreg, pixman_region_unionO, TRUE, TRUE, pOverlap))
+        int half = num_ri / 2;
+        for (j = num_ri & 1; j < (half + (num_ri & 1)); j++)
+        {
+            reg = &ri[j].reg;
+            hreg = &ri[j + half].reg;
+
+            if (!pixman_op (reg, reg, hreg, pixman_region_union_o, TRUE, TRUE, overlap))
 		ret = FALSE;
-	    if (hreg->extents.x1 < reg->extents.x1)
+
+            if (hreg->extents.x1 < reg->extents.x1)
 		reg->extents.x1 = hreg->extents.x1;
-	    if (hreg->extents.y1 < reg->extents.y1)
+
+            if (hreg->extents.y1 < reg->extents.y1)
 		reg->extents.y1 = hreg->extents.y1;
-	    if (hreg->extents.x2 > reg->extents.x2)
+
+            if (hreg->extents.x2 > reg->extents.x2)
 		reg->extents.x2 = hreg->extents.x2;
-	    if (hreg->extents.y2 > reg->extents.y2)
+
+            if (hreg->extents.y2 > reg->extents.y2)
 		reg->extents.y2 = hreg->extents.y2;
-	    freeData(hreg);
+
+            FREE_DATA (hreg);
 	}
-	numRI -= half;
+
+        num_ri -= half;
+
 	if (!ret)
 	    goto bail;
     }
+
     *badreg = ri[0].reg;
+
     if (ri != stack_regions)
-	free(ri);
-    good(badreg);
+	free (ri);
+
+    GOOD (badreg);
     return ret;
+
 bail:
-    for (i = 0; i < numRI; i++)
-	freeData(&ri[i].reg);
+    for (i = 0; i < num_ri; i++)
+	FREE_DATA (&ri[i].reg);
+
     if (ri != stack_regions)
 	free (ri);
 
@@ -1547,12 +1848,12 @@ bail:
 }
 
 /*======================================================================
- * 	    	  Region Subtraction
+ *                Region Subtraction
  *====================================================================*/
 
 /*-
  *-----------------------------------------------------------------------
- * pixman_region_subtractO --
+ * pixman_region_subtract_o --
  *	Overlapping band subtraction. x1 is the left-most point not yet
  *	checked.
  *
@@ -1566,109 +1867,113 @@ bail:
  */
 /*ARGSUSED*/
 static pixman_bool_t
-pixman_region_subtractO (
-    region_type_t *	region,
-    box_type_t *	r1,
-    box_type_t *  	  	r1End,
-    box_type_t *	r2,
-    box_type_t *  	  	r2End,
-    int  	y1,
-    int  	y2,
-    int		*pOverlap)
+pixman_region_subtract_o (region_type_t * region,
+			  box_type_t *    r1,
+			  box_type_t *    r1_end,
+			  box_type_t *    r2,
+			  box_type_t *    r2_end,
+			  int             y1,
+			  int             y2,
+			  int *           overlap)
 {
-    box_type_t *	pNextRect;
-    int  	x1;
+    box_type_t *        next_rect;
+    int x1;
 
     x1 = r1->x1;
 
-    assert(y1<y2);
-    assert(r1 != r1End && r2 != r2End);
+    assert (y1 < y2);
+    assert (r1 != r1_end && r2 != r2_end);
 
-    pNextRect = PIXREGION_TOP(region);
+    next_rect = PIXREGION_TOP (region);
 
     do
     {
-	if (r2->x2 <= x1)
-	{
-	    /*
+        if (r2->x2 <= x1)
+        {
+            /*
 	     * Subtrahend entirely to left of minuend: go to next subtrahend.
 	     */
-	    r2++;
+            r2++;
 	}
-	else if (r2->x1 <= x1)
-	{
-	    /*
+        else if (r2->x1 <= x1)
+        {
+            /*
 	     * Subtrahend preceeds minuend: nuke left edge of minuend.
 	     */
-	    x1 = r2->x2;
-	    if (x1 >= r1->x2)
-	    {
-		/*
+            x1 = r2->x2;
+            if (x1 >= r1->x2)
+            {
+                /*
 		 * Minuend completely covered: advance to next minuend and
 		 * reset left fence to edge of new minuend.
 		 */
-		r1++;
-		if (r1 != r1End)
+                r1++;
+                if (r1 != r1_end)
 		    x1 = r1->x1;
 	    }
-	    else
-	    {
-		/*
+            else
+            {
+                /*
 		 * Subtrahend now used up since it doesn't extend beyond
 		 * minuend
 		 */
-		r2++;
+                r2++;
 	    }
 	}
-	else if (r2->x1 < r1->x2)
-	{
-	    /*
+        else if (r2->x1 < r1->x2)
+        {
+            /*
 	     * Left part of subtrahend covers part of minuend: add uncovered
 	     * part of minuend to region and skip to next subtrahend.
 	     */
-	    assert(x1<r2->x1);
-	    NEWRECT(region, pNextRect, x1, y1, r2->x1, y2);
+            assert (x1 < r2->x1);
+            NEWRECT (region, next_rect, x1, y1, r2->x1, y2);
 
-	    x1 = r2->x2;
-	    if (x1 >= r1->x2)
-	    {
-		/*
+            x1 = r2->x2;
+            if (x1 >= r1->x2)
+            {
+                /*
 		 * Minuend used up: advance to new...
 		 */
-		r1++;
-		if (r1 != r1End)
+                r1++;
+                if (r1 != r1_end)
 		    x1 = r1->x1;
 	    }
-	    else
-	    {
-		/*
+            else
+            {
+                /*
 		 * Subtrahend used up
 		 */
-		r2++;
+                r2++;
 	    }
 	}
-	else
-	{
-	    /*
+        else
+        {
+            /*
 	     * Minuend used up: add any remaining piece before advancing.
 	     */
-	    if (r1->x2 > x1)
-		NEWRECT(region, pNextRect, x1, y1, r1->x2, y2);
-	    r1++;
-	    if (r1 != r1End)
+            if (r1->x2 > x1)
+		NEWRECT (region, next_rect, x1, y1, r1->x2, y2);
+
+            r1++;
+
+	    if (r1 != r1_end)
 		x1 = r1->x1;
 	}
-    } while ((r1 != r1End) && (r2 != r2End));
+    }
+    while ((r1 != r1_end) && (r2 != r2_end));
 
     /*
      * Add remaining minuend rectangles to region.
      */
-    while (r1 != r1End)
+    while (r1 != r1_end)
     {
-	assert(x1<r1->x2);
-	NEWRECT(region, pNextRect, x1, y1, r1->x2, y2);
-	r1++;
-	if (r1 != r1End)
+        assert (x1 < r1->x2);
+
+        NEWRECT (region, next_rect, x1, y1, r1->x2, y2);
+
+        r1++;
+        if (r1 != r1_end)
 	    x1 = r1->x1;
     }
     return TRUE;
@@ -1677,59 +1982,62 @@ pixman_region_subtractO (
 /*-
  *-----------------------------------------------------------------------
  * pixman_region_subtract --
- *	Subtract regS from regM and leave the result in regD.
+ *	Subtract reg_s from reg_m and leave the result in reg_d.
  *	S stands for subtrahend, M for minuend and D for difference.
  *
  * Results:
  *	TRUE if successful.
  *
  * Side Effects:
- *	regD is overwritten.
+ *	reg_d is overwritten.
  *
  *-----------------------------------------------------------------------
  */
 PIXMAN_EXPORT pixman_bool_t
-PREFIX(_subtract) (region_type_t *	regD,
-		       region_type_t * 	regM,
-		       region_type_t *	regS)
+PREFIX (_subtract) (region_type_t *reg_d,
+                    region_type_t *reg_m,
+                    region_type_t *reg_s)
 {
     int overlap; /* result ignored */
 
-    good(regM);
-    good(regS);
-    good(regD);
-   /* check for trivial rejects */
-    if (PIXREGION_NIL(regM) || PIXREGION_NIL(regS) ||
-	!EXTENTCHECK(&regM->extents, &regS->extents))
+    GOOD (reg_m);
+    GOOD (reg_s);
+    GOOD (reg_d);
+    
+    /* check for trivial rejects */
+    if (PIXREGION_NIL (reg_m) || PIXREGION_NIL (reg_s) ||
+        !EXTENTCHECK (&reg_m->extents, &reg_s->extents))
     {
-	if (PIXREGION_NAR (regS))
-	    return pixman_break (regD);
-	return PREFIX(_copy) (regD, regM);
+        if (PIXREGION_NAR (reg_s))
+	    return pixman_break (reg_d);
+	
+        return PREFIX (_copy) (reg_d, reg_m);
     }
-    else if (regM == regS)
+    else if (reg_m == reg_s)
     {
-	freeData(regD);
-	regD->extents.x2 = regD->extents.x1;
-	regD->extents.y2 = regD->extents.y1;
-	regD->data = pixman_region_emptyData;
-	return TRUE;
+        FREE_DATA (reg_d);
+        reg_d->extents.x2 = reg_d->extents.x1;
+        reg_d->extents.y2 = reg_d->extents.y1;
+        reg_d->data = pixman_region_empty_data;
+
+        return TRUE;
     }
 
     /* Add those rectangles in region 1 that aren't in region 2,
        do yucky substraction for overlaps, and
        just throw away rectangles in region 2 that aren't in region 1 */
-    if (!pixman_op(regD, regM, regS, pixman_region_subtractO, TRUE, FALSE, &overlap))
+    if (!pixman_op (reg_d, reg_m, reg_s, pixman_region_subtract_o, TRUE, FALSE, &overlap))
 	return FALSE;
 
     /*
-     * Can't alter RegD's extents before we call pixman_op because
+     * Can't alter reg_d's extents before we call pixman_op because
      * it might be one of the source regions and pixman_op depends
      * on the extents of those regions being unaltered. Besides, this
      * way there's no checking against rectangles that will be nuked
      * due to coalescing, so we have to examine fewer rectangles.
      */
-    pixman_set_extents(regD);
-    good(regD);
+    pixman_set_extents (reg_d);
+    GOOD (reg_d);
     return TRUE;
 }
 
@@ -1748,408 +2056,497 @@ PREFIX(_subtract) (region_type_t *	regD,
  *	TRUE.
  *
  * Side Effects:
- *	newReg is overwritten.
+ *	new_reg is overwritten.
  *
  *-----------------------------------------------------------------------
  */
 pixman_bool_t
-PIXMAN_EXPORT PREFIX(_inverse) (region_type_t * 	  newReg,       /* Destination region */
-		      region_type_t * 	  reg1,         /* Region to invert */
-		      box_type_t *     	  invRect) 	/* Bounding box for inversion */
+PIXMAN_EXPORT PREFIX (_inverse) (region_type_t *new_reg,  /* Destination region */
+                                 region_type_t *reg1,     /* Region to invert */
+                                 box_type_t *   inv_rect) /* Bounding box for inversion */
 {
-    region_type_t	  invReg;   	/* Quick and dirty region made from the
-				 * bounding box */
-    int	  overlap;	/* result ignored */
-
-    good(reg1);
-    good(newReg);
-   /* check for trivial rejects */
-    if (PIXREGION_NIL(reg1) || !EXTENTCHECK(invRect, &reg1->extents))
-    {
-	if (PIXREGION_NAR(reg1))
-	    return pixman_break (newReg);
-	newReg->extents = *invRect;
-	freeData(newReg);
-	newReg->data = (region_data_type_t *)NULL;
+    region_type_t inv_reg; /* Quick and dirty region made from the
+			    * bounding box */
+    int overlap;           /* result ignored */
+
+    GOOD (reg1);
+    GOOD (new_reg);
+    
+    /* check for trivial rejects */
+    if (PIXREGION_NIL (reg1) || !EXTENTCHECK (inv_rect, &reg1->extents))
+    {
+        if (PIXREGION_NAR (reg1))
+	    return pixman_break (new_reg);
+	
+        new_reg->extents = *inv_rect;
+        FREE_DATA (new_reg);
+        new_reg->data = (region_data_type_t *)NULL;
+	
         return TRUE;
     }
 
     /* Add those rectangles in region 1 that aren't in region 2,
-       do yucky substraction for overlaps, and
-       just throw away rectangles in region 2 that aren't in region 1 */
-    invReg.extents = *invRect;
-    invReg.data = (region_data_type_t *)NULL;
-    if (!pixman_op(newReg, &invReg, reg1, pixman_region_subtractO, TRUE, FALSE, &overlap))
+     * do yucky substraction for overlaps, and
+     * just throw away rectangles in region 2 that aren't in region 1
+     */
+    inv_reg.extents = *inv_rect;
+    inv_reg.data = (region_data_type_t *)NULL;
+    if (!pixman_op (new_reg, &inv_reg, reg1, pixman_region_subtract_o, TRUE, FALSE, &overlap))
 	return FALSE;
 
     /*
-     * Can't alter newReg's extents before we call pixman_op because
+     * Can't alter new_reg's extents before we call pixman_op because
      * it might be one of the source regions and pixman_op depends
      * on the extents of those regions being unaltered. Besides, this
      * way there's no checking against rectangles that will be nuked
      * due to coalescing, so we have to examine fewer rectangles.
      */
-    pixman_set_extents(newReg);
-    good(newReg);
+    pixman_set_extents (new_reg);
+    GOOD (new_reg);
     return TRUE;
 }
 
 /*
- *   RectIn(region, rect)
+ *   rect_in(region, rect)
  *   This routine takes a pointer to a region and a pointer to a box
  *   and determines if the box is outside/inside/partly inside the region.
  *
  *   The idea is to travel through the list of rectangles trying to cover the
  *   passed box with them. Anytime a piece of the rectangle isn't covered
- *   by a band of rectangles, partOut is set TRUE. Any time a rectangle in
- *   the region covers part of the box, partIn is set TRUE. The process ends
+ *   by a band of rectangles, part_out is set TRUE. Any time a rectangle in
+ *   the region covers part of the box, part_in is set TRUE. The process ends
  *   when either the box has been completely covered (we reached a band that
- *   doesn't overlap the box, partIn is TRUE and partOut is false), the
- *   box has been partially covered (partIn == partOut == TRUE -- because of
+ *   doesn't overlap the box, part_in is TRUE and part_out is false), the
+ *   box has been partially covered (part_in == part_out == TRUE -- because of
  *   the banding, the first time this is true we know the box is only
  *   partially in the region) or is outside the region (we reached a band
- *   that doesn't overlap the box at all and partIn is false)
+ *   that doesn't overlap the box at all and part_in is false)
  */
 
 pixman_region_overlap_t
-PIXMAN_EXPORT PREFIX(_contains_rectangle) (region_type_t *  region,
-				 box_type_t *     prect)
+PIXMAN_EXPORT PREFIX (_contains_rectangle) (region_type_t *  region,
+                                            box_type_t *     prect)
 {
-    int	x;
-    int	y;
     box_type_t *     pbox;
-    box_type_t *     pboxEnd;
-    int			partIn, partOut;
-    int			numRects;
+    box_type_t *     pbox_end;
+    int part_in, part_out;
+    int numRects;
+    int x, y;
+
+    GOOD (region);
+
+    numRects = PIXREGION_NUMRECTS (region);
 
-    good(region);
-    numRects = PIXREGION_NUM_RECTS(region);
     /* useful optimization */
-    if (!numRects || !EXTENTCHECK(&region->extents, prect))
-        return(PIXMAN_REGION_OUT);
+    if (!numRects || !EXTENTCHECK (&region->extents, prect))
+	return(PIXMAN_REGION_OUT);
 
     if (numRects == 1)
     {
-	/* We know that it must be PIXMAN_REGION_IN or PIXMAN_REGION_PART */
-	if (SUBSUMES(&region->extents, prect))
+        /* We know that it must be PIXMAN_REGION_IN or PIXMAN_REGION_PART */
+        if (SUBSUMES (&region->extents, prect))
 	    return(PIXMAN_REGION_IN);
-	else
+        else
 	    return(PIXMAN_REGION_PART);
     }
 
-    partOut = FALSE;
-    partIn = FALSE;
+    part_out = FALSE;
+    part_in = FALSE;
 
     /* (x,y) starts at upper left of rect, moving to the right and down */
     x = prect->x1;
     y = prect->y1;
 
-    /* can stop when both partOut and partIn are TRUE, or we reach prect->y2 */
-    for (pbox = PIXREGION_BOXPTR(region), pboxEnd = pbox + numRects;
-         pbox != pboxEnd;
+    /* can stop when both part_out and part_in are TRUE, or we reach prect->y2 */
+    for (pbox = PIXREGION_BOXPTR (region), pbox_end = pbox + numRects;
+         pbox != pbox_end;
          pbox++)
     {
 
         if (pbox->y2 <= y)
-           continue;    /* getting up to speed or skipping remainder of band */
+	    continue;   /* getting up to speed or skipping remainder of band */
 
         if (pbox->y1 > y)
         {
-           partOut = TRUE;      /* missed part of rectangle above */
-           if (partIn || (pbox->y1 >= prect->y2))
-              break;
-           y = pbox->y1;        /* x guaranteed to be == prect->x1 */
-        }
+            part_out = TRUE;     /* missed part of rectangle above */
+            if (part_in || (pbox->y1 >= prect->y2))
+		break;
+            y = pbox->y1;       /* x guaranteed to be == prect->x1 */
+	}
 
         if (pbox->x2 <= x)
-           continue;            /* not far enough over yet */
+	    continue;           /* not far enough over yet */
 
         if (pbox->x1 > x)
         {
-           partOut = TRUE;      /* missed part of rectangle to left */
-           if (partIn)
-              break;
-        }
+            part_out = TRUE;     /* missed part of rectangle to left */
+            if (part_in)
+		break;
+	}
 
         if (pbox->x1 < prect->x2)
         {
-            partIn = TRUE;      /* definitely overlap */
-            if (partOut)
-               break;
-        }
+            part_in = TRUE;      /* definitely overlap */
+            if (part_out)
+		break;
+	}
 
         if (pbox->x2 >= prect->x2)
         {
-           y = pbox->y2;        /* finished with this band */
-           if (y >= prect->y2)
-              break;
-           x = prect->x1;       /* reset x out to left again */
-        }
-	else
-	{
-	    /*
+            y = pbox->y2;       /* finished with this band */
+            if (y >= prect->y2)
+		break;
+            x = prect->x1;      /* reset x out to left again */
+	}
+        else
+        {
+            /*
 	     * Because boxes in a band are maximal width, if the first box
 	     * to overlap the rectangle doesn't completely cover it in that
 	     * band, the rectangle must be partially out, since some of it
-	     * will be uncovered in that band. partIn will have been set true
+	     * will be uncovered in that band. part_in will have been set true
 	     * by now...
 	     */
-	    partOut = TRUE;
-	    break;
+            part_out = TRUE;
+            break;
 	}
     }
 
-    if (partIn)
+    if (part_in)
     {
-	if (y < prect->y2)
+        if (y < prect->y2)
 	    return PIXMAN_REGION_PART;
-	else
+        else
 	    return PIXMAN_REGION_IN;
     }
     else
     {
-	return PIXMAN_REGION_OUT;
+        return PIXMAN_REGION_OUT;
     }
 }
 
 /* PREFIX(_translate) (region, x, y)
-   translates in place
-*/
+ * translates in place
+ */
 
 PIXMAN_EXPORT void
-PREFIX(_translate) (region_type_t * region, int x, int y)
+PREFIX (_translate) (region_type_t *region, int x, int y)
 {
     int x1, x2, y1, y2;
     int nbox;
     box_type_t * pbox;
 
-    good(region);
+    GOOD (region);
     region->extents.x1 = x1 = region->extents.x1 + x;
     region->extents.y1 = y1 = region->extents.y1 + y;
     region->extents.x2 = x2 = region->extents.x2 + x;
     region->extents.y2 = y2 = region->extents.y2 + y;
-    if (((x1 - SHRT_MIN)|(y1 - SHRT_MIN)|(SHRT_MAX - x2)|(SHRT_MAX - y2)) >= 0)
+    
+    if (((x1 - SHRT_MIN) | (y1 - SHRT_MIN) | (SHRT_MAX - x2) | (SHRT_MAX - y2)) >= 0)
     {
-	if (region->data && (nbox = region->data->numRects))
-	{
-	    for (pbox = PIXREGION_BOXPTR(region); nbox--; pbox++)
-	    {
-		pbox->x1 += x;
-		pbox->y1 += y;
-		pbox->x2 += x;
-		pbox->y2 += y;
+        if (region->data && (nbox = region->data->numRects))
+        {
+            for (pbox = PIXREGION_BOXPTR (region); nbox--; pbox++)
+            {
+                pbox->x1 += x;
+                pbox->y1 += y;
+                pbox->x2 += x;
+                pbox->y2 += y;
 	    }
 	}
-	return;
+        return;
     }
-    if (((x2 - SHRT_MIN)|(y2 - SHRT_MIN)|(SHRT_MAX - x1)|(SHRT_MAX - y1)) <= 0)
+
+    if (((x2 - SHRT_MIN) | (y2 - SHRT_MIN) | (SHRT_MAX - x1) | (SHRT_MAX - y1)) <= 0)
     {
-	region->extents.x2 = region->extents.x1;
-	region->extents.y2 = region->extents.y1;
-	freeData(region);
-	region->data = pixman_region_emptyData;
-	return;
+        region->extents.x2 = region->extents.x1;
+        region->extents.y2 = region->extents.y1;
+        FREE_DATA (region);
+        region->data = pixman_region_empty_data;
+        return;
     }
+
     if (x1 < SHRT_MIN)
 	region->extents.x1 = SHRT_MIN;
     else if (x2 > SHRT_MAX)
 	region->extents.x2 = SHRT_MAX;
+
     if (y1 < SHRT_MIN)
 	region->extents.y1 = SHRT_MIN;
     else if (y2 > SHRT_MAX)
 	region->extents.y2 = SHRT_MAX;
+
     if (region->data && (nbox = region->data->numRects))
     {
-	box_type_t * pboxout;
+        box_type_t * pbox_out;
 
-	for (pboxout = pbox = PIXREGION_BOXPTR(region); nbox--; pbox++)
-	{
-	    pboxout->x1 = x1 = pbox->x1 + x;
-	    pboxout->y1 = y1 = pbox->y1 + y;
-	    pboxout->x2 = x2 = pbox->x2 + x;
-	    pboxout->y2 = y2 = pbox->y2 + y;
-	    if (((x2 - SHRT_MIN)|(y2 - SHRT_MIN)|
-		 (SHRT_MAX - x1)|(SHRT_MAX - y1)) <= 0)
-	    {
-		region->data->numRects--;
-		continue;
+        for (pbox_out = pbox = PIXREGION_BOXPTR (region); nbox--; pbox++)
+        {
+            pbox_out->x1 = x1 = pbox->x1 + x;
+            pbox_out->y1 = y1 = pbox->y1 + y;
+            pbox_out->x2 = x2 = pbox->x2 + x;
+            pbox_out->y2 = y2 = pbox->y2 + y;
+
+            if (((x2 - SHRT_MIN) | (y2 - SHRT_MIN) |
+                 (SHRT_MAX - x1) | (SHRT_MAX - y1)) <= 0)
+            {
+                region->data->numRects--;
+                continue;
 	    }
-	    if (x1 < SHRT_MIN)
-		pboxout->x1 = SHRT_MIN;
-	    else if (x2 > SHRT_MAX)
-		pboxout->x2 = SHRT_MAX;
-	    if (y1 < SHRT_MIN)
-		pboxout->y1 = SHRT_MIN;
-	    else if (y2 > SHRT_MAX)
-		pboxout->y2 = SHRT_MAX;
-	    pboxout++;
+
+            if (x1 < SHRT_MIN)
+		pbox_out->x1 = SHRT_MIN;
+            else if (x2 > SHRT_MAX)
+		pbox_out->x2 = SHRT_MAX;
+
+            if (y1 < SHRT_MIN)
+		pbox_out->y1 = SHRT_MIN;
+            else if (y2 > SHRT_MAX)
+		pbox_out->y2 = SHRT_MAX;
+
+            pbox_out++;
 	}
-	if (pboxout != pbox)
-	{
-	    if (region->data->numRects == 1)
+
+        if (pbox_out != pbox)
+        {
+            if (region->data->numRects == 1)
+            {
+                region->extents = *PIXREGION_BOXPTR (region);
+                FREE_DATA (region);
+                region->data = (region_data_type_t *)NULL;
+	    }
+            else
 	    {
-		region->extents = *PIXREGION_BOXPTR(region);
-		freeData(region);
-		region->data = (region_data_type_t *)NULL;
+		pixman_set_extents (region);
 	    }
-	    else
-		pixman_set_extents(region);
 	}
     }
+
+    GOOD (region);
 }
 
 PIXMAN_EXPORT void
-PREFIX(_reset) (region_type_t *region, box_type_t *box)
+PREFIX (_reset) (region_type_t *region, box_type_t *box)
 {
-    good(region);
-    assert(box->x1<=box->x2);
-    assert(box->y1<=box->y2);
+    GOOD (region);
+
+    assert (GOOD_RECT (box));
+
     region->extents = *box;
-    freeData(region);
-    region->data = (region_data_type_t *)NULL;
+
+    FREE_DATA (region);
+
+    region->data = NULL;
 }
 
 /* box is "return" value */
 PIXMAN_EXPORT int
-PREFIX(_contains_point) (region_type_t * region,
-			     int x, int y,
-			     box_type_t * box)
+PREFIX (_contains_point) (region_type_t * region,
+                          int x, int y,
+                          box_type_t * box)
 {
-    box_type_t *pbox, *pboxEnd;
+    box_type_t *pbox, *pbox_end;
     int numRects;
 
-    good(region);
-    numRects = PIXREGION_NUM_RECTS(region);
-    if (!numRects || !INBOX(&region->extents, x, y))
-        return(FALSE);
+    GOOD (region);
+    numRects = PIXREGION_NUMRECTS (region);
+
+    if (!numRects || !INBOX (&region->extents, x, y))
+	return(FALSE);
+
     if (numRects == 1)
     {
         if (box)
 	    *box = region->extents;
 
-	return(TRUE);
+        return(TRUE);
     }
-    for (pbox = PIXREGION_BOXPTR(region), pboxEnd = pbox + numRects;
-	 pbox != pboxEnd;
+
+    for (pbox = PIXREGION_BOXPTR (region), pbox_end = pbox + numRects;
+	 pbox != pbox_end;
 	 pbox++)
     {
         if (y >= pbox->y2)
-	   continue;		/* not there yet */
-	if ((y < pbox->y1) || (x < pbox->x1))
-	   break;		/* missed it */
-	if (x >= pbox->x2)
-	   continue;		/* not there yet */
+	    continue;           /* not there yet */
+
+        if ((y < pbox->y1) || (x < pbox->x1))
+	    break;              /* missed it */
+
+        if (x >= pbox->x2)
+	    continue;           /* not there yet */
 
         if (box)
 	    *box = *pbox;
 
-	return(TRUE);
+        return(TRUE);
     }
+
     return(FALSE);
 }
 
 PIXMAN_EXPORT int
-PREFIX(_not_empty) (region_type_t * region)
+PREFIX (_not_empty) (region_type_t * region)
 {
-    good(region);
-    return(!PIXREGION_NIL(region));
+    GOOD (region);
+
+    return(!PIXREGION_NIL (region));
 }
 
 PIXMAN_EXPORT box_type_t *
-PREFIX(_extents) (region_type_t * region)
+PREFIX (_extents) (region_type_t * region)
 {
-    good(region);
+    GOOD (region);
+
     return(&region->extents);
 }
 
 /*
-    Clip a list of scanlines to a region.  The caller has allocated the
-    space.  FSorted is non-zero if the scanline origins are in ascending
-    order.
-    returns the number of new, clipped scanlines.
-*/
+ * Clip a list of scanlines to a region.  The caller has allocated the
+ * space.  FSorted is non-zero if the scanline origins are in ascending order.
+ *
+ * returns the number of new, clipped scanlines.
+ */
 
 PIXMAN_EXPORT pixman_bool_t
-PREFIX(_selfcheck) (reg)
-    region_type_t * reg;
+PREFIX (_selfcheck) (region_type_t *reg)
 {
     int i, numRects;
 
     if ((reg->extents.x1 > reg->extents.x2) ||
-	(reg->extents.y1 > reg->extents.y2))
+        (reg->extents.y1 > reg->extents.y2))
+    {
 	return FALSE;
-    numRects = PIXREGION_NUM_RECTS(reg);
+    }
+
+    numRects = PIXREGION_NUMRECTS (reg);
     if (!numRects)
+    {
 	return ((reg->extents.x1 == reg->extents.x2) &&
-		(reg->extents.y1 == reg->extents.y2) &&
-		(reg->data->size || (reg->data == pixman_region_emptyData)));
+	        (reg->extents.y1 == reg->extents.y2) &&
+	        (reg->data->size || (reg->data == pixman_region_empty_data)));
+    }
     else if (numRects == 1)
+    {
 	return (!reg->data);
+    }
     else
     {
-	box_type_t * pboxP, * pboxN;
-	box_type_t box;
+        box_type_t * pbox_p, * pbox_n;
+        box_type_t box;
 
-	pboxP = PIXREGION_RECTS(reg);
-	box = *pboxP;
-	box.y2 = pboxP[numRects-1].y2;
-	pboxN = pboxP + 1;
-	for (i = numRects; --i > 0; pboxP++, pboxN++)
-	{
-	    if ((pboxN->x1 >= pboxN->x2) ||
-		(pboxN->y1 >= pboxN->y2))
+        pbox_p = PIXREGION_RECTS (reg);
+        box = *pbox_p;
+        box.y2 = pbox_p[numRects - 1].y2;
+        pbox_n = pbox_p + 1;
+
+        for (i = numRects; --i > 0; pbox_p++, pbox_n++)
+        {
+            if ((pbox_n->x1 >= pbox_n->x2) ||
+                (pbox_n->y1 >= pbox_n->y2))
+	    {
 		return FALSE;
-	    if (pboxN->x1 < box.x1)
-	        box.x1 = pboxN->x1;
-	    if (pboxN->x2 > box.x2)
-		box.x2 = pboxN->x2;
-	    if ((pboxN->y1 < pboxP->y1) ||
-		((pboxN->y1 == pboxP->y1) &&
-		 ((pboxN->x1 < pboxP->x2) || (pboxN->y2 != pboxP->y2))))
+	    }
+
+            if (pbox_n->x1 < box.x1)
+		box.x1 = pbox_n->x1;
+	    
+            if (pbox_n->x2 > box.x2)
+		box.x2 = pbox_n->x2;
+	    
+            if ((pbox_n->y1 < pbox_p->y1) ||
+                ((pbox_n->y1 == pbox_p->y1) &&
+                 ((pbox_n->x1 < pbox_p->x2) || (pbox_n->y2 != pbox_p->y2))))
+	    {
 		return FALSE;
+	    }
 	}
-	return ((box.x1 == reg->extents.x1) &&
-		(box.x2 == reg->extents.x2) &&
-		(box.y1 == reg->extents.y1) &&
-		(box.y2 == reg->extents.y2));
+
+        return ((box.x1 == reg->extents.x1) &&
+                (box.x2 == reg->extents.x2) &&
+                (box.y1 == reg->extents.y1) &&
+                (box.y2 == reg->extents.y2));
     }
 }
 
 PIXMAN_EXPORT pixman_bool_t
-PREFIX(_init_rects) (region_type_t *region,
-		     box_type_t *boxes, int count)
+PREFIX (_init_rects) (region_type_t *region,
+                      box_type_t *boxes, int count)
 {
-    int overlap;
+    box_type_t *rects;
+    int displacement;
+    int i;
 
     /* if it's 1, then we just want to set the extents, so call
      * the existing method. */
-    if (count == 1) {
-       PREFIX(_init_rect) (region,
-                               boxes[0].x1,
-                               boxes[0].y1,
-                               boxes[0].x2 - boxes[0].x1,
-                               boxes[0].y2 - boxes[0].y1);
-       return TRUE;
+    if (count == 1)
+    {
+        PREFIX (_init_rect) (region,
+                             boxes[0].x1,
+                             boxes[0].y1,
+                             boxes[0].x2 - boxes[0].x1,
+                             boxes[0].y2 - boxes[0].y1);
+        return TRUE;
     }
 
-    PREFIX(_init) (region);
+    PREFIX (_init) (region);
 
     /* if it's 0, don't call pixman_rect_alloc -- 0 rectangles is
      * a special case, and causing pixman_rect_alloc would cause
      * us to leak memory (because the 0-rect case should be the
-     * static pixman_region_emptyData data).
+     * static pixman_region_empty_data data).
      */
     if (count == 0)
-        return TRUE;
+	return TRUE;
 
-    if (!pixman_rect_alloc(region, count))
+    if (!pixman_rect_alloc (region, count))
 	return FALSE;
 
+    rects = PIXREGION_RECTS (region);
+
     /* Copy in the rects */
-    memcpy (PIXREGION_RECTS(region), boxes, sizeof(box_type_t) * count);
+    memcpy (rects, boxes, sizeof(box_type_t) * count);
     region->data->numRects = count;
 
+    /* Eliminate empty and malformed rectangles */
+    displacement = 0;
+
+    for (i = 0; i < count; ++i)
+    {
+        box_type_t *box = &rects[i];
+
+        if (box->x1 >= box->x2 || box->y1 >= box->y2)
+	    displacement++;
+        else if (displacement)
+	    rects[i - displacement] = rects[i];
+    }
+
+    region->data->numRects -= displacement;
+
+    /* If eliminating empty rectangles caused there
+     * to be only 0 or 1 rectangles, deal with that.
+     */
+    if (region->data->numRects == 0)
+    {
+        FREE_DATA (region);
+        PREFIX (_init) (region);
+
+        return TRUE;
+    }
+
+    if (region->data->numRects == 1)
+    {
+        region->extents = rects[0];
+
+        FREE_DATA (region);
+        region->data = NULL;
+
+        GOOD (region);
+
+        return TRUE;
+    }
+
     /* Validate */
     region->extents.x1 = region->extents.x2 = 0;
-    return validate (region, &overlap);
+
+    return validate (region, &i);
 }
diff --git a/lib/pixman/pixman/pixman-region16.c b/lib/pixman/pixman/pixman-region16.c
index acee0946b..46f5e26ea 100644
--- a/lib/pixman/pixman/pixman-region16.c
+++ b/lib/pixman/pixman/pixman-region16.c
@@ -42,42 +42,22 @@ typedef struct {
 
 #define PREFIX(x) pixman_region##x
 
+#include "pixman-region.c"
+
+/* This function exists only to make it possible to preserve the X ABI -
+ * it should go away at first opportunity.
+ *
+ * The problem is that the X ABI exports the three structs and has used
+ * them through macros. So the X server calls this function with
+ * the addresses of those structs which makes the existing code continue to
+ * work.
+ */
 PIXMAN_EXPORT void
 pixman_region_set_static_pointers (pixman_box16_t *empty_box,
 				   pixman_region16_data_t *empty_data,
 				   pixman_region16_data_t *broken_data)
 {
-    pixman_region_internal_set_static_pointers (empty_box, empty_data, broken_data);
-}
-
-pixman_bool_t
-pixman_region16_copy_from_region32 (pixman_region16_t *dst,
-				    pixman_region32_t *src)
-{
-    int n_boxes, i;
-    pixman_box32_t *boxes32;
-    pixman_box16_t *boxes16;
-    pixman_bool_t retval;
-    
-    boxes32 = pixman_region32_rectangles (src, &n_boxes);
-
-    boxes16 = pixman_malloc_ab (n_boxes, sizeof (pixman_box16_t));
-
-    if (!boxes16)
-	return FALSE;
-    
-    for (i = 0; i < n_boxes; ++i)
-    {
-	boxes16[i].x1 = boxes32[i].x1;
-	boxes16[i].y1 = boxes32[i].y1;
-	boxes16[i].x2 = boxes32[i].x2;
-	boxes16[i].y2 = boxes32[i].y2;
-    }
-
-    pixman_region_fini (dst);
-    retval = pixman_region_init_rects (dst, boxes16, n_boxes);
-    free (boxes16);
-    return retval;
+    pixman_region_empty_box = empty_box;
+    pixman_region_empty_data = empty_data;
+    pixman_broken_data = broken_data;
 }
-
-#include "pixman-region.c"
diff --git a/lib/pixman/pixman/pixman-region32.c b/lib/pixman/pixman/pixman-region32.c
index aac74f68f..aeee86cf9 100644
--- a/lib/pixman/pixman/pixman-region32.c
+++ b/lib/pixman/pixman/pixman-region32.c
@@ -40,43 +40,4 @@ typedef struct {
 
 #define PREFIX(x) pixman_region32##x
 
-#define N_TMP_BOXES (16)
-
-pixman_bool_t
-pixman_region32_copy_from_region16 (pixman_region32_t *dst,
-				    pixman_region16_t *src)
-{
-    int n_boxes, i;
-    pixman_box16_t *boxes16;
-    pixman_box32_t *boxes32;
-    pixman_box32_t tmp_boxes[N_TMP_BOXES];
-    pixman_bool_t retval;
-    
-    boxes16 = pixman_region_rectangles (src, &n_boxes);
-
-    if (n_boxes > N_TMP_BOXES)
-	boxes32 = pixman_malloc_ab (n_boxes, sizeof (pixman_box32_t));
-    else
-	boxes32 = tmp_boxes;
-    
-    if (!boxes32)
-	return FALSE;
-    
-    for (i = 0; i < n_boxes; ++i)
-    {
-	boxes32[i].x1 = boxes16[i].x1;
-	boxes32[i].y1 = boxes16[i].y1;
-	boxes32[i].x2 = boxes16[i].x2;
-	boxes32[i].y2 = boxes16[i].y2;
-    }
-
-    pixman_region32_fini (dst);
-    retval = pixman_region32_init_rects (dst, boxes32, n_boxes);
-
-    if (boxes32 != tmp_boxes)
-	free (boxes32);
-
-    return retval;
-}
-
 #include "pixman-region.c"
diff --git a/lib/pixman/pixman/pixman-solid-fill.c b/lib/pixman/pixman/pixman-solid-fill.c
index 1805600d8..38675dca8 100644
--- a/lib/pixman/pixman/pixman-solid-fill.c
+++ b/lib/pixman/pixman/pixman-solid-fill.c
@@ -21,28 +21,35 @@
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif
 #include "pixman-private.h"
 
 static void
-solid_fill_get_scanline_32 (pixman_image_t *image, int x, int y, int width,
-			    uint32_t *buffer, uint32_t *mask, uint32_t maskBits)
+solid_fill_get_scanline_32 (pixman_image_t *image,
+                            int             x,
+                            int             y,
+                            int             width,
+                            uint32_t *      buffer,
+                            const uint32_t *mask,
+                            uint32_t        mask_bits)
 {
     uint32_t *end = buffer + width;
     register uint32_t color = ((solid_fill_t *)image)->color;
-    
+
     while (buffer < end)
 	*(buffer++) = color;
-    
+
     return;
 }
 
-static source_pict_class_t
+static source_image_class_t
 solid_fill_classify (pixman_image_t *image,
-		     int	     x,
-		     int	     y,
-		     int	     width,
-		     int	     height)
+                     int             x,
+                     int             y,
+                     int             width,
+                     int             height)
 {
     return (image->source.class = SOURCE_IMAGE_CLASS_HORIZONTAL);
 }
@@ -50,25 +57,25 @@ solid_fill_classify (pixman_image_t *image,
 static void
 solid_fill_property_changed (pixman_image_t *image)
 {
-    image->common.get_scanline_32 = (scanFetchProc)solid_fill_get_scanline_32;
-    image->common.get_scanline_64 = (scanFetchProc)_pixman_image_get_scanline_64_generic;
+    image->common.get_scanline_32 = solid_fill_get_scanline_32;
+    image->common.get_scanline_64 = _pixman_image_get_scanline_generic_64;
 }
 
 static uint32_t
 color_to_uint32 (const pixman_color_t *color)
 {
     return
-	(color->alpha >> 8 << 24) |
-	(color->red >> 8 << 16) |
+        (color->alpha >> 8 << 24) |
+        (color->red >> 8 << 16) |
         (color->green & 0xff00) |
-	(color->blue >> 8);
+        (color->blue >> 8);
 }
 
 PIXMAN_EXPORT pixman_image_t *
 pixman_image_create_solid_fill (pixman_color_t *color)
 {
-    pixman_image_t *img = _pixman_image_allocate();
-    
+    pixman_image_t *img = _pixman_image_allocate ();
+
     if (!img)
 	return NULL;
 
@@ -79,7 +86,6 @@ pixman_image_create_solid_fill (pixman_color_t *color)
     img->common.classify = solid_fill_classify;
     img->common.property_changed = solid_fill_property_changed;
 
-    solid_fill_property_changed (img);
-    
     return img;
 }
+
diff --git a/lib/pixman/pixman/pixman-sse2.c b/lib/pixman/pixman/pixman-sse2.c
index 40e222893..bb74882b2 100644
--- a/lib/pixman/pixman/pixman-sse2.c
+++ b/lib/pixman/pixman/pixman-sse2.c
@@ -23,7 +23,7 @@
  *
  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
  *          André Tupinambá (andrelrt@gmail.com)
- * 
+ *
  * Based on work by Owen Taylor and Søren Sandmann
  */
 #ifdef HAVE_CONFIG_H
@@ -34,94 +34,110 @@
 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
 #include <emmintrin.h> /* for SSE2 intrinsics */
 #include "pixman-private.h"
+#include "pixman-combine32.h"
+
+#if defined(_MSC_VER) && defined(_M_AMD64)
+/* Windows 64 doesn't allow MMX to be used, so
+ * the pixman-x64-mmx-emulation.h file contains
+ * implementations of those MMX intrinsics that
+ * are used in the SSE2 implementation.
+ */
+#   include "pixman-x64-mmx-emulation.h"
+#endif
 
 #ifdef USE_SSE2
 
-/* -------------------------------------------------------------------------------------------------
+/* --------------------------------------------------------------------
  * Locals
  */
 
-static __m64 xMask0080;
-static __m64 xMask00ff;
-static __m64 xMask0101;
-static __m64 xMaskAlpha;
+static __m64 mask_x0080;
+static __m64 mask_x00ff;
+static __m64 mask_x0101;
+static __m64 mask_x_alpha;
 
-static __m64 xMask565rgb;
-static __m64 xMask565Unpack;
+static __m64 mask_x565_rgb;
+static __m64 mask_x565_unpack;
 
-static __m128i Mask0080;
-static __m128i Mask00ff;
-static __m128i Mask0101;
-static __m128i Maskffff;
-static __m128i Maskff000000;
-static __m128i MaskAlpha;
+static __m128i mask_0080;
+static __m128i mask_00ff;
+static __m128i mask_0101;
+static __m128i mask_ffff;
+static __m128i mask_ff000000;
+static __m128i mask_alpha;
 
-static __m128i Mask565r;
-static __m128i Mask565g1, Mask565g2;
-static __m128i Mask565b;
-static __m128i MaskRed;
-static __m128i MaskGreen;
-static __m128i MaskBlue;
+static __m128i mask_565_r;
+static __m128i mask_565_g1, mask_565_g2;
+static __m128i mask_565_b;
+static __m128i mask_red;
+static __m128i mask_green;
+static __m128i mask_blue;
 
-static __m128i Mask565FixRB;
-static __m128i Mask565FixG;
+static __m128i mask_565_fix_rb;
+static __m128i mask_565_fix_g;
 
-/* -------------------------------------------------------------------------------------------------
+/* ----------------------------------------------------------------------
  * SSE2 Inlines
  */
 static force_inline __m128i
 unpack_32_1x128 (uint32_t data)
 {
-    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128());
+    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
 }
 
 static force_inline void
-unpack_128_2x128 (__m128i data, __m128i* dataLo, __m128i* dataHi)
+unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
 {
-    *dataLo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
-    *dataHi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
+    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
+    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
 }
 
 static force_inline __m128i
-unpack565to8888 (__m128i lo)
+unpack_565_to_8888 (__m128i lo)
 {
     __m128i r, g, b, rb, t;
-    
-    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), MaskRed);
-    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), MaskGreen);
-    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), MaskBlue);
+
+    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
+    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
+    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
 
     rb = _mm_or_si128 (r, b);
-    t  = _mm_and_si128 (rb, Mask565FixRB);
+    t  = _mm_and_si128 (rb, mask_565_fix_rb);
     t  = _mm_srli_epi32 (t, 5);
     rb = _mm_or_si128 (rb, t);
 
-    t  = _mm_and_si128 (g, Mask565FixG);
+    t  = _mm_and_si128 (g, mask_565_fix_g);
     t  = _mm_srli_epi32 (t, 6);
     g  = _mm_or_si128 (g, t);
-    
+
     return _mm_or_si128 (rb, g);
 }
 
 static force_inline void
-unpack565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3)
+unpack_565_128_4x128 (__m128i  data,
+                      __m128i* data0,
+                      __m128i* data1,
+                      __m128i* data2,
+                      __m128i* data3)
 {
     __m128i lo, hi;
 
     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
 
-    lo = unpack565to8888 (lo);
-    hi = unpack565to8888 (hi);
+    lo = unpack_565_to_8888 (lo);
+    hi = unpack_565_to_8888 (hi);
 
     unpack_128_2x128 (lo, data0, data1);
     unpack_128_2x128 (hi, data2, data3);
 }
 
 static force_inline uint16_t
-pack565_32_16 (uint32_t pixel)
+pack_565_32_16 (uint32_t pixel)
 {
-    return (uint16_t) (((pixel>>8) & 0xf800) | ((pixel>>5) & 0x07e0) | ((pixel>>3) & 0x001f));
+    return (uint16_t) (((pixel >> 8) & 0xf800) |
+		       ((pixel >> 5) & 0x07e0) |
+		       ((pixel >> 3) & 0x001f));
 }
 
 static force_inline __m128i
@@ -131,308 +147,358 @@ pack_2x128_128 (__m128i lo, __m128i hi)
 }
 
 static force_inline __m128i
-pack565_2x128_128 (__m128i lo, __m128i hi)
+pack_565_2x128_128 (__m128i lo, __m128i hi)
 {
     __m128i data;
     __m128i r, g1, g2, b;
 
-    data = pack_2x128_128 ( lo, hi );
+    data = pack_2x128_128 (lo, hi);
 
-    r  = _mm_and_si128 (data , Mask565r);
-    g1 = _mm_and_si128 (_mm_slli_epi32 (data , 3), Mask565g1);
-    g2 = _mm_and_si128 (_mm_srli_epi32 (data , 5), Mask565g2);
-    b  = _mm_and_si128 (_mm_srli_epi32 (data , 3), Mask565b);
+    r  = _mm_and_si128 (data, mask_565_r);
+    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
+    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
+    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
 
     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
 }
 
 static force_inline __m128i
-pack565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
+pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
 {
-    return _mm_packus_epi16 (pack565_2x128_128 (*xmm0, *xmm1), pack565_2x128_128 (*xmm2, *xmm3));
+    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
+			     pack_565_2x128_128 (*xmm2, *xmm3));
 }
 
 static force_inline int
-isOpaque (__m128i x)
+is_opaque (__m128i x)
 {
     __m128i ffs = _mm_cmpeq_epi8 (x, x);
+
     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
 }
 
 static force_inline int
-isZero (__m128i x)
+is_zero (__m128i x)
 {
-    return _mm_movemask_epi8 (_mm_cmpeq_epi8 (x, _mm_setzero_si128())) == 0xffff;
+    return _mm_movemask_epi8 (
+	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
 }
 
 static force_inline int
-isTransparent (__m128i x)
+is_transparent (__m128i x)
 {
-    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, _mm_setzero_si128())) & 0x8888) == 0x8888;
+    return (_mm_movemask_epi8 (
+		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
 }
 
 static force_inline __m128i
-expandPixel_32_1x128 (uint32_t data)
+expand_pixel_32_1x128 (uint32_t data)
 {
-    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE(1, 0, 1, 0));
+    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
 }
 
 static force_inline __m128i
-expandAlpha_1x128 (__m128i data)
+expand_alpha_1x128 (__m128i data)
 {
-    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
+    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
+						     _MM_SHUFFLE (3, 3, 3, 3)),
+				_MM_SHUFFLE (3, 3, 3, 3));
 }
 
 static force_inline void
-expandAlpha_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
+expand_alpha_2x128 (__m128i  data_lo,
+                    __m128i  data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi)
 {
     __m128i lo, hi;
 
-    lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 3, 3, 3));
-    hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 3, 3, 3));
-    *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 3, 3, 3));
-    *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 3, 3, 3));
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
+
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
 }
 
 static force_inline void
-expandAlphaRev_2x128 (__m128i dataLo, __m128i dataHi, __m128i* alphaLo, __m128i* alphaHi)
+expand_alpha_rev_2x128 (__m128i  data_lo,
+                        __m128i  data_hi,
+                        __m128i* alpha_lo,
+                        __m128i* alpha_hi)
 {
     __m128i lo, hi;
 
-    lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(0, 0, 0, 0));
-    hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(0, 0, 0, 0));
-    *alphaLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(0, 0, 0, 0));
-    *alphaHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(0, 0, 0, 0));
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
 }
 
 static force_inline void
-pixMultiply_2x128 (__m128i* dataLo, __m128i* dataHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* retLo, __m128i* retHi)
+pix_multiply_2x128 (__m128i* data_lo,
+                    __m128i* data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi,
+                    __m128i* ret_lo,
+                    __m128i* ret_hi)
 {
     __m128i lo, hi;
 
-    lo = _mm_mullo_epi16 (*dataLo, *alphaLo);
-    hi = _mm_mullo_epi16 (*dataHi, *alphaHi);
-    lo = _mm_adds_epu16 (lo, Mask0080);
-    hi = _mm_adds_epu16 (hi, Mask0080);
-    *retLo = _mm_mulhi_epu16 (lo, Mask0101);
-    *retHi = _mm_mulhi_epu16 (hi, Mask0101);
+    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
+    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
+    lo = _mm_adds_epu16 (lo, mask_0080);
+    hi = _mm_adds_epu16 (hi, mask_0080);
+    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
+    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
 }
 
 static force_inline void
-pixAddMultiply_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaDstLo, __m128i* alphaDstHi,
-                      __m128i* dstLo, __m128i* dstHi, __m128i* alphaSrcLo, __m128i* alphaSrcHi,
-                      __m128i* retLo, __m128i* retHi)
+pix_add_multiply_2x128 (__m128i* src_lo,
+                        __m128i* src_hi,
+                        __m128i* alpha_dst_lo,
+                        __m128i* alpha_dst_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi,
+                        __m128i* alpha_src_lo,
+                        __m128i* alpha_src_hi,
+                        __m128i* ret_lo,
+                        __m128i* ret_hi)
 {
-    __m128i lo, hi;
-    __m128i mulLo, mulHi;
+    __m128i t1_lo, t1_hi;
+    __m128i t2_lo, t2_hi;
+
+    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
+    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
 
-    lo = _mm_mullo_epi16 (*srcLo, *alphaDstLo);
-    hi = _mm_mullo_epi16 (*srcHi, *alphaDstHi);
-    mulLo = _mm_mullo_epi16 (*dstLo, *alphaSrcLo);
-    mulHi = _mm_mullo_epi16 (*dstHi, *alphaSrcHi);
-    lo = _mm_adds_epu16 (lo, Mask0080);
-    hi = _mm_adds_epu16 (hi, Mask0080);
-    lo = _mm_adds_epu16 (lo, mulLo);
-    hi = _mm_adds_epu16 (hi, mulHi);
-    *retLo = _mm_mulhi_epu16 (lo, Mask0101);
-    *retHi = _mm_mulhi_epu16 (hi, Mask0101);
+    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
+    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
 }
 
 static force_inline void
-negate_2x128 (__m128i dataLo, __m128i dataHi, __m128i* negLo, __m128i* negHi)
+negate_2x128 (__m128i  data_lo,
+              __m128i  data_hi,
+              __m128i* neg_lo,
+              __m128i* neg_hi)
 {
-    *negLo = _mm_xor_si128 (dataLo, Mask00ff);
-    *negHi = _mm_xor_si128 (dataHi, Mask00ff);
+    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
+    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
 }
 
 static force_inline void
-invertColors_2x128 (__m128i dataLo, __m128i dataHi, __m128i* invLo, __m128i* invHi)
+invert_colors_2x128 (__m128i  data_lo,
+                     __m128i  data_hi,
+                     __m128i* inv_lo,
+                     __m128i* inv_hi)
 {
     __m128i lo, hi;
 
-    lo = _mm_shufflelo_epi16 (dataLo, _MM_SHUFFLE(3, 0, 1, 2));
-    hi = _mm_shufflelo_epi16 (dataHi, _MM_SHUFFLE(3, 0, 1, 2));
-    *invLo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE(3, 0, 1, 2));
-    *invHi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE(3, 0, 1, 2));
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
 }
 
 static force_inline void
-over_2x128 (__m128i* srcLo, __m128i* srcHi, __m128i* alphaLo, __m128i* alphaHi, __m128i* dstLo, __m128i* dstHi)
+over_2x128 (__m128i* src_lo,
+            __m128i* src_hi,
+            __m128i* alpha_lo,
+            __m128i* alpha_hi,
+            __m128i* dst_lo,
+            __m128i* dst_hi)
 {
     __m128i t1, t2;
 
-    negate_2x128 (*alphaLo, *alphaHi, &t1, &t2);
+    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
 
-    pixMultiply_2x128 (dstLo, dstHi, &t1, &t2, dstLo, dstHi);
+    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
 
-    *dstLo = _mm_adds_epu8 (*srcLo, *dstLo);
-    *dstHi = _mm_adds_epu8 (*srcHi, *dstHi);
+    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
+    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
 }
 
 static force_inline void
-overRevNonPre_2x128 (__m128i srcLo, __m128i srcHi, __m128i* dstLo, __m128i* dstHi)
+over_rev_non_pre_2x128 (__m128i  src_lo,
+                        __m128i  src_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi)
 {
     __m128i lo, hi;
-    __m128i alphaLo, alphaHi;
+    __m128i alpha_lo, alpha_hi;
 
-    expandAlpha_2x128 (srcLo, srcHi, &alphaLo, &alphaHi);
+    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
 
-    lo = _mm_or_si128 (alphaLo, MaskAlpha);
-    hi = _mm_or_si128 (alphaHi, MaskAlpha);
+    lo = _mm_or_si128 (alpha_lo, mask_alpha);
+    hi = _mm_or_si128 (alpha_hi, mask_alpha);
 
-    invertColors_2x128 (srcLo, srcHi, &srcLo, &srcHi);
+    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
 
-    pixMultiply_2x128 (&srcLo, &srcHi, &lo, &hi, &lo, &hi);
+    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
 
-    over_2x128 (&lo, &hi, &alphaLo, &alphaHi, dstLo, dstHi);
+    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
 }
 
 static force_inline void
-inOver_2x128 (__m128i* srcLo,  __m128i* srcHi,  __m128i*  alphaLo, __m128i*  alphaHi,
-              __m128i* maskLo, __m128i* maskHi, __m128i* dstLo,   __m128i* dstHi)
+in_over_2x128 (__m128i* src_lo,
+               __m128i* src_hi,
+               __m128i* alpha_lo,
+               __m128i* alpha_hi,
+               __m128i* mask_lo,
+               __m128i* mask_hi,
+               __m128i* dst_lo,
+               __m128i* dst_hi)
 {
-    __m128i sLo, sHi;
-    __m128i aLo, aHi;
+    __m128i s_lo, s_hi;
+    __m128i a_lo, a_hi;
 
-    pixMultiply_2x128 (  srcLo,   srcHi, maskLo, maskHi, &sLo, &sHi);
-    pixMultiply_2x128 (alphaLo, alphaHi, maskLo, maskHi, &aLo, &aHi);
+    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
+    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
 
-    over_2x128 (&sLo, &sHi, &aLo, &aHi, dstLo, dstHi);
+    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
 }
 
 static force_inline void
-cachePrefetch (__m128i* addr)
+cache_prefetch (__m128i* addr)
 {
-    _mm_prefetch (addr, _MM_HINT_T0);
+    _mm_prefetch ((void const*)addr, _MM_HINT_T0);
 }
 
 static force_inline void
-cachePrefetchNext (__m128i* addr)
+cache_prefetch_next (__m128i* addr)
 {
-    _mm_prefetch (addr + 4, _MM_HINT_T0); // 64 bytes ahead
+    _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */
 }
 
 /* load 4 pixels from a 16-byte boundary aligned address */
 static force_inline __m128i
-load128Aligned (__m128i* src)
+load_128_aligned (__m128i* src)
 {
     return _mm_load_si128 (src);
 }
 
 /* load 4 pixels from a unaligned address */
 static force_inline __m128i
-load128Unaligned (const __m128i* src)
+load_128_unaligned (const __m128i* src)
 {
     return _mm_loadu_si128 (src);
 }
 
-/* save 4 pixels using Write Combining memory on a 16-byte boundary aligned address */
+/* save 4 pixels using Write Combining memory on a 16-byte
+ * boundary aligned address
+ */
 static force_inline void
-save128WriteCombining (__m128i* dst, __m128i data)
+save_128_write_combining (__m128i* dst,
+                          __m128i  data)
 {
     _mm_stream_si128 (dst, data);
 }
 
 /* save 4 pixels on a 16-byte boundary aligned address */
 static force_inline void
-save128Aligned (__m128i* dst, __m128i data)
+save_128_aligned (__m128i* dst,
+                  __m128i  data)
 {
     _mm_store_si128 (dst, data);
 }
 
 /* save 4 pixels on a unaligned address */
 static force_inline void
-save128Unaligned (__m128i* dst, __m128i data)
+save_128_unaligned (__m128i* dst,
+                    __m128i  data)
 {
     _mm_storeu_si128 (dst, data);
 }
 
-/* -------------------------------------------------------------------------------------------------
+/* ------------------------------------------------------------------
  * MMX inlines
  */
 
 static force_inline __m64
 unpack_32_1x64 (uint32_t data)
 {
-    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64());
+    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ());
 }
 
 static force_inline __m64
-expandAlpha_1x64 (__m64 data)
+expand_alpha_1x64 (__m64 data)
 {
-    return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 3, 3, 3));
+    return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3));
 }
 
 static force_inline __m64
-expandAlphaRev_1x64 (__m64 data)
+expand_alpha_rev_1x64 (__m64 data)
 {
-    return _mm_shuffle_pi16 (data, _MM_SHUFFLE(0, 0, 0, 0));
+    return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
 }
 
 static force_inline __m64
-expandPixel_8_1x64 (uint8_t data)
+expand_pixel_8_1x64 (uint8_t data)
 {
-    return _mm_shuffle_pi16 (unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE(0, 0, 0, 0));
+    return _mm_shuffle_pi16 (
+	unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
 }
 
 static force_inline __m64
-pixMultiply_1x64 (__m64 data, __m64 alpha)
+pix_multiply_1x64 (__m64 data,
+                   __m64 alpha)
 {
     return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha),
-                                          xMask0080),
-                           xMask0101);
+                                          mask_x0080),
+                           mask_x0101);
 }
 
 static force_inline __m64
-pixAddMultiply_1x64 (__m64* src, __m64* alphaDst, __m64* dst, __m64* alphaSrc)
+pix_add_multiply_1x64 (__m64* src,
+                       __m64* alpha_dst,
+                       __m64* dst,
+                       __m64* alpha_src)
 {
-    return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (*src, *alphaDst),
-                                                         xMask0080),
-                                          _mm_mullo_pi16 (*dst, *alphaSrc)),
-                           xMask0101);
+    __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst);
+    __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src);
+
+    return _mm_adds_pu8 (t1, t2);
 }
 
 static force_inline __m64
 negate_1x64 (__m64 data)
 {
-    return _mm_xor_si64 (data, xMask00ff);
+    return _mm_xor_si64 (data, mask_x00ff);
 }
 
 static force_inline __m64
-invertColors_1x64 (__m64 data)
+invert_colors_1x64 (__m64 data)
 {
-    return _mm_shuffle_pi16 (data, _MM_SHUFFLE(3, 0, 1, 2));
+    return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
 }
 
 static force_inline __m64
 over_1x64 (__m64 src, __m64 alpha, __m64 dst)
 {
-    return _mm_adds_pu8 (src, pixMultiply_1x64 (dst, negate_1x64 (alpha)));
+    return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha)));
 }
 
 static force_inline __m64
-inOver_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
+in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst)
 {
-    return over_1x64 (pixMultiply_1x64 (*src, *mask),
-                      pixMultiply_1x64 (*alpha, *mask),
+    return over_1x64 (pix_multiply_1x64 (*src, *mask),
+                      pix_multiply_1x64 (*alpha, *mask),
                       *dst);
 }
 
 static force_inline __m64
-overRevNonPre_1x64 (__m64 src, __m64 dst)
+over_rev_non_pre_1x64 (__m64 src, __m64 dst)
 {
-    __m64 alpha = expandAlpha_1x64 (src);
+    __m64 alpha = expand_alpha_1x64 (src);
 
-    return over_1x64 (pixMultiply_1x64 (invertColors_1x64 (src),
-                                        _mm_or_si64 (alpha, xMaskAlpha)),
+    return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src),
+                                         _mm_or_si64 (alpha, mask_x_alpha)),
                       alpha,
                       dst);
 }
 
 static force_inline uint32_t
-pack_1x64_32( __m64 data )
+pack_1x64_32 (__m64 data)
 {
-    return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64()));
+    return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ()));
 }
 
 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
@@ -462,31 +528,32 @@ expand565_16_1x64 (uint16_t pixel)
 
     p = _mm_or_si64 (t1, p);
     p = _mm_or_si64 (t2, p);
-    p = _mm_and_si64 (p, xMask565rgb);
-    p = _mm_mullo_pi16 (p, xMask565Unpack);
+    p = _mm_and_si64 (p, mask_x565_rgb);
+    p = _mm_mullo_pi16 (p, mask_x565_unpack);
 
     return _mm_srli_pi16 (p, 8);
 }
 
-/* -------------------------------------------------------------------------------------------------
+/* ----------------------------------------------------------------------------
  * Compose Core transformations
  */
 static force_inline uint32_t
-coreCombineOverUPixelsse2 (uint32_t src, uint32_t dst)
+core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
 {
-    uint8_t     a;
-    __m64       ms;
+    uint8_t a;
+    __m64 ms;
 
     a = src >> 24;
 
     if (a == 0xff)
     {
-        return src;
+	return src;
     }
     else if (src)
     {
-        ms = unpack_32_1x64 (src);
-        return pack_1x64_32 (over_1x64 (ms, expandAlpha_1x64 (ms), unpack_32_1x64 (dst)));
+	ms = unpack_32_1x64 (src);
+	return pack_1x64_32 (
+	    over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst)));
     }
 
     return dst;
@@ -502,10 +569,10 @@ combine1 (const uint32_t *ps, const uint32_t *pm)
 	__m64 ms, mm;
 
 	mm = unpack_32_1x64 (*pm);
-	mm = expandAlpha_1x64 (mm);
-	
+	mm = expand_alpha_1x64 (mm);
+
 	ms = unpack_32_1x64 (s);
-	ms = pixMultiply_1x64 (ms, mm);
+	ms = pix_multiply_1x64 (ms, mm);
 
 	s = pack_1x64_32 (ms);
     }
@@ -516,270 +583,299 @@ combine1 (const uint32_t *ps, const uint32_t *pm)
 static force_inline __m128i
 combine4 (const __m128i *ps, const __m128i *pm)
 {
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmMskLo, xmmMskHi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_msk_lo, xmm_msk_hi;
     __m128i s;
-    
+
     if (pm)
     {
-	xmmMskLo = load128Unaligned (pm);
+	xmm_msk_lo = load_128_unaligned (pm);
 
-	if (isTransparent (xmmMskLo))
+	if (is_transparent (xmm_msk_lo))
 	    return _mm_setzero_si128 ();
     }
-    
-    s = load128Unaligned (ps);
-	
+
+    s = load_128_unaligned (ps);
+
     if (pm)
     {
-	unpack_128_2x128 (s, &xmmSrcLo, &xmmSrcHi);
-	unpack_128_2x128 (xmmMskLo, &xmmMskLo, &xmmMskHi);
-	
-	expandAlpha_2x128 (xmmMskLo, xmmMskHi, &xmmMskLo, &xmmMskHi);
-	
-	pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMskLo, &xmmMskHi, &xmmSrcLo, &xmmSrcHi);
-	
-	s = pack_2x128_128 (xmmSrcLo, xmmSrcHi);
+	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
+
+	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_msk_lo, &xmm_msk_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+
+	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
     }
 
     return s;
 }
 
 static force_inline void
-coreCombineOverUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+core_combine_over_u_sse2 (uint32_t*       pd,
+                          const uint32_t* ps,
+                          const uint32_t* pm,
+                          int             w)
 {
     uint32_t s, d;
 
-    __m128i xmmDstLo, xmmDstHi;
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmAlphaLo, xmmAlphaHi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     /* Align dst on a 16-byte boundary */
-    while (w &&
-           ((unsigned long)pd & 15))
+    while (w && ((unsigned long)pd & 15))
     {
-        d = *pd;
-        s = combine1 (ps, pm);
+	d = *pd;
+	s = combine1 (ps, pm);
 
-        *pd++ = coreCombineOverUPixelsse2 (s, d);
+	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
 	ps++;
 	if (pm)
 	    pm++;
-        w--;
+	w--;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-	cachePrefetchNext ((__m128i*)pm);
-
-        /* I'm loading unaligned because I'm not sure about the address alignment. */
-        xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
-
-        if (isOpaque (xmmSrcHi))
-        {
-            save128Aligned ((__m128i*)pd, xmmSrcHi);
-        }
-        else if (!isZero (xmmSrcHi))
-        {
-            xmmDstHi = load128Aligned ((__m128i*) pd);
-
-            unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-            unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-
-            expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
-
-            over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
-
-            /* rebuid the 4 pixel data and save*/
-            save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-        }
-
-        w -= 4;
-        ps += 4;
-        pd += 4;
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
+
+	/* I'm loading unaligned because I'm not sure about
+	 * the address alignment.
+	 */
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+	if (is_opaque (xmm_src_hi))
+	{
+	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
+	}
+	else if (!is_zero (xmm_src_hi))
+	{
+	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	    expand_alpha_2x128 (
+		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst_lo, &xmm_dst_hi);
+
+	    /* rebuid the 4 pixel data and save*/
+	    save_128_aligned ((__m128i*)pd,
+			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	w -= 4;
+	ps += 4;
+	pd += 4;
 	if (pm)
 	    pm += 4;
     }
 
     while (w)
     {
-        d = *pd;
-        s = combine1 (ps, pm);
+	d = *pd;
+	s = combine1 (ps, pm);
 
-        *pd++ = coreCombineOverUPixelsse2 (s, d);
+	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
 	ps++;
 	if (pm)
 	    pm++;
-        w--;
+
+	w--;
     }
 }
 
 static force_inline void
-coreCombineOverReverseUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+core_combine_over_reverse_u_sse2 (uint32_t*       pd,
+                                  const uint32_t* ps,
+                                  const uint32_t* pm,
+                                  int             w)
 {
     uint32_t s, d;
 
-    __m128i xmmDstLo, xmmDstHi;
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmAlphaLo, xmmAlphaHi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     /* Align dst on a 16-byte boundary */
     while (w &&
            ((unsigned long)pd & 15))
     {
-        d = *pd;
-        s = combine1 (ps, pm);
+	d = *pd;
+	s = combine1 (ps, pm);
 
-        *pd++ = coreCombineOverUPixelsse2 (d, s);
-        w--;
+	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
+	w--;
 	ps++;
 	if (pm)
 	    pm++;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-	cachePrefetchNext ((__m128i*)pm);
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
 
-        /* I'm loading unaligned because I'm not sure about the address alignment. */
-        xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
-        xmmDstHi = load128Aligned ((__m128i*) pd);
+	/* I'm loading unaligned because I'm not sure
+	 * about the address alignment.
+	 */
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
 
-        over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmSrcLo, &xmmSrcHi);
+	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+		    &xmm_alpha_lo, &xmm_alpha_hi,
+		    &xmm_src_lo, &xmm_src_hi);
 
-        /* rebuid the 4 pixel data and save*/
-        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmSrcLo, xmmSrcHi));
+	/* rebuid the 4 pixel data and save*/
+	save_128_aligned ((__m128i*)pd,
+			  pack_2x128_128 (xmm_src_lo, xmm_src_hi));
+
+	w -= 4;
+	ps += 4;
+	pd += 4;
 
-        w -= 4;
-        ps += 4;
-        pd += 4;
 	if (pm)
 	    pm += 4;
     }
 
     while (w)
     {
-        d = *pd;
-        s = combine1 (ps, pm);
+	d = *pd;
+	s = combine1 (ps, pm);
 
-        *pd++ = coreCombineOverUPixelsse2 (d, s);
+	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
 	ps++;
-        w--;
+	w--;
 	if (pm)
 	    pm++;
     }
 }
 
 static force_inline uint32_t
-coreCombineInUPixelsse2 (uint32_t src, uint32_t dst)
+core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst)
 {
     uint32_t maska = src >> 24;
 
     if (maska == 0)
     {
-        return 0;
+	return 0;
     }
     else if (maska != 0xff)
     {
-        return pack_1x64_32(pixMultiply_1x64 (unpack_32_1x64 (dst), expandAlpha_1x64 (unpack_32_1x64 (src))));
+	return pack_1x64_32 (
+	    pix_multiply_1x64 (unpack_32_1x64 (dst),
+			       expand_alpha_1x64 (unpack_32_1x64 (src))));
     }
 
     return dst;
 }
 
 static force_inline void
-coreCombineInUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+core_combine_in_u_sse2 (uint32_t*       pd,
+                        const uint32_t* ps,
+                        const uint32_t* pm,
+                        int             w)
 {
     uint32_t s, d;
 
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmDstLo, xmmDstHi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && ((unsigned long) pd & 15))
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+	s = combine1 (ps, pm);
+	d = *pd;
 
-        *pd++ = coreCombineInUPixelsse2 (d, s);
-        w--;
+	*pd++ = core_combine_in_u_pixelsse2 (d, s);
+	w--;
 	ps++;
 	if (pm)
 	    pm++;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-	cachePrefetchNext ((__m128i*)pm);
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
 
-        xmmDstHi = load128Aligned ((__m128i*) pd);
-        xmmSrcHi = combine4 ((__m128i*) ps, (__m128i*) pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
 
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
 
-        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+	save_128_aligned ((__m128i*)pd,
+			  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        w -= 4;
+	ps += 4;
+	pd += 4;
+	w -= 4;
 	if (pm)
 	    pm += 4;
     }
 
     while (w)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+	s = combine1 (ps, pm);
+	d = *pd;
 
-        *pd++ = coreCombineInUPixelsse2 (d, s);
-        w--;
+	*pd++ = core_combine_in_u_pixelsse2 (d, s);
+	w--;
 	ps++;
 	if (pm)
 	    pm++;
@@ -787,67 +883,73 @@ coreCombineInUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
 }
 
 static force_inline void
-coreCombineReverseInUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
+core_combine_reverse_in_u_sse2 (uint32_t*       pd,
+                                const uint32_t* ps,
+                                const uint32_t *pm,
+                                int             w)
 {
     uint32_t s, d;
 
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmDstLo, xmmDstHi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && ((unsigned long) pd & 15))
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+	s = combine1 (ps, pm);
+	d = *pd;
 
-        *pd++ = coreCombineInUPixelsse2 (s, d);
+	*pd++ = core_combine_in_u_pixelsse2 (s, d);
 	ps++;
-        w--;
+	w--;
 	if (pm)
 	    pm++;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-	cachePrefetchNext ((__m128i*)pm);
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
 
-        xmmDstHi = load128Aligned ((__m128i*) pd);
-        xmmSrcHi = combine4 ((__m128i*) ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-        pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
 
-        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        w -= 4;
+	ps += 4;
+	pd += 4;
+	w -= 4;
 	if (pm)
 	    pm += 4;
     }
 
     while (w)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+	s = combine1 (ps, pm);
+	d = *pd;
 
-        *pd++ = coreCombineInUPixelsse2 (s, d);
-        w--;
+	*pd++ = core_combine_in_u_pixelsse2 (s, d);
+	w--;
 	ps++;
 	if (pm)
 	    pm++;
@@ -855,135 +957,161 @@ coreCombineReverseInUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm,
 }
 
 static force_inline void
-coreCombineReverseOutUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+core_combine_reverse_out_u_sse2 (uint32_t*       pd,
+                                 const uint32_t* ps,
+                                 const uint32_t* pm,
+                                 int             w)
 {
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && ((unsigned long) pd & 15))
     {
-        uint32_t s = combine1 (ps, pm);
-        uint32_t d = *pd;
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
 
-        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
+	*pd++ = pack_1x64_32 (
+	    pix_multiply_1x64 (
+		unpack_32_1x64 (d), negate_1x64 (
+		    expand_alpha_1x64 (unpack_32_1x64 (s)))));
+	
 	if (pm)
 	    pm++;
 	ps++;
-        w--;
+	w--;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        __m128i xmmSrcLo, xmmSrcHi;
-        __m128i xmmDstLo, xmmDstHi;
+	__m128i xmm_src_lo, xmm_src_hi;
+	__m128i xmm_dst_lo, xmm_dst_hi;
 
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-	cachePrefetchNext ((__m128i*)pm);
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
 
-        xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
-        xmmDstHi = load128Aligned ((__m128i*) pd);
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        negate_2x128      (xmmSrcLo, xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 
-        pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi);
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
 
-        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
+	ps += 4;
+	pd += 4;
 	if (pm)
 	    pm += 4;
-        w -= 4;
+
+	w -= 4;
     }
 
     while (w)
     {
-        uint32_t s = combine1 (ps, pm);
-        uint32_t d = *pd;
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
 
-        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (s)))));
+	*pd++ = pack_1x64_32 (
+	    pix_multiply_1x64 (
+		unpack_32_1x64 (d), negate_1x64 (
+		    expand_alpha_1x64 (unpack_32_1x64 (s)))));
 	ps++;
 	if (pm)
 	    pm++;
-        w--;
+	w--;
     }
 }
 
 static force_inline void
-coreCombineOutUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+core_combine_out_u_sse2 (uint32_t*       pd,
+                         const uint32_t* ps,
+                         const uint32_t* pm,
+                         int             w)
 {
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && ((unsigned long) pd & 15))
     {
-        uint32_t s = combine1 (ps, pm);
-        uint32_t d = *pd;
-
-        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
-        w--;
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x64_32 (
+	    pix_multiply_1x64 (
+		unpack_32_1x64 (s), negate_1x64 (
+		    expand_alpha_1x64 (unpack_32_1x64 (d)))));
+	w--;
 	ps++;
 	if (pm)
 	    pm++;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        __m128i xmmSrcLo, xmmSrcHi;
-        __m128i xmmDstLo, xmmDstHi;
+	__m128i xmm_src_lo, xmm_src_hi;
+	__m128i xmm_dst_lo, xmm_dst_hi;
 
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-	cachePrefetchNext ((__m128i*)pm);
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
 
-        xmmSrcHi = combine4 ((__m128i*) ps, (__m128i*)pm);
-        xmmDstHi = load128Aligned ((__m128i*) pd);
+	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
-        negate_2x128      (xmmDstLo, xmmDstHi, &xmmDstLo, &xmmDstHi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
 
-        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        w -= 4;
+	ps += 4;
+	pd += 4;
+	w -= 4;
 	if (pm)
 	    pm += 4;
     }
 
     while (w)
     {
-        uint32_t s = combine1 (ps, pm);
-        uint32_t d = *pd;
-
-        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
-        w--;
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x64_32 (
+	    pix_multiply_1x64 (
+		unpack_32_1x64 (s), negate_1x64 (
+		    expand_alpha_1x64 (unpack_32_1x64 (d)))));
+	w--;
 	ps++;
 	if (pm)
 	    pm++;
@@ -991,87 +1119,96 @@ coreCombineOutUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w
 }
 
 static force_inline uint32_t
-coreCombineAtopUPixelsse2 (uint32_t src, uint32_t dst)
+core_combine_atop_u_pixel_sse2 (uint32_t src,
+                                uint32_t dst)
 {
     __m64 s = unpack_32_1x64 (src);
     __m64 d = unpack_32_1x64 (dst);
 
-    __m64 sa = negate_1x64 (expandAlpha_1x64 (s));
-    __m64 da = expandAlpha_1x64 (d);
+    __m64 sa = negate_1x64 (expand_alpha_1x64 (s));
+    __m64 da = expand_alpha_1x64 (d);
 
-    return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
+    return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
 }
 
 static force_inline void
-coreCombineAtopUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+core_combine_atop_u_sse2 (uint32_t*       pd,
+                          const uint32_t* ps,
+                          const uint32_t* pm,
+                          int             w)
 {
     uint32_t s, d;
 
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmDstLo, xmmDstHi;
-    __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
-    __m128i xmmAlphaDstLo, xmmAlphaDstHi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && ((unsigned long) pd & 15))
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+	s = combine1 (ps, pm);
+	d = *pd;
 
-        *pd++ = coreCombineAtopUPixelsse2 (s, d);
-        w--;
+	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+	w--;
 	ps++;
 	if (pm)
 	    pm++;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-	cachePrefetchNext ((__m128i*)pm);
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
 
-        xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
-        xmmDstHi = load128Aligned ((__m128i*) pd);
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
-        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
 
-        negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
+	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
 
-        pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
-                               &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
-                               &xmmDstLo, &xmmDstHi );
+	pix_add_multiply_2x128 (
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
 
-        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        w -= 4;
+	ps += 4;
+	pd += 4;
+	w -= 4;
 	if (pm)
 	    pm += 4;
     }
 
     while (w)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+	s = combine1 (ps, pm);
+	d = *pd;
 
-        *pd++ = coreCombineAtopUPixelsse2 (s, d);
-        w--;
+	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+	w--;
 	ps++;
 	if (pm)
 	    pm++;
@@ -1079,180 +1216,199 @@ coreCombineAtopUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int
 }
 
 static force_inline uint32_t
-coreCombineReverseAtopUPixelsse2 (uint32_t src, uint32_t dst)
+core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
+                                        uint32_t dst)
 {
     __m64 s = unpack_32_1x64 (src);
     __m64 d = unpack_32_1x64 (dst);
 
-    __m64 sa = expandAlpha_1x64 (s);
-    __m64 da = negate_1x64 (expandAlpha_1x64 (d));
+    __m64 sa = expand_alpha_1x64 (s);
+    __m64 da = negate_1x64 (expand_alpha_1x64 (d));
 
-    return pack_1x64_32 (pixAddMultiply_1x64 (&s, &da, &d, &sa));
+    return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa));
 }
 
 static force_inline void
-coreCombineReverseAtopUsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w)
+core_combine_reverse_atop_u_sse2 (uint32_t*       pd,
+                                  const uint32_t* ps,
+                                  const uint32_t* pm,
+                                  int             w)
 {
     uint32_t s, d;
 
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmDstLo, xmmDstHi;
-    __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
-    __m128i xmmAlphaDstLo, xmmAlphaDstHi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && ((unsigned long) pd & 15))
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+	s = combine1 (ps, pm);
+	d = *pd;
 
-        *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
+	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
 	ps++;
-        w--;
+	w--;
 	if (pm)
 	    pm++;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-	cachePrefetchNext ((__m128i*)pm);
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
 
-        xmmSrcHi = combine4 ((__m128i*)ps, (__m128i*)pm);
-        xmmDstHi = load128Aligned ((__m128i*) pd);
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 
-        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
-        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
 
-        negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
 
-        pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
-                               &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
-                               &xmmDstLo, &xmmDstHi );
+	pix_add_multiply_2x128 (
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
 
-        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        w -= 4;
+	ps += 4;
+	pd += 4;
+	w -= 4;
 	if (pm)
 	    pm += 4;
     }
 
     while (w)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+	s = combine1 (ps, pm);
+	d = *pd;
 
-        *pd++ = coreCombineReverseAtopUPixelsse2 (s, d);
+	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
 	ps++;
-        w--;
+	w--;
 	if (pm)
 	    pm++;
     }
 }
 
 static force_inline uint32_t
-coreCombineXorUPixelsse2 (uint32_t src, uint32_t dst)
+core_combine_xor_u_pixel_sse2 (uint32_t src,
+                               uint32_t dst)
 {
     __m64 s = unpack_32_1x64 (src);
     __m64 d = unpack_32_1x64 (dst);
 
-    __m64 negD = negate_1x64 (expandAlpha_1x64 (d));
-    __m64 negS = negate_1x64 (expandAlpha_1x64 (s));
+    __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d));
+    __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s));
 
-    return pack_1x64_32 (pixAddMultiply_1x64 (&s, &negD, &d, &negS));
+    return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s));
 }
 
 static force_inline void
-coreCombineXorUsse2 (uint32_t* dst, const uint32_t* src, const uint32_t *mask, int width)
+core_combine_xor_u_sse2 (uint32_t*       dst,
+                         const uint32_t* src,
+                         const uint32_t *mask,
+                         int             width)
 {
     int w = width;
     uint32_t s, d;
     uint32_t* pd = dst;
     const uint32_t* ps = src;
     const uint32_t* pm = mask;
-    
-    __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
-    __m128i xmmDst, xmmDstLo, xmmDstHi;
-    __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
-    __m128i xmmAlphaDstLo, xmmAlphaDstHi;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && ((unsigned long) pd & 15))
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+	s = combine1 (ps, pm);
+	d = *pd;
 
-        *pd++ = coreCombineXorUPixelsse2 (s, d);
-        w--;
+	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+	w--;
 	ps++;
 	if (pm)
 	    pm++;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-	cachePrefetchNext ((__m128i*)pm);
-
-        xmmSrc = combine4 ((__m128i*) ps, (__m128i*) pm);
-        xmmDst = load128Aligned ((__m128i*) pd);
-
-        unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
-        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
-        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
-        negate_2x128 (xmmAlphaSrcLo, xmmAlphaSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
-        negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
-        pixAddMultiply_2x128 ( &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
-                               &xmmDstLo, &xmmDstHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi,
-                               &xmmDstLo, &xmmDstHi );
-
-        save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
-        ps += 4;
-        pd += 4;
-        w -= 4;
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
+
+	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
+	xmm_dst = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
 	if (pm)
 	    pm += 4;
     }
 
     while (w)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+	s = combine1 (ps, pm);
+	d = *pd;
 
-        *pd++ = coreCombineXorUPixelsse2 (s, d);
-        w--;
+	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+	w--;
 	ps++;
 	if (pm)
 	    pm++;
@@ -1260,68 +1416,77 @@ coreCombineXorUsse2 (uint32_t* dst, const uint32_t* src, const uint32_t *mask, i
 }
 
 static force_inline void
-coreCombineAddUsse2 (uint32_t* dst, const uint32_t* src, const uint32_t* mask, int width)
+core_combine_add_u_sse2 (uint32_t*       dst,
+                         const uint32_t* src,
+                         const uint32_t* mask,
+                         int             width)
 {
     int w = width;
-    uint32_t s,d;
+    uint32_t s, d;
     uint32_t* pd = dst;
     const uint32_t* ps = src;
     const uint32_t* pm = mask;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && (unsigned long)pd & 15)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+	s = combine1 (ps, pm);
+	d = *pd;
+
 	ps++;
 	if (pm)
 	    pm++;
-        *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
-        w--;
+	*pd++ = _mm_cvtsi64_si32 (
+	    _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+	w--;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
 	__m128i s;
-	
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-	cachePrefetchNext ((__m128i*)pm);
 
-	s = combine4((__m128i*)ps,(__m128i*)pm);
-	
-        save128Aligned( (__m128i*)pd,
-                        _mm_adds_epu8( s, load128Aligned  ((__m128i*)pd)) );
-        pd += 4;
-        ps += 4;
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
+
+	s = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+	save_128_aligned (
+	    (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
+
+	pd += 4;
+	ps += 4;
 	if (pm)
 	    pm += 4;
-        w -= 4;
+	w -= 4;
     }
 
     while (w--)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
+	s = combine1 (ps, pm);
+	d = *pd;
+
 	ps++;
-        *pd++ = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
+	*pd++ = _mm_cvtsi64_si32 (
+	    _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d)));
 	if (pm)
 	    pm++;
     }
 }
 
 static force_inline uint32_t
-coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst)
+core_combine_saturate_u_pixel_sse2 (uint32_t src,
+                                    uint32_t dst)
 {
     __m64 ms = unpack_32_1x64 (src);
     __m64 md = unpack_32_1x64 (dst);
@@ -1330,99 +1495,107 @@ coreCombineSaturateUPixelsse2 (uint32_t src, uint32_t dst)
 
     if (sa > da)
     {
-        ms = pixMultiply_1x64 (ms, expandAlpha_1x64 (unpack_32_1x64 (FbIntDiv(da, sa) << 24)));
+	ms = pix_multiply_1x64 (
+	    ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24)));
     }
 
     return pack_1x64_32 (_mm_adds_pu16 (md, ms));
 }
 
 static force_inline void
-coreCombineSaturateUsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_saturate_u_sse2 (uint32_t *      pd,
+                              const uint32_t *ps,
+                              const uint32_t *pm,
+                              int             w)
 {
-    uint32_t s,d;
+    uint32_t s, d;
 
-    uint32_t packCmp;
-    __m128i xmmSrc, xmmDst;
+    uint32_t pack_cmp;
+    __m128i xmm_src, xmm_dst;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && (unsigned long)pd & 15)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
-        *pd++ = coreCombineSaturateUPixelsse2 (s, d);
-        w--;
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	w--;
 	ps++;
 	if (pm)
 	    pm++;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-	cachePrefetchNext ((__m128i*)pm);
-
-        xmmDst = load128Aligned  ((__m128i*)pd);
-        xmmSrc = combine4 ((__m128i*)ps, (__m128i*)pm);
-
-        packCmp = _mm_movemask_epi8 (_mm_cmpgt_epi32 (_mm_srli_epi32 (xmmSrc, 24),
-                                                      _mm_srli_epi32 (_mm_xor_si128 (xmmDst, Maskff000000), 24)));
-
-        /* if some alpha src is grater than respective ~alpha dst */
-        if (packCmp)
-        {
-            s = combine1 (ps++, pm);
-            d = *pd;
-            *pd++ = coreCombineSaturateUPixelsse2 (s, d);
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
+
+	xmm_dst = load_128_aligned  ((__m128i*)pd);
+	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+	pack_cmp = _mm_movemask_epi8 (
+	    _mm_cmpgt_epi32 (
+		_mm_srli_epi32 (xmm_src, 24),
+		_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
+
+	/* if some alpha src is grater than respective ~alpha dst */
+	if (pack_cmp)
+	{
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
 	    if (pm)
 		pm++;
 
-            s = combine1 (ps++, pm);
-            d = *pd;
-            *pd++ = coreCombineSaturateUPixelsse2 (s, d);
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
 	    if (pm)
 		pm++;
 
-            s = combine1 (ps++, pm);
-            d = *pd;
-            *pd++ = coreCombineSaturateUPixelsse2 (s, d);
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
 	    if (pm)
 		pm++;
 
-            s = combine1 (ps++, pm);
-            d = *pd;
-            *pd++ = coreCombineSaturateUPixelsse2 (s, d);
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
 	    if (pm)
 		pm++;
-        }
-        else
-        {
-            save128Aligned ((__m128i*)pd, _mm_adds_epu8 (xmmDst, xmmSrc));
+	}
+	else
+	{
+	    save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
 
-            pd += 4;
-            ps += 4;
+	    pd += 4;
+	    ps += 4;
 	    if (pm)
 		pm += 4;
-        }
+	}
 
-        w -= 4;
+	w -= 4;
     }
 
     while (w--)
     {
-        s = combine1 (ps, pm);
-        d = *pd;
-        *pd++ = coreCombineSaturateUPixelsse2 (s, d);
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
 	ps++;
 	if (pm)
 	    pm++;
@@ -1430,1683 +1603,2098 @@ coreCombineSaturateUsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm,
 }
 
 static force_inline void
-coreCombineSrcCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
+core_combine_src_ca_sse2 (uint32_t*       pd,
+                          const uint32_t* ps,
+                          const uint32_t *pm,
+                          int             w)
 {
     uint32_t s, m;
 
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmMaskLo, xmmMaskHi;
-    __m128i xmmDstLo, xmmDstHi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
-        w--;
+	s = *ps++;
+	m = *pm++;
+	*pd++ = pack_1x64_32 (
+	    pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+	w--;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-        cachePrefetchNext ((__m128i*)pm);
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
 
-        xmmSrcHi = load128Unaligned ((__m128i*)ps);
-        xmmMaskHi = load128Unaligned ((__m128i*)pm);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
 
-        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
 
-        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
-        w--;
+	s = *ps++;
+	m = *pm++;
+	*pd++ = pack_1x64_32 (
+	    pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)));
+	w--;
     }
 }
 
 static force_inline uint32_t
-coreCombineOverCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
+core_combine_over_ca_pixel_sse2 (uint32_t src,
+                                 uint32_t mask,
+                                 uint32_t dst)
 {
     __m64 s = unpack_32_1x64 (src);
-    __m64 expAlpha = expandAlpha_1x64 (s);
-    __m64 unpkMask = unpack_32_1x64 (mask);
-    __m64 unpkDst  = unpack_32_1x64 (dst);
+    __m64 expAlpha = expand_alpha_1x64 (s);
+    __m64 unpk_mask = unpack_32_1x64 (mask);
+    __m64 unpk_dst  = unpack_32_1x64 (dst);
 
-    return pack_1x64_32 (inOver_1x64 (&s, &expAlpha, &unpkMask, &unpkDst));
+    return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst));
 }
 
 static force_inline void
-coreCombineOverCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
+core_combine_over_ca_sse2 (uint32_t*       pd,
+                           const uint32_t* ps,
+                           const uint32_t *pm,
+                           int             w)
 {
     uint32_t s, m, d;
 
-    __m128i xmmAlphaLo, xmmAlphaHi;
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmDstLo, xmmDstHi;
-    __m128i xmmMaskLo, xmmMaskHi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
 
-        *pd++ = coreCombineOverCPixelsse2 (s, m, d);
-        w--;
+	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+	w--;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-        cachePrefetchNext ((__m128i*)pm);
-
-        xmmDstHi = load128Aligned ((__m128i*)pd);
-        xmmSrcHi = load128Unaligned ((__m128i*)ps);
-        xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
-        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
-
-        inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
-
-        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
+
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+		       &xmm_alpha_lo, &xmm_alpha_hi,
+		       &xmm_mask_lo, &xmm_mask_hi,
+		       &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
 
-        *pd++ = coreCombineOverCPixelsse2 (s, m, d);
-        w--;
+	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+	w--;
     }
 }
 
 static force_inline uint32_t
-coreCombineOverReverseCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
+core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
+                                         uint32_t mask,
+                                         uint32_t dst)
 {
     __m64 d = unpack_32_1x64 (dst);
 
-	return pack_1x64_32(over_1x64 (d, expandAlpha_1x64 (d), pixMultiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask))));
+    return pack_1x64_32 (
+	over_1x64 (d, expand_alpha_1x64 (d),
+		   pix_multiply_1x64 (unpack_32_1x64 (src),
+				      unpack_32_1x64 (mask))));
 }
 
 static force_inline void
-coreCombineOverReverseCsse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w)
+core_combine_over_reverse_ca_sse2 (uint32_t*       pd,
+                                   const uint32_t* ps,
+                                   const uint32_t *pm,
+                                   int             w)
 {
     uint32_t s, m, d;
 
-    __m128i xmmAlphaLo, xmmAlphaHi;
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmDstLo, xmmDstHi;
-    __m128i xmmMaskLo, xmmMaskHi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
 
-        *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
-        w--;
+	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+	w--;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-        cachePrefetchNext ((__m128i*)pm);
-
-        xmmDstHi = load128Aligned ((__m128i*)pd);
-        xmmSrcHi = load128Unaligned ((__m128i*)ps);
-        xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
-        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
-        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
-        over_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
-
-        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
-
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
+
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+		    &xmm_alpha_lo, &xmm_alpha_hi,
+		    &xmm_mask_lo, &xmm_mask_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
 
-        *pd++ = coreCombineOverReverseCPixelsse2 (s, m, d);
-        w--;
+	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+	w--;
     }
 }
 
 static force_inline void
-coreCombineInCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_in_ca_sse2 (uint32_t *      pd,
+                         const uint32_t *ps,
+                         const uint32_t *pm,
+                         int             w)
 {
     uint32_t s, m, d;
 
-    __m128i xmmAlphaLo, xmmAlphaHi;
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmDstLo, xmmDstHi;
-    __m128i xmmMaskLo, xmmMaskHi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x64_32 (
+	    pix_multiply_1x64 (
+		pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
+		expand_alpha_1x64 (unpack_32_1x64 (d))));
 
-        *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
-                                                expandAlpha_1x64 (unpack_32_1x64 (d))));
-        w--;
+	w--;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-        cachePrefetchNext ((__m128i*)pm);
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
+
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
-        xmmDstHi = load128Aligned ((__m128i*)pd);
-        xmmSrcHi = load128Unaligned ((__m128i*)ps);
-        xmmMaskHi = load128Unaligned ((__m128i*)pm);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
 
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
 
-        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
-        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
 
-        pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
 
-        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
 
-        *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
-                                                expandAlpha_1x64 (unpack_32_1x64 (d))));
-        w--;
+	*pd++ = pack_1x64_32 (
+	    pix_multiply_1x64 (
+		pix_multiply_1x64 (
+		    unpack_32_1x64 (s), unpack_32_1x64 (m)),
+		expand_alpha_1x64 (unpack_32_1x64 (d))));
+
+	w--;
     }
 }
 
 static force_inline void
-coreCombineInReverseCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_in_reverse_ca_sse2 (uint32_t *      pd,
+                                 const uint32_t *ps,
+                                 const uint32_t *pm,
+                                 int             w)
 {
     uint32_t s, m, d;
 
-    __m128i xmmAlphaLo, xmmAlphaHi;
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmDstLo, xmmDstHi;
-    __m128i xmmMaskLo, xmmMaskHi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
-
-        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
-                                                pixMultiply_1x64 (unpack_32_1x64 (m),
-                                                                  expandAlpha_1x64 (unpack_32_1x64 (s)))));
-        w--;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x64_32 (
+	    pix_multiply_1x64 (
+		unpack_32_1x64 (d),
+		pix_multiply_1x64 (unpack_32_1x64 (m),
+				   expand_alpha_1x64 (unpack_32_1x64 (s)))));
+	w--;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-        cachePrefetchNext ((__m128i*)pm);
-
-        xmmDstHi = load128Aligned ((__m128i*)pd);
-        xmmSrcHi = load128Unaligned ((__m128i*)ps);
-        xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
-        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
-        pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
-
-        pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
-
-        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
+
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
-
-        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
-                                                pixMultiply_1x64 (unpack_32_1x64 (m),
-                                                                  expandAlpha_1x64 (unpack_32_1x64 (s)))));
-        w--;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x64_32 (
+	    pix_multiply_1x64 (
+		unpack_32_1x64 (d),
+		pix_multiply_1x64 (unpack_32_1x64 (m),
+				   expand_alpha_1x64 (unpack_32_1x64 (s)))));
+	w--;
     }
 }
 
 static force_inline void
-coreCombineOutCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_out_ca_sse2 (uint32_t *      pd,
+                          const uint32_t *ps,
+                          const uint32_t *pm,
+                          int             w)
 {
     uint32_t s, m, d;
 
-    __m128i xmmAlphaLo, xmmAlphaHi;
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmDstLo, xmmDstHi;
-    __m128i xmmMaskLo, xmmMaskHi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
-
-        *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
-                                                negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
-        w--;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x64_32 (
+	    pix_multiply_1x64 (
+		pix_multiply_1x64 (
+		    unpack_32_1x64 (s), unpack_32_1x64 (m)),
+		negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+	w--;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-        cachePrefetchNext ((__m128i*)pm);
-
-        xmmDstHi = load128Aligned ((__m128i*)pd);
-        xmmSrcHi = load128Unaligned ((__m128i*)ps);
-        xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
-        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaLo, &xmmAlphaHi);
-        negate_2x128 (xmmAlphaLo, xmmAlphaHi, &xmmAlphaLo, &xmmAlphaHi);
-
-        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
-        pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDstLo, &xmmDstHi);
-
-        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
+
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
+		      &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
 
-        *pd++ = pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)),
-                                                negate_1x64 (expandAlpha_1x64 (unpack_32_1x64 (d)))));
-        w--;
+	*pd++ = pack_1x64_32 (
+	    pix_multiply_1x64 (
+		pix_multiply_1x64 (
+		    unpack_32_1x64 (s), unpack_32_1x64 (m)),
+		negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d)))));
+
+	w--;
     }
 }
 
 static force_inline void
-coreCombineOutReverseCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_out_reverse_ca_sse2 (uint32_t *      pd,
+                                  const uint32_t *ps,
+                                  const uint32_t *pm,
+                                  int             w)
 {
     uint32_t s, m, d;
 
-    __m128i xmmAlphaLo, xmmAlphaHi;
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmDstLo, xmmDstHi;
-    __m128i xmmMaskLo, xmmMaskHi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
-
-        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
-                                                negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
-                                                                               expandAlpha_1x64 (unpack_32_1x64 (s))))));
-        w--;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x64_32 (
+	    pix_multiply_1x64 (
+		unpack_32_1x64 (d),
+		negate_1x64 (pix_multiply_1x64 (
+				 unpack_32_1x64 (m),
+				 expand_alpha_1x64 (unpack_32_1x64 (s))))));
+	w--;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-        cachePrefetchNext ((__m128i*)pm);
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
 
-        xmmDstHi = load128Aligned ((__m128i*)pd);
-        xmmSrcHi = load128Unaligned ((__m128i*)ps);
-        xmmMaskHi = load128Unaligned ((__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
 
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
 
-        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
 
-        pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMaskLo, &xmmMaskHi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
 
-        negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
+	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+		      &xmm_mask_lo, &xmm_mask_hi);
 
-        pixMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
 
-        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
-
-        *pd++ = pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (d),
-                                                negate_1x64 (pixMultiply_1x64 (unpack_32_1x64 (m),
-                                                                               expandAlpha_1x64 (unpack_32_1x64 (s))))));
-        w--;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x64_32 (
+	    pix_multiply_1x64 (
+		unpack_32_1x64 (d),
+		negate_1x64 (pix_multiply_1x64 (
+				 unpack_32_1x64 (m),
+				 expand_alpha_1x64 (unpack_32_1x64 (s))))));
+	w--;
     }
 }
 
 static force_inline uint32_t
-coreCombineAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
+core_combine_atop_ca_pixel_sse2 (uint32_t src,
+                                 uint32_t mask,
+                                 uint32_t dst)
 {
     __m64 m = unpack_32_1x64 (mask);
     __m64 s = unpack_32_1x64 (src);
     __m64 d = unpack_32_1x64 (dst);
-    __m64 sa = expandAlpha_1x64 (s);
-    __m64 da = expandAlpha_1x64 (d);
+    __m64 sa = expand_alpha_1x64 (s);
+    __m64 da = expand_alpha_1x64 (d);
 
-    s = pixMultiply_1x64 (s, m);
-    m = negate_1x64 (pixMultiply_1x64 (m, sa));
+    s = pix_multiply_1x64 (s, m);
+    m = negate_1x64 (pix_multiply_1x64 (m, sa));
 
-    return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
+    return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
 }
 
 static force_inline void
-coreCombineAtopCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_atop_ca_sse2 (uint32_t *      pd,
+                           const uint32_t *ps,
+                           const uint32_t *pm,
+                           int             w)
 {
     uint32_t s, m, d;
 
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmDstLo, xmmDstHi;
-    __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
-    __m128i xmmAlphaDstLo, xmmAlphaDstHi;
-    __m128i xmmMaskLo, xmmMaskHi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
 
-        *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
-        w--;
+	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+	w--;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-        cachePrefetchNext ((__m128i*)pm);
-
-        xmmDstHi = load128Aligned ((__m128i*)pd);
-        xmmSrcHi = load128Unaligned ((__m128i*)ps);
-        xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
-        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
-        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
-        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
-        pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
-
-        negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
-        pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
-                              &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
-                              &xmmDstLo, &xmmDstHi);
-
-        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
+
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
 
-        *pd++ = coreCombineAtopCPixelsse2 (s, m, d);
-        w--;
+	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+	w--;
     }
 }
 
 static force_inline uint32_t
-coreCombineReverseAtopCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
+core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
+                                         uint32_t mask,
+                                         uint32_t dst)
 {
     __m64 m = unpack_32_1x64 (mask);
     __m64 s = unpack_32_1x64 (src);
     __m64 d = unpack_32_1x64 (dst);
 
-    __m64 da = negate_1x64 (expandAlpha_1x64 (d));
-    __m64 sa = expandAlpha_1x64 (s);
+    __m64 da = negate_1x64 (expand_alpha_1x64 (d));
+    __m64 sa = expand_alpha_1x64 (s);
 
-    s = pixMultiply_1x64 (s, m);
-    m = pixMultiply_1x64 (m, sa);
+    s = pix_multiply_1x64 (s, m);
+    m = pix_multiply_1x64 (m, sa);
 
-    return pack_1x64_32 (pixAddMultiply_1x64 (&d, &m, &s, &da));
+    return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da));
 }
 
 static force_inline void
-coreCombineReverseAtopCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_reverse_atop_ca_sse2 (uint32_t *      pd,
+                                   const uint32_t *ps,
+                                   const uint32_t *pm,
+                                   int             w)
 {
     uint32_t s, m, d;
 
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmDstLo, xmmDstHi;
-    __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
-    __m128i xmmAlphaDstLo, xmmAlphaDstHi;
-    __m128i xmmMaskLo, xmmMaskHi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
 
-        *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
-        w--;
+	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+	w--;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-        cachePrefetchNext ((__m128i*)pm);
-
-        xmmDstHi = load128Aligned ((__m128i*)pd);
-        xmmSrcHi = load128Unaligned ((__m128i*)ps);
-        xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
-        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
-        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
-        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
-        pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
-
-        negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
-        pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
-                              &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
-                              &xmmDstLo, &xmmDstHi);
-
-        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
+
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
 
-        *pd++ = coreCombineReverseAtopCPixelsse2 (s, m, d);
-        w--;
+	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+	w--;
     }
 }
 
 static force_inline uint32_t
-coreCombineXorCPixelsse2 (uint32_t src, uint32_t mask, uint32_t dst)
+core_combine_xor_ca_pixel_sse2 (uint32_t src,
+                                uint32_t mask,
+                                uint32_t dst)
 {
     __m64 a = unpack_32_1x64 (mask);
     __m64 s = unpack_32_1x64 (src);
     __m64 d = unpack_32_1x64 (dst);
 
-    __m64 alphaDst = negate_1x64 (pixMultiply_1x64 (a, expandAlpha_1x64 (s)));
-    __m64 dest      = pixMultiply_1x64 (s, a);
-    __m64 alphaSrc = negate_1x64 (expandAlpha_1x64 (d));
+    __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 (
+				       a, expand_alpha_1x64 (s)));
+    __m64 dest      = pix_multiply_1x64 (s, a);
+    __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d));
 
-    return pack_1x64_32 (pixAddMultiply_1x64 (&d,
-                                              &alphaDst,
-                                              &dest,
-                                              &alphaSrc));
+    return pack_1x64_32 (pix_add_multiply_1x64 (&d,
+                                                &alpha_dst,
+                                                &dest,
+                                                &alpha_src));
 }
 
 static force_inline void
-coreCombineXorCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_xor_ca_sse2 (uint32_t *      pd,
+                          const uint32_t *ps,
+                          const uint32_t *pm,
+                          int             w)
 {
     uint32_t s, m, d;
 
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmDstLo, xmmDstHi;
-    __m128i xmmAlphaSrcLo, xmmAlphaSrcHi;
-    __m128i xmmAlphaDstLo, xmmAlphaDstHi;
-    __m128i xmmMaskLo, xmmMaskHi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
 
-        *pd++ = coreCombineXorCPixelsse2 (s, m, d);
-        w--;
+	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+	w--;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-        cachePrefetchNext ((__m128i*)pm);
-
-        xmmDstHi = load128Aligned ((__m128i*)pd);
-        xmmSrcHi = load128Unaligned ((__m128i*)ps);
-        xmmMaskHi = load128Unaligned ((__m128i*)pm);
-
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
-        expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi);
-        expandAlpha_2x128 (xmmDstLo, xmmDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-
-        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
-        pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmAlphaSrcLo, &xmmAlphaSrcHi, &xmmMaskLo, &xmmMaskHi);
-
-        negate_2x128 (xmmAlphaDstLo, xmmAlphaDstHi, &xmmAlphaDstLo, &xmmAlphaDstHi);
-        negate_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
-        pixAddMultiply_2x128 (&xmmDstLo, &xmmDstHi, &xmmMaskLo, &xmmMaskHi,
-                              &xmmSrcLo, &xmmSrcHi, &xmmAlphaDstLo, &xmmAlphaDstHi,
-                              &xmmDstLo, &xmmDstHi);
-
-        save128Aligned( (__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
+
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+		      &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
 
-        *pd++ = coreCombineXorCPixelsse2 (s, m, d);
-        w--;
+	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+	w--;
     }
 }
 
 static force_inline void
-coreCombineAddCsse2 (uint32_t *pd, const uint32_t *ps, const uint32_t *pm, int w)
+core_combine_add_ca_sse2 (uint32_t *      pd,
+                          const uint32_t *ps,
+                          const uint32_t *pm,
+                          int             w)
 {
     uint32_t s, m, d;
 
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmDstLo, xmmDstHi;
-    __m128i xmmMaskLo, xmmMaskHi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w && (unsigned long)pd & 15)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
-
-        *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
-                                                              unpack_32_1x64 (m)),
-                                            unpack_32_1x64 (d)));
-        w--;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x64_32 (
+	    _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
+					     unpack_32_1x64 (m)),
+			  unpack_32_1x64 (d)));
+	w--;
     }
 
     /* call prefetch hint to optimize cache load*/
-    cachePrefetch ((__m128i*)ps);
-    cachePrefetch ((__m128i*)pd);
-    cachePrefetch ((__m128i*)pm);
+    cache_prefetch ((__m128i*)ps);
+    cache_prefetch ((__m128i*)pd);
+    cache_prefetch ((__m128i*)pm);
 
     while (w >= 4)
     {
-        /* fill cache line with next memory */
-        cachePrefetchNext ((__m128i*)ps);
-        cachePrefetchNext ((__m128i*)pd);
-        cachePrefetchNext ((__m128i*)pm);
-
-        xmmSrcHi = load128Unaligned ((__m128i*)ps);
-        xmmMaskHi = load128Unaligned ((__m128i*)pm);
-        xmmDstHi = load128Aligned ((__m128i*)pd);
-
-        unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-        unpack_128_2x128 (xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-        unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-
-        pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmMaskLo, &xmmMaskHi, &xmmSrcLo, &xmmSrcHi);
-
-        save128Aligned( (__m128i*)pd, pack_2x128_128 (_mm_adds_epu8 (xmmSrcLo, xmmDstLo),
-                                                      _mm_adds_epu8 (xmmSrcHi, xmmDstHi)));
-
-        ps += 4;
-        pd += 4;
-        pm += 4;
-        w -= 4;
+	/* fill cache line with next memory */
+	cache_prefetch_next ((__m128i*)ps);
+	cache_prefetch_next ((__m128i*)pd);
+	cache_prefetch_next ((__m128i*)pm);
+
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (
+		_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
+		_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
     }
 
     while (w)
     {
-        s = *ps++;
-        m = *pm++;
-        d = *pd;
-
-        *pd++ = pack_1x64_32 (_mm_adds_pu8 (pixMultiply_1x64 (unpack_32_1x64 (s),
-                                                              unpack_32_1x64 (m)),
-                                            unpack_32_1x64 (d)));
-        w--;
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x64_32 (
+	    _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s),
+					     unpack_32_1x64 (m)),
+			  unpack_32_1x64 (d)));
+	w--;
     }
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbComposeSetupSSE2
+/* ---------------------------------------------------
+ * fb_compose_setup_sSE2
  */
 static force_inline __m64
-createMask_16_64 (uint16_t mask)
+create_mask_16_64 (uint16_t mask)
 {
     return _mm_set1_pi16 (mask);
 }
 
 static force_inline __m128i
-createMask_16_128 (uint16_t mask)
+create_mask_16_128 (uint16_t mask)
 {
     return _mm_set1_epi16 (mask);
 }
 
 static force_inline __m64
-createMask_2x32_64 (uint32_t mask0, uint32_t mask1)
+create_mask_2x32_64 (uint32_t mask0,
+                     uint32_t mask1)
 {
     return _mm_set_pi32 (mask0, mask1);
 }
 
+/* Work around a code generation bug in Sun Studio 12. */
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
+# define create_mask_2x32_128(mask0, mask1) \
+	(_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
+#else
 static force_inline __m128i
-createMask_2x32_128 (uint32_t mask0, uint32_t mask1)
+create_mask_2x32_128 (uint32_t mask0,
+                      uint32_t mask1)
 {
     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
 }
+#endif
 
 /* SSE2 code patch for fbcompose.c */
 
-static FASTCALL void
-sse2CombineOverU (pixman_implementation_t *imp, pixman_op_t op,
-		  uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineOverUsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_over_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dst,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    core_combine_over_u_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineOverReverseU (pixman_implementation_t *imp, pixman_op_t op,
-			 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineOverReverseUsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_over_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dst,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    core_combine_over_reverse_u_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineInU (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineInUsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_in_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dst,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    core_combine_in_u_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineInReverseU (pixman_implementation_t *imp, pixman_op_t op,
-		       uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineReverseInUsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_in_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dst,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    core_combine_reverse_in_u_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineOutU (pixman_implementation_t *imp, pixman_op_t op,
-		 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineOutUsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_out_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    core_combine_out_u_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineOutReverseU (pixman_implementation_t *imp, pixman_op_t op,
-			uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineReverseOutUsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_out_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dst,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    core_combine_reverse_out_u_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineAtopU (pixman_implementation_t *imp, pixman_op_t op,
-		  uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineAtopUsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_atop_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dst,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    core_combine_atop_u_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineAtopReverseU (pixman_implementation_t *imp, pixman_op_t op,
-			 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineReverseAtopUsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dst,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    core_combine_reverse_atop_u_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineXorU (pixman_implementation_t *imp, pixman_op_t op,
-		 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineXorUsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_xor_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    core_combine_xor_u_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineAddU (pixman_implementation_t *imp, pixman_op_t op,
-		 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineAddUsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_add_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    core_combine_add_u_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineSaturateU (pixman_implementation_t *imp, pixman_op_t op,
-		      uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineSaturateUsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_saturate_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *               dst,
+                         const uint32_t *         src,
+                         const uint32_t *         mask,
+                         int                      width)
+{
+    core_combine_saturate_u_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineSrcC (pixman_implementation_t *imp, pixman_op_t op,
-		 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineSrcCsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_src_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dst,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    core_combine_src_ca_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineOverC (pixman_implementation_t *imp, pixman_op_t op,
-		  uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineOverCsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_over_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               dst,
+                      const uint32_t *         src,
+                      const uint32_t *         mask,
+                      int                      width)
+{
+    core_combine_over_ca_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineOverReverseC (pixman_implementation_t *imp, pixman_op_t op,
-			 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineOverReverseCsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               dst,
+                              const uint32_t *         src,
+                              const uint32_t *         mask,
+                              int                      width)
+{
+    core_combine_over_reverse_ca_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineInC (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineInCsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_in_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    core_combine_in_ca_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineInReverseC (pixman_implementation_t *imp, pixman_op_t op,
-		       uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineInReverseCsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dst,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    core_combine_in_reverse_ca_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineOutC (pixman_implementation_t *imp, pixman_op_t op,
-		 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineOutCsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_out_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dst,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    core_combine_out_ca_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineOutReverseC (pixman_implementation_t *imp, pixman_op_t op,
-			uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineOutReverseCsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dst,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    core_combine_out_reverse_ca_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineAtopC (pixman_implementation_t *imp, pixman_op_t op,
-		  uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineAtopCsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_atop_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               dst,
+                      const uint32_t *         src,
+                      const uint32_t *         mask,
+                      int                      width)
+{
+    core_combine_atop_ca_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineAtopReverseC (pixman_implementation_t *imp, pixman_op_t op,
-			 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineReverseAtopCsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               dst,
+                              const uint32_t *         src,
+                              const uint32_t *         mask,
+                              int                      width)
+{
+    core_combine_reverse_atop_ca_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineXorC (pixman_implementation_t *imp, pixman_op_t op,
-		 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineXorCsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_xor_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dst,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    core_combine_xor_ca_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-static FASTCALL void
-sse2CombineAddC (pixman_implementation_t *imp, pixman_op_t op,
-		 uint32_t *dst, const uint32_t *src, const uint32_t *mask, int width)
-{
-    coreCombineAddCsse2 (dst, src, mask, width);
-    _mm_empty();
+static void
+sse2_combine_add_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dst,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    core_combine_add_ca_sse2 (dst, src, mask, width);
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSolid_nx8888
+/* -------------------------------------------------------------------
+ * composite_over_n_8888
  */
 
 static void
-fbCompositeSolid_nx8888sse2 (pixman_implementation_t *imp,
-			     pixman_op_t op,
-			    pixman_image_t * pSrc,
-			    pixman_image_t * pMask,
-			    pixman_image_t * pDst,
-			    int32_t	xSrc,
-			    int32_t	ySrc,
-			    int32_t	xMask,
-			    int32_t	yMask,
-			    int32_t	xDst,
-			    int32_t	yDst,
-			    int32_t	width,
-			    int32_t	height)
-{
-    uint32_t	src;
-    uint32_t	*dstLine, *dst, d;
-    uint16_t	w;
-    int	dstStride;
-    __m128i xmmSrc, xmmAlpha;
-    __m128i xmmDst, xmmDstLo, xmmDstHi;
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+sse2_composite_over_n_8888 (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            pixman_image_t *         src_image,
+                            pixman_image_t *         mask_image,
+                            pixman_image_t *         dst_image,
+                            int32_t                  src_x,
+                            int32_t                  src_y,
+                            int32_t                  mask_x,
+                            int32_t                  mask_y,
+                            int32_t                  dest_x,
+                            int32_t                  dest_y,
+                            int32_t                  width,
+                            int32_t                  height)
+{
+    uint32_t src;
+    uint32_t    *dst_line, *dst, d;
+    uint16_t w;
+    int dst_stride;
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     if (src == 0)
 	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
-    xmmSrc = expandPixel_32_1x128 (src);
-    xmmAlpha = expandAlpha_1x128 (xmmSrc);
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
 
     while (height--)
     {
-        dst = dstLine;
+	dst = dst_line;
 
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)dst);
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)dst);
 
-        dstLine += dstStride;
-        w = width;
+	dst_line += dst_stride;
+	w = width;
 
-        while (w && (unsigned long)dst & 15)
-        {
-            d = *dst;
-            *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
-                                              _mm_movepi64_pi64 (xmmAlpha),
-                                              unpack_32_1x64 (d)));
-            w--;
-        }
+	while (w && (unsigned long)dst & 15)
+	{
+	    d = *dst;
+	    *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+	                                      _mm_movepi64_pi64 (xmm_alpha),
+	                                      unpack_32_1x64 (d)));
+	    w--;
+	}
 
-        cachePrefetch ((__m128i*)dst);
+	cache_prefetch ((__m128i*)dst);
 
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)dst);
+	while (w >= 4)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)dst);
 
-            xmmDst = load128Aligned ((__m128i*)dst);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
 
-            unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
 
-            over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDstLo, &xmmDstHi);
+	    over_2x128 (&xmm_src, &xmm_src,
+			&xmm_alpha, &xmm_alpha,
+			&xmm_dst_lo, &xmm_dst_hi);
 
-            /* rebuid the 4 pixel data and save*/
-            save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
+	    /* rebuid the 4 pixel data and save*/
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 
-            w -= 4;
-            dst += 4;
-        }
+	    w -= 4;
+	    dst += 4;
+	}
 
-        while (w)
-        {
-            d = *dst;
-            *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
-                                              _mm_movepi64_pi64 (xmmAlpha),
-                                              unpack_32_1x64 (d)));
-            w--;
-        }
+	while (w)
+	{
+	    d = *dst;
+	    *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+	                                      _mm_movepi64_pi64 (xmm_alpha),
+	                                      unpack_32_1x64 (d)));
+	    w--;
+	}
 
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSolid_nx0565
+/* ---------------------------------------------------------------------
+ * composite_over_n_0565
  */
 static void
-fbCompositeSolid_nx0565sse2 (pixman_implementation_t *imp,
-			     pixman_op_t op,
-			    pixman_image_t * pSrc,
-			    pixman_image_t * pMask,
-			    pixman_image_t * pDst,
-			    int32_t	xSrc,
-			    int32_t	ySrc,
-			    int32_t	xMask,
-			    int32_t	yMask,
-			    int32_t	xDst,
-			    int32_t	yDst,
-			    int32_t	width,
-			    int32_t	height)
-{
-    uint32_t	src;
-    uint16_t	*dstLine, *dst, d;
-    uint16_t	w;
-    int	        dstStride;
-    __m128i xmmSrc, xmmAlpha;
-    __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+sse2_composite_over_n_0565 (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            pixman_image_t *         src_image,
+                            pixman_image_t *         mask_image,
+                            pixman_image_t *         dst_image,
+                            int32_t                  src_x,
+                            int32_t                  src_y,
+                            int32_t                  mask_x,
+                            int32_t                  mask_y,
+                            int32_t                  dest_x,
+                            int32_t                  dest_y,
+                            int32_t                  width,
+                            int32_t                  height)
+{
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    uint16_t w;
+    int dst_stride;
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     if (src == 0)
-        return;
+	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
 
-    xmmSrc = expandPixel_32_1x128 (src);
-    xmmAlpha = expandAlpha_1x128 (xmmSrc);
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
 
     while (height--)
     {
-        dst = dstLine;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)dst);
-
-        dstLine += dstStride;
-        w = width;
-
-        while (w && (unsigned long)dst & 15)
-        {
-            d = *dst;
-
-            *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
-                                                             _mm_movepi64_pi64 (xmmAlpha),
-                                                             expand565_16_1x64 (d))));
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)dst);
-
-        while (w >= 8)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)dst);
-
-	    xmmDst = load128Aligned ((__m128i*)dst);
-	    
-	    unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
-	    
-            over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst0, &xmmDst1);
-            over_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmDst2, &xmmDst3);
-
-            xmmDst = pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
-            save128Aligned ((__m128i*)dst, xmmDst);
-
-            dst += 8;
-            w -= 8;
-        }
-
-        while (w--)
-        {
-            d = *dst;
-            *dst++ = pack565_32_16 (pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmmSrc),
-                                                             _mm_movepi64_pi64 (xmmAlpha),
-                                                             expand565_16_1x64 (d))));
-        }
+	dst = dst_line;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)dst);
+
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    d = *dst;
+
+	    *dst++ = pack_565_32_16 (
+		pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+					 _mm_movepi64_pi64 (xmm_alpha),
+					 expand565_16_1x64 (d))));
+	    w--;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)dst);
+
+	while (w >= 8)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)dst);
+
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+	    over_2x128 (&xmm_src, &xmm_src,
+			&xmm_alpha, &xmm_alpha,
+			&xmm_dst0, &xmm_dst1);
+	    over_2x128 (&xmm_src, &xmm_src,
+			&xmm_alpha, &xmm_alpha,
+			&xmm_dst2, &xmm_dst3);
+
+	    xmm_dst = pack_565_4x128_128 (
+		&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+	    save_128_aligned ((__m128i*)dst, xmm_dst);
+
+	    dst += 8;
+	    w -= 8;
+	}
+
+	while (w--)
+	{
+	    d = *dst;
+	    *dst++ = pack_565_32_16 (
+		pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src),
+					 _mm_movepi64_pi64 (xmm_alpha),
+					 expand565_16_1x64 (d))));
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSolidMask_nx8888x8888C
+/* ------------------------------
+ * composite_add_n_8888_8888_ca
  */
-
 static void
-fbCompositeSolidMask_nx8888x8888Csse2 (pixman_implementation_t *imp,
-				       pixman_op_t op,
-				      pixman_image_t * pSrc,
-				      pixman_image_t * pMask,
-				      pixman_image_t * pDst,
-				      int32_t	xSrc,
-				      int32_t	ySrc,
-				      int32_t	xMask,
-				      int32_t	yMask,
-				      int32_t	xDst,
-				      int32_t	yDst,
-				      int32_t	width,
-				      int32_t	height)
-{
-    uint32_t	src;
-    uint32_t	*dstLine, d;
-    uint32_t	*maskLine, m;
-    uint32_t    packCmp;
-    int	dstStride, maskStride;
-
-    __m128i xmmSrc, xmmAlpha;
-    __m128i xmmDst, xmmDstLo, xmmDstHi;
-    __m128i xmmMask, xmmMaskLo, xmmMaskHi;
-
-    __m64 mmxSrc, mmxAlpha, mmxMask, mmxDst;
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
-
+sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+				   pixman_op_t              op,
+				   pixman_image_t *         src_image,
+				   pixman_image_t *         mask_image,
+				   pixman_image_t *         dst_image,
+				   int32_t                  src_x,
+				   int32_t                  src_y,
+				   int32_t                  mask_x,
+				   int32_t                  mask_y,
+				   int32_t                  dest_x,
+				   int32_t                  dest_y,
+				   int32_t                  width,
+				   int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+    srca = src >> 24;
+    
     if (src == 0)
 	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
-    xmmSrc = _mm_unpacklo_epi8 (createMask_2x32_128 (src, src), _mm_setzero_si128 ());
-    xmmAlpha = expandAlpha_1x128 (xmmSrc);
-    mmxSrc   = _mm_movepi64_pi64 (xmmSrc);
-    mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
+    xmm_src = _mm_unpacklo_epi8 (
+	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src   = _mm_movepi64_pi64 (xmm_src);
+    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
 
     while (height--)
     {
-        int w = width;
-        const uint32_t *pm = (uint32_t *)maskLine;
-        uint32_t *pd = (uint32_t *)dstLine;
-
-        dstLine += dstStride;
-        maskLine += maskStride;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)pd);
-        cachePrefetch ((__m128i*)pm);
-
-        while (w && (unsigned long)pd & 15)
-        {
-            m = *pm++;
-
-            if (m)
-            {
-                d = *pd;
-                mmxMask = unpack_32_1x64 (m);
-                mmxDst = unpack_32_1x64 (d);
-
-                *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
-                                                 &mmxAlpha,
-                                                 &mmxMask,
-                                                 &mmxDst));
-            }
-
-            pd++;
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)pd);
-        cachePrefetch ((__m128i*)pm);
-
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)pd);
-            cachePrefetchNext ((__m128i*)pm);
-
-            xmmMask = load128Unaligned ((__m128i*)pm);
-
-            packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
-
-            /* if all bits in mask are zero, packCmp are equal to 0xffff */
-            if (packCmp != 0xffff)
-            {
-                xmmDst = load128Aligned ((__m128i*)pd);
-
-                unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-                unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
-                inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
-
-                save128Aligned ((__m128i*)pd, pack_2x128_128 (xmmDstLo, xmmDstHi));
-            }
-
-            pd += 4;
-            pm += 4;
-            w -= 4;
-        }
-
-        while (w)
-        {
-            m = *pm++;
-
-            if (m)
-            {
-                d = *pd;
-                mmxMask = unpack_32_1x64 (m);
-                mmxDst = unpack_32_1x64 (d);
-
-                *pd = pack_1x64_32 (inOver_1x64 (&mmxSrc,
-                                                 &mmxAlpha,
-                                                 &mmxMask,
-                                                 &mmxDst));
-            }
-
-            pd++;
-            w--;
-        }
+	int w = width;
+	const uint32_t *pm = (uint32_t *)mask_line;
+	uint32_t *pd = (uint32_t *)dst_line;
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)pd);
+	cache_prefetch ((__m128i*)pm);
+
+	while (w && (unsigned long)pd & 15)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		
+		mmx_mask = unpack_32_1x64 (m);
+		mmx_dest = unpack_32_1x64 (d);
+
+		*pd = pack_1x64_32 (
+		    _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)pd);
+	cache_prefetch ((__m128i*)pm);
+
+	while (w >= 4)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)pd);
+	    cache_prefetch_next ((__m128i*)pm);
+
+	    xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+	    pack_cmp =
+		_mm_movemask_epi8 (
+		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+	    if (pack_cmp != 0xffff)
+	    {
+		xmm_dst = load_128_aligned ((__m128i*)pd);
+
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		pix_multiply_2x128 (&xmm_src, &xmm_src,
+				    &xmm_mask_lo, &xmm_mask_hi,
+				    &xmm_mask_lo, &xmm_mask_hi);
+		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
+		
+		save_128_aligned (
+		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
+	    }
+
+	    pd += 4;
+	    pm += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		
+		mmx_mask = unpack_32_1x64 (m);
+		mmx_dest = unpack_32_1x64 (d);
+
+		*pd = pack_1x64_32 (
+		    _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrc_8888x8x8888
+/* ---------------------------------------------------------------------------
+ * composite_over_n_8888_8888_ca
  */
 
 static void
-fbCompositeSrc_8888x8x8888sse2 (pixman_implementation_t *imp,
-				pixman_op_t op,
-			       pixman_image_t * pSrc,
-			       pixman_image_t * pMask,
-			       pixman_image_t * pDst,
-			       int32_t	xSrc,
-			       int32_t	ySrc,
-			       int32_t      xMask,
-			       int32_t      yMask,
-			       int32_t      xDst,
-			       int32_t      yDst,
-			       int32_t     width,
-			       int32_t     height)
-{
-    uint32_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-    uint32_t	mask;
-    uint16_t	w;
-    int	dstStride, srcStride;
-
-    __m128i xmmMask;
-    __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
-    __m128i xmmDst, xmmDstLo, xmmDstHi;
-    __m128i xmmAlphaLo, xmmAlphaHi;
-
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-    fbComposeGetSolid (pMask, mask, pDst->bits.format);
-
-    xmmMask = createMask_16_128 (mask >> 24);
-
-    while (height--)
-    {
-        dst = dstLine;
-        dstLine += dstStride;
-        src = srcLine;
-        srcLine += srcStride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)dst);
-        cachePrefetch ((__m128i*)src);
-
-        while (w && (unsigned long)dst & 15)
-        {
-            uint32_t s = *src++;
-            uint32_t d = *dst;
-
-            __m64 ms = unpack_32_1x64 (s);
-            __m64 alpha    = expandAlpha_1x64 (ms);
-            __m64 dest     = _mm_movepi64_pi64 (xmmMask);
-            __m64 alphaDst = unpack_32_1x64 (d);
-
-            *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
-                                                &alpha,
-                                                &dest,
-                                                &alphaDst));
+sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_op_t              op,
+                                    pixman_image_t *         src_image,
+                                    pixman_image_t *         mask_image,
+                                    pixman_image_t *         dst_image,
+                                    int32_t                  src_x,
+                                    int32_t                  src_y,
+                                    int32_t                  mask_x,
+                                    int32_t                  mask_y,
+                                    int32_t                  dest_x,
+                                    int32_t                  dest_y,
+                                    int32_t                  width,
+                                    int32_t                  height)
+{
+    uint32_t src;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
-            w--;
-        }
+    if (src == 0)
+	return;
 
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)dst);
-        cachePrefetch ((__m128i*)src);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)dst);
-            cachePrefetchNext ((__m128i*)src);
+    xmm_src = _mm_unpacklo_epi8 (
+	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src   = _mm_movepi64_pi64 (xmm_src);
+    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
 
-            xmmSrc = load128Unaligned ((__m128i*)src);
-            xmmDst = load128Aligned ((__m128i*)dst);
+    while (height--)
+    {
+	int w = width;
+	const uint32_t *pm = (uint32_t *)mask_line;
+	uint32_t *pd = (uint32_t *)dst_line;
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)pd);
+	cache_prefetch ((__m128i*)pm);
+
+	while (w && (unsigned long)pd & 15)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		mmx_mask = unpack_32_1x64 (m);
+		mmx_dest = unpack_32_1x64 (d);
+
+		*pd = pack_1x64_32 (in_over_1x64 (&mmx_src,
+		                                  &mmx_alpha,
+		                                  &mmx_mask,
+		                                  &mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)pd);
+	cache_prefetch ((__m128i*)pm);
+
+	while (w >= 4)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)pd);
+	    cache_prefetch_next ((__m128i*)pm);
+
+	    xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+	    pack_cmp =
+		_mm_movemask_epi8 (
+		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+	    if (pack_cmp != 0xffff)
+	    {
+		xmm_dst = load_128_aligned ((__m128i*)pd);
+
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    pd += 4;
+	    pm += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		mmx_mask = unpack_32_1x64 (m);
+		mmx_dest = unpack_32_1x64 (d);
+
+		*pd = pack_1x64_32 (
+		    in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+    }
 
-            unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
-            unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-            expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
+    _mm_empty ();
+}
 
-            inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
+/*---------------------------------------------------------------------
+ * composite_over_8888_n_8888
+ */
 
-            save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
+static void
+sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    uint16_t w;
+    int dst_stride, src_stride;
 
-            dst += 4;
-            src += 4;
-            w -= 4;
-        }
+    __m128i xmm_mask;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
 
-        while (w)
-        {
-            uint32_t s = *src++;
-            uint32_t d = *dst;
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-            __m64 ms = unpack_32_1x64 (s);
-            __m64 alpha = expandAlpha_1x64 (ms);
-            __m64 mask  = _mm_movepi64_pi64 (xmmMask);
-            __m64 dest  = unpack_32_1x64 (d);
+    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
 
-            *dst++ = pack_1x64_32 (inOver_1x64 (&ms,
-                                                &alpha,
-                                                &mask,
-                                                &dest));
+    xmm_mask = create_mask_16_128 (mask >> 24);
 
-            w--;
-        }
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)dst);
+	cache_prefetch ((__m128i*)src);
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    uint32_t s = *src++;
+	    uint32_t d = *dst;
+
+	    __m64 ms = unpack_32_1x64 (s);
+	    __m64 alpha    = expand_alpha_1x64 (ms);
+	    __m64 dest     = _mm_movepi64_pi64 (xmm_mask);
+	    __m64 alpha_dst = unpack_32_1x64 (d);
+
+	    *dst++ = pack_1x64_32 (
+		in_over_1x64 (&ms, &alpha, &dest, &alpha_dst));
+
+	    w--;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)dst);
+	cache_prefetch ((__m128i*)src);
+
+	while (w >= 4)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)dst);
+	    cache_prefetch_next ((__m128i*)src);
+
+	    xmm_src = load_128_unaligned ((__m128i*)src);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				&xmm_alpha_lo, &xmm_alpha_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha_lo, &xmm_alpha_hi,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    uint32_t s = *src++;
+	    uint32_t d = *dst;
+
+	    __m64 ms = unpack_32_1x64 (s);
+	    __m64 alpha = expand_alpha_1x64 (ms);
+	    __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
+	    __m64 dest  = unpack_32_1x64 (d);
+
+	    *dst++ = pack_1x64_32 (
+		in_over_1x64 (&ms, &alpha, &mask, &dest));
+
+	    w--;
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrc_x888xnx8888
+/* ---------------------------------------------------------------------
+ * composite_over_x888_n_8888
  */
 static void
-fbCompositeSrc_x888xnx8888sse2 (pixman_implementation_t *imp,
-				pixman_op_t op,
-			       pixman_image_t * pSrc,
-			       pixman_image_t * pMask,
-			       pixman_image_t * pDst,
-			       int32_t	xSrc,
-			       int32_t	ySrc,
-			       int32_t      xMask,
-			       int32_t      yMask,
-			       int32_t      xDst,
-			       int32_t      yDst,
-			       int32_t     width,
-			       int32_t     height)
-{
-    uint32_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-    uint32_t	mask;
-    int	dstStride, srcStride;
-    uint16_t	w;
-
-    __m128i xmmMask, xmmAlpha;
-    __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
-    __m128i xmmDst, xmmDstLo, xmmDstHi;
-
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-    fbComposeGetSolid (pMask, mask, pDst->bits.format);
-
-    xmmMask = createMask_16_128 (mask >> 24);
-    xmmAlpha = Mask00ff;
-
-    while (height--)
-    {
-        dst = dstLine;
-        dstLine += dstStride;
-        src = srcLine;
-        srcLine += srcStride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)dst);
-        cachePrefetch ((__m128i*)src);
-
-        while (w && (unsigned long)dst & 15)
-        {
-            uint32_t s = (*src++) | 0xff000000;
-            uint32_t d = *dst;
-
-            __m64 src   = unpack_32_1x64 (s);
-            __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
-            __m64 mask  = _mm_movepi64_pi64 (xmmMask);
-            __m64 dest  = unpack_32_1x64 (d);
-
-            *dst++ = pack_1x64_32 (inOver_1x64 (&src,
-                                                &alpha,
-                                                &mask,
-                                                &dest));
-
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)dst);
-        cachePrefetch ((__m128i*)src);
-
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)dst);
-            cachePrefetchNext ((__m128i*)src);
-
-            xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
-            xmmDst = load128Aligned ((__m128i*)dst);
-
-            unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
-            unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
-            inOver_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlpha, &xmmAlpha, &xmmMask, &xmmMask, &xmmDstLo, &xmmDstHi);
-
-            save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
-            dst += 4;
-            src += 4;
-            w -= 4;
+sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    int dst_stride, src_stride;
+    uint16_t w;
 
-        }
+    __m128i xmm_mask, xmm_alpha;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
-        while (w)
-        {
-            uint32_t s = (*src++) | 0xff000000;
-            uint32_t d = *dst;
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
-            __m64 src  = unpack_32_1x64 (s);
-            __m64 alpha = _mm_movepi64_pi64 (xmmAlpha);
-            __m64 mask  = _mm_movepi64_pi64 (xmmMask);
-            __m64 dest  = unpack_32_1x64 (d);
+    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
 
-            *dst++ = pack_1x64_32 (inOver_1x64 (&src,
-                                                &alpha,
-                                                &mask,
-                                                &dest));
+    xmm_mask = create_mask_16_128 (mask >> 24);
+    xmm_alpha = mask_00ff;
 
-            w--;
-        }
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)dst);
+	cache_prefetch ((__m128i*)src);
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    uint32_t s = (*src++) | 0xff000000;
+	    uint32_t d = *dst;
+
+	    __m64 src   = unpack_32_1x64 (s);
+	    __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
+	    __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
+	    __m64 dest  = unpack_32_1x64 (d);
+
+	    *dst++ = pack_1x64_32 (
+		in_over_1x64 (&src, &alpha, &mask, &dest));
+
+	    w--;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)dst);
+	cache_prefetch ((__m128i*)src);
+
+	while (w >= 4)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)dst);
+	    cache_prefetch_next ((__m128i*)src);
+
+	    xmm_src = _mm_or_si128 (
+		load_128_unaligned ((__m128i*)src), mask_ff000000);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha, &xmm_alpha,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+
+	}
+
+	while (w)
+	{
+	    uint32_t s = (*src++) | 0xff000000;
+	    uint32_t d = *dst;
+
+	    __m64 src  = unpack_32_1x64 (s);
+	    __m64 alpha = _mm_movepi64_pi64 (xmm_alpha);
+	    __m64 mask  = _mm_movepi64_pi64 (xmm_mask);
+	    __m64 dest  = unpack_32_1x64 (d);
+
+	    *dst++ = pack_1x64_32 (
+		in_over_1x64 (&src, &alpha, &mask, &dest));
+
+	    w--;
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrc_8888x8888
+/* --------------------------------------------------------------------
+ * composite_over_8888_8888
  */
 static void
-fbCompositeSrc_8888x8888sse2 (pixman_implementation_t *imp,
-			      pixman_op_t op,
-			     pixman_image_t * pSrc,
-			     pixman_image_t * pMask,
-			     pixman_image_t * pDst,
-			     int32_t	xSrc,
-			     int32_t	ySrc,
-			     int32_t      xMask,
-			     int32_t      yMask,
-			     int32_t      xDst,
-			     int32_t      yDst,
-			     int32_t     width,
-			     int32_t     height)
-{
-    int	        dstStride, srcStride;
-    uint32_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-
-    dst = dstLine;
-    src = srcLine;
+sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               pixman_image_t *         src_image,
+                               pixman_image_t *         mask_image,
+                               pixman_image_t *         dst_image,
+                               int32_t                  src_x,
+                               int32_t                  src_y,
+                               int32_t                  mask_x,
+                               int32_t                  mask_y,
+                               int32_t                  dest_x,
+                               int32_t                  dest_y,
+                               int32_t                  width,
+                               int32_t                  height)
+{
+    int dst_stride, src_stride;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    dst = dst_line;
+    src = src_line;
 
     while (height--)
     {
-        coreCombineOverUsse2 (dst, src, NULL, width);
+	core_combine_over_u_sse2 (dst, src, NULL, width);
 
-        dst += dstStride;
-        src += srcStride;
+	dst += dst_stride;
+	src += src_stride;
     }
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrc_8888x0565
+/* ------------------------------------------------------------------
+ * composite_over_8888_0565
  */
 static force_inline uint16_t
-fbCompositeSrc_8888x0565pixel (uint32_t src, uint16_t dst)
+composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
 {
-    __m64       ms;
+    __m64 ms;
 
     ms = unpack_32_1x64 (src);
-    return pack565_32_16( pack_1x64_32 (over_1x64 (ms,
-                                                   expandAlpha_1x64 (ms),
-                                                   expand565_16_1x64 (dst))));
+    return pack_565_32_16 (
+	pack_1x64_32 (
+	    over_1x64 (
+		ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst))));
 }
 
 static void
-fbCompositeSrc_8888x0565sse2 (pixman_implementation_t *imp,
-			      pixman_op_t op,
-			     pixman_image_t * pSrc,
-			     pixman_image_t * pMask,
-			     pixman_image_t * pDst,
-			     int32_t      xSrc,
-			     int32_t      ySrc,
-			     int32_t      xMask,
-			     int32_t      yMask,
-			     int32_t      xDst,
-			     int32_t      yDst,
-			     int32_t     width,
-			     int32_t     height)
-{
-    uint16_t	*dstLine, *dst, d;
-    uint32_t	*srcLine, *src, s;
-    int	dstStride, srcStride;
-    uint16_t	w;
-
-    __m128i xmmAlphaLo, xmmAlphaHi;
-    __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
-    __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
-
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               pixman_image_t *         src_image,
+                               pixman_image_t *         mask_image,
+                               pixman_image_t *         dst_image,
+                               int32_t                  src_x,
+                               int32_t                  src_y,
+                               int32_t                  mask_x,
+                               int32_t                  mask_y,
+                               int32_t                  dest_x,
+                               int32_t                  dest_y,
+                               int32_t                  width,
+                               int32_t                  height)
+{
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    uint16_t w;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
 #if 0
     /* FIXME
@@ -3114,242 +3702,262 @@ fbCompositeSrc_8888x0565sse2 (pixman_implementation_t *imp,
      * I copy the code from MMX one and keep the fixme.
      * If it's a problem there, probably is a problem here.
      */
-    assert (pSrc->pDrawable == pMask->pDrawable);
+    assert (src_image->drawable == mask_image->drawable);
 #endif
 
     while (height--)
     {
-        dst = dstLine;
-        src = srcLine;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)src);
-        cachePrefetch ((__m128i*)dst);
-
-        dstLine += dstStride;
-        srcLine += srcStride;
-        w = width;
-
-        /* Align dst on a 16-byte boundary */
-        while (w &&
-               ((unsigned long)dst & 15))
-        {
-            s = *src++;
-            d = *dst;
-
-            *dst++ = fbCompositeSrc_8888x0565pixel (s, d);
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)src);
-        cachePrefetch ((__m128i*)dst);
-
-        /* It's a 8 pixel loop */
-        while (w >= 8)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)src);
-            cachePrefetchNext ((__m128i*)dst);
-
-            /* I'm loading unaligned because I'm not sure about the address alignment. */
-            xmmSrc = load128Unaligned ((__m128i*) src);
-            xmmDst = load128Aligned ((__m128i*) dst);
-
-            /* Unpacking */
-            unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
-            unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
-            expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
-
-            /* I'm loading next 4 pixels from memory before to optimze the memory read. */
-            xmmSrc = load128Unaligned ((__m128i*) (src+4));
-
-            over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst0, &xmmDst1);
-
-            /* Unpacking */
-            unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
-            expandAlpha_2x128 (xmmSrcLo, xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi);
-
-            over_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmAlphaLo, &xmmAlphaHi, &xmmDst2, &xmmDst3);
-
-            save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
-
-            w -= 8;
-            dst += 8;
-            src += 8;
-        }
-
-        while (w--)
-        {
-            s = *src++;
-            d = *dst;
-
-            *dst++ = fbCompositeSrc_8888x0565pixel (s, d);
-        }
+	dst = dst_line;
+	src = src_line;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)src);
+	cache_prefetch ((__m128i*)dst);
+
+	dst_line += dst_stride;
+	src_line += src_stride;
+	w = width;
+
+	/* Align dst on a 16-byte boundary */
+	while (w &&
+	       ((unsigned long)dst & 15))
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = composite_over_8888_0565pixel (s, d);
+	    w--;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)src);
+	cache_prefetch ((__m128i*)dst);
+
+	/* It's a 8 pixel loop */
+	while (w >= 8)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)src);
+	    cache_prefetch_next ((__m128i*)dst);
+
+	    /* I'm loading unaligned because I'm not sure
+	     * about the address alignment.
+	     */
+	    xmm_src = load_128_unaligned ((__m128i*) src);
+	    xmm_dst = load_128_aligned ((__m128i*) dst);
+
+	    /* Unpacking */
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				&xmm_alpha_lo, &xmm_alpha_hi);
+
+	    /* I'm loading next 4 pixels from memory
+	     * before to optimze the memory read.
+	     */
+	    xmm_src = load_128_unaligned ((__m128i*) (src + 4));
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst0, &xmm_dst1);
+
+	    /* Unpacking */
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				&xmm_alpha_lo, &xmm_alpha_hi);
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst2, &xmm_dst3);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    dst += 8;
+	    src += 8;
+	}
+
+	while (w--)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = composite_over_8888_0565pixel (s, d);
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSolidMask_nx8x8888
+/* -----------------------------------------------------------------
+ * composite_over_n_8_8888
  */
 
 static void
-fbCompositeSolidMask_nx8x8888sse2 (pixman_implementation_t *imp,
-				   pixman_op_t op,
-				  pixman_image_t * pSrc,
-				  pixman_image_t * pMask,
-				  pixman_image_t * pDst,
-				  int32_t      xSrc,
-				  int32_t      ySrc,
-				  int32_t      xMask,
-				  int32_t      yMask,
-				  int32_t      xDst,
-				  int32_t      yDst,
-				  int32_t     width,
-				  int32_t     height)
-{
-    uint32_t	src, srca;
-    uint32_t	*dstLine, *dst;
-    uint8_t	*maskLine, *mask;
-    int	dstStride, maskStride;
-    uint16_t	w;
+sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
     uint32_t m, d;
 
-    __m128i xmmSrc, xmmAlpha, xmmDef;
-    __m128i xmmDst, xmmDstLo, xmmDstHi;
-    __m128i xmmMask, xmmMaskLo, xmmMaskHi;
+    __m128i xmm_src, xmm_alpha, xmm_def;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
+    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    xmmDef = createMask_2x32_128 (src, src);
-    xmmSrc = expandPixel_32_1x128 (src);
-    xmmAlpha = expandAlpha_1x128 (xmmSrc);
-    mmxSrc   = _mm_movepi64_pi64 (xmmSrc);
-    mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
+    xmm_def = create_mask_2x32_128 (src, src);
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src   = _mm_movepi64_pi64 (xmm_src);
+    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
 
     while (height--)
     {
-        dst = dstLine;
-        dstLine += dstStride;
-        mask = maskLine;
-        maskLine += maskStride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)mask);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w && (unsigned long)dst & 15)
-        {
-            uint8_t m = *mask++;
-
-            if (m)
-            {
-                d = *dst;
-                mmxMask = expandPixel_8_1x64 (m);
-                mmxDest = unpack_32_1x64 (d);
-
-                *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
-                                                  &mmxAlpha,
-                                                  &mmxMask,
-                                                  &mmxDest));
-            }
-
-            w--;
-            dst++;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)mask);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)mask);
-            cachePrefetchNext ((__m128i*)dst);
-
-            m = *((uint32_t*)mask);
-
-            if (srca == 0xff && m == 0xffffffff)
-            {
-                save128Aligned ((__m128i*)dst, xmmDef);
-            }
-            else if (m)
-            {
-                xmmDst = load128Aligned ((__m128i*) dst);
-                xmmMask = unpack_32_1x128 (m);
-                xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
-
-                /* Unpacking */
-                unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-                unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-
-                expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
-                inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi);
-
-                save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
-            }
-
-            w -= 4;
-            dst += 4;
-            mask += 4;
-        }
-
-        while (w)
-        {
-            uint8_t m = *mask++;
-
-            if (m)
-            {
-                d = *dst;
-                mmxMask = expandPixel_8_1x64 (m);
-                mmxDest = unpack_32_1x64 (d);
-
-                *dst = pack_1x64_32 (inOver_1x64 (&mmxSrc,
-                                                  &mmxAlpha,
-                                                  &mmxMask,
-                                                  &mmxDest));
-            }
-
-            w--;
-            dst++;
-        }
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)mask);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_pixel_8_1x64 (m);
+		mmx_dest = unpack_32_1x64 (d);
+
+		*dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+		                                   &mmx_alpha,
+		                                   &mmx_mask,
+		                                   &mmx_dest));
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)mask);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w >= 4)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)mask);
+	    cache_prefetch_next ((__m128i*)dst);
+
+	    m = *((uint32_t*)mask);
+
+	    if (srca == 0xff && m == 0xffffffff)
+	    {
+		save_128_aligned ((__m128i*)dst, xmm_def);
+	    }
+	    else if (m)
+	    {
+		xmm_dst = load_128_aligned ((__m128i*) dst);
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    mask += 4;
+	}
+
+	while (w)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_pixel_8_1x64 (m);
+		mmx_dest = unpack_32_1x64 (d);
+
+		*dst = pack_1x64_32 (in_over_1x64 (&mmx_src,
+		                                   &mmx_alpha,
+		                                   &mmx_mask,
+		                                   &mmx_dest));
+	    }
+
+	    w--;
+	    dst++;
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSolidMask_nx8x8888
+/* ----------------------------------------------------------------
+ * composite_over_n_8_8888
  */
 
 pixman_bool_t
-pixmanFillsse2 (uint32_t *bits,
-		 int stride,
-		 int bpp,
-		 int x,
-		 int y,
-		 int width,
-		 int height,
-		 uint32_t data)
+pixman_fill_sse2 (uint32_t *bits,
+                  int       stride,
+                  int       bpp,
+                  int       x,
+                  int       y,
+                  int       width,
+                  int       height,
+                  uint32_t  data)
 {
-    uint32_t	byte_width;
-    uint8_t	    *byte_line;
+    uint32_t byte_width;
+    uint8_t         *byte_line;
 
-    __m128i xmmDef;
+    __m128i xmm_def;
 
     if (bpp == 16 && (data >> 16 != (data & 0xffff)))
 	return FALSE;
@@ -3359,433 +3967,459 @@ pixmanFillsse2 (uint32_t *bits,
 
     if (bpp == 16)
     {
-        stride = stride * (int) sizeof (uint32_t) / 2;
-        byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
-        byte_width = 2 * width;
-        stride *= 2;
+	stride = stride * (int) sizeof (uint32_t) / 2;
+	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+	byte_width = 2 * width;
+	stride *= 2;
     }
     else
     {
-        stride = stride * (int) sizeof (uint32_t) / 4;
-        byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
-        byte_width = 4 * width;
-        stride *= 4;
+	stride = stride * (int) sizeof (uint32_t) / 4;
+	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+	byte_width = 4 * width;
+	stride *= 4;
     }
 
-    cachePrefetch ((__m128i*)byte_line);
-    xmmDef = createMask_2x32_128 (data, data);
+    cache_prefetch ((__m128i*)byte_line);
+    xmm_def = create_mask_2x32_128 (data, data);
 
     while (height--)
     {
-        int w;
-        uint8_t *d = byte_line;
-        byte_line += stride;
-        w = byte_width;
-
-
-        cachePrefetchNext ((__m128i*)d);
-
-        while (w >= 2 && ((unsigned long)d & 3))
-        {
-            *(uint16_t *)d = data;
-            w -= 2;
-            d += 2;
-        }
-
-        while (w >= 4 && ((unsigned long)d & 15))
-        {
-            *(uint32_t *)d = data;
-
-            w -= 4;
-            d += 4;
-        }
-
-        cachePrefetchNext ((__m128i*)d);
-
-        while (w >= 128)
-        {
-            cachePrefetch (((__m128i*)d) + 12);
-
-            save128Aligned ((__m128i*)(d),     xmmDef);
-            save128Aligned ((__m128i*)(d+16),  xmmDef);
-            save128Aligned ((__m128i*)(d+32),  xmmDef);
-            save128Aligned ((__m128i*)(d+48),  xmmDef);
-            save128Aligned ((__m128i*)(d+64),  xmmDef);
-            save128Aligned ((__m128i*)(d+80),  xmmDef);
-            save128Aligned ((__m128i*)(d+96),  xmmDef);
-            save128Aligned ((__m128i*)(d+112), xmmDef);
-
-            d += 128;
-            w -= 128;
-        }
-
-        if (w >= 64)
-        {
-            cachePrefetch (((__m128i*)d) + 8);
-
-            save128Aligned ((__m128i*)(d),     xmmDef);
-            save128Aligned ((__m128i*)(d+16),  xmmDef);
-            save128Aligned ((__m128i*)(d+32),  xmmDef);
-            save128Aligned ((__m128i*)(d+48),  xmmDef);
-
-            d += 64;
-            w -= 64;
-        }
-
-        cachePrefetchNext ((__m128i*)d);
-
-        if (w >= 32)
-        {
-            save128Aligned ((__m128i*)(d),     xmmDef);
-            save128Aligned ((__m128i*)(d+16),  xmmDef);
-
-            d += 32;
-            w -= 32;
-        }
-
-        if (w >= 16)
-        {
-            save128Aligned ((__m128i*)(d),     xmmDef);
-
-            d += 16;
-            w -= 16;
-        }
-
-        cachePrefetchNext ((__m128i*)d);
-
-        while (w >= 4)
-        {
-            *(uint32_t *)d = data;
-
-            w -= 4;
-            d += 4;
-        }
-
-        if (w >= 2)
-        {
-            *(uint16_t *)d = data;
-            w -= 2;
-            d += 2;
-        }
+	int w;
+	uint8_t *d = byte_line;
+	byte_line += stride;
+	w = byte_width;
+
+
+	cache_prefetch_next ((__m128i*)d);
+
+	while (w >= 2 && ((unsigned long)d & 3))
+	{
+	    *(uint16_t *)d = data;
+	    w -= 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((unsigned long)d & 15))
+	{
+	    *(uint32_t *)d = data;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	cache_prefetch_next ((__m128i*)d);
+
+	while (w >= 128)
+	{
+	    cache_prefetch (((__m128i*)d) + 12);
+
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 64),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 80),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 96),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 112), xmm_def);
+
+	    d += 128;
+	    w -= 128;
+	}
+
+	if (w >= 64)
+	{
+	    cache_prefetch (((__m128i*)d) + 8);
+
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
+
+	    d += 64;
+	    w -= 64;
+	}
+
+	cache_prefetch_next ((__m128i*)d);
+
+	if (w >= 32)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+
+	    d += 32;
+	    w -= 32;
+	}
+
+	if (w >= 16)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+
+	    d += 16;
+	    w -= 16;
+	}
+
+	cache_prefetch_next ((__m128i*)d);
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = data;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = data;
+	    w -= 2;
+	    d += 2;
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
     return TRUE;
 }
 
 static void
-fbCompositeSolidMaskSrc_nx8x8888sse2 (pixman_implementation_t *imp,
-				      pixman_op_t op,
-				     pixman_image_t * pSrc,
-				     pixman_image_t * pMask,
-				     pixman_image_t * pDst,
-				     int32_t      xSrc,
-				     int32_t      ySrc,
-				     int32_t      xMask,
-				     int32_t      yMask,
-				     int32_t      xDst,
-				     int32_t      yDst,
-				     int32_t     width,
-				     int32_t     height)
-{
-    uint32_t	src, srca;
-    uint32_t	*dstLine, *dst;
-    uint8_t	*maskLine, *mask;
-    int	dstStride, maskStride;
-    uint16_t	w;
-    uint32_t    m;
-
-    __m128i xmmSrc, xmmDef;
-    __m128i xmmMask, xmmMaskLo, xmmMaskHi;
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             pixman_image_t *         src_image,
+                             pixman_image_t *         mask_image,
+                             pixman_image_t *         dst_image,
+                             int32_t                  src_x,
+                             int32_t                  src_y,
+                             int32_t                  mask_x,
+                             int32_t                  mask_y,
+                             int32_t                  dest_x,
+                             int32_t                  dest_y,
+                             int32_t                  width,
+                             int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
+    uint32_t m;
+
+    __m128i xmm_src, xmm_def;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
     {
-        pixmanFillsse2 (pDst->bits.bits, pDst->bits.rowstride,
-                        PIXMAN_FORMAT_BPP (pDst->bits.format),
-                        xDst, yDst, width, height, 0);
-        return;
+	pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride,
+	                  PIXMAN_FORMAT_BPP (dst_image->bits.format),
+	                  dest_x, dest_y, width, height, 0);
+	return;
     }
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    xmmDef = createMask_2x32_128 (src, src);
-    xmmSrc = expandPixel_32_1x128 (src);
+    xmm_def = create_mask_2x32_128 (src, src);
+    xmm_src = expand_pixel_32_1x128 (src);
 
     while (height--)
     {
-        dst = dstLine;
-        dstLine += dstStride;
-        mask = maskLine;
-        maskLine += maskStride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)mask);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w && (unsigned long)dst & 15)
-        {
-            uint8_t m = *mask++;
-
-            if (m)
-            {
-                *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
-            }
-            else
-            {
-                *dst = 0;
-            }
-
-            w--;
-            dst++;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)mask);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)mask);
-            cachePrefetchNext ((__m128i*)dst);
-
-            m = *((uint32_t*)mask);
-
-            if (srca == 0xff && m == 0xffffffff)
-            {
-                save128Aligned ((__m128i*)dst, xmmDef);
-            }
-            else if (m)
-            {
-                xmmMask = unpack_32_1x128 (m);
-                xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
-
-                /* Unpacking */
-                unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-
-                expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
-                pixMultiply_2x128 (&xmmSrc, &xmmSrc, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
-                save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmMaskLo, xmmMaskHi));
-            }
-            else
-            {
-                save128Aligned ((__m128i*)dst, _mm_setzero_si128());
-            }
-
-            w -= 4;
-            dst += 4;
-            mask += 4;
-        }
-
-        while (w)
-        {
-            uint8_t m = *mask++;
-
-            if (m)
-            {
-                *dst = pack_1x64_32 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmSrc), expandPixel_8_1x64 (m)));
-            }
-            else
-            {
-                *dst = 0;
-            }
-
-            w--;
-            dst++;
-        }
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)mask);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		*dst = pack_1x64_32 (
+		    pix_multiply_1x64 (
+			_mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)mask);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w >= 4)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)mask);
+	    cache_prefetch_next ((__m128i*)dst);
+
+	    m = *((uint32_t*)mask);
+
+	    if (srca == 0xff && m == 0xffffffff)
+	    {
+		save_128_aligned ((__m128i*)dst, xmm_def);
+	    }
+	    else if (m)
+	    {
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		pix_multiply_2x128 (&xmm_src, &xmm_src,
+				    &xmm_mask_lo, &xmm_mask_hi,
+				    &xmm_mask_lo, &xmm_mask_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+	    }
+	    else
+	    {
+		save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    mask += 4;
+	}
+
+	while (w)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		*dst = pack_1x64_32 (
+		    pix_multiply_1x64 (
+			_mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m)));
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+
+	    w--;
+	    dst++;
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSolidMask_nx8x0565
+/*-----------------------------------------------------------------------
+ * composite_over_n_8_0565
  */
 
 static void
-fbCompositeSolidMask_nx8x0565sse2 (pixman_implementation_t *imp,
-				   pixman_op_t op,
-				  pixman_image_t * pSrc,
-				  pixman_image_t * pMask,
-				  pixman_image_t * pDst,
-				  int32_t      xSrc,
-				  int32_t      ySrc,
-				  int32_t      xMask,
-				  int32_t      yMask,
-				  int32_t      xDst,
-				  int32_t      yDst,
-				  int32_t     width,
-				  int32_t     height)
-{
-    uint32_t	src, srca;
-    uint16_t	*dstLine, *dst, d;
-    uint8_t	*maskLine, *mask;
-    int	dstStride, maskStride;
-    uint16_t	w;
+sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t src, srca;
+    uint16_t    *dst_line, *dst, d;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
     uint32_t m;
-    __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
+    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
-    __m128i xmmSrc, xmmAlpha;
-    __m128i xmmMask, xmmMaskLo, xmmMaskHi;
-    __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     srca = src >> 24;
     if (src == 0)
 	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    xmmSrc = expandPixel_32_1x128 (src);
-    xmmAlpha = expandAlpha_1x128 (xmmSrc);
-    mmxSrc = _mm_movepi64_pi64 (xmmSrc);
-    mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src = _mm_movepi64_pi64 (xmm_src);
+    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
 
     while (height--)
     {
-        dst = dstLine;
-        dstLine += dstStride;
-        mask = maskLine;
-        maskLine += maskStride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)mask);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w && (unsigned long)dst & 15)
-        {
-            m = *mask++;
-
-            if (m)
-            {
-                d = *dst;
-                mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
-                mmxDest = expand565_16_1x64 (d);
-
-                *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
-                                                                 &mmxAlpha,
-                                                                 &mmxMask,
-                                                                 &mmxDest)));
-            }
-
-            w--;
-            dst++;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)mask);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w >= 8)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)mask);
-            cachePrefetchNext ((__m128i*)dst);
-
-            xmmDst = load128Aligned ((__m128i*) dst);
-            unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
-
-            m = *((uint32_t*)mask);
-            mask += 4;
-
-            if (m)
-            {
-                xmmMask = unpack_32_1x128 (m);
-                xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
-
-                /* Unpacking */
-                unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-
-                expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-                inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
-            }
-
-            m = *((uint32_t*)mask);
-            mask += 4;
-
-            if (m)
-            {
-                xmmMask = unpack_32_1x128 (m);
-                xmmMask = _mm_unpacklo_epi8 (xmmMask, _mm_setzero_si128());
-
-                /* Unpacking */
-                unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-
-                expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-                inOver_2x128 (&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
-            }
-
-            save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
-
-            w -= 8;
-            dst += 8;
-        }
-
-        while (w)
-        {
-            m = *mask++;
-
-            if (m)
-            {
-                d = *dst;
-                mmxMask = expandAlphaRev_1x64 (unpack_32_1x64 (m));
-                mmxDest = expand565_16_1x64 (d);
-
-                *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
-                                                                 &mmxAlpha,
-                                                                 &mmxMask,
-                                                                 &mmxDest)));
-            }
-
-            w--;
-            dst++;
-        }
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)mask);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+		mmx_dest = expand565_16_1x64 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x64_32 (
+			in_over_1x64 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)mask);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w >= 8)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)mask);
+	    cache_prefetch_next ((__m128i*)dst);
+
+	    xmm_dst = load_128_aligned ((__m128i*) dst);
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+	    m = *((uint32_t*)mask);
+	    mask += 4;
+
+	    if (m)
+	    {
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst0, &xmm_dst1);
+	    }
+
+	    m = *((uint32_t*)mask);
+	    mask += 4;
+
+	    if (m)
+	    {
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst2, &xmm_dst3);
+	    }
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    dst += 8;
+	}
+
+	while (w)
+	{
+	    m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+		mmx_dest = expand565_16_1x64 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x64_32 (
+			in_over_1x64 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrc_8888RevNPx0565
+/* -----------------------------------------------------------------------
+ * composite_over_pixbuf_0565
  */
 
 static void
-fbCompositeSrc_8888RevNPx0565sse2 (pixman_implementation_t *imp,
-				   pixman_op_t op,
-				  pixman_image_t * pSrc,
-				  pixman_image_t * pMask,
-				  pixman_image_t * pDst,
-				  int32_t      xSrc,
-				  int32_t      ySrc,
-				  int32_t      xMask,
-				  int32_t      yMask,
-				  int32_t      xDst,
-				  int32_t      yDst,
-				  int32_t     width,
-				  int32_t     height)
-{
-    uint16_t	*dstLine, *dst, d;
-    uint32_t	*srcLine, *src, s;
-    int		dstStride, srcStride;
-    uint16_t	w;
-    uint32_t    opaque, zero;
+sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    uint16_t w;
+    uint32_t opaque, zero;
 
     __m64 ms;
-    __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
-    __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
 #if 0
     /* FIXME
@@ -3793,133 +4427,144 @@ fbCompositeSrc_8888RevNPx0565sse2 (pixman_implementation_t *imp,
      * I copy the code from MMX one and keep the fixme.
      * If it's a problem there, probably is a problem here.
      */
-    assert (pSrc->pDrawable == pMask->pDrawable);
+    assert (src_image->drawable == mask_image->drawable);
 #endif
 
     while (height--)
     {
-        dst = dstLine;
-        dstLine += dstStride;
-        src = srcLine;
-        srcLine += srcStride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)src);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w && (unsigned long)dst & 15)
-        {
-            s = *src++;
-            d = *dst;
-
-            ms = unpack_32_1x64 (s);
-
-            *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)src);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w >= 8)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)src);
-            cachePrefetchNext ((__m128i*)dst);
-
-            /* First round */
-            xmmSrc = load128Unaligned((__m128i*)src);
-            xmmDst = load128Aligned  ((__m128i*)dst);
-
-            opaque = isOpaque (xmmSrc);
-	    zero = isZero (xmmSrc);
-
-	    unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
-            unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
-
-            /* preload next round*/
-            xmmSrc = load128Unaligned((__m128i*)(src+4));
-	    
-            if (opaque)
-            {
-                invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
-            }
-            else if (!zero)
-            {
-                overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst0, &xmmDst1);
-            }
-
-            /* Second round */
-	    opaque = isOpaque (xmmSrc);
-	    zero = isZero (xmmSrc);
-
-            unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
-
-            if (opaque)
-            {
-                invertColors_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
-            }
-            else if (zero)
-            {
-                overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDst2, &xmmDst3);
-            }
-
-            save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
-
-            w -= 8;
-            src += 8;
-            dst += 8;
-        }
-
-        while (w)
-        {
-            s = *src++;
-            d = *dst;
-
-            ms = unpack_32_1x64 (s);
-
-            *dst++ = pack565_32_16 (pack_1x64_32 (overRevNonPre_1x64(ms, expand565_16_1x64 (d))));
-            w--;
-        }
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)src);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    ms = unpack_32_1x64 (s);
+
+	    *dst++ = pack_565_32_16 (
+		pack_1x64_32 (
+		    over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+	    w--;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)src);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w >= 8)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)src);
+	    cache_prefetch_next ((__m128i*)dst);
+
+	    /* First round */
+	    xmm_src = load_128_unaligned ((__m128i*)src);
+	    xmm_dst = load_128_aligned  ((__m128i*)dst);
+
+	    opaque = is_opaque (xmm_src);
+	    zero = is_zero (xmm_src);
+
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+	    /* preload next round*/
+	    xmm_src = load_128_unaligned ((__m128i*)(src + 4));
+
+	    if (opaque)
+	    {
+		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+				     &xmm_dst0, &xmm_dst1);
+	    }
+	    else if (!zero)
+	    {
+		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+					&xmm_dst0, &xmm_dst1);
+	    }
+
+	    /* Second round */
+	    opaque = is_opaque (xmm_src);
+	    zero = is_zero (xmm_src);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+	    if (opaque)
+	    {
+		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+				     &xmm_dst2, &xmm_dst3);
+	    }
+	    else if (!zero)
+	    {
+		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+					&xmm_dst2, &xmm_dst3);
+	    }
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    src += 8;
+	    dst += 8;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    ms = unpack_32_1x64 (s);
+
+	    *dst++ = pack_565_32_16 (
+		pack_1x64_32 (
+		    over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d))));
+	    w--;
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
-
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrc_8888RevNPx8888
+/* -------------------------------------------------------------------------
+ * composite_over_pixbuf_8888
  */
 
 static void
-fbCompositeSrc_8888RevNPx8888sse2 (pixman_implementation_t *imp,
-				   pixman_op_t op,
-				  pixman_image_t * pSrc,
-				  pixman_image_t * pMask,
-				  pixman_image_t * pDst,
-				  int32_t      xSrc,
-				  int32_t      ySrc,
-				  int32_t      xMask,
-				  int32_t      yMask,
-				  int32_t      xDst,
-				  int32_t      yDst,
-				  int32_t     width,
-				  int32_t     height)
-{
-    uint32_t	*dstLine, *dst, d;
-    uint32_t	*srcLine, *src, s;
-    int	dstStride, srcStride;
-    uint16_t	w;
-    uint32_t    opaque, zero;
-
-    __m128i xmmSrcLo, xmmSrcHi;
-    __m128i xmmDstLo, xmmDstHi;
-
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    uint16_t w;
+    uint32_t opaque, zero;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
 #if 0
     /* FIXME
@@ -3927,1031 +4572,1103 @@ fbCompositeSrc_8888RevNPx8888sse2 (pixman_implementation_t *imp,
      * I copy the code from MMX one and keep the fixme.
      * If it's a problem there, probably is a problem here.
      */
-    assert (pSrc->pDrawable == pMask->pDrawable);
+    assert (src_image->drawable == mask_image->drawable);
 #endif
 
     while (height--)
     {
-        dst = dstLine;
-        dstLine += dstStride;
-        src = srcLine;
-        srcLine += srcStride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)src);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w && (unsigned long)dst & 15)
-        {
-            s = *src++;
-            d = *dst;
-
-            *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
-
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)src);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)src);
-            cachePrefetchNext ((__m128i*)dst);
-
-            xmmSrcHi = load128Unaligned((__m128i*)src);
-
-            opaque = isOpaque (xmmSrcHi);
-	    zero = isZero (xmmSrcHi);
-
-            unpack_128_2x128 (xmmSrcHi, &xmmSrcLo, &xmmSrcHi);
-
-            if (opaque)
-            {
-                invertColors_2x128( xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
-
-                save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
-            }
-            else if (!zero)
-            {
-                xmmDstHi = load128Aligned  ((__m128i*)dst);
-
-                unpack_128_2x128 (xmmDstHi, &xmmDstLo, &xmmDstHi);
-
-                overRevNonPre_2x128 (xmmSrcLo, xmmSrcHi, &xmmDstLo, &xmmDstHi);
-
-                save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
-            }
-
-            w -= 4;
-            dst += 4;
-            src += 4;
-        }
-
-        while (w)
-        {
-            s = *src++;
-            d = *dst;
-
-            *dst++ = pack_1x64_32 (overRevNonPre_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
-
-            w--;
-        }
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)src);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = pack_1x64_32 (
+		over_rev_non_pre_1x64 (
+		    unpack_32_1x64 (s), unpack_32_1x64 (d)));
+
+	    w--;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)src);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w >= 4)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)src);
+	    cache_prefetch_next ((__m128i*)dst);
+
+	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
+
+	    opaque = is_opaque (xmm_src_hi);
+	    zero = is_zero (xmm_src_hi);
+
+	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+	    if (opaque)
+	    {
+		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+				     &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+	    else if (!zero)
+	    {
+		xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
+
+		unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+					&xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    src += 4;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = pack_1x64_32 (
+		over_rev_non_pre_1x64 (
+		    unpack_32_1x64 (s), unpack_32_1x64 (d)));
+
+	    w--;
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 /* -------------------------------------------------------------------------------------------------
- * fbCompositeSolidMask_nx8888x0565C
+ * composite_over_n_8888_0565_ca
  */
 
 static void
-fbCompositeSolidMask_nx8888x0565Csse2 (pixman_implementation_t *imp,
-				       pixman_op_t op,
-				      pixman_image_t * pSrc,
-				      pixman_image_t * pMask,
-				      pixman_image_t * pDst,
-				      int32_t      xSrc,
-				      int32_t      ySrc,
-				      int32_t      xMask,
-				      int32_t      yMask,
-				      int32_t      xDst,
-				      int32_t      yDst,
-				      int32_t     width,
-				      int32_t     height)
-{
-    uint32_t	src;
-    uint16_t	*dstLine, *dst, d;
-    uint32_t	*maskLine, *mask, m;
-    int	dstStride, maskStride;
+sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                    pixman_op_t              op,
+                                    pixman_image_t *         src_image,
+                                    pixman_image_t *         mask_image,
+                                    pixman_image_t *         dst_image,
+                                    int32_t                  src_x,
+                                    int32_t                  src_y,
+                                    int32_t                  mask_x,
+                                    int32_t                  mask_y,
+                                    int32_t                  dest_x,
+                                    int32_t                  dest_y,
+                                    int32_t                  width,
+                                    int32_t                  height)
+{
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, m;
+    int dst_stride, mask_stride;
     int w;
-    uint32_t packCmp;
+    uint32_t pack_cmp;
 
-    __m128i xmmSrc, xmmAlpha;
-    __m128i xmmMask, xmmMaskLo, xmmMaskHi;
-    __m128i xmmDst, xmmDst0, xmmDst1, xmmDst2, xmmDst3;
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
 
-    __m64 mmxSrc, mmxAlpha, mmxMask, mmxDest;
+    __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest;
 
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     if (src == 0)
-        return;
+	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint32_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
 
-    xmmSrc = expandPixel_32_1x128 (src);
-    xmmAlpha = expandAlpha_1x128 (xmmSrc);
-    mmxSrc = _mm_movepi64_pi64 (xmmSrc);
-    mmxAlpha = _mm_movepi64_pi64 (xmmAlpha);
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src = _mm_movepi64_pi64 (xmm_src);
+    mmx_alpha = _mm_movepi64_pi64 (xmm_alpha);
 
     while (height--)
     {
-        w = width;
-        mask = maskLine;
-        dst = dstLine;
-        maskLine += maskStride;
-        dstLine += dstStride;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)mask);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w && ((unsigned long)dst & 15))
-        {
-            m = *(uint32_t *) mask;
-
-            if (m)
-            {
-                d = *dst;
-                mmxMask = unpack_32_1x64 (m);
-                mmxDest = expand565_16_1x64 (d);
-
-                *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
-                                                                 &mmxAlpha,
-                                                                 &mmxMask,
-                                                                 &mmxDest)));
-            }
-
-            w--;
-            dst++;
-            mask++;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)mask);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w >= 8)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)mask);
-            cachePrefetchNext ((__m128i*)dst);
-
-            /* First round */
-            xmmMask = load128Unaligned((__m128i*)mask);
-            xmmDst = load128Aligned((__m128i*)dst);
-
-            packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
-
-            unpack565_128_4x128 (xmmDst, &xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3);
-            unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-
-            /* preload next round*/
-            xmmMask = load128Unaligned((__m128i*)(mask+4));
-            /* preload next round*/
-
-            if (packCmp != 0xffff)
-            {
-                inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst0, &xmmDst1);
-            }
-
-            /* Second round */
-            packCmp = _mm_movemask_epi8 (_mm_cmpeq_epi32 (xmmMask, _mm_setzero_si128()));
-
-            unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-
-            if (packCmp != 0xffff)
-            {
-                inOver_2x128(&xmmSrc, &xmmSrc, &xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmDst2, &xmmDst3);
-            }
-
-            save128Aligned ((__m128i*)dst, pack565_4x128_128 (&xmmDst0, &xmmDst1, &xmmDst2, &xmmDst3));
-
-            w -= 8;
-            dst += 8;
-            mask += 8;
-        }
-
-        while (w)
-        {
-            m = *(uint32_t *) mask;
-
-            if (m)
-            {
-                d = *dst;
-                mmxMask = unpack_32_1x64 (m);
-                mmxDest = expand565_16_1x64 (d);
-
-                *dst = pack565_32_16 (pack_1x64_32 (inOver_1x64 (&mmxSrc,
-                                                                 &mmxAlpha,
-                                                                 &mmxMask,
-                                                                 &mmxDest)));
-            }
-
-            w--;
-            dst++;
-            mask++;
-        }
+	w = width;
+	mask = mask_line;
+	dst = dst_line;
+	mask_line += mask_stride;
+	dst_line += dst_stride;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)mask);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    m = *(uint32_t *) mask;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = unpack_32_1x64 (m);
+		mmx_dest = expand565_16_1x64 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x64_32 (
+			in_over_1x64 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	    mask++;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)mask);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w >= 8)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)mask);
+	    cache_prefetch_next ((__m128i*)dst);
+
+	    /* First round */
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    pack_cmp = _mm_movemask_epi8 (
+		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+	    /* preload next round */
+	    xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
+
+	    /* preload next round */
+	    if (pack_cmp != 0xffff)
+	    {
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst0, &xmm_dst1);
+	    }
+
+	    /* Second round */
+	    pack_cmp = _mm_movemask_epi8 (
+		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+	    if (pack_cmp != 0xffff)
+	    {
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst2, &xmm_dst3);
+	    }
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    dst += 8;
+	    mask += 8;
+	}
+
+	while (w)
+	{
+	    m = *(uint32_t *) mask;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = unpack_32_1x64 (m);
+		mmx_dest = expand565_16_1x64 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x64_32 (
+			in_over_1x64 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	    mask++;
+	}
     }
 
     _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeIn_nx8x8
+/* -----------------------------------------------------------------------
+ * composite_in_n_8_8
  */
 
 static void
-fbCompositeIn_nx8x8sse2 (pixman_implementation_t *imp,
-			 pixman_op_t op,
-			pixman_image_t * pSrc,
-			pixman_image_t * pMask,
-			pixman_image_t * pDst,
-			int32_t      xSrc,
-			int32_t      ySrc,
-			int32_t      xMask,
-			int32_t      yMask,
-			int32_t      xDst,
-			int32_t      yDst,
-			int32_t     width,
-			int32_t     height)
-{
-    uint8_t	*dstLine, *dst;
-    uint8_t	*maskLine, *mask;
-    int	dstStride, maskStride;
-    uint16_t	w, d, m;
-    uint32_t	src;
-    uint8_t	sa;
-
-    __m128i xmmAlpha;
-    __m128i xmmMask, xmmMaskLo, xmmMaskHi;
-    __m128i xmmDst, xmmDstLo, xmmDstHi;
-
-    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
-
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         pixman_image_t *         src_image,
+                         pixman_image_t *         mask_image,
+                         pixman_image_t *         dst_image,
+                         int32_t                  src_x,
+                         int32_t                  src_y,
+                         int32_t                  mask_x,
+                         int32_t                  mask_y,
+                         int32_t                  dest_x,
+                         int32_t                  dest_y,
+                         int32_t                  width,
+                         int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w, d, m;
+    uint32_t src;
+    uint8_t sa;
+
+    __m128i xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     sa = src >> 24;
-    if (sa == 0)
-        return;
 
-    xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
 
     while (height--)
     {
-        dst = dstLine;
-        dstLine += dstStride;
-        mask = maskLine;
-        maskLine += maskStride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)mask);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w && ((unsigned long)dst & 15))
-        {
-            m = (uint32_t) *mask++;
-            d = (uint32_t) *dst;
-
-            *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
-                                                               unpack_32_1x64 (d)));
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)mask);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w >= 16)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)mask);
-            cachePrefetchNext ((__m128i*)dst);
-
-            xmmMask = load128Unaligned((__m128i*)mask);
-            xmmDst = load128Aligned((__m128i*)dst);
-
-            unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-            unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
-            pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-            pixMultiply_2x128 (&xmmMaskLo, &xmmMaskHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
-
-            save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
-            mask += 16;
-            dst += 16;
-            w -= 16;
-        }
-
-        while (w)
-        {
-            m = (uint32_t) *mask++;
-            d = (uint32_t) *dst;
-
-            *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
-                                                               unpack_32_1x64 (d)));
-            w--;
-        }
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)mask);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x64_32 (
+		pix_multiply_1x64 (
+		    pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha),
+				       unpack_32_1x64 (m)),
+		    unpack_32_1x64 (d)));
+	    w--;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)mask);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w >= 16)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)mask);
+	    cache_prefetch_next ((__m128i*)dst);
+
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+				&xmm_mask_lo, &xmm_mask_hi,
+				&xmm_mask_lo, &xmm_mask_hi);
+
+	    pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+				&xmm_dst_lo, &xmm_dst_hi,
+				&xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    mask += 16;
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x64_32 (
+		pix_multiply_1x64 (
+		    pix_multiply_1x64 (
+			_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
+		    unpack_32_1x64 (d)));
+	    w--;
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeIn_8x8
+/* ---------------------------------------------------------------------------
+ * composite_in_8_8
  */
 
 static void
-fbCompositeIn_8x8sse2 (pixman_implementation_t *imp,
-		       pixman_op_t op,
-		      pixman_image_t * pSrc,
-		      pixman_image_t * pMask,
-		      pixman_image_t * pDst,
-		      int32_t      xSrc,
-		      int32_t      ySrc,
-		      int32_t      xMask,
-		      int32_t      yMask,
-		      int32_t      xDst,
-		      int32_t      yDst,
-		      int32_t     width,
-		      int32_t     height)
-{
-    uint8_t	*dstLine, *dst;
-    uint8_t	*srcLine, *src;
-    int	srcStride, dstStride;
-    uint16_t	w;
-    uint32_t    s, d;
-
-    __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
-    __m128i xmmDst, xmmDstLo, xmmDstHi;
-
-    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
+sse2_composite_in_8_8 (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       pixman_image_t *         src_image,
+                       pixman_image_t *         mask_image,
+                       pixman_image_t *         dst_image,
+                       int32_t                  src_x,
+                       int32_t                  src_y,
+                       int32_t                  mask_x,
+                       int32_t                  mask_y,
+                       int32_t                  dest_x,
+                       int32_t                  dest_y,
+                       int32_t                  width,
+                       int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int src_stride, dst_stride;
+    uint16_t w;
+    uint32_t s, d;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
 
     while (height--)
     {
-        dst = dstLine;
-        dstLine += dstStride;
-        src = srcLine;
-        srcLine += srcStride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)src);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w && ((unsigned long)dst & 15))
-        {
-            s = (uint32_t) *src++;
-            d = (uint32_t) *dst;
-
-            *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)src);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w >= 16)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)src);
-            cachePrefetchNext ((__m128i*)dst);
-
-            xmmSrc = load128Unaligned((__m128i*)src);
-            xmmDst = load128Aligned((__m128i*)dst);
-
-            unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
-            unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
-            pixMultiply_2x128 (&xmmSrcLo, &xmmSrcHi, &xmmDstLo, &xmmDstHi, &xmmDstLo, &xmmDstHi);
-
-            save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
-            src += 16;
-            dst += 16;
-            w -= 16;
-        }
-
-        while (w)
-        {
-            s = (uint32_t) *src++;
-            d = (uint32_t) *dst;
-
-            *dst++ = (uint8_t) pack_1x64_32 (pixMultiply_1x64 (unpack_32_1x64 (s),unpack_32_1x64 (d)));
-            w--;
-        }
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)src);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    s = (uint32_t) *src++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x64_32 (
+		pix_multiply_1x64 (
+		    unpack_32_1x64 (s), unpack_32_1x64 (d)));
+	    w--;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)src);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w >= 16)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)src);
+	    cache_prefetch_next ((__m128i*)dst);
+
+	    xmm_src = load_128_unaligned ((__m128i*)src);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+				&xmm_dst_lo, &xmm_dst_hi,
+				&xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    src += 16;
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    s = (uint32_t) *src++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x64_32 (
+		pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d)));
+	    w--;
+	}
     }
 
     _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrcAdd_8888x8x8
+/* -------------------------------------------------------------------------
+ * composite_add_8888_8_8
  */
 
 static void
-fbCompositeSrcAdd_8888x8x8sse2 (pixman_implementation_t *imp,
-				pixman_op_t op,
-			       pixman_image_t * pSrc,
-			       pixman_image_t * pMask,
-			       pixman_image_t * pDst,
-			       int32_t      xSrc,
-			       int32_t      ySrc,
-			       int32_t      xMask,
-			       int32_t      yMask,
-			       int32_t      xDst,
-			       int32_t      yDst,
-			       int32_t     width,
-			       int32_t     height)
-{
-    uint8_t	*dstLine, *dst;
-    uint8_t	*maskLine, *mask;
-    int	dstStride, maskStride;
-    uint16_t	w;
-    uint32_t	src;
-    uint8_t	sa;
+sse2_composite_add_8888_8_8 (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             pixman_image_t *         src_image,
+                             pixman_image_t *         mask_image,
+                             pixman_image_t *         dst_image,
+                             int32_t                  src_x,
+                             int32_t                  src_y,
+                             int32_t                  mask_x,
+                             int32_t                  mask_y,
+                             int32_t                  dest_x,
+                             int32_t                  dest_y,
+                             int32_t                  width,
+                             int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint16_t w;
+    uint32_t src;
+    uint8_t sa;
     uint32_t m, d;
 
-    __m128i xmmAlpha;
-    __m128i xmmMask, xmmMaskLo, xmmMaskHi;
-    __m128i xmmDst, xmmDstLo, xmmDstHi;
+    __m128i xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
 
-    fbComposeGetSolid(pSrc, src, pDst->bits.format);
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
 
     sa = src >> 24;
-    if (sa == 0)
-        return;
 
-    xmmAlpha = expandAlpha_1x128 (expandPixel_32_1x128 (src));
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
 
     while (height--)
     {
-        dst = dstLine;
-        dstLine += dstStride;
-        mask = maskLine;
-        maskLine += maskStride;
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)mask);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w && ((unsigned long)dst & 15))
-        {
-            m = (uint32_t) *mask++;
-            d = (uint32_t) *dst;
-
-            *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
-                                                                              unpack_32_1x64 (d)));
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)mask);
-        cachePrefetch ((__m128i*)dst);
-
-        while (w >= 16)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)mask);
-            cachePrefetchNext ((__m128i*)dst);
-
-            xmmMask = load128Unaligned((__m128i*)mask);
-            xmmDst = load128Aligned((__m128i*)dst);
-
-            unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-            unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
-            pixMultiply_2x128 (&xmmAlpha, &xmmAlpha, &xmmMaskLo, &xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
-            xmmDstLo = _mm_adds_epu16 (xmmMaskLo, xmmDstLo);
-            xmmDstHi = _mm_adds_epu16 (xmmMaskHi, xmmDstHi);
-
-            save128Aligned ((__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
-
-            mask += 16;
-            dst += 16;
-            w -= 16;
-        }
-
-        while (w)
-        {
-            m = (uint32_t) *mask++;
-            d = (uint32_t) *dst;
-
-            *dst++ = (uint8_t) pack_1x64_32 (_mm_adds_pu16 (pixMultiply_1x64 (_mm_movepi64_pi64 (xmmAlpha), unpack_32_1x64 (m)),
-                                                                              unpack_32_1x64 (d)));
-            w--;
-        }
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)mask);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w && ((unsigned long)dst & 15))
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x64_32 (
+		_mm_adds_pu16 (
+		    pix_multiply_1x64 (
+			_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
+		    unpack_32_1x64 (d)));
+	    w--;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)mask);
+	cache_prefetch ((__m128i*)dst);
+
+	while (w >= 16)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)mask);
+	    cache_prefetch_next ((__m128i*)dst);
+
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+				&xmm_mask_lo, &xmm_mask_hi,
+				&xmm_mask_lo, &xmm_mask_hi);
+
+	    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+	    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    mask += 16;
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x64_32 (
+		_mm_adds_pu16 (
+		    pix_multiply_1x64 (
+			_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)),
+		    unpack_32_1x64 (d)));
+
+	    w--;
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrcAdd_8000x8000
+/* ----------------------------------------------------------------------
+ * composite_add_8000_8000
  */
 
 static void
-fbCompositeSrcAdd_8000x8000sse2 (pixman_implementation_t *imp,
-				 pixman_op_t op,
-				pixman_image_t * pSrc,
-				pixman_image_t * pMask,
-				pixman_image_t * pDst,
-				int32_t      xSrc,
-				int32_t      ySrc,
-				int32_t      xMask,
-				int32_t      yMask,
-				int32_t      xDst,
-				int32_t      yDst,
-				int32_t     width,
-				int32_t     height)
-{
-    uint8_t	*dstLine, *dst;
-    uint8_t	*srcLine, *src;
-    int	dstStride, srcStride;
-    uint16_t	w;
-    uint16_t	t;
-
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint8_t, srcStride, srcLine, 1);
-    fbComposeGetStart (pDst, xDst, yDst, uint8_t, dstStride, dstLine, 1);
+sse2_composite_add_8000_8000 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    uint16_t w;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-        dst = dstLine;
-        src = srcLine;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)src);
-        cachePrefetch ((__m128i*)dst);
-
-        dstLine += dstStride;
-        srcLine += srcStride;
-        w = width;
-
-        /* Small head */
-        while (w && (unsigned long)dst & 3)
-        {
-            t = (*dst) + (*src++);
-            *dst++ = t | (0 - (t >> 8));
-            w--;
-        }
-
-        coreCombineAddUsse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
-
-        /* Small tail */
-        dst += w & 0xfffc;
-        src += w & 0xfffc;
-
-        w &= 3;
-
-        while (w)
-        {
-            t = (*dst) + (*src++);
-            *dst++ = t | (0 - (t >> 8));
-            w--;
-        }
+	dst = dst_line;
+	src = src_line;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)src);
+	cache_prefetch ((__m128i*)dst);
+
+	dst_line += dst_stride;
+	src_line += src_stride;
+	w = width;
+
+	/* Small head */
+	while (w && (unsigned long)dst & 3)
+	{
+	    t = (*dst) + (*src++);
+	    *dst++ = t | (0 - (t >> 8));
+	    w--;
+	}
+
+	core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+	/* Small tail */
+	dst += w & 0xfffc;
+	src += w & 0xfffc;
+
+	w &= 3;
+
+	while (w)
+	{
+	    t = (*dst) + (*src++);
+	    *dst++ = t | (0 - (t >> 8));
+	    w--;
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
-/* -------------------------------------------------------------------------------------------------
- * fbCompositeSrcAdd_8888x8888
+/* ---------------------------------------------------------------------
+ * composite_add_8888_8888
  */
 static void
-fbCompositeSrcAdd_8888x8888sse2 (pixman_implementation_t *imp,
-				 pixman_op_t 	op,
-				pixman_image_t *	pSrc,
-				pixman_image_t *	pMask,
-				pixman_image_t *	 pDst,
-				int32_t		 xSrc,
-				int32_t      ySrc,
-				int32_t      xMask,
-				int32_t      yMask,
-				int32_t      xDst,
-				int32_t      yDst,
-				int32_t     width,
-				int32_t     height)
-{
-    uint32_t	*dstLine, *dst;
-    uint32_t	*srcLine, *src;
-    int	dstStride, srcStride;
-
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
+sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
 
     while (height--)
     {
-        dst = dstLine;
-        dstLine += dstStride;
-        src = srcLine;
-        srcLine += srcStride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
 
-        coreCombineAddUsse2 (dst, src, NULL, width);
+	core_combine_add_u_sse2 (dst, src, NULL, width);
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
 
 /* -------------------------------------------------------------------------------------------------
- * fbCompositeCopyAreasse2
+ * sse2_composite_copy_area
  */
 
 static pixman_bool_t
-pixmanBltsse2 (uint32_t *src_bits,
-	       uint32_t *dst_bits,
-	       int src_stride,
-	       int dst_stride,
-	       int src_bpp,
-	       int dst_bpp,
-	       int src_x, int src_y,
-	       int dst_x, int dst_y,
-	       int width, int height)
-{
-    uint8_t *	src_bytes;
-    uint8_t *	dst_bytes;
-    int		byte_width;
+pixman_blt_sse2 (uint32_t *src_bits,
+                 uint32_t *dst_bits,
+                 int       src_stride,
+                 int       dst_stride,
+                 int       src_bpp,
+                 int       dst_bpp,
+                 int       src_x,
+                 int       src_y,
+                 int       dst_x,
+                 int       dst_y,
+                 int       width,
+                 int       height)
+{
+    uint8_t *   src_bytes;
+    uint8_t *   dst_bytes;
+    int byte_width;
 
     if (src_bpp != dst_bpp)
-        return FALSE;
+	return FALSE;
 
     if (src_bpp == 16)
     {
-        src_stride = src_stride * (int) sizeof (uint32_t) / 2;
-        dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
-        src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
-        dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
-        byte_width = 2 * width;
-        src_stride *= 2;
-        dst_stride *= 2;
+	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+	byte_width = 2 * width;
+	src_stride *= 2;
+	dst_stride *= 2;
     }
     else if (src_bpp == 32)
     {
-        src_stride = src_stride * (int) sizeof (uint32_t) / 4;
-        dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
-        src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
-        dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
-        byte_width = 4 * width;
-        src_stride *= 4;
-        dst_stride *= 4;
+	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+	byte_width = 4 * width;
+	src_stride *= 4;
+	dst_stride *= 4;
     }
     else
     {
-        return FALSE;
+	return FALSE;
     }
 
-    cachePrefetch ((__m128i*)src_bytes);
-    cachePrefetch ((__m128i*)dst_bytes);
+    cache_prefetch ((__m128i*)src_bytes);
+    cache_prefetch ((__m128i*)dst_bytes);
 
     while (height--)
     {
-        int w;
-        uint8_t *s = src_bytes;
-        uint8_t *d = dst_bytes;
-        src_bytes += src_stride;
-        dst_bytes += dst_stride;
-        w = byte_width;
-
-        cachePrefetchNext ((__m128i*)s);
-        cachePrefetchNext ((__m128i*)d);
-
-        while (w >= 2 && ((unsigned long)d & 3))
-        {
-            *(uint16_t *)d = *(uint16_t *)s;
-            w -= 2;
-            s += 2;
-            d += 2;
-        }
-
-        while (w >= 4 && ((unsigned long)d & 15))
-        {
-            *(uint32_t *)d = *(uint32_t *)s;
-
-            w -= 4;
-            s += 4;
-            d += 4;
-        }
-
-        cachePrefetchNext ((__m128i*)s);
-        cachePrefetchNext ((__m128i*)d);
-
-        while (w >= 64)
-        {
-            __m128i xmm0, xmm1, xmm2, xmm3;
-
-            /* 128 bytes ahead */
-            cachePrefetch (((__m128i*)s) + 8);
-            cachePrefetch (((__m128i*)d) + 8);
-
-            xmm0 = load128Unaligned ((__m128i*)(s));
-            xmm1 = load128Unaligned ((__m128i*)(s+16));
-            xmm2 = load128Unaligned ((__m128i*)(s+32));
-            xmm3 = load128Unaligned ((__m128i*)(s+48));
-
-            save128Aligned ((__m128i*)(d),    xmm0);
-            save128Aligned ((__m128i*)(d+16), xmm1);
-            save128Aligned ((__m128i*)(d+32), xmm2);
-            save128Aligned ((__m128i*)(d+48), xmm3);
-
-            s += 64;
-            d += 64;
-            w -= 64;
-        }
-
-        cachePrefetchNext ((__m128i*)s);
-        cachePrefetchNext ((__m128i*)d);
-
-        while (w >= 16)
-        {
-            save128Aligned ((__m128i*)d, load128Unaligned ((__m128i*)s) );
-
-            w -= 16;
-            d += 16;
-            s += 16;
-        }
-
-        cachePrefetchNext ((__m128i*)s);
-        cachePrefetchNext ((__m128i*)d);
-
-        while (w >= 4)
-        {
-            *(uint32_t *)d = *(uint32_t *)s;
-
-            w -= 4;
-            s += 4;
-            d += 4;
-        }
-
-        if (w >= 2)
-        {
-            *(uint16_t *)d = *(uint16_t *)s;
-            w -= 2;
-            s += 2;
-            d += 2;
-        }
+	int w;
+	uint8_t *s = src_bytes;
+	uint8_t *d = dst_bytes;
+	src_bytes += src_stride;
+	dst_bytes += dst_stride;
+	w = byte_width;
+
+	cache_prefetch_next ((__m128i*)s);
+	cache_prefetch_next ((__m128i*)d);
+
+	while (w >= 2 && ((unsigned long)d & 3))
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((unsigned long)d & 15))
+	{
+	    *(uint32_t *)d = *(uint32_t *)s;
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+
+	cache_prefetch_next ((__m128i*)s);
+	cache_prefetch_next ((__m128i*)d);
+
+	while (w >= 64)
+	{
+	    __m128i xmm0, xmm1, xmm2, xmm3;
+
+	    /* 128 bytes ahead */
+	    cache_prefetch (((__m128i*)s) + 8);
+	    cache_prefetch (((__m128i*)d) + 8);
+
+	    xmm0 = load_128_unaligned ((__m128i*)(s));
+	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
+	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
+	    xmm3 = load_128_unaligned ((__m128i*)(s + 48));
+
+	    save_128_aligned ((__m128i*)(d),    xmm0);
+	    save_128_aligned ((__m128i*)(d + 16), xmm1);
+	    save_128_aligned ((__m128i*)(d + 32), xmm2);
+	    save_128_aligned ((__m128i*)(d + 48), xmm3);
+
+	    s += 64;
+	    d += 64;
+	    w -= 64;
+	}
+
+	cache_prefetch_next ((__m128i*)s);
+	cache_prefetch_next ((__m128i*)d);
+
+	while (w >= 16)
+	{
+	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
+
+	    w -= 16;
+	    d += 16;
+	    s += 16;
+	}
+
+	cache_prefetch_next ((__m128i*)s);
+	cache_prefetch_next ((__m128i*)d);
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = *(uint32_t *)s;
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 
     return TRUE;
 }
 
 static void
-fbCompositeCopyAreasse2 (pixman_implementation_t *imp,
-			 pixman_op_t       op,
-			pixman_image_t *	pSrc,
-			pixman_image_t *	pMask,
-			pixman_image_t *	pDst,
-			int32_t		xSrc,
-			int32_t		ySrc,
-			int32_t		xMask,
-			int32_t		yMask,
-			int32_t		xDst,
-			int32_t		yDst,
-			int32_t		width,
-			int32_t		height)
-{
-    pixmanBltsse2 (pSrc->bits.bits,
-		    pDst->bits.bits,
-		    pSrc->bits.rowstride,
-		    pDst->bits.rowstride,
-		    PIXMAN_FORMAT_BPP (pSrc->bits.format),
-		    PIXMAN_FORMAT_BPP (pDst->bits.format),
-		    xSrc, ySrc, xDst, yDst, width, height);
+sse2_composite_copy_area (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          pixman_image_t *         src_image,
+                          pixman_image_t *         mask_image,
+                          pixman_image_t *         dst_image,
+                          int32_t                  src_x,
+                          int32_t                  src_y,
+                          int32_t                  mask_x,
+                          int32_t                  mask_y,
+                          int32_t                  dest_x,
+                          int32_t                  dest_y,
+                          int32_t                  width,
+                          int32_t                  height)
+{
+    pixman_blt_sse2 (src_image->bits.bits,
+                     dst_image->bits.bits,
+                     src_image->bits.rowstride,
+                     dst_image->bits.rowstride,
+                     PIXMAN_FORMAT_BPP (src_image->bits.format),
+                     PIXMAN_FORMAT_BPP (dst_image->bits.format),
+                     src_x, src_y, dest_x, dest_y, width, height);
 }
 
 #if 0
 /* This code are buggy in MMX version, now the bug was translated to SSE2 version */
 void
-fbCompositeOver_x888x8x8888sse2 (pixman_implementation_t *imp,
-				 pixman_op_t      op,
-				pixman_image_t * pSrc,
-				pixman_image_t * pMask,
-				pixman_image_t * pDst,
-				int32_t      xSrc,
-				int32_t      ySrc,
-				int32_t      xMask,
-				int32_t      yMask,
-				int32_t      xDst,
-				int32_t      yDst,
-				int32_t     width,
-				int32_t     height)
-{
-    uint32_t	*src, *srcLine, s;
-    uint32_t    *dst, *dstLine, d;
-    uint8_t	    *mask, *maskLine;
-    uint32_t    m;
-    int		 srcStride, maskStride, dstStride;
+sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint8_t         *mask, *mask_line;
+    uint32_t m;
+    int src_stride, mask_stride, dst_stride;
     uint16_t w;
 
-    __m128i xmmSrc, xmmSrcLo, xmmSrcHi;
-    __m128i xmmDst, xmmDstLo, xmmDstHi;
-    __m128i xmmMask, xmmMaskLo, xmmMaskHi;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-    fbComposeGetStart (pMask, xMask, yMask, uint8_t, maskStride, maskLine, 1);
-    fbComposeGetStart (pSrc, xSrc, ySrc, uint32_t, srcStride, srcLine, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
 
     while (height--)
     {
-        src = srcLine;
-        srcLine += srcStride;
-        dst = dstLine;
-        dstLine += dstStride;
-        mask = maskLine;
-        maskLine += maskStride;
-
-        w = width;
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)src);
-        cachePrefetch ((__m128i*)dst);
-        cachePrefetch ((__m128i*)mask);
-
-        while (w && (unsigned long)dst & 15)
-        {
-            s = 0xff000000 | *src++;
-            m = (uint32_t) *mask++;
-            d = *dst;
-
-            __m64 ms = unpack_32_1x64 (s);
-
-            if (m != 0xff)
-            {
-                ms = inOver_1x64 (ms,
-                                  xMask00ff,
-                                  expandAlphaRev_1x64 (unpack_32_1x64 (m)),
-                                  unpack_32_1x64 (d));
-            }
-
-            *dst++ = pack_1x64_32 (ms);
-            w--;
-        }
-
-        /* call prefetch hint to optimize cache load*/
-        cachePrefetch ((__m128i*)src);
-        cachePrefetch ((__m128i*)dst);
-        cachePrefetch ((__m128i*)mask);
-
-        while (w >= 4)
-        {
-            /* fill cache line with next memory */
-            cachePrefetchNext ((__m128i*)src);
-            cachePrefetchNext ((__m128i*)dst);
-            cachePrefetchNext ((__m128i*)mask);
-
-            m = *(uint32_t*) mask;
-            xmmSrc = _mm_or_si128 (load128Unaligned ((__m128i*)src), Maskff000000);
-
-            if (m == 0xffffffff)
-            {
-                save128Aligned ((__m128i*)dst, xmmSrc);
-            }
-            else
-            {
-                xmmDst = load128Aligned ((__m128i*)dst);
-
-                xmmMask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
-
-                unpack_128_2x128 (xmmSrc, &xmmSrcLo, &xmmSrcHi);
-                unpack_128_2x128 (xmmMask, &xmmMaskLo, &xmmMaskHi);
-                unpack_128_2x128 (xmmDst, &xmmDstLo, &xmmDstHi);
-
-                expandAlphaRev_2x128 (xmmMaskLo, xmmMaskHi, &xmmMaskLo, &xmmMaskHi);
-
-                inOver_2x128 (xmmSrcLo, xmmSrcHi, Mask00ff, Mask00ff, xmmMaskLo, xmmMaskHi, &xmmDstLo, &xmmDstHi);
-
-                save128Aligned( (__m128i*)dst, pack_2x128_128 (xmmDstLo, xmmDstHi));
-            }
-
-            src += 4;
-            dst += 4;
-            mask += 4;
-            w -= 4;
-        }
-
-        while (w)
-        {
-            m = (uint32_t) *mask++;
-
-            if (m)
-            {
-                s = 0xff000000 | *src;
-
-                if (m == 0xff)
-                {
-                    *dst = s;
-                }
-                else
-                {
-                    d = *dst;
-
-                    *dst = pack_1x64_32 (inOver_1x64 (unpack_32_1x64 (s),
-                                                      xMask00ff,
-                                                      expandAlphaRev_1x64 (unpack_32_1x64 (m)),
-                                                      unpack_32_1x64 (d)));
-                }
-
-            }
-
-            src++;
-            dst++;
-            w--;
-        }
+	src = src_line;
+	src_line += src_stride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	w = width;
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)src);
+	cache_prefetch ((__m128i*)dst);
+	cache_prefetch ((__m128i*)mask);
+
+	while (w && (unsigned long)dst & 15)
+	{
+	    s = 0xff000000 | *src++;
+	    m = (uint32_t) *mask++;
+	    d = *dst;
+
+	    __m64 ms = unpack_32_1x64 (s);
+
+	    if (m != 0xff)
+	    {
+		ms = in_over_1x64 (ms,
+		                   mask_x00ff,
+		                   expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
+		                   unpack_32_1x64 (d));
+	    }
+
+	    *dst++ = pack_1x64_32 (ms);
+	    w--;
+	}
+
+	/* call prefetch hint to optimize cache load*/
+	cache_prefetch ((__m128i*)src);
+	cache_prefetch ((__m128i*)dst);
+	cache_prefetch ((__m128i*)mask);
+
+	while (w >= 4)
+	{
+	    /* fill cache line with next memory */
+	    cache_prefetch_next ((__m128i*)src);
+	    cache_prefetch_next ((__m128i*)dst);
+	    cache_prefetch_next ((__m128i*)mask);
+
+	    m = *(uint32_t*) mask;
+	    xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
+
+	    if (m == 0xffffffff)
+	    {
+		save_128_aligned ((__m128i*)dst, xmm_src);
+	    }
+	    else
+	    {
+		xmm_dst = load_128_aligned ((__m128i*)dst);
+
+		xmm_mask = _mm_unpacklo_epi16 (
+		    unpack_32_1x128 (m), _mm_setzero_si128 ());
+
+		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		in_over_2x128 (xmm_src_lo, xmm_src_hi,
+			       mask_00ff, mask_00ff,
+			       xmm_mask_lo, xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    src += 4;
+	    dst += 4;
+	    mask += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    m = (uint32_t) *mask++;
+
+	    if (m)
+	    {
+		s = 0xff000000 | *src;
+
+		if (m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    d = *dst;
+
+		    *dst = pack_1x64_32 (
+			in_over_1x64 (
+			    unpack_32_1x64 (s),
+			    mask_x00ff,
+			    expand_alpha_rev_1x64 (unpack_32_1x64 (m)),
+			    unpack_32_1x64 (d)));
+		}
+
+	    }
+
+	    src++;
+	    dst++;
+	    w--;
+	}
     }
 
-    _mm_empty();
+    _mm_empty ();
 }
+
 #endif
 
-static const FastPathInfo sse2_fast_paths[] =
-{
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   fbCompositeSolidMask_nx8x0565sse2,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   fbCompositeSolidMask_nx8x0565sse2,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSolid_nx8888sse2,           0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSolid_nx8888sse2,           0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSolid_nx0565sse2,           0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8888sse2,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8888sse2,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8888sse2,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8888sse2,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeSrc_8888x0565sse2,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeSrc_8888x0565sse2,          0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8x8888sse2,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8x8888sse2,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8x8888sse2,     0 },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8x8888sse2,     0 },
+static const pixman_fast_path_t sse2_fast_paths[] =
+{
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_r5g6b5,   sse2_composite_over_n_8_0565,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_b5g6r5,   sse2_composite_over_n_8_0565,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_n_8888,         0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_n_8888,         0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_n_0565,         0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888,      0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888,      0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888,      0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888,      0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_over_8888_0565,      0 },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_over_8888_0565,      0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888,       0 },
 #if 0
     /* FIXME: This code are buggy in MMX version, now the bug was translated to SSE2 version */
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeOver_x888x8x8888sse2,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeOver_x888x8x8888sse2,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeOver_x888x8x8888sse2,       0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeOver_x888x8x8888sse2,       0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888,    0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_x888_8_8888,    0 },
 #endif
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSrc_x888xnx8888sse2,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSrc_x888xnx8888sse2,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSrc_x888xnx8888sse2,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSrc_x888xnx8888sse2,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSrc_8888x8x8888sse2,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSrc_8888x8x8888sse2,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSrc_8888x8x8888sse2,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSrc_8888x8x8888sse2,        NEED_SOLID_MASK },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fbCompositeSolidMask_nx8888x8888Csse2, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, fbCompositeSolidMask_nx8888x8888Csse2, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, fbCompositeSolidMask_nx8888x8888Csse2, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, fbCompositeSolidMask_nx8888x8888Csse2, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   fbCompositeSolidMask_nx8888x0565Csse2, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   fbCompositeSolidMask_nx8888x0565Csse2, NEED_COMPONENT_ALPHA },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888RevNPx8888sse2,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, fbCompositeSrc_8888RevNPx8888sse2,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888RevNPx8888sse2,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, fbCompositeSrc_8888RevNPx8888sse2,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, fbCompositeSrc_8888RevNPx8888sse2,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, fbCompositeSrc_8888RevNPx8888sse2,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, fbCompositeSrc_8888RevNPx8888sse2,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, fbCompositeSrc_8888RevNPx8888sse2,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   fbCompositeSrc_8888RevNPx0565sse2,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5,   fbCompositeSrc_8888RevNPx0565sse2,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5,   fbCompositeSrc_8888RevNPx0565sse2,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   fbCompositeSrc_8888RevNPx0565sse2,     NEED_PIXBUF },
-    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeCopyAreasse2,               0 },
-    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeCopyAreasse2,               0 },
-
-    { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       fbCompositeSrcAdd_8000x8000sse2,       0 },
-    { PIXMAN_OP_ADD,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeSrcAdd_8888x8888sse2,       0 },
-    { PIXMAN_OP_ADD,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, fbCompositeSrcAdd_8888x8888sse2,       0 },
-    { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       fbCompositeSrcAdd_8888x8x8sse2,        0 },
-
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8r8g8b8, fbCompositeSolidMaskSrc_nx8x8888sse2,  0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8r8g8b8, fbCompositeSolidMaskSrc_nx8x8888sse2,  0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8b8g8r8, fbCompositeSolidMaskSrc_nx8x8888sse2,  0 },
-    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8b8g8r8, fbCompositeSolidMaskSrc_nx8x8888sse2,  0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, fbCompositeCopyAreasse2,               0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, fbCompositeCopyAreasse2,               0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeCopyAreasse2,		0 },
-    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,	PIXMAN_x8b8g8r8, fbCompositeCopyAreasse2,		0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, fbCompositeCopyAreasse2,               0 },
-    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, fbCompositeCopyAreasse2,               0 },
-    { PIXMAN_OP_SRC, PIXMAN_r5g6b5,    PIXMAN_null,     PIXMAN_r5g6b5,   fbCompositeCopyAreasse2,               0 },
-    { PIXMAN_OP_SRC, PIXMAN_b5g6r5,    PIXMAN_null,     PIXMAN_b5g6r5,   fbCompositeCopyAreasse2,               0 },
-
-    { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       fbCompositeIn_8x8sse2,                 0 },
-    { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       fbCompositeIn_nx8x8sse2,               0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888,    NEED_SOLID_MASK },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_solid,    PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5,   sse2_composite_over_pixbuf_0565,    NEED_PIXBUF },
+    { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
+
+    { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca,  NEED_COMPONENT_ALPHA },
+    { PIXMAN_OP_ADD,  PIXMAN_a8,       PIXMAN_null,     PIXMAN_a8,       sse2_composite_add_8000_8000,       0 },
+    { PIXMAN_OP_ADD,  PIXMAN_a8r8g8b8, PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888,       0 },
+    { PIXMAN_OP_ADD,  PIXMAN_a8b8g8r8, PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888,       0 },
+    { PIXMAN_OP_ADD,  PIXMAN_solid,    PIXMAN_a8,       PIXMAN_a8,       sse2_composite_add_8888_8_8,        0 },
+
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888,        0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888,        0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888,        0 },
+    { PIXMAN_OP_SRC, PIXMAN_solid,     PIXMAN_a8,       PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888,        0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_a8r8g8b8, sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_a8b8g8r8, sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8,  PIXMAN_null,     PIXMAN_x8r8g8b8, sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8,  PIXMAN_null,     PIXMAN_x8b8g8r8, sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_SRC, PIXMAN_r5g6b5,    PIXMAN_null,     PIXMAN_r5g6b5,   sse2_composite_copy_area,           0 },
+    { PIXMAN_OP_SRC, PIXMAN_b5g6r5,    PIXMAN_null,     PIXMAN_b5g6r5,   sse2_composite_copy_area,           0 },
+
+    { PIXMAN_OP_IN,  PIXMAN_a8,        PIXMAN_null,     PIXMAN_a8,       sse2_composite_in_8_8,              0 },
+    { PIXMAN_OP_IN,  PIXMAN_solid,     PIXMAN_a8,       PIXMAN_a8,       sse2_composite_in_n_8_8,            0 },
 
     { PIXMAN_OP_NONE },
 };
 
 /*
  * Work around GCC bug causing crashes in Mozilla with SSE2
- * 
- * When using SSE2 intrinsics, gcc assumes that the stack is 16 byte
- * aligned. Unfortunately some code, such as Mozilla and Mono contain
- * code that aligns the stack to 4 bytes.
+ *
+ * When using -msse, gcc generates movdqa instructions assuming that
+ * the stack is 16 byte aligned. Unfortunately some applications, such
+ * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
+ * causes the movdqa instructions to fail.
  *
  * The __force_align_arg_pointer__ makes gcc generate a prologue that
  * realigns the stack pointer to 16 bytes.
@@ -4961,56 +5678,63 @@ static const FastPathInfo sse2_fast_paths[] =
  *
  * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
  */
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
 __attribute__((__force_align_arg_pointer__))
+#endif
 static void
 sse2_composite (pixman_implementation_t *imp,
-		pixman_op_t     op,
-		pixman_image_t *src,
-		pixman_image_t *mask,
-		pixman_image_t *dest,
-		int32_t         src_x,
-		int32_t         src_y,
-		int32_t         mask_x,
-		int32_t         mask_y,
-		int32_t         dest_x,
-		int32_t         dest_y,
-		int32_t        width,
-		int32_t        height)
+                pixman_op_t              op,
+                pixman_image_t *         src,
+                pixman_image_t *         mask,
+                pixman_image_t *         dest,
+                int32_t                  src_x,
+                int32_t                  src_y,
+                int32_t                  mask_x,
+                int32_t                  mask_y,
+                int32_t                  dest_x,
+                int32_t                  dest_y,
+                int32_t                  width,
+                int32_t                  height)
 {
     if (_pixman_run_fast_path (sse2_fast_paths, imp,
-			       op, src, mask, dest,
-			       src_x, src_y,
-			       mask_x, mask_y,
-			       dest_x, dest_y,
-			       width, height))
+                               op, src, mask, dest,
+                               src_x, src_y,
+                               mask_x, mask_y,
+                               dest_x, dest_y,
+                               width, height))
     {
 	return;
     }
 
     _pixman_implementation_composite (imp->delegate, op,
-				      src, mask, dest,
-				      src_x, src_y,
-				      mask_x, mask_y,
-				      dest_x, dest_y,
-				      width, height);
+                                      src, mask, dest,
+                                      src_x, src_y,
+                                      mask_x, mask_y,
+                                      dest_x, dest_y,
+                                      width, height);
 }
 
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
 __attribute__((__force_align_arg_pointer__))
+#endif
 static pixman_bool_t
 sse2_blt (pixman_implementation_t *imp,
-	  uint32_t *src_bits,
-	  uint32_t *dst_bits,
-	  int src_stride,
-	  int dst_stride,
-	  int src_bpp,
-	  int dst_bpp,
-	  int src_x, int src_y,
-	  int dst_x, int dst_y,
-	  int width, int height)
-{
-    if (!pixmanBltsse2 (
-	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
-	    src_x, src_y, dst_x, dst_y, width, height))
+          uint32_t *               src_bits,
+          uint32_t *               dst_bits,
+          int                      src_stride,
+          int                      dst_stride,
+          int                      src_bpp,
+          int                      dst_bpp,
+          int                      src_x,
+          int                      src_y,
+          int                      dst_x,
+          int                      dst_y,
+          int                      width,
+          int                      height)
+{
+    if (!pixman_blt_sse2 (
+            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+            src_x, src_y, dst_x, dst_y, width, height))
 
     {
 	return _pixman_implementation_blt (
@@ -5022,19 +5746,21 @@ sse2_blt (pixman_implementation_t *imp,
     return TRUE;
 }
 
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
 __attribute__((__force_align_arg_pointer__))
+#endif
 static pixman_bool_t
 sse2_fill (pixman_implementation_t *imp,
-	   uint32_t *bits,
-	   int stride,
-	   int bpp,
-	   int x,
-	   int y,
-	   int width,
-	   int height,
-	   uint32_t xor)
-{
-    if (!pixmanFillsse2 (bits, stride, bpp, x, y, width, height, xor))
+           uint32_t *               bits,
+           int                      stride,
+           int                      bpp,
+           int                      x,
+           int                      y,
+           int                      width,
+           int                      height,
+           uint32_t xor)
+{
+    if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor))
     {
 	return _pixman_implementation_fill (
 	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
@@ -5043,72 +5769,75 @@ sse2_fill (pixman_implementation_t *imp,
     return TRUE;
 }
 
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
 pixman_implementation_t *
-_pixman_implementation_create_sse2 (pixman_implementation_t *toplevel)
+_pixman_implementation_create_sse2 (void)
 {
-    pixman_implementation_t *mmx = _pixman_implementation_create_mmx (NULL);
-    pixman_implementation_t *imp = _pixman_implementation_create (toplevel, mmx);
+    pixman_implementation_t *mmx = _pixman_implementation_create_mmx ();
+    pixman_implementation_t *imp = _pixman_implementation_create (mmx);
 
     /* SSE2 constants */
-    Mask565r  = createMask_2x32_128 (0x00f80000, 0x00f80000);
-    Mask565g1 = createMask_2x32_128 (0x00070000, 0x00070000);
-    Mask565g2 = createMask_2x32_128 (0x000000e0, 0x000000e0);
-    Mask565b  = createMask_2x32_128 (0x0000001f, 0x0000001f);
-    MaskRed   = createMask_2x32_128 (0x00f80000, 0x00f80000);
-    MaskGreen = createMask_2x32_128 (0x0000fc00, 0x0000fc00);
-    MaskBlue  = createMask_2x32_128 (0x000000f8, 0x000000f8);
-    Mask565FixRB = createMask_2x32_128 (0x00e000e0, 0x00e000e0);
-    Mask565FixG = createMask_2x32_128  (0x0000c000, 0x0000c000);
-    Mask0080 = createMask_16_128 (0x0080);
-    Mask00ff = createMask_16_128 (0x00ff);
-    Mask0101 = createMask_16_128 (0x0101);
-    Maskffff = createMask_16_128 (0xffff);
-    Maskff000000 = createMask_2x32_128 (0xff000000, 0xff000000);
-    MaskAlpha = createMask_2x32_128 (0x00ff0000, 0x00000000);
-    
+    mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+    mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
+    mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
+    mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
+    mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+    mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
+    mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
+    mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
+    mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
+    mask_0080 = create_mask_16_128 (0x0080);
+    mask_00ff = create_mask_16_128 (0x00ff);
+    mask_0101 = create_mask_16_128 (0x0101);
+    mask_ffff = create_mask_16_128 (0xffff);
+    mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
+    mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
+
     /* MMX constants */
-    xMask565rgb = createMask_2x32_64 (0x000001f0, 0x003f001f);
-    xMask565Unpack = createMask_2x32_64 (0x00000084, 0x04100840);
-    
-    xMask0080 = createMask_16_64 (0x0080);
-    xMask00ff = createMask_16_64 (0x00ff);
-    xMask0101 = createMask_16_64 (0x0101);
-    xMaskAlpha = createMask_2x32_64 (0x00ff0000, 0x00000000);
+    mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f);
+    mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840);
+
+    mask_x0080 = create_mask_16_64 (0x0080);
+    mask_x00ff = create_mask_16_64 (0x00ff);
+    mask_x0101 = create_mask_16_64 (0x0101);
+    mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000);
 
-    _mm_empty();
+    _mm_empty ();
 
     /* Set up function pointers */
-    
+
     /* SSE code patch for fbcompose.c */
-    imp->combine_32[PIXMAN_OP_OVER] = sse2CombineOverU;
-    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseU;
-    imp->combine_32[PIXMAN_OP_IN] = sse2CombineInU;
-    imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseU;
-    imp->combine_32[PIXMAN_OP_OUT] = sse2CombineOutU;
-    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseU;
-    imp->combine_32[PIXMAN_OP_ATOP] = sse2CombineAtopU;
-    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseU;
-    imp->combine_32[PIXMAN_OP_XOR] = sse2CombineXorU;
-    imp->combine_32[PIXMAN_OP_ADD] = sse2CombineAddU;
-    
-    imp->combine_32[PIXMAN_OP_SATURATE] = sse2CombineSaturateU;
-    
-    imp->combine_32_ca[PIXMAN_OP_SRC] = sse2CombineSrcC;
-    imp->combine_32_ca[PIXMAN_OP_OVER] = sse2CombineOverC;
-    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2CombineOverReverseC;
-    imp->combine_32_ca[PIXMAN_OP_IN] = sse2CombineInC;
-    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2CombineInReverseC;
-    imp->combine_32_ca[PIXMAN_OP_OUT] = sse2CombineOutC;
-    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2CombineOutReverseC;
-    imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2CombineAtopC;
-    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2CombineAtopReverseC;
-    imp->combine_32_ca[PIXMAN_OP_XOR] = sse2CombineXorC;
-    imp->combine_32_ca[PIXMAN_OP_ADD] = sse2CombineAddC;
-    
+    imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
+
+    imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
+
     imp->composite = sse2_composite;
     imp->blt = sse2_blt;
     imp->fill = sse2_fill;
-    
+
     return imp;
 }
 
diff --git a/lib/pixman/pixman/pixman-timer.c b/lib/pixman/pixman/pixman-timer.c
index c76264431..f5ae18e89 100644
--- a/lib/pixman/pixman/pixman-timer.c
+++ b/lib/pixman/pixman/pixman-timer.c
@@ -19,41 +19,48 @@
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+#ifdef HAVE_CONFIG_H
 #include <config.h>
+#endif
+
 #include <stdlib.h>
 #include <stdio.h>
 #include "pixman-private.h"
 
-static PixmanTimer *timers;
+#ifdef PIXMAN_TIMERS
+
+static pixman_timer_t *timers;
 
 static void
 dump_timers (void)
 {
-    PixmanTimer *timer;
+    pixman_timer_t *timer;
 
     for (timer = timers; timer != NULL; timer = timer->next)
     {
 	printf ("%s:   total: %llu     n: %llu      avg: %f\n",
-		timer->name,
-		timer->total,
-		timer->n_times,
-		timer->total / (double)timer->n_times);
+	        timer->name,
+	        timer->total,
+	        timer->n_times,
+	        timer->total / (double)timer->n_times);
     }
 }
 
 void
-pixman_timer_register (PixmanTimer *timer)
+pixman_timer_register (pixman_timer_t *timer)
 {
     static int initialized;
 
-    int atexit(void (*function)(void));
+    int atexit (void (*function)(void));
 
     if (!initialized)
     {
 	atexit (dump_timers);
 	initialized = 1;
     }
-    
+
     timer->next = timers;
     timers = timer;
 }
+
+#endif
diff --git a/lib/pixman/pixman/pixman-transformed-accessors.c b/lib/pixman/pixman/pixman-transformed-accessors.c
deleted file mode 100644
index 442ca2474..000000000
--- a/lib/pixman/pixman/pixman-transformed-accessors.c
+++ /dev/null
@@ -1,3 +0,0 @@
-#define PIXMAN_FB_ACCESSORS
-
-#include "pixman-transformed.c"
diff --git a/lib/pixman/pixman/pixman-transformed.c b/lib/pixman/pixman/pixman-transformed.c
deleted file mode 100644
index d721b35a2..000000000
--- a/lib/pixman/pixman/pixman-transformed.c
+++ /dev/null
@@ -1,510 +0,0 @@
-/*
- *
- * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
- *             2005 Lars Knoll & Zack Rusin, Trolltech
- *             2008 Aaron Plattner, NVIDIA Corporation
- *
- * Permission to use, copy, modify, distribute, and sell this software and its
- * documentation for any purpose is hereby granted without fee, provided that
- * the above copyright notice appear in all copies and that both that
- * copyright notice and this permission notice appear in supporting
- * documentation, and that the name of Keith Packard not be used in
- * advertising or publicity pertaining to distribution of the software without
- * specific, written prior permission.  Keith Packard makes no
- * representations about the suitability of this software for any purpose.  It
- * is provided "as is" without express or implied warranty.
- *
- * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
- * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
- * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
- * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
- * SOFTWARE.
- */
-
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include <stdlib.h>
-
-#include "pixman-private.h"
-
-#define Alpha(x) ((x) >> 24)
-#define Red(x) (((x) >> 16) & 0xff)
-#define Green(x) (((x) >> 8) & 0xff)
-#define Blue(x) ((x) & 0xff)
-
-#define Alpha64(x) ((x) >> 48)
-#define Red64(x) (((x) >> 32) & 0xffff)
-#define Green64(x) (((x) >> 16) & 0xffff)
-#define Blue64(x) ((x) & 0xffff)
-
-/*
- * Fetch from region strategies
- */
-typedef FASTCALL uint32_t (*fetchFromRegionProc)(bits_image_t *pict, int x, int y, uint32_t *buffer, fetchPixelProc32 fetch, pixman_box32_t *box);
-
-/*
- * There are two properties we can make use of when fetching pixels
- *
- * (a) Is the source clip just the image itself?
- *
- * (b) Do we know the coordinates of the pixel to fetch are
- *     within the image boundaries;
- *
- * Source clips are almost never used, so the important case to optimize
- * for is when src_clip is false. Since inside_bounds is statically known,
- * the last part of the if statement will normally be optimized away.
- */
-static force_inline uint32_t
-do_fetch (bits_image_t *pict, int x, int y, fetchPixelProc32 fetch,
-	  pixman_bool_t src_clip,
-	  pixman_bool_t inside_bounds)
-{
-    if (src_clip)
-    {
-	if (pixman_region32_contains_point (pict->common.src_clip, x, y,NULL))
-	    return fetch (pict, x, y);
-	else
-	    return 0;
-    }
-    else if (inside_bounds)
-    {
-	return fetch (pict, x, y);
-    }
-    else
-    {
-	if (x >= 0 && x < pict->width && y >= 0 && y < pict->height)
-	    return fetch (pict, x, y);
-	else
-	    return 0;
-    }
-}
-
-/*
- * Fetching Algorithms
- */
-static inline uint32_t
-fetch_nearest (bits_image_t		*pict,
-	       fetchPixelProc32		 fetch,
-	       pixman_bool_t		 affine,
-	       pixman_repeat_t		 repeat,
-	       pixman_bool_t             has_src_clip,
-	       const pixman_vector_t    *v)
-{
-    if (!v->vector[2])
-    {
-	return 0;
-    }
-    else
-    {
-	int x, y;
-	pixman_bool_t inside_bounds;
-
-	if (!affine)
-	{
-	    x = DIV(v->vector[0], v->vector[2]);
-	    y = DIV(v->vector[1], v->vector[2]);
-	}
-	else
-	{
-	    x = v->vector[0]>>16;
-	    y = v->vector[1]>>16;
-	}
-
-	switch (repeat)
-	{
-	case PIXMAN_REPEAT_NORMAL:
-	    x = MOD (x, pict->width);
-	    y = MOD (y, pict->height);
-	    inside_bounds = TRUE;
-	    break;
-	    
-	case PIXMAN_REPEAT_PAD:
-	    x = CLIP (x, 0, pict->width-1);
-	    y = CLIP (y, 0, pict->height-1);
-	    inside_bounds = TRUE;
-	    break;
-	    
-	case PIXMAN_REPEAT_REFLECT:
-	    x = MOD (x, pict->width * 2);
-	    if (x >= pict->width)
-		x = pict->width * 2 - x - 1;
-	    y = MOD (y, pict->height * 2);
-	    if (y >= pict->height)
-		y = pict->height * 2 - y - 1;
-	    inside_bounds = TRUE;
-	    break;
-
-	case PIXMAN_REPEAT_NONE:
-	    inside_bounds = FALSE;
-	    break;
-
-	default:
-	    return 0;
-	}
-
-	return do_fetch (pict, x, y, fetch, has_src_clip, inside_bounds);
-    }
-}
-
-static inline uint32_t
-fetch_bilinear (bits_image_t		*pict,
-		fetchPixelProc32	 fetch,
-		pixman_bool_t		 affine,
-		pixman_repeat_t		 repeat,
-		pixman_bool_t		 has_src_clip,
-		const pixman_vector_t   *v)
-{
-    if (!v->vector[2])
-    {
-	return 0;
-    }
-    else
-    {
-	int x1, x2, y1, y2, distx, idistx, disty, idisty;
-	uint32_t tl, tr, bl, br, r;
-	uint32_t ft, fb;
-	pixman_bool_t inside_bounds;
-	
-	if (!affine)
-	{
-	    pixman_fixed_48_16_t div;
-	    div = ((pixman_fixed_48_16_t)v->vector[0] << 16)/v->vector[2];
-	    x1 = div >> 16;
-	    distx = ((pixman_fixed_t)div >> 8) & 0xff;
-	    div = ((pixman_fixed_48_16_t)v->vector[1] << 16)/v->vector[2];
-	    y1 = div >> 16;
-	    disty = ((pixman_fixed_t)div >> 8) & 0xff;
-	}
-	else
-	{
-	    x1 = v->vector[0] >> 16;
-	    distx = (v->vector[0] >> 8) & 0xff;
-	    y1 = v->vector[1] >> 16;
-	    disty = (v->vector[1] >> 8) & 0xff;
-	}
-	x2 = x1 + 1;
-	y2 = y1 + 1;
-	
-	idistx = 256 - distx;
-	idisty = 256 - disty;
-
-	switch (repeat)
-	{
-	case PIXMAN_REPEAT_NORMAL:
-	    x1 = MOD (x1, pict->width);
-	    x2 = MOD (x2, pict->width);
-	    y1 = MOD (y1, pict->height);
-	    y2 = MOD (y2, pict->height);
-	    inside_bounds = TRUE;
-	    break;
-	    
-	case PIXMAN_REPEAT_PAD:
-	    x1 = CLIP (x1, 0, pict->width-1);
-	    x2 = CLIP (x2, 0, pict->width-1);
-	    y1 = CLIP (y1, 0, pict->height-1);
-	    y2 = CLIP (y2, 0, pict->height-1);
-	    inside_bounds = TRUE;
-	    break;
-	    
-	case PIXMAN_REPEAT_REFLECT:
-	    x1 = MOD (x1, pict->width * 2);
-	    if (x1 >= pict->width)
-		x1 = pict->width * 2 - x1 - 1;
-	    x2 = MOD (x2, pict->width * 2);
-	    if (x2 >= pict->width)
-		x2 = pict->width * 2 - x2 - 1;
-	    y1 = MOD (y1, pict->height * 2);
-	    if (y1 >= pict->height)
-		y1 = pict->height * 2 - y1 - 1;
-	    y2 = MOD (y2, pict->height * 2);
-	    if (y2 >= pict->height)
-		y2 = pict->height * 2 - y2 - 1;
-	    inside_bounds = TRUE;
-	    break;
-
-	case PIXMAN_REPEAT_NONE:
-	    inside_bounds = FALSE;
-	    break;
-
-	default:
-	    return 0;
-	}
-	
-	tl = do_fetch(pict, x1, y1, fetch, has_src_clip, inside_bounds);
-	tr = do_fetch(pict, x2, y1, fetch, has_src_clip, inside_bounds);
-	bl = do_fetch(pict, x1, y2, fetch, has_src_clip, inside_bounds);
-	br = do_fetch(pict, x2, y2, fetch, has_src_clip, inside_bounds);
-	
-	ft = FbGet8(tl,0) * idistx + FbGet8(tr,0) * distx;
-	fb = FbGet8(bl,0) * idistx + FbGet8(br,0) * distx;
-	r = (((ft * idisty + fb * disty) >> 16) & 0xff);
-	ft = FbGet8(tl,8) * idistx + FbGet8(tr,8) * distx;
-	fb = FbGet8(bl,8) * idistx + FbGet8(br,8) * distx;
-	r |= (((ft * idisty + fb * disty) >> 8) & 0xff00);
-	ft = FbGet8(tl,16) * idistx + FbGet8(tr,16) * distx;
-	fb = FbGet8(bl,16) * idistx + FbGet8(br,16) * distx;
-	r |= (((ft * idisty + fb * disty)) & 0xff0000);
-	ft = FbGet8(tl,24) * idistx + FbGet8(tr,24) * distx;
-	fb = FbGet8(bl,24) * idistx + FbGet8(br,24) * distx;
-	r |= (((ft * idisty + fb * disty) << 8) & 0xff000000);
-
-	return r;
-    }
-}
-
-static void
-fbFetchTransformed_Convolution(bits_image_t * pict, int width, uint32_t *buffer, uint32_t *mask, uint32_t maskBits,
-			       pixman_bool_t affine, pixman_vector_t v, pixman_vector_t unit)
-{
-    fetchPixelProc32 fetch;
-    int i;
-
-    pixman_fixed_t *params = pict->common.filter_params;
-    int32_t cwidth = pixman_fixed_to_int(params[0]);
-    int32_t cheight = pixman_fixed_to_int(params[1]);
-    int xoff = (params[0] - pixman_fixed_1) >> 1;
-    int yoff = (params[1] - pixman_fixed_1) >> 1;
-    fetch = ACCESS(pixman_fetchPixelProcForPicture32)(pict);
-
-    params += 2;
-    for (i = 0; i < width; ++i) {
-        if (!mask || mask[i] & maskBits)
-        {
-            if (!v.vector[2]) {
-                *(buffer + i) = 0;
-            } else {
-                int x1, x2, y1, y2, x, y;
-                int32_t srtot, sgtot, sbtot, satot;
-                pixman_fixed_t *p = params;
-
-                if (!affine) {
-                    pixman_fixed_48_16_t tmp;
-                    tmp = ((pixman_fixed_48_16_t)v.vector[0] << 16)/v.vector[2] - xoff;
-                    x1 = pixman_fixed_to_int(tmp);
-                    tmp = ((pixman_fixed_48_16_t)v.vector[1] << 16)/v.vector[2] - yoff;
-                    y1 = pixman_fixed_to_int(tmp);
-                } else {
-                    x1 = pixman_fixed_to_int(v.vector[0] - xoff);
-                    y1 = pixman_fixed_to_int(v.vector[1] - yoff);
-                }
-                x2 = x1 + cwidth;
-                y2 = y1 + cheight;
-
-                srtot = sgtot = sbtot = satot = 0;
-
-                for (y = y1; y < y2; y++) {
-                    int ty;
-                    switch (pict->common.repeat) {
-                        case PIXMAN_REPEAT_NORMAL:
-                            ty = MOD (y, pict->height);
-                            break;
-                        case PIXMAN_REPEAT_PAD:
-                            ty = CLIP (y, 0, pict->height-1);
-                            break;
-			case PIXMAN_REPEAT_REFLECT:
-			    ty = MOD (y, pict->height * 2);
-			    if (ty >= pict->height)
-				ty = pict->height * 2 - ty - 1;
-			    break;
-                        default:
-                            ty = y;
-                    }
-                    for (x = x1; x < x2; x++) {
-                        if (*p) {
-                            int tx;
-                            switch (pict->common.repeat) {
-                                case PIXMAN_REPEAT_NORMAL:
-                                    tx = MOD (x, pict->width);
-                                    break;
-                                case PIXMAN_REPEAT_PAD:
-                                    tx = CLIP (x, 0, pict->width-1);
-                                    break;
-				case PIXMAN_REPEAT_REFLECT:
-				    tx = MOD (x, pict->width * 2);
-				    if (tx >= pict->width)
-					tx = pict->width * 2 - tx - 1;
-				    break;
-                                default:
-                                    tx = x;
-                            }
-                            if (pixman_region32_contains_point (pict->common.src_clip, tx, ty, NULL)) {
-                                uint32_t c = fetch(pict, tx, ty);
-
-                                srtot += Red(c) * *p;
-                                sgtot += Green(c) * *p;
-                                sbtot += Blue(c) * *p;
-                                satot += Alpha(c) * *p;
-                            }
-                        }
-                        p++;
-                    }
-                }
-
-                satot >>= 16;
-                srtot >>= 16;
-                sgtot >>= 16;
-                sbtot >>= 16;
-
-                if (satot < 0) satot = 0; else if (satot > 0xff) satot = 0xff;
-                if (srtot < 0) srtot = 0; else if (srtot > 0xff) srtot = 0xff;
-                if (sgtot < 0) sgtot = 0; else if (sgtot > 0xff) sgtot = 0xff;
-                if (sbtot < 0) sbtot = 0; else if (sbtot > 0xff) sbtot = 0xff;
-
-                *(buffer + i) = ((satot << 24) |
-                                 (srtot << 16) |
-                                 (sgtot <<  8) |
-                                 (sbtot       ));
-            }
-        }
-        v.vector[0] += unit.vector[0];
-        v.vector[1] += unit.vector[1];
-        v.vector[2] += unit.vector[2];
-    }
-}
-
-static void
-adjust (pixman_vector_t *v, pixman_vector_t *u, pixman_fixed_t adjustment)
-{
-    int delta_v = (adjustment * v->vector[2]) >> 16;
-    int delta_u = (adjustment * u->vector[2]) >> 16;
-    
-    v->vector[0] += delta_v;
-    v->vector[1] += delta_v;
-    
-    u->vector[0] += delta_u;
-    u->vector[1] += delta_u;
-}
-
-void
-ACCESS(fbFetchTransformed)(bits_image_t * pict, int x, int y, int width,
-                           uint32_t *buffer, uint32_t *mask, uint32_t maskBits)
-{
-    uint32_t     *bits;
-    int32_t    stride;
-    pixman_vector_t v;
-    pixman_vector_t unit;
-    pixman_bool_t affine = TRUE;
-
-    bits = pict->bits;
-    stride = pict->rowstride;
-
-    /* reference point is the center of the pixel */
-    v.vector[0] = pixman_int_to_fixed(x) + pixman_fixed_1 / 2;
-    v.vector[1] = pixman_int_to_fixed(y) + pixman_fixed_1 / 2;
-    v.vector[2] = pixman_fixed_1;
-
-    /* when using convolution filters or PIXMAN_REPEAT_PAD one might get here without a transform */
-    if (pict->common.transform)
-    {
-        if (!pixman_transform_point_3d (pict->common.transform, &v))
-            return;
-        unit.vector[0] = pict->common.transform->matrix[0][0];
-        unit.vector[1] = pict->common.transform->matrix[1][0];
-        unit.vector[2] = pict->common.transform->matrix[2][0];
-
-        affine = (v.vector[2] == pixman_fixed_1 && unit.vector[2] == 0);
-    }
-    else
-    {
-        unit.vector[0] = pixman_fixed_1;
-        unit.vector[1] = 0;
-        unit.vector[2] = 0;
-    }
-
-    if (pict->common.filter == PIXMAN_FILTER_NEAREST || pict->common.filter == PIXMAN_FILTER_FAST)
-    {
-	fetchPixelProc32   fetch;
-	pixman_bool_t src_clip;
-	int i;
-
-	/* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
-	adjust (&v, &unit, - pixman_fixed_e);
-
-	fetch = ACCESS(pixman_fetchPixelProcForPicture32)(pict);
-	
-	src_clip = pict->common.src_clip != &(pict->common.full_region);
-	
-	for ( i = 0; i < width; ++i)
-	{
-	    if (!mask || mask[i] & maskBits)
-		*(buffer + i) = fetch_nearest (pict, fetch, affine, pict->common.repeat, src_clip, &v);
-	    
-	    v.vector[0] += unit.vector[0];
-	    v.vector[1] += unit.vector[1];
-	    v.vector[2] += unit.vector[2];
-	}
-    }
-    else if (pict->common.filter == PIXMAN_FILTER_BILINEAR	||
-	       pict->common.filter == PIXMAN_FILTER_GOOD	||
-	       pict->common.filter == PIXMAN_FILTER_BEST)
-    {
-	pixman_bool_t src_clip;
-	fetchPixelProc32   fetch;
-	int i;
-
-	/* Let the bilinear code pretend that pixels fall on integer coordinaters */
-	adjust (&v, &unit, -(pixman_fixed_1 / 2));
-
-	fetch = ACCESS(pixman_fetchPixelProcForPicture32)(pict);
-	src_clip = pict->common.src_clip != &(pict->common.full_region);
-	
-	for (i = 0; i < width; ++i)
-	{
-	    if (!mask || mask[i] & maskBits)
-		*(buffer + i) = fetch_bilinear (pict, fetch, affine, pict->common.repeat, src_clip, &v);
-	    
-	    v.vector[0] += unit.vector[0];
-	    v.vector[1] += unit.vector[1];
-	    v.vector[2] += unit.vector[2];
-	}
-    }
-    else if (pict->common.filter == PIXMAN_FILTER_CONVOLUTION)
-    {
-	/* Round to closest integer, ensuring that 0.5 rounds to 0, not 1 */
-	adjust (&v, &unit, - pixman_fixed_e);
-	
-        fbFetchTransformed_Convolution(pict, width, buffer, mask, maskBits, affine, v, unit);
-    }
-}
-
-#define SCANLINE_BUFFER_LENGTH 2048
-
-void
-ACCESS(fbFetchExternalAlpha)(bits_image_t * pict, int x, int y, int width,
-                             uint32_t *buffer, uint32_t *mask,
-                             uint32_t maskBits)
-{
-    int i;
-    uint32_t _alpha_buffer[SCANLINE_BUFFER_LENGTH];
-    uint32_t *alpha_buffer = _alpha_buffer;
-
-    if (!pict->common.alpha_map) {
-        ACCESS(fbFetchTransformed) (pict, x, y, width, buffer, mask, maskBits);
-	return;
-    }
-    if (width > SCANLINE_BUFFER_LENGTH)
-        alpha_buffer = (uint32_t *) pixman_malloc_ab (width, sizeof(uint32_t));
-
-    ACCESS(fbFetchTransformed)(pict, x, y, width, buffer, mask, maskBits);
-    ACCESS(fbFetchTransformed)((bits_image_t *)pict->common.alpha_map, x - pict->common.alpha_origin.x,
-                               y - pict->common.alpha_origin.y, width,
-                               alpha_buffer, mask, maskBits);
-    for (i = 0; i < width; ++i) {
-        if (!mask || mask[i] & maskBits)
-	{
-	    int a = alpha_buffer[i]>>24;
-	    *(buffer + i) = (a << 24)
-		| (div_255(Red(*(buffer + i)) * a) << 16)
-		| (div_255(Green(*(buffer + i)) * a) << 8)
-		| (div_255(Blue(*(buffer + i)) * a));
-	}
-    }
-
-    if (alpha_buffer != _alpha_buffer)
-        free(alpha_buffer);
-}
diff --git a/lib/pixman/pixman/pixman-trap.c b/lib/pixman/pixman/pixman-trap.c
index 28dacafcc..962cbb39e 100644
--- a/lib/pixman/pixman/pixman-trap.c
+++ b/lib/pixman/pixman/pixman-trap.c
@@ -27,30 +27,230 @@
 #include <stdio.h>
 #include "pixman-private.h"
 
-typedef uint32_t FbBits;
+/*
+ * Compute the smallest value no less than y which is on a
+ * grid row
+ */
+
+PIXMAN_EXPORT pixman_fixed_t
+pixman_sample_ceil_y (pixman_fixed_t y, int n)
+{
+    pixman_fixed_t f = pixman_fixed_frac (y);
+    pixman_fixed_t i = pixman_fixed_floor (y);
+
+    f = ((f + Y_FRAC_FIRST (n)) / STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
+	Y_FRAC_FIRST (n);
+    
+    if (f > Y_FRAC_LAST (n))
+    {
+	if (pixman_fixed_to_int (i) == 0x7fff)
+	{
+	    f = 0xffff; /* saturate */
+	}
+	else
+	{
+	    f = Y_FRAC_FIRST (n);
+	    i += pixman_fixed_1;
+	}
+    }
+    return (i | f);
+}
+
+/*
+ * Compute the largest value no greater than y which is on a
+ * grid row
+ */
+PIXMAN_EXPORT pixman_fixed_t
+pixman_sample_floor_y (pixman_fixed_t y,
+                       int            n)
+{
+    pixman_fixed_t f = pixman_fixed_frac (y);
+    pixman_fixed_t i = pixman_fixed_floor (y);
+
+    f = DIV (f - Y_FRAC_FIRST (n), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
+	Y_FRAC_FIRST (n);
+
+    if (f < Y_FRAC_FIRST (n))
+    {
+	if (pixman_fixed_to_int (i) == 0x8000)
+	{
+	    f = 0; /* saturate */
+	}
+	else
+	{
+	    f = Y_FRAC_LAST (n);
+	    i -= pixman_fixed_1;
+	}
+    }
+    return (i | f);
+}
+
+/*
+ * Step an edge by any amount (including negative values)
+ */
+PIXMAN_EXPORT void
+pixman_edge_step (pixman_edge_t *e,
+                  int            n)
+{
+    pixman_fixed_48_16_t ne;
+
+    e->x += n * e->stepx;
+
+    ne = e->e + n * (pixman_fixed_48_16_t) e->dx;
+
+    if (n >= 0)
+    {
+	if (ne > 0)
+	{
+	    int nx = (ne + e->dy - 1) / e->dy;
+	    e->e = ne - nx * (pixman_fixed_48_16_t) e->dy;
+	    e->x += nx * e->signdx;
+	}
+    }
+    else
+    {
+	if (ne <= -e->dy)
+	{
+	    int nx = (-ne) / e->dy;
+	    e->e = ne + nx * (pixman_fixed_48_16_t) e->dy;
+	    e->x -= nx * e->signdx;
+	}
+    }
+}
+
+/*
+ * A private routine to initialize the multi-step
+ * elements of an edge structure
+ */
+static void
+_pixman_edge_multi_init (pixman_edge_t * e,
+                         int             n,
+                         pixman_fixed_t *stepx_p,
+                         pixman_fixed_t *dx_p)
+{
+    pixman_fixed_t stepx;
+    pixman_fixed_48_16_t ne;
+
+    ne = n * (pixman_fixed_48_16_t) e->dx;
+    stepx = n * e->stepx;
+
+    if (ne > 0)
+    {
+	int nx = ne / e->dy;
+	ne -= nx * e->dy;
+	stepx += nx * e->signdx;
+    }
+
+    *dx_p = ne;
+    *stepx_p = stepx;
+}
+
+/*
+ * Initialize one edge structure given the line endpoints and a
+ * starting y value
+ */
+PIXMAN_EXPORT void
+pixman_edge_init (pixman_edge_t *e,
+                  int            n,
+                  pixman_fixed_t y_start,
+                  pixman_fixed_t x_top,
+                  pixman_fixed_t y_top,
+                  pixman_fixed_t x_bot,
+                  pixman_fixed_t y_bot)
+{
+    pixman_fixed_t dx, dy;
+
+    e->x = x_top;
+    e->e = 0;
+    dx = x_bot - x_top;
+    dy = y_bot - y_top;
+    e->dy = dy;
+    e->dx = 0;
+
+    if (dy)
+    {
+	if (dx >= 0)
+	{
+	    e->signdx = 1;
+	    e->stepx = dx / dy;
+	    e->dx = dx % dy;
+	    e->e = -dy;
+	}
+	else
+	{
+	    e->signdx = -1;
+	    e->stepx = -(-dx / dy);
+	    e->dx = -dx % dy;
+	    e->e = 0;
+	}
+
+	_pixman_edge_multi_init (e, STEP_Y_SMALL (n),
+				 &e->stepx_small, &e->dx_small);
+
+	_pixman_edge_multi_init (e, STEP_Y_BIG (n),
+				 &e->stepx_big, &e->dx_big);
+    }
+    pixman_edge_step (e, y_start - y_top);
+}
+
+/*
+ * Initialize one edge structure given a line, starting y value
+ * and a pixel offset for the line
+ */
+PIXMAN_EXPORT void
+pixman_line_fixed_edge_init (pixman_edge_t *            e,
+                             int                        n,
+                             pixman_fixed_t             y,
+                             const pixman_line_fixed_t *line,
+                             int                        x_off,
+                             int                        y_off)
+{
+    pixman_fixed_t x_off_fixed = pixman_int_to_fixed (x_off);
+    pixman_fixed_t y_off_fixed = pixman_int_to_fixed (y_off);
+    const pixman_point_fixed_t *top, *bot;
+
+    if (line->p1.y <= line->p2.y)
+    {
+	top = &line->p1;
+	bot = &line->p2;
+    }
+    else
+    {
+	top = &line->p2;
+	bot = &line->p1;
+    }
+    
+    pixman_edge_init (e, n, y,
+                      top->x + x_off_fixed,
+                      top->y + y_off_fixed,
+                      bot->x + x_off_fixed,
+                      bot->y + y_off_fixed);
+}
 
 PIXMAN_EXPORT void
-pixman_add_traps (pixman_image_t *	image,
-		  int16_t	x_off,
-		  int16_t	y_off,
-		  int		ntrap,
-		  pixman_trap_t *traps)
+pixman_add_traps (pixman_image_t * image,
+                  int16_t          x_off,
+                  int16_t          y_off,
+                  int              ntrap,
+                  pixman_trap_t *  traps)
 {
-    int		bpp;
-    int		width;
-    int		height;
+    int bpp;
+    int width;
+    int height;
 
-    pixman_fixed_t	x_off_fixed;
-    pixman_fixed_t	y_off_fixed;
-    pixman_edge_t  l, r;
-    pixman_fixed_t	t, b;
+    pixman_fixed_t x_off_fixed;
+    pixman_fixed_t y_off_fixed;
+    pixman_edge_t l, r;
+    pixman_fixed_t t, b;
 
+    _pixman_image_validate (image);
+    
     width = image->bits.width;
     height = image->bits.height;
     bpp = PIXMAN_FORMAT_BPP (image->bits.format);
-    
-    x_off_fixed = pixman_int_to_fixed(x_off);
-    y_off_fixed = pixman_int_to_fixed(y_off);
+
+    x_off_fixed = pixman_int_to_fixed (x_off);
+    y_off_fixed = pixman_int_to_fixed (y_off);
 
     while (ntrap--)
     {
@@ -58,83 +258,82 @@ pixman_add_traps (pixman_image_t *	image,
 	if (t < 0)
 	    t = 0;
 	t = pixman_sample_ceil_y (t, bpp);
-    
+
 	b = traps->bot.y + y_off_fixed;
 	if (pixman_fixed_to_int (b) >= height)
 	    b = pixman_int_to_fixed (height) - 1;
 	b = pixman_sample_floor_y (b, bpp);
-	
+
 	if (b >= t)
 	{
 	    /* initialize edge walkers */
 	    pixman_edge_init (&l, bpp, t,
-			      traps->top.l + x_off_fixed,
-			      traps->top.y + y_off_fixed,
-			      traps->bot.l + x_off_fixed,
-			      traps->bot.y + y_off_fixed);
-	
+	                      traps->top.l + x_off_fixed,
+	                      traps->top.y + y_off_fixed,
+	                      traps->bot.l + x_off_fixed,
+	                      traps->bot.y + y_off_fixed);
+
 	    pixman_edge_init (&r, bpp, t,
-			      traps->top.r + x_off_fixed,
-			      traps->top.y + y_off_fixed,
-			      traps->bot.r + x_off_fixed,
-			      traps->bot.y + y_off_fixed);
-	    
+	                      traps->top.r + x_off_fixed,
+	                      traps->top.y + y_off_fixed,
+	                      traps->bot.r + x_off_fixed,
+	                      traps->bot.y + y_off_fixed);
+
 	    pixman_rasterize_edges (image, &l, &r, t, b);
 	}
+
 	traps++;
     }
 }
 
+#if 0
 static void
 dump_image (pixman_image_t *image,
-	    const char *title)
+            const char *    title)
 {
     int i, j;
-    
+
     if (!image->type == BITS)
-    {
 	printf ("%s is not a regular image\n", title);
-    }
 
     if (!image->bits.format == PIXMAN_a8)
-    {
 	printf ("%s is not an alpha mask\n", title);
-    }
 
     printf ("\n\n\n%s: \n", title);
-    
+
     for (i = 0; i < image->bits.height; ++i)
     {
 	uint8_t *line =
 	    (uint8_t *)&(image->bits.bits[i * image->bits.rowstride]);
-	    
+
 	for (j = 0; j < image->bits.width; ++j)
-	    printf ("%c", line[j]? '#' : ' ');
+	    printf ("%c", line[j] ? '#' : ' ');
 
 	printf ("\n");
     }
 }
+#endif
 
 PIXMAN_EXPORT void
-pixman_add_trapezoids (pixman_image_t           *image,
-		       int16_t                   x_off,
-		       int                       y_off,
-		       int                       ntraps,
-		       const pixman_trapezoid_t *traps)
+pixman_add_trapezoids (pixman_image_t *          image,
+                       int16_t                   x_off,
+                       int                       y_off,
+                       int                       ntraps,
+                       const pixman_trapezoid_t *traps)
 {
     int i;
 
 #if 0
     dump_image (image, "before");
 #endif
-    
+
     for (i = 0; i < ntraps; ++i)
     {
 	const pixman_trapezoid_t *trap = &(traps[i]);
-	
+
 	if (!pixman_trapezoid_valid (trap))
 	    continue;
-	
+
 	pixman_rasterize_trapezoid (image, trap, x_off, y_off);
     }
 
@@ -144,21 +343,23 @@ pixman_add_trapezoids (pixman_image_t           *image,
 }
 
 PIXMAN_EXPORT void
-pixman_rasterize_trapezoid (pixman_image_t *    image,
-			    const pixman_trapezoid_t *trap,
-			    int			x_off,
-			    int			y_off)
+pixman_rasterize_trapezoid (pixman_image_t *          image,
+                            const pixman_trapezoid_t *trap,
+                            int                       x_off,
+                            int                       y_off)
 {
-    int		bpp;
-    int		width;
-    int		height;
+    int bpp;
+    int width;
+    int height;
 
-    pixman_fixed_t	x_off_fixed;
-    pixman_fixed_t	y_off_fixed;
-    pixman_edge_t	l, r;
-    pixman_fixed_t	t, b;
+    pixman_fixed_t x_off_fixed;
+    pixman_fixed_t y_off_fixed;
+    pixman_edge_t l, r;
+    pixman_fixed_t t, b;
 
     return_if_fail (image->type == BITS);
+
+    _pixman_image_validate (image);
     
     if (!pixman_trapezoid_valid (trap))
 	return;
@@ -166,9 +367,10 @@ pixman_rasterize_trapezoid (pixman_image_t *    image,
     width = image->bits.width;
     height = image->bits.height;
     bpp = PIXMAN_FORMAT_BPP (image->bits.format);
-    
-    x_off_fixed = pixman_int_to_fixed(x_off);
-    y_off_fixed = pixman_int_to_fixed(y_off);
+
+    x_off_fixed = pixman_int_to_fixed (x_off);
+    y_off_fixed = pixman_int_to_fixed (y_off);
+
     t = trap->top + y_off_fixed;
     if (t < 0)
 	t = 0;
@@ -178,7 +380,7 @@ pixman_rasterize_trapezoid (pixman_image_t *    image,
     if (pixman_fixed_to_int (b) >= height)
 	b = pixman_int_to_fixed (height) - 1;
     b = pixman_sample_floor_y (b, bpp);
-    
+
     if (b >= t)
     {
 	/* initialize edge walkers */
@@ -188,97 +390,3 @@ pixman_rasterize_trapezoid (pixman_image_t *    image,
 	pixman_rasterize_edges (image, &l, &r, t, b);
     }
 }
-
-#if 0
-static int
-_GreaterY (pixman_point_fixed_t *a, pixman_point_fixed_t *b)
-{
-    if (a->y == b->y)
-	return a->x > b->x;
-    return a->y > b->y;
-}
-
-/*
- * Note that the definition of this function is a bit odd because
- * of the X coordinate space (y increasing downwards).
- */
-static int
-_Clockwise (pixman_point_fixed_t *ref, pixman_point_fixed_t *a, pixman_point_fixed_t *b)
-{
-    pixman_point_fixed_t	ad, bd;
-
-    ad.x = a->x - ref->x;
-    ad.y = a->y - ref->y;
-    bd.x = b->x - ref->x;
-    bd.y = b->y - ref->y;
-
-    return ((pixman_fixed_32_32_t) bd.y * ad.x - (pixman_fixed_32_32_t) ad.y * bd.x) < 0;
-}
-
-/* FIXME -- this could be made more efficient */
-void
-fbAddTriangles (pixman_image_t *  pPicture,
-		int16_t	    x_off,
-		int16_t	    y_off,
-		int	    ntri,
-		xTriangle *tris)
-{
-    pixman_point_fixed_t	  *top, *left, *right, *tmp;
-    xTrapezoid	    trap;
-
-    for (; ntri; ntri--, tris++)
-    {
-	top = &tris->p1;
-	left = &tris->p2;
-	right = &tris->p3;
-	if (_GreaterY (top, left)) {
-	    tmp = left; left = top; top = tmp;
-	}
-	if (_GreaterY (top, right)) {
-	    tmp = right; right = top; top = tmp;
-	}
-	if (_Clockwise (top, right, left)) {
-	    tmp = right; right = left; left = tmp;
-	}
-	
-	/*
-	 * Two cases:
-	 *
-	 *		+		+
-	 *	       / \             / \
-	 *	      /   \           /   \
-	 *	     /     +         +     \
-	 *      /    --           --    \
-	 *     /   --               --   \
-	 *    / ---                   --- \
-	 *	 +--                         --+
-	 */
-	
-	trap.top = top->y;
-	trap.left.p1 = *top;
-	trap.left.p2 = *left;
-	trap.right.p1 = *top;
-	trap.right.p2 = *right;
-	if (right->y < left->y)
-	    trap.bottom = right->y;
-	else
-	    trap.bottom = left->y;
-	fbRasterizeTrapezoid (pPicture, &trap, x_off, y_off);
-	if (right->y < left->y)
-	{
-	    trap.top = right->y;
-	    trap.bottom = left->y;
-	    trap.right.p1 = *right;
-	    trap.right.p2 = *left;
-	}
-	else
-	{
-	    trap.top = left->y;
-	    trap.bottom = right->y;
-	    trap.left.p1 = *left;
-	    trap.left.p2 = *right;
-	}
-	fbRasterizeTrapezoid (pPicture, &trap, x_off, y_off);
-    }
-}
-#endif
diff --git a/lib/pixman/pixman/pixman-utils.c b/lib/pixman/pixman/pixman-utils.c
index ffb14445e..71282062c 100644
--- a/lib/pixman/pixman/pixman-utils.c
+++ b/lib/pixman/pixman/pixman-utils.c
@@ -1,5 +1,6 @@
 /*
  * Copyright © 2000 SuSE, Inc.
+ * Copyright © 1999 Keith Packard
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
@@ -24,210 +25,233 @@
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif
-
+#include <stdio.h>
 #include <stdlib.h>
 
 #include "pixman-private.h"
 
 /*
- * Compute the smallest value no less than y which is on a
- * grid row
+ * Computing composite region
  */
+#define BOUND(v)        (int16_t) ((v) < INT16_MIN ? INT16_MIN : (v) > INT16_MAX ? INT16_MAX : (v))
 
-PIXMAN_EXPORT pixman_fixed_t
-pixman_sample_ceil_y (pixman_fixed_t y, int n)
+static inline pixman_bool_t
+clip_general_image (pixman_region32_t * region,
+                    pixman_region32_t * clip,
+                    int                 dx,
+                    int                 dy)
 {
-    pixman_fixed_t   f = pixman_fixed_frac(y);
-    pixman_fixed_t   i = pixman_fixed_floor(y);
-
-    f = ((f + Y_FRAC_FIRST(n)) / STEP_Y_SMALL(n)) * STEP_Y_SMALL(n) + Y_FRAC_FIRST(n);
-    if (f > Y_FRAC_LAST(n))
+    if (pixman_region32_n_rects (region) == 1 &&
+        pixman_region32_n_rects (clip) == 1)
     {
-	if (pixman_fixed_to_int(i) == 0x7fff)
+	pixman_box32_t *  rbox = pixman_region32_rectangles (region, NULL);
+	pixman_box32_t *  cbox = pixman_region32_rectangles (clip, NULL);
+	int v;
+
+	if (rbox->x1 < (v = cbox->x1 + dx))
+	    rbox->x1 = BOUND (v);
+	if (rbox->x2 > (v = cbox->x2 + dx))
+	    rbox->x2 = BOUND (v);
+	if (rbox->y1 < (v = cbox->y1 + dy))
+	    rbox->y1 = BOUND (v);
+	if (rbox->y2 > (v = cbox->y2 + dy))
+	    rbox->y2 = BOUND (v);
+	if (rbox->x1 >= rbox->x2 ||
+	    rbox->y1 >= rbox->y2)
 	{
-	    f = 0xffff; /* saturate */
-	} else {
-	    f = Y_FRAC_FIRST(n);
-	    i += pixman_fixed_1;
+	    pixman_region32_init (region);
 	}
     }
-    return (i | f);
+    else if (!pixman_region32_not_empty (clip))
+    {
+	return FALSE;
+    }
+    else
+    {
+	if (dx || dy)
+	    pixman_region32_translate (region, -dx, -dy);
+	if (!pixman_region32_intersect (region, region, clip))
+	    return FALSE;
+	if (dx || dy)
+	    pixman_region32_translate (region, dx, dy);
+    }
+    return pixman_region32_not_empty (region);
 }
 
-#define _div(a,b)    ((a) >= 0 ? (a) / (b) : -((-(a) + (b) - 1) / (b)))
-
-/*
- * Compute the largest value no greater than y which is on a
- * grid row
- */
-PIXMAN_EXPORT pixman_fixed_t
-pixman_sample_floor_y (pixman_fixed_t y, int n)
+static inline pixman_bool_t
+clip_source_image (pixman_region32_t * region,
+                   pixman_image_t *    image,
+                   int                 dx,
+                   int                 dy)
 {
-    pixman_fixed_t   f = pixman_fixed_frac(y);
-    pixman_fixed_t   i = pixman_fixed_floor (y);
+    /* Source clips are ignored, unless they are explicitly turned on
+     * and the clip in question was set by an X client. (Because if
+     * the clip was not set by a client, then it is a hierarchy
+     * clip and those should always be ignored for sources).
+     */
+    if (!image->common.clip_sources || !image->common.client_clip)
+	return TRUE;
 
-    f = _div(f - Y_FRAC_FIRST(n), STEP_Y_SMALL(n)) * STEP_Y_SMALL(n) + Y_FRAC_FIRST(n);
-    if (f < Y_FRAC_FIRST(n))
-    {
-	if (pixman_fixed_to_int(i) == 0x8000)
-	{
-	    f = 0; /* saturate */
-	} else {
-	    f = Y_FRAC_LAST(n);
-	    i -= pixman_fixed_1;
-	}
-    }
-    return (i | f);
+    return clip_general_image (region,
+                               &image->common.clip_region,
+                               dx, dy);
 }
 
 /*
- * Step an edge by any amount (including negative values)
+ * returns FALSE if the final region is empty.  Indistinguishable from
+ * an allocation failure, but rendering ignores those anyways.
  */
-PIXMAN_EXPORT void
-pixman_edge_step (pixman_edge_t *e, int n)
+static pixman_bool_t
+pixman_compute_composite_region32 (pixman_region32_t * region,
+                                   pixman_image_t *    src_image,
+                                   pixman_image_t *    mask_image,
+                                   pixman_image_t *    dst_image,
+                                   int16_t             src_x,
+                                   int16_t             src_y,
+                                   int16_t             mask_x,
+                                   int16_t             mask_y,
+                                   int16_t             dest_x,
+                                   int16_t             dest_y,
+                                   uint16_t            width,
+                                   uint16_t            height)
 {
-    pixman_fixed_48_16_t	ne;
+    int v;
+
+    region->extents.x1 = dest_x;
+    v = dest_x + width;
+    region->extents.x2 = BOUND (v);
+    region->extents.y1 = dest_y;
+    v = dest_y + height;
+    region->extents.y2 = BOUND (v);
 
-    e->x += n * e->stepx;
+    region->extents.x1 = MAX (region->extents.x1, 0);
+    region->extents.y1 = MAX (region->extents.y1, 0);
+    region->extents.x2 = MIN (region->extents.x2, dst_image->bits.width);
+    region->extents.y2 = MIN (region->extents.y2, dst_image->bits.height);
 
-    ne = e->e + n * (pixman_fixed_48_16_t) e->dx;
+    region->data = 0;
 
-    if (n >= 0)
+    /* Check for empty operation */
+    if (region->extents.x1 >= region->extents.x2 ||
+        region->extents.y1 >= region->extents.y2)
     {
-	if (ne > 0)
+	pixman_region32_init (region);
+	return FALSE;
+    }
+
+    if (dst_image->common.have_clip_region)
+    {
+	if (!clip_general_image (region, &dst_image->common.clip_region, 0, 0))
 	{
-	    int nx = (ne + e->dy - 1) / e->dy;
-	    e->e = ne - nx * (pixman_fixed_48_16_t) e->dy;
-	    e->x += nx * e->signdx;
+	    pixman_region32_fini (region);
+	    return FALSE;
 	}
     }
-    else
+
+    if (dst_image->common.alpha_map && dst_image->common.alpha_map->common.have_clip_region)
     {
-	if (ne <= -e->dy)
+	if (!clip_general_image (region, &dst_image->common.alpha_map->common.clip_region,
+	                         -dst_image->common.alpha_origin_x,
+	                         -dst_image->common.alpha_origin_y))
 	{
-	    int nx = (-ne) / e->dy;
-	    e->e = ne + nx * (pixman_fixed_48_16_t) e->dy;
-	    e->x -= nx * e->signdx;
+	    pixman_region32_fini (region);
+	    return FALSE;
 	}
     }
-}
-
-/*
- * A private routine to initialize the multi-step
- * elements of an edge structure
- */
-static void
-_pixman_edge_multi_init (pixman_edge_t *e, int n, pixman_fixed_t *stepx_p, pixman_fixed_t *dx_p)
-{
-    pixman_fixed_t	stepx;
-    pixman_fixed_48_16_t	ne;
 
-    ne = n * (pixman_fixed_48_16_t) e->dx;
-    stepx = n * e->stepx;
-    if (ne > 0)
+    /* clip against src */
+    if (src_image->common.have_clip_region)
     {
-	int nx = ne / e->dy;
-	ne -= nx * e->dy;
-	stepx += nx * e->signdx;
+	if (!clip_source_image (region, src_image, dest_x - src_x, dest_y - src_y))
+	{
+	    pixman_region32_fini (region);
+	    return FALSE;
+	}
     }
-    *dx_p = ne;
-    *stepx_p = stepx;
-}
-
-/*
- * Initialize one edge structure given the line endpoints and a
- * starting y value
- */
-PIXMAN_EXPORT void
-pixman_edge_init (pixman_edge_t	*e,
-		  int		n,
-		  pixman_fixed_t		y_start,
-		  pixman_fixed_t		x_top,
-		  pixman_fixed_t		y_top,
-		  pixman_fixed_t		x_bot,
-		  pixman_fixed_t		y_bot)
-{
-    pixman_fixed_t	dx, dy;
-
-    e->x = x_top;
-    e->e = 0;
-    dx = x_bot - x_top;
-    dy = y_bot - y_top;
-    e->dy = dy;
-    e->dx = 0;
-    if (dy)
+    if (src_image->common.alpha_map && src_image->common.alpha_map->common.have_clip_region)
     {
-	if (dx >= 0)
+	if (!clip_source_image (region, (pixman_image_t *)src_image->common.alpha_map,
+	                        dest_x - (src_x - src_image->common.alpha_origin_x),
+	                        dest_y - (src_y - src_image->common.alpha_origin_y)))
 	{
-	    e->signdx = 1;
-	    e->stepx = dx / dy;
-	    e->dx = dx % dy;
-	    e->e = -dy;
+	    pixman_region32_fini (region);
+	    return FALSE;
 	}
-	else
+    }
+    /* clip against mask */
+    if (mask_image && mask_image->common.have_clip_region)
+    {
+	if (!clip_source_image (region, mask_image, dest_x - mask_x, dest_y - mask_y))
 	{
-	    e->signdx = -1;
-	    e->stepx = -(-dx / dy);
-	    e->dx = -dx % dy;
-	    e->e = 0;
+	    pixman_region32_fini (region);
+	    return FALSE;
+	}
+	if (mask_image->common.alpha_map && mask_image->common.alpha_map->common.have_clip_region)
+	{
+	    if (!clip_source_image (region, (pixman_image_t *)mask_image->common.alpha_map,
+	                            dest_x - (mask_x - mask_image->common.alpha_origin_x),
+	                            dest_y - (mask_y - mask_image->common.alpha_origin_y)))
+	    {
+		pixman_region32_fini (region);
+		return FALSE;
+	    }
 	}
-
-	_pixman_edge_multi_init (e, STEP_Y_SMALL(n), &e->stepx_small, &e->dx_small);
-	_pixman_edge_multi_init (e, STEP_Y_BIG(n), &e->stepx_big, &e->dx_big);
     }
-    pixman_edge_step (e, y_start - y_top);
+
+    return TRUE;
 }
 
-/*
- * Initialize one edge structure given a line, starting y value
- * and a pixel offset for the line
- */
-PIXMAN_EXPORT void
-pixman_line_fixed_edge_init (pixman_edge_t *e,
-			     int	    n,
-			     pixman_fixed_t	    y,
-			     const pixman_line_fixed_t *line,
-			     int	    x_off,
-			     int	    y_off)
+PIXMAN_EXPORT pixman_bool_t
+pixman_compute_composite_region (pixman_region16_t * region,
+                                 pixman_image_t *    src_image,
+                                 pixman_image_t *    mask_image,
+                                 pixman_image_t *    dst_image,
+                                 int16_t             src_x,
+                                 int16_t             src_y,
+                                 int16_t             mask_x,
+                                 int16_t             mask_y,
+                                 int16_t             dest_x,
+                                 int16_t             dest_y,
+                                 uint16_t            width,
+                                 uint16_t            height)
 {
-    pixman_fixed_t	x_off_fixed = pixman_int_to_fixed(x_off);
-    pixman_fixed_t	y_off_fixed = pixman_int_to_fixed(y_off);
-    const pixman_point_fixed_t *top, *bot;
+    pixman_region32_t r32;
+    pixman_bool_t retval;
 
-    if (line->p1.y <= line->p2.y)
-    {
-	top = &line->p1;
-	bot = &line->p2;
-    }
-    else
+    pixman_region32_init (&r32);
+
+    retval = pixman_compute_composite_region32 (
+	&r32, src_image, mask_image, dst_image,
+	src_x, src_y, mask_x, mask_y, dest_x, dest_y,
+	width, height);
+
+    if (retval)
     {
-	top = &line->p2;
-	bot = &line->p1;
+	if (!pixman_region16_copy_from_region32 (region, &r32))
+	    retval = FALSE;
     }
-    pixman_edge_init (e, n, y,
-		    top->x + x_off_fixed,
-		    top->y + y_off_fixed,
-		    bot->x + x_off_fixed,
-		    bot->y + y_off_fixed);
+
+    pixman_region32_fini (&r32);
+    return retval;
 }
 
 pixman_bool_t
 pixman_multiply_overflows_int (unsigned int a,
-		               unsigned int b)
+                               unsigned int b)
 {
     return a >= INT32_MAX / b;
 }
 
 pixman_bool_t
 pixman_addition_overflows_int (unsigned int a,
-		               unsigned int b)
+                               unsigned int b)
 {
     return a > INT32_MAX - b;
 }
 
 void *
-pixman_malloc_ab(unsigned int a,
-		 unsigned int b)
+pixman_malloc_ab (unsigned int a,
+                  unsigned int b)
 {
     if (a >= INT32_MAX / b)
 	return NULL;
@@ -237,8 +261,8 @@ pixman_malloc_ab(unsigned int a,
 
 void *
 pixman_malloc_abc (unsigned int a,
-		   unsigned int b,
-		   unsigned int c)
+                   unsigned int b,
+                   unsigned int c)
 {
     if (a >= INT32_MAX / b)
 	return NULL;
@@ -248,286 +272,233 @@ pixman_malloc_abc (unsigned int a,
 	return malloc (a * b * c);
 }
 
-
-/**
- * pixman_version:
- *
- * Returns the version of the pixman library encoded in a single
- * integer as per %PIXMAN_VERSION_ENCODE. The encoding ensures that
- * later versions compare greater than earlier versions.
- *
- * A run-time comparison to check that pixman's version is greater than
- * or equal to version X.Y.Z could be performed as follows:
- *
- * <informalexample><programlisting>
- * if (pixman_version() >= PIXMAN_VERSION_ENCODE(X,Y,Z)) {...}
- * </programlisting></informalexample>
- *
- * See also pixman_version_string() as well as the compile-time
- * equivalents %PIXMAN_VERSION and %PIXMAN_VERSION_STRING.
- *
- * Return value: the encoded version.
- **/
-PIXMAN_EXPORT int
-pixman_version (void)
+/*
+ * Helper routine to expand a color component from 0 < n <= 8 bits to 16
+ * bits by replication.
+ */
+static inline uint64_t
+expand16 (const uint8_t val, int nbits)
 {
-    return PIXMAN_VERSION;
-}
+    /* Start out with the high bit of val in the high bit of result. */
+    uint16_t result = (uint16_t)val << (16 - nbits);
 
-/**
- * pixman_version_string:
- *
- * Returns the version of the pixman library as a human-readable string
- * of the form "X.Y.Z".
- *
- * See also pixman_version() as well as the compile-time equivalents
- * %PIXMAN_VERSION_STRING and %PIXMAN_VERSION.
- *
- * Return value: a string containing the version.
- **/
-PIXMAN_EXPORT const char*
-pixman_version_string (void)
-{
-    return PIXMAN_VERSION_STRING;
-}
+    if (nbits == 0)
+	return 0;
 
-/**
- * pixman_format_supported_destination:
- * @format: A pixman_format_code_t format
- * 
- * Return value: whether the provided format code is a supported
- * format for a pixman surface used as a destination in
- * rendering.
- *
- * Currently, all pixman_format_code_t values are supported
- * except for the YUV formats.
- **/
-PIXMAN_EXPORT pixman_bool_t
-pixman_format_supported_destination (pixman_format_code_t format)
-{
-    switch (format) {
-    /* 32 bpp formats */
-    case PIXMAN_a2b10g10r10:
-    case PIXMAN_x2b10g10r10:
-    case PIXMAN_a8r8g8b8:
-    case PIXMAN_x8r8g8b8:
-    case PIXMAN_a8b8g8r8:
-    case PIXMAN_x8b8g8r8:
-    case PIXMAN_b8g8r8a8:
-    case PIXMAN_b8g8r8x8:
-    case PIXMAN_r8g8b8:
-    case PIXMAN_b8g8r8:
-    case PIXMAN_r5g6b5:
-    case PIXMAN_b5g6r5:
-    /* 16 bpp formats */
-    case PIXMAN_a1r5g5b5:
-    case PIXMAN_x1r5g5b5:
-    case PIXMAN_a1b5g5r5:
-    case PIXMAN_x1b5g5r5:
-    case PIXMAN_a4r4g4b4:
-    case PIXMAN_x4r4g4b4:
-    case PIXMAN_a4b4g4r4:
-    case PIXMAN_x4b4g4r4:
-    /* 8bpp formats */
-    case PIXMAN_a8:
-    case PIXMAN_r3g3b2:
-    case PIXMAN_b2g3r3:
-    case PIXMAN_a2r2g2b2:
-    case PIXMAN_a2b2g2r2:
-    case PIXMAN_c8:
-    case PIXMAN_g8:
-    case PIXMAN_x4a4:
-    /* Collides with PIXMAN_c8
-    case PIXMAN_x4c4:
-    */
-    /* Collides with PIXMAN_g8
-    case PIXMAN_x4g4:
-    */
-    /* 4bpp formats */
-    case PIXMAN_a4:
-    case PIXMAN_r1g2b1:
-    case PIXMAN_b1g2r1:
-    case PIXMAN_a1r1g1b1:
-    case PIXMAN_a1b1g1r1:
-    case PIXMAN_c4:
-    case PIXMAN_g4:
-    /* 1bpp formats */
-    case PIXMAN_a1:
-    case PIXMAN_g1:
-	return TRUE;
-	
-    /* YUV formats */
-    case PIXMAN_yuy2:
-    case PIXMAN_yv12:
-    default:
-	return FALSE;
+    /* Copy the bits in result, doubling the number of bits each time, until
+     * we fill all 16 bits.
+     */
+    while (nbits < 16)
+    {
+	result |= result >> nbits;
+	nbits *= 2;
     }
+
+    return result;
 }
 
-/**
- * pixman_format_supported_source:
- * @format: A pixman_format_code_t format
- * 
- * Return value: whether the provided format code is a supported
- * format for a pixman surface used as a source in
- * rendering.
- *
- * Currently, all pixman_format_code_t values are supported.
- **/
-PIXMAN_EXPORT pixman_bool_t
-pixman_format_supported_source (pixman_format_code_t format)
+/*
+ * This function expands images from ARGB8 format to ARGB16.  To preserve
+ * precision, it needs to know the original source format.  For example, if the
+ * source was PIXMAN_x1r5g5b5 and the red component contained bits 12345, then
+ * the expanded value is 12345123.  To correctly expand this to 16 bits, it
+ * should be 1234512345123451 and not 1234512312345123.
+ */
+void
+pixman_expand (uint64_t *           dst,
+               const uint32_t *     src,
+               pixman_format_code_t format,
+               int                  width)
 {
-    switch (format) {
-    /* 32 bpp formats */
-    case PIXMAN_a2b10g10r10:
-    case PIXMAN_x2b10g10r10:
-    case PIXMAN_a8r8g8b8:
-    case PIXMAN_x8r8g8b8:
-    case PIXMAN_a8b8g8r8:
-    case PIXMAN_x8b8g8r8:
-    case PIXMAN_b8g8r8a8:
-    case PIXMAN_b8g8r8x8:
-    case PIXMAN_r8g8b8:
-    case PIXMAN_b8g8r8:
-    case PIXMAN_r5g6b5:
-    case PIXMAN_b5g6r5:
-    /* 16 bpp formats */
-    case PIXMAN_a1r5g5b5:
-    case PIXMAN_x1r5g5b5:
-    case PIXMAN_a1b5g5r5:
-    case PIXMAN_x1b5g5r5:
-    case PIXMAN_a4r4g4b4:
-    case PIXMAN_x4r4g4b4:
-    case PIXMAN_a4b4g4r4:
-    case PIXMAN_x4b4g4r4:
-    /* 8bpp formats */
-    case PIXMAN_a8:
-    case PIXMAN_r3g3b2:
-    case PIXMAN_b2g3r3:
-    case PIXMAN_a2r2g2b2:
-    case PIXMAN_a2b2g2r2:
-    case PIXMAN_c8:
-    case PIXMAN_g8:
-    case PIXMAN_x4a4:
-    /* Collides with PIXMAN_c8
-    case PIXMAN_x4c4:
-    */
-    /* Collides with PIXMAN_g8
-    case PIXMAN_x4g4:
-    */
-    /* 4bpp formats */
-    case PIXMAN_a4:
-    case PIXMAN_r1g2b1:
-    case PIXMAN_b1g2r1:
-    case PIXMAN_a1r1g1b1:
-    case PIXMAN_a1b1g1r1:
-    case PIXMAN_c4:
-    case PIXMAN_g4:
-    /* 1bpp formats */
-    case PIXMAN_a1:
-    case PIXMAN_g1:
-    /* YUV formats */
-    case PIXMAN_yuy2:
-    case PIXMAN_yv12:
-	return TRUE;
-
-    default:
-	return FALSE;
+    /*
+     * Determine the sizes of each component and the masks and shifts
+     * required to extract them from the source pixel.
+     */
+    const int a_size = PIXMAN_FORMAT_A (format),
+              r_size = PIXMAN_FORMAT_R (format),
+              g_size = PIXMAN_FORMAT_G (format),
+              b_size = PIXMAN_FORMAT_B (format);
+    const int a_shift = 32 - a_size,
+              r_shift = 24 - r_size,
+              g_shift = 16 - g_size,
+              b_shift =  8 - b_size;
+    const uint8_t a_mask = ~(~0 << a_size),
+                  r_mask = ~(~0 << r_size),
+                  g_mask = ~(~0 << g_size),
+                  b_mask = ~(~0 << b_size);
+    int i;
+
+    /* Start at the end so that we can do the expansion in place
+     * when src == dst
+     */
+    for (i = width - 1; i >= 0; i--)
+    {
+	const uint32_t pixel = src[i];
+	const uint8_t a = (pixel >> a_shift) & a_mask,
+	              r = (pixel >> r_shift) & r_mask,
+	              g = (pixel >> g_shift) & g_mask,
+	              b = (pixel >> b_shift) & b_mask;
+	const uint64_t a16 = a_size ? expand16 (a, a_size) : 0xffff,
+	               r16 = expand16 (r, r_size),
+	               g16 = expand16 (g, g_size),
+	               b16 = expand16 (b, b_size);
+
+	dst[i] = a16 << 48 | r16 << 32 | g16 << 16 | b16;
     }
 }
 
+/*
+ * Contracting is easier than expanding.  We just need to truncate the
+ * components.
+ */
 void
-_pixman_walk_composite_region (pixman_implementation_t *imp,
-			      pixman_op_t op,
-			      pixman_image_t * pSrc,
-			      pixman_image_t * pMask,
-			      pixman_image_t * pDst,
-			      int16_t xSrc,
-			      int16_t ySrc,
-			      int16_t xMask,
-			      int16_t yMask,
-			      int16_t xDst,
-			      int16_t yDst,
-			      uint16_t width,
-			      uint16_t height,
-			      pixman_bool_t srcRepeat,
-			      pixman_bool_t maskRepeat,
-			      pixman_composite_func_t compositeRect)
+pixman_contract (uint32_t *      dst,
+                 const uint64_t *src,
+                 int             width)
 {
-    int		    n;
-    const pixman_box32_t *pbox;
-    int		    w, h, w_this, h_this;
-    int		    x_msk, y_msk, x_src, y_src, x_dst, y_dst;
-    pixman_region32_t reg;
-    pixman_region32_t *region;
-
-    pixman_region32_init (&reg);
-    if (!pixman_compute_composite_region32 (&reg, pSrc, pMask, pDst,
-					    xSrc, ySrc, xMask, yMask, xDst, yDst, width, height))
+    int i;
+
+    /* Start at the beginning so that we can do the contraction in
+     * place when src == dst
+     */
+    for (i = 0; i < width; i++)
     {
-	return;
+	const uint8_t a = src[i] >> 56,
+	              r = src[i] >> 40,
+	              g = src[i] >> 24,
+	              b = src[i] >> 8;
+
+	dst[i] = a << 24 | r << 16 | g << 8 | b;
     }
+}
 
-    region = &reg;
+static void
+walk_region_internal (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      pixman_image_t *         src_image,
+                      pixman_image_t *         mask_image,
+                      pixman_image_t *         dst_image,
+                      int16_t                  src_x,
+                      int16_t                  src_y,
+                      int16_t                  mask_x,
+                      int16_t                  mask_y,
+                      int16_t                  dest_x,
+                      int16_t                  dest_y,
+                      uint16_t                 width,
+                      uint16_t                 height,
+                      pixman_bool_t            src_repeat,
+                      pixman_bool_t            mask_repeat,
+                      pixman_region32_t *      region,
+                      pixman_composite_func_t  composite_rect)
+{
+    int n;
+    const pixman_box32_t *pbox;
+    int w, h, w_this, h_this;
+    int x_msk, y_msk, x_src, y_src, x_dst, y_dst;
 
     pbox = pixman_region32_rectangles (region, &n);
     while (n--)
     {
 	h = pbox->y2 - pbox->y1;
-	y_src = pbox->y1 - yDst + ySrc;
-	y_msk = pbox->y1 - yDst + yMask;
+	y_src = pbox->y1 - dest_y + src_y;
+	y_msk = pbox->y1 - dest_y + mask_y;
 	y_dst = pbox->y1;
+
 	while (h)
 	{
 	    h_this = h;
 	    w = pbox->x2 - pbox->x1;
-	    x_src = pbox->x1 - xDst + xSrc;
-	    x_msk = pbox->x1 - xDst + xMask;
+	    x_src = pbox->x1 - dest_x + src_x;
+	    x_msk = pbox->x1 - dest_x + mask_x;
 	    x_dst = pbox->x1;
-	    if (maskRepeat)
+
+	    if (mask_repeat)
 	    {
-		y_msk = MOD (y_msk, pMask->bits.height);
-		if (h_this > pMask->bits.height - y_msk)
-		    h_this = pMask->bits.height - y_msk;
+		y_msk = MOD (y_msk, mask_image->bits.height);
+		if (h_this > mask_image->bits.height - y_msk)
+		    h_this = mask_image->bits.height - y_msk;
 	    }
-	    if (srcRepeat)
+
+	    if (src_repeat)
 	    {
-		y_src = MOD (y_src, pSrc->bits.height);
-		if (h_this > pSrc->bits.height - y_src)
-		    h_this = pSrc->bits.height - y_src;
+		y_src = MOD (y_src, src_image->bits.height);
+		if (h_this > src_image->bits.height - y_src)
+		    h_this = src_image->bits.height - y_src;
 	    }
+
 	    while (w)
 	    {
 		w_this = w;
-		if (maskRepeat)
+
+		if (mask_repeat)
 		{
-		    x_msk = MOD (x_msk, pMask->bits.width);
-		    if (w_this > pMask->bits.width - x_msk)
-			w_this = pMask->bits.width - x_msk;
+		    x_msk = MOD (x_msk, mask_image->bits.width);
+		    if (w_this > mask_image->bits.width - x_msk)
+			w_this = mask_image->bits.width - x_msk;
 		}
-		if (srcRepeat)
+
+		if (src_repeat)
 		{
-		    x_src = MOD (x_src, pSrc->bits.width);
-		    if (w_this > pSrc->bits.width - x_src)
-			w_this = pSrc->bits.width - x_src;
+		    x_src = MOD (x_src, src_image->bits.width);
+		    if (w_this > src_image->bits.width - x_src)
+			w_this = src_image->bits.width - x_src;
 		}
-		(*compositeRect) (imp,
-				  op, pSrc, pMask, pDst,
-				  x_src, y_src, x_msk, y_msk, x_dst, y_dst,
-				  w_this, h_this);
+
+		(*composite_rect) (imp, op,
+				   src_image, mask_image, dst_image,
+				   x_src, y_src, x_msk, y_msk, x_dst, y_dst,
+				   w_this, h_this);
 		w -= w_this;
+
 		x_src += w_this;
 		x_msk += w_this;
 		x_dst += w_this;
 	    }
+
 	    h -= h_this;
 	    y_src += h_this;
 	    y_msk += h_this;
 	    y_dst += h_this;
 	}
+
 	pbox++;
     }
-    pixman_region32_fini (&reg);
+}
+
+void
+_pixman_walk_composite_region (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               pixman_image_t *         src_image,
+                               pixman_image_t *         mask_image,
+                               pixman_image_t *         dst_image,
+                               int16_t                  src_x,
+                               int16_t                  src_y,
+                               int16_t                  mask_x,
+                               int16_t                  mask_y,
+                               int16_t                  dest_x,
+                               int16_t                  dest_y,
+                               uint16_t                 width,
+                               uint16_t                 height,
+                               pixman_composite_func_t  composite_rect)
+{
+    pixman_region32_t region;
+
+    pixman_region32_init (&region);
+
+    if (pixman_compute_composite_region32 (
+            &region, src_image, mask_image, dst_image,
+            src_x, src_y, mask_x, mask_y, dest_x, dest_y,
+            width, height))
+    {
+	walk_region_internal (imp, op,
+	                      src_image, mask_image, dst_image,
+	                      src_x, src_y, mask_x, mask_y, dest_x, dest_y,
+	                      width, height, FALSE, FALSE,
+	                      &region,
+	                      composite_rect);
+
+	pixman_region32_fini (&region);
+    }
 }
 
 static pixman_bool_t
@@ -537,9 +508,9 @@ mask_is_solid (pixman_image_t *mask)
 	return TRUE;
 
     if (mask->type == BITS &&
-	mask->common.repeat == PIXMAN_REPEAT_NORMAL &&
-	mask->bits.width == 1 &&
-	mask->bits.height == 1)
+        mask->common.repeat == PIXMAN_REPEAT_NORMAL &&
+        mask->bits.width == 1 &&
+        mask->bits.height == 1)
     {
 	return TRUE;
     }
@@ -547,26 +518,28 @@ mask_is_solid (pixman_image_t *mask)
     return FALSE;
 }
 
-static const FastPathInfo *
-get_fast_path (const FastPathInfo *fast_paths,
-	       pixman_op_t         op,
-	       pixman_image_t     *pSrc,
-	       pixman_image_t     *pMask,
-	       pixman_image_t     *pDst,
-	       pixman_bool_t       is_pixbuf)
+static const pixman_fast_path_t *
+get_fast_path (const pixman_fast_path_t *fast_paths,
+               pixman_op_t               op,
+               pixman_image_t *          src_image,
+               pixman_image_t *          mask_image,
+               pixman_image_t *          dst_image,
+               pixman_bool_t             is_pixbuf)
 {
-    const FastPathInfo *info;
+    const pixman_fast_path_t *info;
 
     for (info = fast_paths; info->op != PIXMAN_OP_NONE; info++)
     {
-	pixman_bool_t valid_src		= FALSE;
-	pixman_bool_t valid_mask	= FALSE;
+	pixman_bool_t valid_src = FALSE;
+	pixman_bool_t valid_mask = FALSE;
 
 	if (info->op != op)
 	    continue;
 
-	if ((info->src_format == PIXMAN_solid && pixman_image_can_get_solid (pSrc))		||
-	    (pSrc->type == BITS && info->src_format == pSrc->bits.format))
+	if ((info->src_format == PIXMAN_solid &&
+	     _pixman_image_is_solid (src_image)) ||
+	    (src_image->type == BITS &&
+	     info->src_format == src_image->bits.format))
 	{
 	    valid_src = TRUE;
 	}
@@ -574,28 +547,29 @@ get_fast_path (const FastPathInfo *fast_paths,
 	if (!valid_src)
 	    continue;
 
-	if ((info->mask_format == PIXMAN_null && !pMask)			||
-	    (pMask && pMask->type == BITS && info->mask_format == pMask->bits.format))
+	if ((info->mask_format == PIXMAN_null && !mask_image) ||
+	    (mask_image && mask_image->type == BITS &&
+	     info->mask_format == mask_image->bits.format))
 	{
 	    valid_mask = TRUE;
 
 	    if (info->flags & NEED_SOLID_MASK)
 	    {
-		if (!pMask || !mask_is_solid (pMask))
+		if (!mask_image || !mask_is_solid (mask_image))
 		    valid_mask = FALSE;
 	    }
 
 	    if (info->flags & NEED_COMPONENT_ALPHA)
 	    {
-		if (!pMask || !pMask->common.component_alpha)
+		if (!mask_image || !mask_image->common.component_alpha)
 		    valid_mask = FALSE;
 	    }
 	}
 
 	if (!valid_mask)
 	    continue;
-	
-	if (info->dest_format != pDst->bits.format)
+
+	if (info->dest_format != dst_image->bits.format)
 	    continue;
 
 	if ((info->flags & NEED_PIXBUF) && !is_pixbuf)
@@ -607,72 +581,142 @@ get_fast_path (const FastPathInfo *fast_paths,
     return NULL;
 }
 
+static force_inline pixman_bool_t
+image_covers (pixman_image_t *image,
+              pixman_box32_t *extents,
+              int             x,
+              int             y)
+{
+    if (image->common.type == BITS &&
+	image->common.repeat == PIXMAN_REPEAT_NONE)
+    {
+	if (x > extents->x1 || y > extents->y1 ||
+	    x + image->bits.width < extents->x2 ||
+	    y + image->bits.height < extents->y2)
+	{
+	    return FALSE;
+	}
+    }
+
+    return TRUE;
+}
+
+static force_inline pixman_bool_t
+sources_cover (pixman_image_t *src,
+	       pixman_image_t *mask,
+	       pixman_box32_t *extents,
+	       int             src_x,
+	       int             src_y,
+	       int             mask_x,
+	       int             mask_y,
+	       int             dest_x,
+	       int             dest_y)
+{
+    if (!image_covers (src, extents, dest_x - src_x, dest_y - src_y))
+	return FALSE;
+
+    if (!mask)
+	return TRUE;
+    
+    if (!image_covers (mask, extents, dest_x - mask_x, dest_y - mask_y))
+	return FALSE;
+
+    return TRUE;
+}
+
 pixman_bool_t
-_pixman_run_fast_path (const FastPathInfo *paths,
-		       pixman_implementation_t *imp,
-		       pixman_op_t op,
-		       pixman_image_t *src,
-		       pixman_image_t *mask,
-		       pixman_image_t *dest,
-		       int32_t src_x,
-		       int32_t src_y,
-		       int32_t mask_x,
-		       int32_t mask_y,
-		       int32_t dest_x,
-		       int32_t dest_y,
-		       int32_t width,
-		       int32_t height)
+_pixman_run_fast_path (const pixman_fast_path_t *paths,
+                       pixman_implementation_t * imp,
+                       pixman_op_t               op,
+                       pixman_image_t *          src,
+                       pixman_image_t *          mask,
+                       pixman_image_t *          dest,
+                       int32_t                   src_x,
+                       int32_t                   src_y,
+                       int32_t                   mask_x,
+                       int32_t                   mask_y,
+                       int32_t                   dest_x,
+                       int32_t                   dest_y,
+                       int32_t                   width,
+                       int32_t                   height)
 {
     pixman_composite_func_t func = NULL;
-    pixman_bool_t src_repeat = src->common.repeat == PIXMAN_REPEAT_NORMAL;
-    pixman_bool_t mask_repeat = mask && mask->common.repeat == PIXMAN_REPEAT_NORMAL;
-    
-    if ((src->type == BITS || pixman_image_can_get_solid (src)) &&
-	(!mask || mask->type == BITS)
-        && !src->common.transform && !(mask && mask->common.transform)
-        && !(mask && mask->common.alpha_map) && !src->common.alpha_map && !dest->common.alpha_map
-        && (src->common.filter != PIXMAN_FILTER_CONVOLUTION)
-        && (src->common.repeat != PIXMAN_REPEAT_PAD)
-        && (src->common.repeat != PIXMAN_REPEAT_REFLECT)
-        && (!mask || (mask->common.filter != PIXMAN_FILTER_CONVOLUTION &&
-		      mask->common.repeat != PIXMAN_REPEAT_PAD &&
-		      mask->common.repeat != PIXMAN_REPEAT_REFLECT))
-	&& !src->common.read_func && !src->common.write_func
-	&& !(mask && mask->common.read_func)
-	&& !(mask && mask->common.write_func)
-	&& !dest->common.read_func
-	&& !dest->common.write_func)
+    pixman_bool_t src_repeat =
+	src->common.repeat == PIXMAN_REPEAT_NORMAL;
+    pixman_bool_t mask_repeat =
+	mask && mask->common.repeat == PIXMAN_REPEAT_NORMAL;
+    pixman_bool_t result;
+    pixman_bool_t has_fast_path;
+
+    has_fast_path = !dest->common.alpha_map &&
+		    !dest->bits.read_func &&
+		    !dest->bits.write_func;
+
+    if (has_fast_path)
+    {
+	has_fast_path = (src->type == BITS || _pixman_image_is_solid (src)) &&
+	                !src->common.transform &&
+	                !src->common.alpha_map &&
+			src->common.filter != PIXMAN_FILTER_CONVOLUTION &&
+			src->common.repeat != PIXMAN_REPEAT_PAD &&
+			src->common.repeat != PIXMAN_REPEAT_REFLECT;
+	if (has_fast_path && src->type == BITS)
+	{
+	    has_fast_path = !src->bits.read_func &&
+	                    !src->bits.write_func &&
+		            !PIXMAN_FORMAT_IS_WIDE (src->bits.format);
+	}
+    }
+
+    if (mask && has_fast_path)
+    {
+	has_fast_path =
+	    mask->type == BITS &&
+	    !mask->common.transform &&
+	    !mask->common.alpha_map &&
+	    !mask->bits.read_func &&
+	    !mask->bits.write_func &&
+	    mask->common.filter != PIXMAN_FILTER_CONVOLUTION &&
+	    mask->common.repeat != PIXMAN_REPEAT_PAD &&
+	    mask->common.repeat != PIXMAN_REPEAT_REFLECT &&
+	    !PIXMAN_FORMAT_IS_WIDE (mask->bits.format);
+    }
+
+    if (has_fast_path)
     {
-	const FastPathInfo *info;	
+	const pixman_fast_path_t *info;
 	pixman_bool_t pixbuf;
 
 	pixbuf =
-	    src && src->type == BITS		&&
-	    mask && mask->type == BITS		&&
-	    src->bits.bits == mask->bits.bits	&&
-	    src_x == mask_x			&&
-	    src_y == mask_y			&&
-	    !mask->common.component_alpha	&&
+	    src && src->type == BITS            &&
+	    mask && mask->type == BITS          &&
+	    src->bits.bits == mask->bits.bits   &&
+	    src_x == mask_x                     &&
+	    src_y == mask_y                     &&
+	    !mask->common.component_alpha       &&
 	    !mask_repeat;
-	
+
 	info = get_fast_path (paths, op, src, mask, dest, pixbuf);
 
 	if (info)
 	{
 	    func = info->func;
-		
+
 	    if (info->src_format == PIXMAN_solid)
 		src_repeat = FALSE;
 
-	    if (info->mask_format == PIXMAN_solid || info->flags & NEED_SOLID_MASK)
+	    if (info->mask_format == PIXMAN_solid ||
+		info->flags & NEED_SOLID_MASK)
+	    {
 		mask_repeat = FALSE;
+	    }
 
-	    if ((src_repeat			&&
-		 src->bits.width == 1		&&
-		 src->bits.height == 1)	||
-		(mask_repeat			&&
-		 mask->bits.width == 1		&&
-		 mask->bits.height == 1))
+	    if ((src_repeat                     &&
+	         src->bits.width == 1           &&
+	         src->bits.height == 1) ||
+	        (mask_repeat                    &&
+	         mask->bits.width == 1          &&
+	         mask->bits.height == 1))
 	    {
 		/* If src or mask are repeating 1x1 images and src_repeat or
 		 * mask_repeat are still TRUE, it means the fast path we
@@ -687,17 +731,107 @@ _pixman_run_fast_path (const FastPathInfo *paths,
 	}
     }
 
+    result = FALSE;
+
     if (func)
     {
-	_pixman_walk_composite_region (imp, op,
-				       src, mask, dest,
-				       src_x, src_y, mask_x, mask_y,
-				       dest_x, dest_y,
-				       width, height,
-				       src_repeat, mask_repeat,
-				       func);
-	return TRUE;
+	pixman_region32_t region;
+	pixman_region32_init (&region);
+
+	if (pixman_compute_composite_region32 (
+	        &region, src, mask, dest,
+	        src_x, src_y, mask_x, mask_y, dest_x, dest_y, width, height))
+	{
+	    pixman_box32_t *extents = pixman_region32_extents (&region);
+
+	    if (sources_cover (
+		    src, mask, extents,
+		    src_x, src_y, mask_x, mask_y, dest_x, dest_y))
+	    {
+		walk_region_internal (imp, op,
+		                      src, mask, dest,
+		                      src_x, src_y, mask_x, mask_y,
+		                      dest_x, dest_y,
+		                      width, height,
+		                      src_repeat, mask_repeat,
+		                      &region,
+		                      func);
+
+		result = TRUE;
+	    }
+
+	    pixman_region32_fini (&region);
+	}
     }
-    
-    return FALSE;
+
+    return result;
+}
+
+#define N_TMP_BOXES (16)
+
+pixman_bool_t
+pixman_region16_copy_from_region32 (pixman_region16_t *dst,
+                                    pixman_region32_t *src)
+{
+    int n_boxes, i;
+    pixman_box32_t *boxes32;
+    pixman_box16_t *boxes16;
+    pixman_bool_t retval;
+
+    boxes32 = pixman_region32_rectangles (src, &n_boxes);
+
+    boxes16 = pixman_malloc_ab (n_boxes, sizeof (pixman_box16_t));
+
+    if (!boxes16)
+	return FALSE;
+
+    for (i = 0; i < n_boxes; ++i)
+    {
+	boxes16[i].x1 = boxes32[i].x1;
+	boxes16[i].y1 = boxes32[i].y1;
+	boxes16[i].x2 = boxes32[i].x2;
+	boxes16[i].y2 = boxes32[i].y2;
+    }
+
+    pixman_region_fini (dst);
+    retval = pixman_region_init_rects (dst, boxes16, n_boxes);
+    free (boxes16);
+    return retval;
+}
+
+pixman_bool_t
+pixman_region32_copy_from_region16 (pixman_region32_t *dst,
+                                    pixman_region16_t *src)
+{
+    int n_boxes, i;
+    pixman_box16_t *boxes16;
+    pixman_box32_t *boxes32;
+    pixman_box32_t tmp_boxes[N_TMP_BOXES];
+    pixman_bool_t retval;
+
+    boxes16 = pixman_region_rectangles (src, &n_boxes);
+
+    if (n_boxes > N_TMP_BOXES)
+	boxes32 = pixman_malloc_ab (n_boxes, sizeof (pixman_box32_t));
+    else
+	boxes32 = tmp_boxes;
+
+    if (!boxes32)
+	return FALSE;
+
+    for (i = 0; i < n_boxes; ++i)
+    {
+	boxes32[i].x1 = boxes16[i].x1;
+	boxes32[i].y1 = boxes16[i].y1;
+	boxes32[i].x2 = boxes16[i].x2;
+	boxes32[i].y2 = boxes16[i].y2;
+    }
+
+    pixman_region32_fini (dst);
+    retval = pixman_region32_init_rects (dst, boxes32, n_boxes);
+
+    if (boxes32 != tmp_boxes)
+	free (boxes32);
+
+    return retval;
 }
diff --git a/lib/pixman/pixman/pixman-vmx.c b/lib/pixman/pixman/pixman-vmx.c
index e371f7f52..06325a7c0 100644
--- a/lib/pixman/pixman/pixman-vmx.c
+++ b/lib/pixman/pixman/pixman-vmx.c
@@ -33,27 +33,31 @@
 #define AVV(x...) {x}
 
 static force_inline vector unsigned int
-splat_alpha (vector unsigned int pix) {
+splat_alpha (vector unsigned int pix)
+{
     return vec_perm (pix, pix,
-    (vector unsigned char)AVV(0x00,0x00,0x00,0x00, 0x04,0x04,0x04,0x04,
-                               0x08,0x08,0x08,0x08, 0x0C,0x0C,0x0C,0x0C));
+		     (vector unsigned char)AVV (
+			 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
+			 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
 }
 
 static force_inline vector unsigned int
 pix_multiply (vector unsigned int p, vector unsigned int a)
 {
     vector unsigned short hi, lo, mod;
+
     /* unpack to short */
     hi = (vector unsigned short)
-                    vec_mergeh ((vector unsigned char)AVV(0),
-                                (vector unsigned char)p);
+	vec_mergeh ((vector unsigned char)AVV (0),
+		    (vector unsigned char)p);
+
     mod = (vector unsigned short)
-                    vec_mergeh ((vector unsigned char)AVV(0),
-                                (vector unsigned char)a);
+	vec_mergeh ((vector unsigned char)AVV (0),
+		    (vector unsigned char)a);
 
     hi = vec_mladd (hi, mod, (vector unsigned short)
-                            AVV(0x0080,0x0080,0x0080,0x0080,
-                                 0x0080,0x0080,0x0080,0x0080));
+                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
+                         0x0080, 0x0080, 0x0080, 0x0080));
 
     hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
 
@@ -61,15 +65,15 @@ pix_multiply (vector unsigned int p, vector unsigned int a)
 
     /* unpack to short */
     lo = (vector unsigned short)
-                    vec_mergel ((vector unsigned char)AVV(0),
-                                (vector unsigned char)p);
+	vec_mergel ((vector unsigned char)AVV (0),
+		    (vector unsigned char)p);
     mod = (vector unsigned short)
-                    vec_mergel ((vector unsigned char)AVV(0),
-                                (vector unsigned char)a);
+	vec_mergel ((vector unsigned char)AVV (0),
+		    (vector unsigned char)a);
 
     lo = vec_mladd (lo, mod, (vector unsigned short)
-                            AVV(0x0080,0x0080,0x0080,0x0080,
-                                 0x0080,0x0080,0x0080,0x0080));
+                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
+                         0x0080, 0x0080, 0x0080, 0x0080));
 
     lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
 
@@ -82,63 +86,21 @@ static force_inline vector unsigned int
 pix_add (vector unsigned int a, vector unsigned int b)
 {
     return (vector unsigned int)vec_adds ((vector unsigned char)a,
-                     (vector unsigned char)b);
+                                          (vector unsigned char)b);
 }
 
 static force_inline vector unsigned int
-pix_add_mul (vector unsigned int x, vector unsigned int a,
-             vector unsigned int y, vector unsigned int b)
+pix_add_mul (vector unsigned int x,
+             vector unsigned int a,
+             vector unsigned int y,
+             vector unsigned int b)
 {
-    vector unsigned short hi, lo, mod, hiy, loy, mody;
+    vector unsigned int t1, t2;
 
-    hi = (vector unsigned short)
-                    vec_mergeh ((vector unsigned char)AVV(0),
-                                (vector unsigned char)x);
-    mod = (vector unsigned short)
-                    vec_mergeh ((vector unsigned char)AVV(0),
-                                (vector unsigned char)a);
-    hiy = (vector unsigned short)
-                    vec_mergeh ((vector unsigned char)AVV(0),
-                                (vector unsigned char)y);
-    mody = (vector unsigned short)
-                    vec_mergeh ((vector unsigned char)AVV(0),
-                                (vector unsigned char)b);
-
-    hi = vec_mladd (hi, mod, (vector unsigned short)
-                             AVV(0x0080,0x0080,0x0080,0x0080,
-                                  0x0080,0x0080,0x0080,0x0080));
+    t1 = pix_multiply (x, a);
+    t2 = pix_multiply (y, b);
 
-    hi = vec_mladd (hiy, mody, hi);
-
-    hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
-
-    hi = vec_sr (hi, vec_splat_u16 (8));
-
-    lo = (vector unsigned short)
-                    vec_mergel ((vector unsigned char)AVV(0),
-                                (vector unsigned char)x);
-    mod = (vector unsigned short)
-                    vec_mergel ((vector unsigned char)AVV(0),
-                                (vector unsigned char)a);
-
-    loy = (vector unsigned short)
-                    vec_mergel ((vector unsigned char)AVV(0),
-                                (vector unsigned char)y);
-    mody = (vector unsigned short)
-                    vec_mergel ((vector unsigned char)AVV(0),
-                                (vector unsigned char)b);
-
-    lo = vec_mladd (lo, mod, (vector unsigned short)
-                             AVV(0x0080,0x0080,0x0080,0x0080,
-                                  0x0080,0x0080,0x0080,0x0080));
-
-    lo = vec_mladd (loy, mody, lo);
-
-    lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
-
-    lo = vec_sr (lo, vec_splat_u16 (8));
-
-    return (vector unsigned int)vec_packsu (hi, lo);
+    return pix_add (t1, t2);
 }
 
 static force_inline vector unsigned int
@@ -146,1450 +108,1536 @@ negate (vector unsigned int src)
 {
     return vec_nor (src, src);
 }
+
 /* dest*~srca + src */
 static force_inline vector unsigned int
-over (vector unsigned int src, vector unsigned int srca,
+over (vector unsigned int src,
+      vector unsigned int srca,
       vector unsigned int dest)
 {
     vector unsigned char tmp = (vector unsigned char)
-                                pix_multiply (dest, negate (srca));
+	pix_multiply (dest, negate (srca));
+
     tmp = vec_adds ((vector unsigned char)src, tmp);
     return (vector unsigned int)tmp;
 }
 
 /* in == pix_multiply */
-#define in_over(src, srca, mask, dest) over (pix_multiply (src, mask),\
-                                             pix_multiply (srca, mask), dest)
+#define in_over(src, srca, mask, dest)					\
+    over (pix_multiply (src, mask),					\
+          pix_multiply (srca, mask), dest)
 
 
-#define COMPUTE_SHIFT_MASK(source) \
+#define COMPUTE_SHIFT_MASK(source)					\
     source ## _mask = vec_lvsl (0, source);
 
-#define COMPUTE_SHIFT_MASKS(dest, source) \
-    dest ## _mask = vec_lvsl (0, dest); \
-    source ## _mask = vec_lvsl (0, source); \
+#define COMPUTE_SHIFT_MASKS(dest, source)				\
+    dest ## _mask = vec_lvsl (0, dest);					\
+    source ## _mask = vec_lvsl (0, source);				\
     store_mask = vec_lvsr (0, dest);
 
-#define COMPUTE_SHIFT_MASKC(dest, source, mask) \
-    mask ## _mask = vec_lvsl (0, mask); \
-    dest ## _mask = vec_lvsl (0, dest); \
-    source ## _mask = vec_lvsl (0, source); \
+#define COMPUTE_SHIFT_MASKC(dest, source, mask)				\
+    mask ## _mask = vec_lvsl (0, mask);					\
+    dest ## _mask = vec_lvsl (0, dest);					\
+    source ## _mask = vec_lvsl (0, source);				\
     store_mask = vec_lvsr (0, dest);
 
 /* notice you have to declare temp vars...
  * Note: tmp3 and tmp4 must remain untouched!
  */
 
-#define LOAD_VECTORS(dest, source) \
-        tmp1 = (typeof(tmp1))vec_ld(0, source); \
-        tmp2 = (typeof(tmp2))vec_ld(15, source); \
-        tmp3 = (typeof(tmp3))vec_ld(0, dest); \
-        v ## source = (typeof(v ## source)) \
-                       vec_perm(tmp1, tmp2, source ## _mask); \
-        tmp4 = (typeof(tmp4))vec_ld(15, dest); \
-        v ## dest = (typeof(v ## dest)) \
-                     vec_perm(tmp3, tmp4, dest ## _mask);
-
-#define LOAD_VECTORSC(dest, source, mask) \
-        tmp1 = (typeof(tmp1))vec_ld(0, source); \
-        tmp2 = (typeof(tmp2))vec_ld(15, source); \
-        tmp3 = (typeof(tmp3))vec_ld(0, dest); \
-        v ## source = (typeof(v ## source)) \
-                       vec_perm(tmp1, tmp2, source ## _mask); \
-        tmp4 = (typeof(tmp4))vec_ld(15, dest); \
-        tmp1 = (typeof(tmp1))vec_ld(0, mask); \
-        v ## dest = (typeof(v ## dest)) \
-                     vec_perm(tmp3, tmp4, dest ## _mask); \
-        tmp2 = (typeof(tmp2))vec_ld(15, mask); \
-        v ## mask = (typeof(v ## mask)) \
-                     vec_perm(tmp1, tmp2, mask ## _mask);
-
-#define LOAD_VECTORSM(dest, source, mask) \
-        LOAD_VECTORSC(dest, source, mask) \
-        v ## source = pix_multiply(v ## source, \
-                                   splat_alpha (v ## mask));
-
-#define STORE_VECTOR(dest) \
-        edges = vec_perm (tmp4, tmp3, dest ## _mask); \
-        tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \
-        tmp1 = vec_perm (edges, (vector unsigned char)v ## dest, store_mask); \
-        vec_st ((vector unsigned int) tmp3, 15, dest ); \
-        vec_st ((vector unsigned int) tmp1, 0, dest );
-
-static FASTCALL void
-vmxCombineOverUnomask (uint32_t *dest, const uint32_t *src, int width)
+#define LOAD_VECTORS(dest, source)			  \
+    tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
+    tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
+    tmp3 = (typeof(tmp3))vec_ld (0, dest);		  \
+    v ## source = (typeof(v ## source))			  \
+	vec_perm (tmp1, tmp2, source ## _mask);		  \
+    tmp4 = (typeof(tmp4))vec_ld (15, dest);		  \
+    v ## dest = (typeof(v ## dest))			  \
+	vec_perm (tmp3, tmp4, dest ## _mask);
+
+#define LOAD_VECTORSC(dest, source, mask)		  \
+    tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
+    tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
+    tmp3 = (typeof(tmp3))vec_ld (0, dest);		  \
+    v ## source = (typeof(v ## source))			  \
+	vec_perm (tmp1, tmp2, source ## _mask);		  \
+    tmp4 = (typeof(tmp4))vec_ld (15, dest);		  \
+    tmp1 = (typeof(tmp1))vec_ld (0, mask);		  \
+    v ## dest = (typeof(v ## dest))			  \
+	vec_perm (tmp3, tmp4, dest ## _mask);		  \
+    tmp2 = (typeof(tmp2))vec_ld (15, mask);		  \
+    v ## mask = (typeof(v ## mask))			  \
+	vec_perm (tmp1, tmp2, mask ## _mask);
+
+#define LOAD_VECTORSM(dest, source, mask)				\
+    LOAD_VECTORSC (dest, source, mask)					\
+    v ## source = pix_multiply (v ## source,				\
+                                splat_alpha (v ## mask));
+
+#define STORE_VECTOR(dest)						\
+    edges = vec_perm (tmp4, tmp3, dest ## _mask);			\
+    tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \
+    tmp1 = vec_perm (edges, (vector unsigned char)v ## dest, store_mask); \
+    vec_st ((vector unsigned int) tmp3, 15, dest);			\
+    vec_st ((vector unsigned int) tmp1, 0, dest);
+
+static void
+vmx_combine_over_u_no_mask (uint32_t *      dest,
+                            const uint32_t *src,
+                            int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc;
+    vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, store_mask;
+	dest_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKS(dest, src)
+    COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
+    for (i = width / 4; i > 0; i--)
+    {
 
-        LOAD_VECTORS(dest, src)
+	LOAD_VECTORS (dest, src);
 
-        vdest = over (vsrc, splat_alpha (vsrc), vdest);
+	vdest = over (vsrc, splat_alpha (vsrc), vdest);
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
+	src += 4;
+	dest += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t ia = Alpha (~s);
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t ia = ALPHA_8 (~s);
 
-        FbByteMulAdd (d, ia, s);
-        dest[i] = d;
+	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+
+	dest[i] = d;
     }
 }
 
-static FASTCALL void
-vmxCombineOverUmask (uint32_t *dest,
-                     const uint32_t *src,
-                     const uint32_t *mask,
-                     int width)
+static void
+vmx_combine_over_u_mask (uint32_t *      dest,
+                         const uint32_t *src,
+                         const uint32_t *mask,
+                         int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, mask_mask, store_mask;
+	dest_mask, src_mask, mask_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORSM(dest, src, mask);
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
 
-        vdest = over (vsrc, splat_alpha (vsrc), vdest);
+	vdest = over (vsrc, splat_alpha (vsrc), vdest);
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t m = Alpha (mask[i]);
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t ia;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t ia;
 
-        FbByteMul (s, m);
+	UN8x4_MUL_UN8 (s, m);
 
-        ia = Alpha (~s);
+	ia = ALPHA_8 (~s);
 
-        FbByteMulAdd (d, ia, s);
-        dest[i] = d;
+	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+	dest[i] = d;
     }
 }
 
-static FASTCALL void
-vmxCombineOverU(pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dest, const uint32_t *src, const uint32_t *mask,
-                int width)
+static void
+vmx_combine_over_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     if (mask)
-        vmxCombineOverUmask(dest, src, mask, width);
+	vmx_combine_over_u_mask (dest, src, mask, width);
     else
-        vmxCombineOverUnomask(dest, src, width);
+	vmx_combine_over_u_no_mask (dest, src, width);
 }
 
-static FASTCALL void
-vmxCombineOverReverseUnomask (uint32_t *dest, const uint32_t *src, int width)
+static void
+vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
+                                    const uint32_t *src,
+                                    int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc;
+    vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, store_mask;
+	dest_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKS(dest, src)
+    COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
+    for (i = width / 4; i > 0; i--)
+    {
 
-        LOAD_VECTORS(dest, src)
+	LOAD_VECTORS (dest, src);
 
-        vdest = over (vdest, splat_alpha (vdest) , vsrc);
+	vdest = over (vdest, splat_alpha (vdest), vsrc);
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
+	src += 4;
+	dest += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t ia = Alpha (~dest[i]);
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t ia = ALPHA_8 (~dest[i]);
 
-        FbByteMulAdd (s, ia, d);
-        dest[i] = s;
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineOverReverseUmask (uint32_t *dest,
-                            const uint32_t *src,
-                            const uint32_t *mask,
-                            int width)
+static void
+vmx_combine_over_reverse_u_mask (uint32_t *      dest,
+                                 const uint32_t *src,
+                                 const uint32_t *mask,
+                                 int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, mask_mask, store_mask;
+	dest_mask, src_mask, mask_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
+    for (i = width / 4; i > 0; i--)
+    {
 
-        LOAD_VECTORSM(dest, src, mask)
+	LOAD_VECTORSM (dest, src, mask);
 
-        vdest = over (vdest, splat_alpha (vdest) , vsrc);
+	vdest = over (vdest, splat_alpha (vdest), vsrc);
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t m = Alpha (mask[i]);
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t ia = Alpha (~dest[i]);
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t ia = ALPHA_8 (~dest[i]);
 
-        FbByteMul (s, m);
+	UN8x4_MUL_UN8 (s, m);
 
-        FbByteMulAdd (s, ia, d);
-        dest[i] = s;
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineOverReverseU (pixman_implementation_t *imp, pixman_op_t op,
-			uint32_t *dest, const uint32_t *src,
-                        const uint32_t *mask, int width)
+static void
+vmx_combine_over_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
 {
     if (mask)
-        vmxCombineOverReverseUmask(dest, src, mask, width);
+	vmx_combine_over_reverse_u_mask (dest, src, mask, width);
     else
-        vmxCombineOverReverseUnomask(dest, src, width);
+	vmx_combine_over_reverse_u_no_mask (dest, src, width);
 }
 
-static FASTCALL void
-vmxCombineInUnomask (uint32_t *dest, const uint32_t *src, int width)
+static void
+vmx_combine_in_u_no_mask (uint32_t *      dest,
+                          const uint32_t *src,
+                          int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc;
+    vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, store_mask;
+	dest_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKS(dest, src)
+    COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORS(dest, src)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
 
-        vdest = pix_multiply (vsrc, splat_alpha (vdest));
+	vdest = pix_multiply (vsrc, splat_alpha (vdest));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
+	src += 4;
+	dest += 4;
     }
 
-    for (i = width%4; --i >=0;) {
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t a = ALPHA_8 (dest[i]);
 
-        uint32_t s = src[i];
-        uint32_t a = Alpha (dest[i]);
-        FbByteMul (s, a);
-        dest[i] = s;
+	UN8x4_MUL_UN8 (s, a);
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineInUmask (uint32_t *dest,
-                   const uint32_t *src,
-                   const uint32_t *mask,
-                   int width)
+static void
+vmx_combine_in_u_mask (uint32_t *      dest,
+                       const uint32_t *src,
+                       const uint32_t *mask,
+                       int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, mask_mask, store_mask;
+	dest_mask, src_mask, mask_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORSM(dest, src, mask)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
 
-        vdest = pix_multiply (vsrc, splat_alpha (vdest));
+	vdest = pix_multiply (vsrc, splat_alpha (vdest));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t m = Alpha (mask[i]);
-        uint32_t s = src[i];
-        uint32_t a = Alpha (dest[i]);
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t a = ALPHA_8 (dest[i]);
 
-        FbByteMul (s, m);
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_MUL_UN8 (s, a);
 
-        FbByteMul (s, a);
-        dest[i] = s;
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineInU (pixman_implementation_t *imp, pixman_op_t op,
-	       uint32_t *dest, const uint32_t *src, const uint32_t *mask,
-               int width)
+static void
+vmx_combine_in_u (pixman_implementation_t *imp,
+                  pixman_op_t              op,
+                  uint32_t *               dest,
+                  const uint32_t *         src,
+                  const uint32_t *         mask,
+                  int                      width)
 {
     if (mask)
-        vmxCombineInUmask(dest, src, mask, width);
+	vmx_combine_in_u_mask (dest, src, mask, width);
     else
-        vmxCombineInUnomask(dest, src, width);
+	vmx_combine_in_u_no_mask (dest, src, width);
 }
 
-static FASTCALL void
-vmxCombineInReverseUnomask (uint32_t *dest, const uint32_t *src, int width)
+static void
+vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
+                                  const uint32_t *src,
+                                  int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc;
+    vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, store_mask;
+	dest_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKS(dest, src)
+    COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORS(dest, src)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
 
-        vdest = pix_multiply (vdest, splat_alpha (vsrc));
+	vdest = pix_multiply (vdest, splat_alpha (vsrc));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
+	src += 4;
+	dest += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t d = dest[i];
-        uint32_t a = Alpha (src[i]);
-        FbByteMul (d, a);
-        dest[i] = d;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t d = dest[i];
+	uint32_t a = ALPHA_8 (src[i]);
+
+	UN8x4_MUL_UN8 (d, a);
+
+	dest[i] = d;
     }
 }
 
-static FASTCALL void
-vmxCombineInReverseUmask (uint32_t *dest,
-                          const uint32_t *src,
-                          const uint32_t *mask,
-                          int width)
+static void
+vmx_combine_in_reverse_u_mask (uint32_t *      dest,
+                               const uint32_t *src,
+                               const uint32_t *mask,
+                               int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, mask_mask, store_mask;
+	dest_mask, src_mask, mask_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORSM(dest, src, mask)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
 
-        vdest = pix_multiply (vdest, splat_alpha (vsrc));
+	vdest = pix_multiply (vdest, splat_alpha (vsrc));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t m = Alpha (mask[i]);
-        uint32_t d = dest[i];
-        uint32_t a = src[i];
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t d = dest[i];
+	uint32_t a = src[i];
 
-        FbByteMul (a, m);
+	UN8x4_MUL_UN8 (a, m);
+	a = ALPHA_8 (a);
+	UN8x4_MUL_UN8 (d, a);
 
-        a = Alpha (a);
-        FbByteMul (d, a);
-        dest[i] = d;
+	dest[i] = d;
     }
 }
 
-static FASTCALL void
-vmxCombineInReverseU (pixman_implementation_t *imp, pixman_op_t op,
-		      uint32_t *dest, const uint32_t *src,
-                      const uint32_t *mask, int width)
+static void
+vmx_combine_in_reverse_u (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *               dest,
+                          const uint32_t *         src,
+                          const uint32_t *         mask,
+                          int                      width)
 {
     if (mask)
-        vmxCombineInReverseUmask(dest, src, mask, width);
+	vmx_combine_in_reverse_u_mask (dest, src, mask, width);
     else
-        vmxCombineInReverseUnomask(dest, src, width);
+	vmx_combine_in_reverse_u_no_mask (dest, src, width);
 }
 
-static FASTCALL void
-vmxCombineOutUnomask (uint32_t *dest, const uint32_t *src, int width)
+static void
+vmx_combine_out_u_no_mask (uint32_t *      dest,
+                           const uint32_t *src,
+                           int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc;
+    vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, store_mask;
+	dest_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKS(dest, src)
+    COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORS(dest, src)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
 
-        vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
+	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
+	src += 4;
+	dest += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t s = src[i];
-        uint32_t a = Alpha (~dest[i]);
-        FbByteMul (s, a);
-        dest[i] = s;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t a = ALPHA_8 (~dest[i]);
+
+	UN8x4_MUL_UN8 (s, a);
+
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineOutUmask (uint32_t *dest,
-                    const uint32_t *src,
-                    const uint32_t *mask,
-                    int width)
+static void
+vmx_combine_out_u_mask (uint32_t *      dest,
+                        const uint32_t *src,
+                        const uint32_t *mask,
+                        int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, mask_mask, store_mask;
+	dest_mask, src_mask, mask_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORSM(dest, src, mask)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
 
-        vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
+	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t m = Alpha (mask[i]);
-        uint32_t s = src[i];
-        uint32_t a = Alpha (~dest[i]);
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t a = ALPHA_8 (~dest[i]);
 
-        FbByteMul (s, m);
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_MUL_UN8 (s, a);
 
-        FbByteMul (s, a);
-        dest[i] = s;
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineOutU (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dest, const uint32_t *src, const uint32_t *mask,
-                int width)
+static void
+vmx_combine_out_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
 {
     if (mask)
-        vmxCombineOutUmask(dest, src, mask, width);
+	vmx_combine_out_u_mask (dest, src, mask, width);
     else
-        vmxCombineOutUnomask(dest, src, width);
+	vmx_combine_out_u_no_mask (dest, src, width);
 }
 
-static FASTCALL void
-vmxCombineOutReverseUnomask (uint32_t *dest, const uint32_t *src, int width)
+static void
+vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
+                                   const uint32_t *src,
+                                   int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc;
+    vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, store_mask;
+	dest_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKS(dest, src)
+    COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
+    for (i = width / 4; i > 0; i--)
+    {
 
-        LOAD_VECTORS(dest, src)
+	LOAD_VECTORS (dest, src);
 
-        vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
+	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
+	src += 4;
+	dest += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t d = dest[i];
-        uint32_t a = Alpha (~src[i]);
-        FbByteMul (d, a);
-        dest[i] = d;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t d = dest[i];
+	uint32_t a = ALPHA_8 (~src[i]);
+
+	UN8x4_MUL_UN8 (d, a);
+
+	dest[i] = d;
     }
 }
 
-static FASTCALL void
-vmxCombineOutReverseUmask (uint32_t *dest,
-                           const uint32_t *src,
-                           const uint32_t *mask,
-                           int width)
+static void
+vmx_combine_out_reverse_u_mask (uint32_t *      dest,
+                                const uint32_t *src,
+                                const uint32_t *mask,
+                                int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, mask_mask, store_mask;
+	dest_mask, src_mask, mask_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORSM(dest, src, mask)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
 
-        vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
+	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t m = Alpha (mask[i]);
-        uint32_t d = dest[i];
-        uint32_t a = src[i];
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t d = dest[i];
+	uint32_t a = src[i];
 
-        FbByteMul (a, m);
+	UN8x4_MUL_UN8 (a, m);
+	a = ALPHA_8 (~a);
+	UN8x4_MUL_UN8 (d, a);
 
-        a = Alpha (~a);
-        FbByteMul (d, a);
-        dest[i] = d;
+	dest[i] = d;
     }
 }
 
-static FASTCALL void
-vmxCombineOutReverseU (pixman_implementation_t *imp, pixman_op_t op,
-		       uint32_t *dest,
-                       const uint32_t *src,
-                       const uint32_t *mask,
-                       int width)
+static void
+vmx_combine_out_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
 {
     if (mask)
-        vmxCombineOutReverseUmask(dest, src, mask, width);
+	vmx_combine_out_reverse_u_mask (dest, src, mask, width);
     else
-        vmxCombineOutReverseUnomask(dest, src, width);
+	vmx_combine_out_reverse_u_no_mask (dest, src, width);
 }
 
-static FASTCALL void
-vmxCombineAtopUnomask (uint32_t *dest, const uint32_t *src, int width)
+static void
+vmx_combine_atop_u_no_mask (uint32_t *      dest,
+                            const uint32_t *src,
+                            int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc;
+    vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, store_mask;
+	dest_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKS(dest, src)
+    COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORS(dest, src)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
 
-        vdest = pix_add_mul (vsrc, splat_alpha (vdest),
-                            vdest, splat_alpha (negate (vsrc)));
+	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
+			     vdest, splat_alpha (negate (vsrc)));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
+	src += 4;
+	dest += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t dest_a = Alpha (d);
-        uint32_t src_ia = Alpha (~s);
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t dest_a = ALPHA_8 (d);
+	uint32_t src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
 
-        FbByteAddMul (s, dest_a, d, src_ia);
-        dest[i] = s;
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineAtopUmask (uint32_t *dest,
-                     const uint32_t *src,
-                     const uint32_t *mask,
-                     int width)
+static void
+vmx_combine_atop_u_mask (uint32_t *      dest,
+                         const uint32_t *src,
+                         const uint32_t *mask,
+                         int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, mask_mask, store_mask;
+	dest_mask, src_mask, mask_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORSM(dest, src, mask)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
 
-        vdest = pix_add_mul (vsrc, splat_alpha (vdest),
-                            vdest, splat_alpha (negate (vsrc)));
+	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
+			     vdest, splat_alpha (negate (vsrc)));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t m = Alpha (mask[i]);
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t dest_a = Alpha (d);
-        uint32_t src_ia;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t dest_a = ALPHA_8 (d);
+	uint32_t src_ia;
+
+	UN8x4_MUL_UN8 (s, m);
 
-        FbByteMul (s, m);
+	src_ia = ALPHA_8 (~s);
 
-        src_ia = Alpha (~s);
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
 
-        FbByteAddMul (s, dest_a, d, src_ia);
-        dest[i] = s;
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineAtopU (pixman_implementation_t *imp, pixman_op_t op,
-		 uint32_t *dest,
-                 const uint32_t *src,
-                 const uint32_t *mask,
-                 int width)
+static void
+vmx_combine_atop_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     if (mask)
-        vmxCombineAtopUmask(dest, src, mask, width);
+	vmx_combine_atop_u_mask (dest, src, mask, width);
     else
-        vmxCombineAtopUnomask(dest, src, width);
+	vmx_combine_atop_u_no_mask (dest, src, width);
 }
 
-static FASTCALL void
-vmxCombineAtopReverseUnomask (uint32_t *dest, const uint32_t *src, int width)
+static void
+vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
+                                    const uint32_t *src,
+                                    int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc;
+    vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, store_mask;
+	dest_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKS(dest, src)
+    COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORS(dest, src)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
 
-        vdest = pix_add_mul (vdest, splat_alpha (vsrc),
-                            vsrc, splat_alpha (negate (vdest)));
+	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
+			     vsrc, splat_alpha (negate (vdest)));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
+	src += 4;
+	dest += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t src_a = Alpha (s);
-        uint32_t dest_ia = Alpha (~d);
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t src_a = ALPHA_8 (s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
 
-        FbByteAddMul (s, dest_ia, d, src_a);
-        dest[i] = s;
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineAtopReverseUmask (uint32_t *dest,
-                            const uint32_t *src,
-                            const uint32_t *mask,
-                            int width)
+static void
+vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
+                                 const uint32_t *src,
+                                 const uint32_t *mask,
+                                 int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, mask_mask, store_mask;
+	dest_mask, src_mask, mask_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORSM(dest, src, mask)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
 
-        vdest = pix_add_mul (vdest, splat_alpha (vsrc),
-                            vsrc, splat_alpha (negate (vdest)));
+	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
+			     vsrc, splat_alpha (negate (vdest)));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t m = Alpha (mask[i]);
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t src_a;
-        uint32_t dest_ia = Alpha (~d);
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t src_a;
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
 
-        FbByteMul (s, m);
+	src_a = ALPHA_8 (s);
 
-        src_a = Alpha (s);
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
 
-        FbByteAddMul (s, dest_ia, d, src_a);
-        dest[i] = s;
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineAtopReverseU (pixman_implementation_t *imp, pixman_op_t op,
-			uint32_t *dest,
-                        const uint32_t *src,
-                        const uint32_t *mask,
-                        int width)
+static void
+vmx_combine_atop_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
 {
     if (mask)
-        vmxCombineAtopReverseUmask(dest, src, mask, width);
+	vmx_combine_atop_reverse_u_mask (dest, src, mask, width);
     else
-        vmxCombineAtopReverseUnomask(dest, src, width);
+	vmx_combine_atop_reverse_u_no_mask (dest, src, width);
 }
 
-static FASTCALL void
-vmxCombineXorUnomask (uint32_t *dest, const uint32_t *src, int width)
+static void
+vmx_combine_xor_u_no_mask (uint32_t *      dest,
+                           const uint32_t *src,
+                           int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc;
+    vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, store_mask;
+	dest_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKS(dest, src)
+    COMPUTE_SHIFT_MASKS (dest, src);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORS (dest, src)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
 
-        vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
-                            vdest, splat_alpha (negate (vsrc)));
+	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
+			     vdest, splat_alpha (negate (vsrc)));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
+	src += 4;
+	dest += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t src_ia = Alpha (~s);
-        uint32_t dest_ia = Alpha (~d);
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t src_ia = ALPHA_8 (~s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
 
-        FbByteAddMul (s, dest_ia, d, src_ia);
-        dest[i] = s;
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineXorUmask (uint32_t *dest,
-                    const uint32_t *src,
-                    const uint32_t *mask,
-                    int width)
+static void
+vmx_combine_xor_u_mask (uint32_t *      dest,
+                        const uint32_t *src,
+                        const uint32_t *mask,
+                        int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, mask_mask, store_mask;
+	dest_mask, src_mask, mask_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORSM(dest, src, mask)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
 
-        vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
-                            vdest, splat_alpha (negate (vsrc)));
+	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
+			     vdest, splat_alpha (negate (vsrc)));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t m = Alpha (mask[i]);
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t src_ia;
-        uint32_t dest_ia = Alpha (~d);
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t src_ia;
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
 
-        FbByteMul (s, m);
+	src_ia = ALPHA_8 (~s);
 
-        src_ia = Alpha (~s);
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
 
-        FbByteAddMul (s, dest_ia, d, src_ia);
-        dest[i] = s;
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineXorU (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dest,
-                const uint32_t *src,
-                const uint32_t *mask,
-                int width)
+static void
+vmx_combine_xor_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
 {
     if (mask)
-        vmxCombineXorUmask(dest, src, mask, width);
+	vmx_combine_xor_u_mask (dest, src, mask, width);
     else
-        vmxCombineXorUnomask(dest, src, width);
+	vmx_combine_xor_u_no_mask (dest, src, width);
 }
 
-static FASTCALL void
-vmxCombineAddUnomask (uint32_t *dest, const uint32_t *src, int width)
+static void
+vmx_combine_add_u_no_mask (uint32_t *      dest,
+                           const uint32_t *src,
+                           int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc;
+    vector unsigned int vdest, vsrc;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, store_mask;
+	dest_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKS(dest, src)
+    COMPUTE_SHIFT_MASKS (dest, src);
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORS(dest, src)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
 
-        vdest = pix_add (vsrc, vdest);
+	vdest = pix_add (vsrc, vdest);
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
+	src += 4;
+	dest += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        FbByteAdd (d, s);
-        dest[i] = d;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+
+	UN8x4_ADD_UN8x4 (d, s);
+
+	dest[i] = d;
     }
 }
 
-static FASTCALL void
-vmxCombineAddUmask (uint32_t *dest,
-                    const uint32_t *src,
-                    const uint32_t *mask,
-                    int width)
+static void
+vmx_combine_add_u_mask (uint32_t *      dest,
+                        const uint32_t *src,
+                        const uint32_t *mask,
+                        int             width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, src_mask, mask_mask, store_mask;
+	dest_mask, src_mask, mask_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORSM(dest, src, mask)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
 
-        vdest = pix_add (vsrc, vdest);
+	vdest = pix_add (vsrc, vdest);
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t m = Alpha (mask[i]);
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
 
-        FbByteMul (s, m);
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_ADD_UN8x4 (d, s);
 
-        FbByteAdd (d, s);
-        dest[i] = d;
+	dest[i] = d;
     }
 }
 
-static FASTCALL void
-vmxCombineAddU (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dest,
-                const uint32_t *src,
-                const uint32_t *mask,
-                int width)
+static void
+vmx_combine_add_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
 {
     if (mask)
-        vmxCombineAddUmask(dest, src, mask, width);
+	vmx_combine_add_u_mask (dest, src, mask, width);
     else
-        vmxCombineAddUnomask(dest, src, width);
+	vmx_combine_add_u_no_mask (dest, src, width);
 }
 
-static FASTCALL void
-vmxCombineSrcC (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+vmx_combine_src_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, mask_mask, src_mask, store_mask;
+	dest_mask, mask_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask);
-    /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
-        LOAD_VECTORSC(dest, src, mask)
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
 
-        vdest = pix_multiply (vsrc, vmask);
+	vdest = pix_multiply (vsrc, vmask);
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        mask+=4;
-        src+=4;
-        dest+=4;
+	mask += 4;
+	src += 4;
+	dest += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t a = mask[i];
-        uint32_t s = src[i];
-        FbByteMulC (s, a);
-        dest[i] = s;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+
+	UN8x4_MUL_UN8x4 (s, a);
+
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineOverC (pixman_implementation_t *imp, pixman_op_t op,
-		 uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+vmx_combine_over_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, mask_mask, src_mask, store_mask;
+	dest_mask, mask_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask);
-    /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
-        LOAD_VECTORSC(dest, src, mask)
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
 
-        vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
+	vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        mask+=4;
-        src+=4;
-        dest+=4;
+	mask += 4;
+	src += 4;
+	dest += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t a = mask[i];
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        FbByteMulC (s, a);
-        FbByteMulAddC (d, ~a, s);
-        dest[i] = d;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
+
+	dest[i] = d;
     }
 }
 
-static FASTCALL void
-vmxCombineOverReverseC (pixman_implementation_t *imp, pixman_op_t op,
-			uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, mask_mask, src_mask, store_mask;
+	dest_mask, mask_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask);
-    /* printf("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
-        LOAD_VECTORSC (dest, src, mask)
+    /* printf("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
 
-        vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
+	vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        mask+=4;
-        src+=4;
-        dest+=4;
+	mask += 4;
+	src += 4;
+	dest += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t a = mask[i];
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t da = Alpha (d);
-        FbByteMulC (s, a);
-        FbByteMulAddC (s, ~da, d);
-        dest[i] = s;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t ida = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
+
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineInC (pixman_implementation_t *imp, pixman_op_t op,
-	       uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+vmx_combine_in_ca (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, mask_mask, src_mask, store_mask;
+	dest_mask, mask_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORSC(dest, src, mask)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
 
-        vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
+	vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t a = mask[i];
-        uint32_t s = src[i];
-        uint32_t da = Alpha (dest[i]);
-        FbByteMul (s, a);
-        FbByteMul (s, da);
-        dest[i] = s;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t da = ALPHA_8 (dest[i]);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (s, da);
+
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineInReverseC (pixman_implementation_t *imp, pixman_op_t op,
-		      uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, mask_mask, src_mask, store_mask;
+	dest_mask, mask_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
+    for (i = width / 4; i > 0; i--)
+    {
 
-        LOAD_VECTORSC(dest, src, mask)
+	LOAD_VECTORSC (dest, src, mask);
 
-        vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
+	vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t a = mask[i];
-        uint32_t d = dest[i];
-        uint32_t sa = Alpha (src[i]);
-        FbByteMul (a, sa);
-        FbByteMulC (d, a);
-        dest[i] = d;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (src[i]);
+
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4 (d, a);
+
+	dest[i] = d;
     }
 }
 
-static FASTCALL void
-vmxCombineOutC (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+vmx_combine_out_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, mask_mask, src_mask, store_mask;
+	dest_mask, mask_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORSC(dest, src, mask)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
 
-        vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
+	vdest = pix_multiply (
+	    pix_multiply (vsrc, vmask), splat_alpha (negate (vdest)));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t a = mask[i];
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t da = Alpha (~d);
-        FbByteMulC (s, a);
-        FbByteMulC (s, da);
-        dest[i] = s;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (s, da);
+
+	dest[i] = s;
     }
 }
 
-static FASTCALL void
-vmxCombineOutReverseC (pixman_implementation_t *imp, pixman_op_t op,
-		       uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, mask_mask, src_mask, store_mask;
+	dest_mask, mask_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORSC(dest, src, mask)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
 
-        vdest = pix_multiply (vdest,
-                             negate (pix_multiply (vmask, splat_alpha (vsrc))));
+	vdest = pix_multiply (
+	    vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t a = mask[i];
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t sa = Alpha (s);
-        FbByteMulC (a, sa);
-        FbByteMulC (d, ~a);
-        dest[i] = d;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4 (d, ~a);
+
+	dest[i] = d;
     }
 }
 
-static FASTCALL void
-vmxCombineAtopC (pixman_implementation_t *imp, pixman_op_t op,
-		 uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+vmx_combine_atop_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask, vsrca;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, mask_mask, src_mask, store_mask;
+	dest_mask, mask_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vsrca = splat_alpha (vsrc);
 
-        LOAD_VECTORSC(dest, src, mask)
+	vsrc = pix_multiply (vsrc, vmask);
+	vmask = pix_multiply (vmask, vsrca);
 
-        vdest = pix_add_mul (pix_multiply (vsrc, vmask), splat_alpha (vdest),
-                            vdest,
-                            negate (pix_multiply (vmask,
-                                                splat_alpha (vmask))));
+	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
+			     negate (vmask), vdest);
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t a = mask[i];
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t sa = Alpha (s);
-        uint32_t da = Alpha (d);
-
-        FbByteMulC (s, a);
-        FbByteMul (a, sa);
-        FbByteAddMulC (d, ~a, s, da);
-        dest[i] = d;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+	dest[i] = d;
     }
 }
 
-static FASTCALL void
-vmxCombineAtopReverseC (pixman_implementation_t *imp, pixman_op_t op,
-			uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, mask_mask, src_mask, store_mask;
+	dest_mask, mask_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORSC(dest, src, mask)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
 
-        vdest = pix_add_mul (vdest,
-                            pix_multiply (vmask, splat_alpha (vsrc)),
-                            pix_multiply (vsrc, vmask),
-                            negate (splat_alpha (vdest)));
+	vdest = pix_add_mul (vdest,
+			     pix_multiply (vmask, splat_alpha (vsrc)),
+			     pix_multiply (vsrc, vmask),
+			     negate (splat_alpha (vdest)));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t a = mask[i];
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t sa = Alpha (s);
-        uint32_t da = Alpha (d);
-
-        FbByteMulC (s, a);
-        FbByteMul (a, sa);
-        FbByteAddMulC (d, a, s, ~da);
-        dest[i] = d;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
+
+	dest[i] = d;
     }
 }
 
-static FASTCALL void
-vmxCombineXorC (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+vmx_combine_xor_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, mask_mask, src_mask, store_mask;
+	dest_mask, mask_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORSC(dest, src, mask)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
 
-        vdest = pix_add_mul (vdest,
-                            negate (pix_multiply (vmask, splat_alpha (vsrc))),
-                            pix_multiply (vsrc, vmask),
-                            negate (splat_alpha (vdest)));
+	vdest = pix_add_mul (vdest,
+			     negate (pix_multiply (vmask, splat_alpha (vsrc))),
+			     pix_multiply (vsrc, vmask),
+			     negate (splat_alpha (vdest)));
 
-        STORE_VECTOR(dest)
+	STORE_VECTOR (dest);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
 
-    for (i = width%4; --i >=0;) {
-        uint32_t a = mask[i];
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
-        uint32_t sa = Alpha (s);
-        uint32_t da = Alpha (d);
-
-        FbByteMulC (s, a);
-        FbByteMul (a, sa);
-        FbByteAddMulC (d, ~a, s, ~da);
-        dest[i] = d;
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+	dest[i] = d;
     }
 }
 
-static FASTCALL void
-vmxCombineAddC (pixman_implementation_t *imp, pixman_op_t op,
-		uint32_t *dest, const uint32_t *src, const uint32_t *mask, int width)
+static void
+vmx_combine_add_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
 {
     int i;
-    vector unsigned int  vdest, vsrc, vmask;
+    vector unsigned int vdest, vsrc, vmask;
     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
-                         dest_mask, mask_mask, src_mask, store_mask;
+	dest_mask, mask_mask, src_mask, store_mask;
 
-    COMPUTE_SHIFT_MASKC(dest, src, mask)
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
 
     /* printf ("%s\n",__PRETTY_FUNCTION__); */
-    for (i = width/4; i > 0; i--) {
-
-        LOAD_VECTORSC(dest, src, mask)
-
-        vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
-
-        STORE_VECTOR(dest)
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
 
-        src+=4;
-        dest+=4;
-        mask+=4;
-    }
+	vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
 
-    for (i = width%4; --i >=0;) {
-        uint32_t a = mask[i];
-        uint32_t s = src[i];
-        uint32_t d = dest[i];
+	STORE_VECTOR (dest);
 
-        FbByteMulC (s, a);
-        FbByteAdd (s, d);
-        dest[i] = s;
+	src += 4;
+	dest += 4;
+	mask += 4;
     }
-}
-
-
-#if 0
-void
-fbCompositeSolid_nx8888vmx (pixman_operator_t	op,
-			    pixman_image_t * pSrc,
-			    pixman_image_t * pMask,
-			    pixman_image_t * pDst,
-			    int16_t	xSrc,
-			    int16_t	ySrc,
-			    int16_t	xMask,
-			    int16_t	yMask,
-			    int16_t	xDst,
-			    int16_t	yDst,
-			    uint16_t	width,
-			    uint16_t	height)
-{
-    uint32_t	src;
-    uint32_t	*dstLine, *dst;
-    int	dstStride;
-
-    fbComposeGetSolid (pSrc, pDst, src);
-
-    if (src >> 24 == 0)
-	return;
 
-    fbComposeGetStart (pDst, xDst, yDst, uint32_t, dstStride, dstLine, 1);
-
-    while (height--)
+    for (i = width % 4; --i >= 0;)
     {
-	dst = dstLine;
-	dstLine += dstStride;
-	/* XXX vmxCombineOverU (dst, src, width); */
-    }
-}
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
 
-void
-fbCompositeSolid_nx0565vmx (pixman_operator_t	op,
-			    pixman_image_t * pSrc,
-			    pixman_image_t * pMask,
-			    pixman_image_t * pDst,
-			    int16_t	xSrc,
-			    int16_t	ySrc,
-			    int16_t	xMask,
-			    int16_t	yMask,
-			    int16_t	xDst,
-			    int16_t	yDst,
-			    uint16_t	width,
-			    uint16_t	height)
-{
-    uint32_t	src;
-    uint16_t	*dstLine, *dst;
-    uint16_t	w;
-    int	dstStride;
-
-    fbComposeGetSolid (pSrc, pDst, src);
-
-    if (src >> 24 == 0)
-	return;
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_ADD_UN8x4 (s, d);
 
-    fbComposeGetStart (pDst, xDst, yDst, uint16_t, dstStride, dstLine, 1);
-
-    while (height--)
-    {
-	dst = dstLine;
-	dstLine += dstStride;
-       vmxCombineOverU565(dst, src, width);
+	dest[i] = s;
     }
 }
 
-static const FastPathInfo vmx_fast_path_array[] =
-{
-    { PIXMAN_OP_NONE },
-};
-
-const FastPathInfo *const vmx_fast_paths = vmx_fast_path_array;
-
-#endif
-
 pixman_implementation_t *
-_pixman_implementation_create_vmx (pixman_implementation_t *toplevel)
+_pixman_implementation_create_vmx (void)
 {
-    pixman_implementation_t *fast = _pixman_implementation_create_fast_path (NULL);
-    pixman_implementation_t *imp = _pixman_implementation_create (toplevel, fast);
+    pixman_implementation_t *fast = _pixman_implementation_create_fast_path ();
+    pixman_implementation_t *imp = _pixman_implementation_create (fast);
 
     /* Set up function pointers */
-    
-    /* SSE code patch for fbcompose.c */
-    imp->combine_32[PIXMAN_OP_OVER] = vmxCombineOverU;
-    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmxCombineOverReverseU;
-    imp->combine_32[PIXMAN_OP_IN] = vmxCombineInU;
-    imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmxCombineInReverseU;
-    imp->combine_32[PIXMAN_OP_OUT] = vmxCombineOutU;
-    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmxCombineOutReverseU;
-    imp->combine_32[PIXMAN_OP_ATOP] = vmxCombineAtopU;
-    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmxCombineAtopReverseU;
-    imp->combine_32[PIXMAN_OP_XOR] = vmxCombineXorU;
-
-    imp->combine_32[PIXMAN_OP_ADD] = vmxCombineAddU;
-
-    imp->combine_32_ca[PIXMAN_OP_SRC] = vmxCombineSrcC;
-    imp->combine_32_ca[PIXMAN_OP_OVER] = vmxCombineOverC;
-    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmxCombineOverReverseC;
-    imp->combine_32_ca[PIXMAN_OP_IN] = vmxCombineInC;
-    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmxCombineInReverseC;
-    imp->combine_32_ca[PIXMAN_OP_OUT] = vmxCombineOutC;
-    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmxCombineOutReverseC;
-    imp->combine_32_ca[PIXMAN_OP_ATOP] = vmxCombineAtopC;
-    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmxCombineAtopReverseC;
-    imp->combine_32_ca[PIXMAN_OP_XOR] = vmxCombineXorC;
-    imp->combine_32_ca[PIXMAN_OP_ADD] = vmxCombineAddC;
-    
+
+    imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = vmx_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u;
+
+    imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = vmx_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = vmx_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = vmx_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
+
     return imp;
 }
-
diff --git a/lib/pixman/pixman/pixman-x64-mmx-emulation.h b/lib/pixman/pixman/pixman-x64-mmx-emulation.h
new file mode 100644
index 000000000..378019cf2
--- /dev/null
+++ b/lib/pixman/pixman/pixman-x64-mmx-emulation.h
@@ -0,0 +1,263 @@
+#ifndef MMX_X64_H_INCLUDED
+#define MMX_X64_H_INCLUDED
+
+/* Implementation of x64 MMX substitition functions, before
+ * pixman is reimplemented not to use __m64 type on Visual C++
+ *
+ * Copyright (C)2009 by George Yohng
+ * Released in public domain.
+ */
+
+#include <intrin.h>
+
+#define M64C(a) (*(const __m64 *)(&a))
+#define M64U(a) (*(const unsigned long long *)(&a))
+
+__inline __m64
+_m_from_int (int a)
+{
+    long long i64 = a;
+
+    return M64C (i64);
+}
+
+__inline __m64
+_mm_setzero_si64 ()
+{
+    long long i64 = 0;
+
+    return M64C (i64);
+}
+
+__inline __m64
+_mm_set_pi32 (int i1,   int i0)
+{
+    unsigned long long i64 = ((unsigned)i0) + (((unsigned long long)(unsigned)i1) << 32);
+
+    return M64C (i64);
+}
+
+__inline void
+_m_empty ()
+{
+}
+
+__inline __m64
+_mm_set1_pi16 (short w)
+{
+    unsigned long long i64 = ((unsigned long long)(unsigned short)(w)) * 0x0001000100010001ULL;
+
+    return M64C (i64);
+}
+
+__inline int
+_m_to_int (__m64 m)
+{
+    return m.m64_i32[0];
+}
+
+__inline __m64
+_mm_movepi64_pi64 (__m128i a)
+{
+    return M64C (a.m128i_i64[0]);
+}
+
+__inline __m64
+_m_pand (__m64 a, __m64 b)
+{
+    unsigned long long i64 = M64U (a) & M64U (b);
+
+    return M64C (i64);
+}
+
+__inline __m64
+_m_por (__m64 a, __m64 b)
+{
+    unsigned long long i64 = M64U (a) | M64U (b);
+
+    return M64C (i64);
+}
+
+__inline __m64
+_m_pxor (__m64 a, __m64 b)
+{
+    unsigned long long i64 = M64U (a) ^ M64U (b);
+
+    return M64C (i64);
+}
+
+__inline __m64
+_m_pmulhuw (__m64 a, __m64 b)        /* unoptimized */
+{
+    unsigned short d[4] =
+    {
+	(unsigned short)((((unsigned)a.m64_u16[0]) * b.m64_u16[0]) >> 16),
+	(unsigned short)((((unsigned)a.m64_u16[1]) * b.m64_u16[1]) >> 16),
+	(unsigned short)((((unsigned)a.m64_u16[2]) * b.m64_u16[2]) >> 16),
+	(unsigned short)((((unsigned)a.m64_u16[3]) * b.m64_u16[3]) >> 16)
+    };
+
+    return M64C (d[0]);
+}
+
+__inline __m64
+_m_pmullw2 (__m64 a, __m64 b)        /* unoptimized */
+{
+    unsigned short d[4] =
+    {
+	(unsigned short)((((unsigned)a.m64_u16[0]) * b.m64_u16[0])),
+	(unsigned short)((((unsigned)a.m64_u16[1]) * b.m64_u16[1])),
+	(unsigned short)((((unsigned)a.m64_u16[2]) * b.m64_u16[2])),
+	(unsigned short)((((unsigned)a.m64_u16[3]) * b.m64_u16[3]))
+    };
+
+    return M64C (d[0]);
+}
+
+__inline __m64
+_m_pmullw (__m64 a, __m64 b)        /* unoptimized */
+{
+    unsigned long long x =
+	((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[0]) * b.m64_u16[0])))  +
+	(((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[1]) * b.m64_u16[1]))) << 16)  +
+	(((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[2]) * b.m64_u16[2]))) << 32)  +
+	(((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[3]) * b.m64_u16[3]))) << 48);
+
+    return M64C (x);
+}
+
+__inline __m64
+_m_paddusb (__m64 a, __m64 b)        /* unoptimized */
+{
+    unsigned long long x = (M64U (a) & 0x00FF00FF00FF00FFULL) +
+                           (M64U (b) & 0x00FF00FF00FF00FFULL);
+
+    unsigned long long y = ((M64U (a) >> 8) & 0x00FF00FF00FF00FFULL) +
+                           ((M64U (b) >> 8) & 0x00FF00FF00FF00FFULL);
+
+    x |= ((x & 0xFF00FF00FF00FF00ULL) >> 8) * 0xFF;
+    y |= ((y & 0xFF00FF00FF00FF00ULL) >> 8) * 0xFF;
+
+    x = (x & 0x00FF00FF00FF00FFULL) | ((y & 0x00FF00FF00FF00FFULL) << 8);
+
+    return M64C (x);
+}
+
+__inline __m64
+_m_paddusw (__m64 a, __m64 b)        /* unoptimized */
+{
+    unsigned long long x = (M64U (a) & 0x0000FFFF0000FFFFULL) +
+                           (M64U (b) & 0x0000FFFF0000FFFFULL);
+
+    unsigned long long y = ((M64U (a) >> 16) & 0x0000FFFF0000FFFFULL) +
+                           ((M64U (b) >> 16) & 0x0000FFFF0000FFFFULL);
+
+    x |= ((x & 0xFFFF0000FFFF0000) >> 16) * 0xFFFF;
+    y |= ((y & 0xFFFF0000FFFF0000) >> 16) * 0xFFFF;
+
+    x = (x & 0x0000FFFF0000FFFFULL) | ((y & 0x0000FFFF0000FFFFULL) << 16);
+
+    return M64C (x);
+}
+
+__inline __m64
+_m_pshufw (__m64 a, int n)         /* unoptimized */
+{
+    unsigned short d[4] =
+    {
+	a.m64_u16[n & 3],
+	a.m64_u16[(n >> 2) & 3],
+	a.m64_u16[(n >> 4) & 3],
+	a.m64_u16[(n >> 6) & 3]
+    };
+
+    return M64C (d[0]);
+}
+
+__inline unsigned char
+sat16 (unsigned short d)
+{
+    if (d > 0xFF) return 0xFF;
+    else return d & 0xFF;
+}
+
+__inline __m64
+_m_packuswb (__m64 m1, __m64 m2)          /* unoptimized */
+{
+    unsigned char d[8] =
+    {
+	sat16 (m1.m64_u16[0]),
+	sat16 (m1.m64_u16[1]),
+	sat16 (m1.m64_u16[2]),
+	sat16 (m1.m64_u16[3]),
+	sat16 (m2.m64_u16[0]),
+	sat16 (m2.m64_u16[1]),
+	sat16 (m2.m64_u16[2]),
+	sat16 (m2.m64_u16[3])
+    };
+
+    return M64C (d[0]);
+}
+
+__inline __m64 _m_punpcklbw (__m64 m1, __m64 m2)          /* unoptimized */
+{
+    unsigned char d[8] =
+    {
+	m1.m64_u8[0],
+	m2.m64_u8[0],
+	m1.m64_u8[1],
+	m2.m64_u8[1],
+	m1.m64_u8[2],
+	m2.m64_u8[2],
+	m1.m64_u8[3],
+	m2.m64_u8[3],
+    };
+
+    return M64C (d[0]);
+}
+
+__inline __m64 _m_punpckhbw (__m64 m1, __m64 m2)          /* unoptimized */
+{
+    unsigned char d[8] =
+    {
+	m1.m64_u8[4],
+	m2.m64_u8[4],
+	m1.m64_u8[5],
+	m2.m64_u8[5],
+	m1.m64_u8[6],
+	m2.m64_u8[6],
+	m1.m64_u8[7],
+	m2.m64_u8[7],
+    };
+
+    return M64C (d[0]);
+}
+
+__inline __m64 _m_psrlwi (__m64 a, int n)       /* unoptimized */
+{
+    unsigned short d[4] =
+    {
+	a.m64_u16[0] >> n,
+	a.m64_u16[1] >> n,
+	a.m64_u16[2] >> n,
+	a.m64_u16[3] >> n
+    };
+
+    return M64C (d[0]);
+}
+
+__inline __m64 _m_psrlqi (__m64 m, int n)
+{
+    unsigned long long x = M64U (m) >> n;
+
+    return M64C (x);
+}
+
+__inline __m64 _m_psllqi (__m64 m, int n)
+{
+    unsigned long long x = M64U (m) << n;
+
+    return M64C (x);
+}
+
+#endif /* MMX_X64_H_INCLUDED */
diff --git a/lib/pixman/pixman/pixman.c b/lib/pixman/pixman/pixman.c
new file mode 100644
index 000000000..0edd967cf
--- /dev/null
+++ b/lib/pixman/pixman/pixman.c
@@ -0,0 +1,543 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+/*
+ * Operator optimizations based on source or destination opacity
+ */
+typedef struct
+{
+    pixman_op_t op;
+    pixman_op_t op_src_dst_opaque;
+    pixman_op_t op_src_opaque;
+    pixman_op_t op_dst_opaque;
+} optimized_operator_info_t;
+
+static const optimized_operator_info_t optimized_operators[] =
+{
+    /* Input Operator           SRC&DST Opaque          SRC Opaque              DST Opaque      */
+    { PIXMAN_OP_OVER,           PIXMAN_OP_SRC,          PIXMAN_OP_SRC,          PIXMAN_OP_OVER },
+    { PIXMAN_OP_OVER_REVERSE,   PIXMAN_OP_DST,          PIXMAN_OP_OVER_REVERSE, PIXMAN_OP_DST },
+    { PIXMAN_OP_IN,             PIXMAN_OP_SRC,          PIXMAN_OP_IN,           PIXMAN_OP_SRC },
+    { PIXMAN_OP_IN_REVERSE,     PIXMAN_OP_DST,          PIXMAN_OP_DST,          PIXMAN_OP_IN_REVERSE },
+    { PIXMAN_OP_OUT,            PIXMAN_OP_CLEAR,        PIXMAN_OP_OUT,          PIXMAN_OP_CLEAR },
+    { PIXMAN_OP_OUT_REVERSE,    PIXMAN_OP_CLEAR,        PIXMAN_OP_CLEAR,        PIXMAN_OP_OUT_REVERSE },
+    { PIXMAN_OP_ATOP,           PIXMAN_OP_SRC,          PIXMAN_OP_IN,           PIXMAN_OP_OVER },
+    { PIXMAN_OP_ATOP_REVERSE,   PIXMAN_OP_DST,          PIXMAN_OP_OVER_REVERSE, PIXMAN_OP_IN_REVERSE },
+    { PIXMAN_OP_XOR,            PIXMAN_OP_CLEAR,        PIXMAN_OP_OUT,          PIXMAN_OP_OUT_REVERSE },
+    { PIXMAN_OP_SATURATE,       PIXMAN_OP_DST,          PIXMAN_OP_OVER_REVERSE, PIXMAN_OP_DST },
+    { PIXMAN_OP_NONE }
+};
+
+static pixman_implementation_t *imp;
+
+/*
+ * Check if the current operator could be optimized
+ */
+static const optimized_operator_info_t*
+pixman_operator_can_be_optimized (pixman_op_t op)
+{
+    const optimized_operator_info_t *info;
+
+    for (info = optimized_operators; info->op != PIXMAN_OP_NONE; info++)
+    {
+	if (info->op == op)
+	    return info;
+    }
+    return NULL;
+}
+
+/*
+ * Optimize the current operator based on opacity of source or destination
+ * The output operator should be mathematically equivalent to the source.
+ */
+static pixman_op_t
+pixman_optimize_operator (pixman_op_t     op,
+                          pixman_image_t *src_image,
+                          pixman_image_t *mask_image,
+                          pixman_image_t *dst_image)
+{
+    pixman_bool_t is_source_opaque;
+    pixman_bool_t is_dest_opaque;
+    const optimized_operator_info_t *info = pixman_operator_can_be_optimized (op);
+
+    if (!info || mask_image)
+	return op;
+
+    is_source_opaque = _pixman_image_is_opaque (src_image);
+    is_dest_opaque = _pixman_image_is_opaque (dst_image);
+
+    if (is_source_opaque == FALSE && is_dest_opaque == FALSE)
+	return op;
+
+    if (is_source_opaque && is_dest_opaque)
+	return info->op_src_dst_opaque;
+    else if (is_source_opaque)
+	return info->op_src_opaque;
+    else if (is_dest_opaque)
+	return info->op_dst_opaque;
+
+    return op;
+
+}
+
+static void
+apply_workaround (pixman_image_t *image,
+		  int16_t *       x,
+		  int16_t *       y,
+		  uint32_t **     save_bits,
+		  int *           save_dx,
+		  int *           save_dy)
+{
+    /* Some X servers generate images that point to the
+     * wrong place in memory, but then set the clip region
+     * to point to the right place. Because of an old bug
+     * in pixman, this would actually work.
+     *
+     * Here we try and undo the damage
+     */
+    int bpp = PIXMAN_FORMAT_BPP (image->bits.format) / 8;
+    pixman_box32_t *extents;
+    uint8_t *t;
+    int dx, dy;
+
+    extents = pixman_region32_extents (&(image->common.clip_region));
+    dx = extents->x1;
+    dy = extents->y1;
+
+    *save_bits = image->bits.bits;
+
+    *x -= dx;
+    *y -= dy;
+    pixman_region32_translate (&(image->common.clip_region), -dx, -dy);
+
+    t = (uint8_t *)image->bits.bits;
+    t += dy * image->bits.rowstride * 4 + dx * bpp;
+    image->bits.bits = (uint32_t *)t;
+
+    *save_dx = dx;
+    *save_dy = dy;
+}
+
+static void
+unapply_workaround (pixman_image_t *image, uint32_t *bits, int dx, int dy)
+{
+    image->bits.bits = bits;
+    pixman_region32_translate (&image->common.clip_region, dx, dy);
+}
+
+PIXMAN_EXPORT void
+pixman_image_composite (pixman_op_t      op,
+                        pixman_image_t * src,
+                        pixman_image_t * mask,
+                        pixman_image_t * dest,
+                        int16_t          src_x,
+                        int16_t          src_y,
+                        int16_t          mask_x,
+                        int16_t          mask_y,
+                        int16_t          dest_x,
+                        int16_t          dest_y,
+                        uint16_t         width,
+                        uint16_t         height)
+{
+    uint32_t *src_bits;
+    int src_dx, src_dy;
+    uint32_t *mask_bits;
+    int mask_dx, mask_dy;
+    uint32_t *dest_bits;
+    int dest_dx, dest_dy;
+
+    _pixman_image_validate (src);
+    if (mask)
+	_pixman_image_validate (mask);
+    _pixman_image_validate (dest);
+    
+    /*
+     * Check if we can replace our operator by a simpler one
+     * if the src or dest are opaque. The output operator should be
+     * mathematically equivalent to the source.
+     */
+    op = pixman_optimize_operator(op, src, mask, dest);
+    if (op == PIXMAN_OP_DST		||
+	op == PIXMAN_OP_CONJOINT_DST	||
+	op == PIXMAN_OP_DISJOINT_DST)
+    {
+        return;
+    }
+
+    if (!imp)
+	imp = _pixman_choose_implementation ();
+
+    if (src->common.need_workaround)
+	apply_workaround (src, &src_x, &src_y, &src_bits, &src_dx, &src_dy);
+    if (mask && mask->common.need_workaround)
+	apply_workaround (mask, &mask_x, &mask_y, &mask_bits, &mask_dx, &mask_dy);
+    if (dest->common.need_workaround)
+	apply_workaround (dest, &dest_x, &dest_y, &dest_bits, &dest_dx, &dest_dy);
+
+    _pixman_implementation_composite (imp, op,
+                                      src, mask, dest,
+                                      src_x, src_y,
+                                      mask_x, mask_y,
+                                      dest_x, dest_y,
+                                      width, height);
+
+    if (src->common.need_workaround)
+	unapply_workaround (src, src_bits, src_dx, src_dy);
+    if (mask && mask->common.need_workaround)
+	unapply_workaround (mask, mask_bits, mask_dx, mask_dy);
+    if (dest->common.need_workaround)
+	unapply_workaround (dest, dest_bits, dest_dx, dest_dy);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_blt (uint32_t *src_bits,
+            uint32_t *dst_bits,
+            int       src_stride,
+            int       dst_stride,
+            int       src_bpp,
+            int       dst_bpp,
+            int       src_x,
+            int       src_y,
+            int       dst_x,
+            int       dst_y,
+            int       width,
+            int       height)
+{
+    if (!imp)
+	imp = _pixman_choose_implementation ();
+
+    return _pixman_implementation_blt (imp, src_bits, dst_bits, src_stride, dst_stride,
+                                       src_bpp, dst_bpp,
+                                       src_x, src_y,
+                                       dst_x, dst_y,
+                                       width, height);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_fill (uint32_t *bits,
+             int       stride,
+             int       bpp,
+             int       x,
+             int       y,
+             int       width,
+             int       height,
+             uint32_t xor)
+{
+    if (!imp)
+	imp = _pixman_choose_implementation ();
+
+    return _pixman_implementation_fill (imp, bits, stride, bpp, x, y, width, height, xor);
+}
+
+static uint32_t
+color_to_uint32 (const pixman_color_t *color)
+{
+    return
+        (color->alpha >> 8 << 24) |
+        (color->red >> 8 << 16) |
+        (color->green & 0xff00) |
+        (color->blue >> 8);
+}
+
+static pixman_bool_t
+color_to_pixel (pixman_color_t *     color,
+                uint32_t *           pixel,
+                pixman_format_code_t format)
+{
+    uint32_t c = color_to_uint32 (color);
+
+    if (!(format == PIXMAN_a8r8g8b8     ||
+          format == PIXMAN_x8r8g8b8     ||
+          format == PIXMAN_a8b8g8r8     ||
+          format == PIXMAN_x8b8g8r8     ||
+          format == PIXMAN_b8g8r8a8     ||
+          format == PIXMAN_b8g8r8x8     ||
+          format == PIXMAN_r5g6b5       ||
+          format == PIXMAN_b5g6r5       ||
+          format == PIXMAN_a8))
+    {
+	return FALSE;
+    }
+
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_ABGR)
+    {
+	c = ((c & 0xff000000) >>  0) |
+	    ((c & 0x00ff0000) >> 16) |
+	    ((c & 0x0000ff00) >>  0) |
+	    ((c & 0x000000ff) << 16);
+    }
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_BGRA)
+    {
+	c = ((c & 0xff000000) >> 24) |
+	    ((c & 0x00ff0000) >>  8) |
+	    ((c & 0x0000ff00) <<  8) |
+	    ((c & 0x000000ff) << 24);
+    }
+
+    if (format == PIXMAN_a8)
+	c = c >> 24;
+    else if (format == PIXMAN_r5g6b5 ||
+             format == PIXMAN_b5g6r5)
+	c = CONVERT_8888_TO_0565 (c);
+
+#if 0
+    printf ("color: %x %x %x %x\n", color->alpha, color->red, color->green, color->blue);
+    printf ("pixel: %x\n", c);
+#endif
+
+    *pixel = c;
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_fill_rectangles (pixman_op_t                 op,
+                              pixman_image_t *            dest,
+                              pixman_color_t *            color,
+                              int                         n_rects,
+                              const pixman_rectangle16_t *rects)
+{
+    pixman_image_t *solid;
+    pixman_color_t c;
+    int i;
+
+    _pixman_image_validate (dest);
+    
+    if (color->alpha == 0xffff)
+    {
+	if (op == PIXMAN_OP_OVER)
+	    op = PIXMAN_OP_SRC;
+    }
+
+    if (op == PIXMAN_OP_CLEAR)
+    {
+	c.red = 0;
+	c.green = 0;
+	c.blue = 0;
+	c.alpha = 0;
+
+	color = &c;
+
+	op = PIXMAN_OP_SRC;
+    }
+
+    if (op == PIXMAN_OP_SRC)
+    {
+	uint32_t pixel;
+
+	if (color_to_pixel (color, &pixel, dest->bits.format))
+	{
+	    for (i = 0; i < n_rects; ++i)
+	    {
+		pixman_region32_t fill_region;
+		int n_boxes, j;
+		pixman_box32_t *boxes;
+
+		pixman_region32_init_rect (&fill_region, rects[i].x, rects[i].y, rects[i].width, rects[i].height);
+
+		if (dest->common.have_clip_region)
+		{
+		    if (!pixman_region32_intersect (&fill_region,
+		                                    &fill_region,
+		                                    &dest->common.clip_region))
+			return FALSE;
+		}
+
+		boxes = pixman_region32_rectangles (&fill_region, &n_boxes);
+		for (j = 0; j < n_boxes; ++j)
+		{
+		    const pixman_box32_t *box = &(boxes[j]);
+		    pixman_fill (dest->bits.bits, dest->bits.rowstride, PIXMAN_FORMAT_BPP (dest->bits.format),
+		                 box->x1, box->y1, box->x2 - box->x1, box->y2 - box->y1,
+		                 pixel);
+		}
+
+		pixman_region32_fini (&fill_region);
+	    }
+	    return TRUE;
+	}
+    }
+
+    solid = pixman_image_create_solid_fill (color);
+    if (!solid)
+	return FALSE;
+
+    for (i = 0; i < n_rects; ++i)
+    {
+	const pixman_rectangle16_t *rect = &(rects[i]);
+
+	pixman_image_composite (op, solid, NULL, dest,
+	                        0, 0, 0, 0,
+	                        rect->x, rect->y,
+	                        rect->width, rect->height);
+    }
+
+    pixman_image_unref (solid);
+
+    return TRUE;
+}
+
+/**
+ * pixman_version:
+ *
+ * Returns the version of the pixman library encoded in a single
+ * integer as per %PIXMAN_VERSION_ENCODE. The encoding ensures that
+ * later versions compare greater than earlier versions.
+ *
+ * A run-time comparison to check that pixman's version is greater than
+ * or equal to version X.Y.Z could be performed as follows:
+ *
+ * <informalexample><programlisting>
+ * if (pixman_version() >= PIXMAN_VERSION_ENCODE(X,Y,Z)) {...}
+ * </programlisting></informalexample>
+ *
+ * See also pixman_version_string() as well as the compile-time
+ * equivalents %PIXMAN_VERSION and %PIXMAN_VERSION_STRING.
+ *
+ * Return value: the encoded version.
+ **/
+PIXMAN_EXPORT int
+pixman_version (void)
+{
+    return PIXMAN_VERSION;
+}
+
+/**
+ * pixman_version_string:
+ *
+ * Returns the version of the pixman library as a human-readable string
+ * of the form "X.Y.Z".
+ *
+ * See also pixman_version() as well as the compile-time equivalents
+ * %PIXMAN_VERSION_STRING and %PIXMAN_VERSION.
+ *
+ * Return value: a string containing the version.
+ **/
+PIXMAN_EXPORT const char*
+pixman_version_string (void)
+{
+    return PIXMAN_VERSION_STRING;
+}
+
+/**
+ * pixman_format_supported_source:
+ * @format: A pixman_format_code_t format
+ *
+ * Return value: whether the provided format code is a supported
+ * format for a pixman surface used as a source in
+ * rendering.
+ *
+ * Currently, all pixman_format_code_t values are supported.
+ **/
+PIXMAN_EXPORT pixman_bool_t
+pixman_format_supported_source (pixman_format_code_t format)
+{
+    switch (format)
+    {
+    /* 32 bpp formats */
+    case PIXMAN_a2b10g10r10:
+    case PIXMAN_x2b10g10r10:
+    case PIXMAN_a2r10g10b10:
+    case PIXMAN_x2r10g10b10:
+    case PIXMAN_a8r8g8b8:
+    case PIXMAN_x8r8g8b8:
+    case PIXMAN_a8b8g8r8:
+    case PIXMAN_x8b8g8r8:
+    case PIXMAN_b8g8r8a8:
+    case PIXMAN_b8g8r8x8:
+    case PIXMAN_r8g8b8:
+    case PIXMAN_b8g8r8:
+    case PIXMAN_r5g6b5:
+    case PIXMAN_b5g6r5:
+    /* 16 bpp formats */
+    case PIXMAN_a1r5g5b5:
+    case PIXMAN_x1r5g5b5:
+    case PIXMAN_a1b5g5r5:
+    case PIXMAN_x1b5g5r5:
+    case PIXMAN_a4r4g4b4:
+    case PIXMAN_x4r4g4b4:
+    case PIXMAN_a4b4g4r4:
+    case PIXMAN_x4b4g4r4:
+    /* 8bpp formats */
+    case PIXMAN_a8:
+    case PIXMAN_r3g3b2:
+    case PIXMAN_b2g3r3:
+    case PIXMAN_a2r2g2b2:
+    case PIXMAN_a2b2g2r2:
+    case PIXMAN_c8:
+    case PIXMAN_g8:
+    case PIXMAN_x4a4:
+    /* Collides with PIXMAN_c8
+       case PIXMAN_x4c4:
+     */
+    /* Collides with PIXMAN_g8
+       case PIXMAN_x4g4:
+     */
+    /* 4bpp formats */
+    case PIXMAN_a4:
+    case PIXMAN_r1g2b1:
+    case PIXMAN_b1g2r1:
+    case PIXMAN_a1r1g1b1:
+    case PIXMAN_a1b1g1r1:
+    case PIXMAN_c4:
+    case PIXMAN_g4:
+    /* 1bpp formats */
+    case PIXMAN_a1:
+    case PIXMAN_g1:
+    /* YUV formats */
+    case PIXMAN_yuy2:
+    case PIXMAN_yv12:
+	return TRUE;
+
+    default:
+	return FALSE;
+    }
+}
+
+/**
+ * pixman_format_supported_destination:
+ * @format: A pixman_format_code_t format
+ *
+ * Return value: whether the provided format code is a supported
+ * format for a pixman surface used as a destination in
+ * rendering.
+ *
+ * Currently, all pixman_format_code_t values are supported
+ * except for the YUV formats.
+ **/
+PIXMAN_EXPORT pixman_bool_t
+pixman_format_supported_destination (pixman_format_code_t format)
+{
+    /* YUV formats cannot be written to at the moment */
+    if (format == PIXMAN_yuy2 || format == PIXMAN_yv12)
+	return FALSE;
+
+    return pixman_format_supported_source (format);
+}
+
diff --git a/lib/pixman/pixman/pixman.h b/lib/pixman/pixman/pixman.h
index 29c054a6f..5b90a0c8d 100644
--- a/lib/pixman/pixman/pixman.h
+++ b/lib/pixman/pixman/pixman.h
@@ -166,147 +166,96 @@ struct pixman_transform
 /* forward declaration (sorry) */
 struct pixman_box16;
 
-void
-pixman_transform_init_identity(struct pixman_transform *matrix);
-
-pixman_bool_t
-pixman_transform_point_3d (const struct pixman_transform *transform,
-			   struct pixman_vector *vector);
-
-pixman_bool_t
-pixman_transform_point(const struct pixman_transform *transform,
-		       struct pixman_vector *vector);
-
-pixman_bool_t
-pixman_transform_multiply (struct pixman_transform *dst,
-			   const struct pixman_transform *l,
-			   const struct pixman_transform *r);
-
-void
-pixman_transform_init_scale (struct pixman_transform *t,
-			     pixman_fixed_t sx,
-			     pixman_fixed_t sy);
-
-pixman_bool_t
-pixman_transform_scale(struct pixman_transform *forward,
-		       struct pixman_transform *reverse,
-		       pixman_fixed_t sx, pixman_fixed_t sy);
-
-void
-pixman_transform_init_rotate(struct pixman_transform *t,
-			     pixman_fixed_t cos,
-			     pixman_fixed_t sin);
-
-pixman_bool_t
-pixman_transform_rotate(struct pixman_transform *forward,
-			struct pixman_transform *reverse,
-			pixman_fixed_t c, pixman_fixed_t s);
-
-void
-pixman_transform_init_translate(struct pixman_transform *t,
-				pixman_fixed_t tx, pixman_fixed_t ty);
-
-
-pixman_bool_t
-pixman_transform_translate(struct pixman_transform *forward,
-			   struct pixman_transform *reverse,
-			   pixman_fixed_t tx, pixman_fixed_t ty);
-
-pixman_bool_t
-pixman_transform_bounds(const struct pixman_transform *matrix,
-			struct pixman_box16 *b);
-
-
-pixman_bool_t
-pixman_transform_invert (struct pixman_transform *dst,
-			 const struct pixman_transform *src);
-
-pixman_bool_t
-pixman_transform_is_identity(const struct pixman_transform *t);
-
-pixman_bool_t
-pixman_transform_is_scale(const struct pixman_transform *t);
-
-pixman_bool_t
-pixman_transform_is_int_translate(const struct pixman_transform *t);
-
-pixman_bool_t
-pixman_transform_is_inverse (const struct pixman_transform *a,
-			     const struct pixman_transform *b);
-
+void          pixman_transform_init_identity    (struct pixman_transform       *matrix);
+pixman_bool_t pixman_transform_point_3d         (const struct pixman_transform *transform,
+						 struct pixman_vector          *vector);
+pixman_bool_t pixman_transform_point            (const struct pixman_transform *transform,
+						 struct pixman_vector          *vector);
+pixman_bool_t pixman_transform_multiply         (struct pixman_transform       *dst,
+						 const struct pixman_transform *l,
+						 const struct pixman_transform *r);
+void          pixman_transform_init_scale       (struct pixman_transform       *t,
+						 pixman_fixed_t                 sx,
+						 pixman_fixed_t                 sy);
+pixman_bool_t pixman_transform_scale            (struct pixman_transform       *forward,
+						 struct pixman_transform       *reverse,
+						 pixman_fixed_t                 sx,
+						 pixman_fixed_t                 sy);
+void          pixman_transform_init_rotate      (struct pixman_transform       *t,
+						 pixman_fixed_t                 cos,
+						 pixman_fixed_t                 sin);
+pixman_bool_t pixman_transform_rotate           (struct pixman_transform       *forward,
+						 struct pixman_transform       *reverse,
+						 pixman_fixed_t                 c,
+						 pixman_fixed_t                 s);
+void          pixman_transform_init_translate   (struct pixman_transform       *t,
+						 pixman_fixed_t                 tx,
+						 pixman_fixed_t                 ty);
+pixman_bool_t pixman_transform_translate        (struct pixman_transform       *forward,
+						 struct pixman_transform       *reverse,
+						 pixman_fixed_t                 tx,
+						 pixman_fixed_t                 ty);
+pixman_bool_t pixman_transform_bounds           (const struct pixman_transform *matrix,
+						 struct pixman_box16           *b);
+pixman_bool_t pixman_transform_invert           (struct pixman_transform       *dst,
+						 const struct pixman_transform *src);
+pixman_bool_t pixman_transform_is_identity      (const struct pixman_transform *t);
+pixman_bool_t pixman_transform_is_scale         (const struct pixman_transform *t);
+pixman_bool_t pixman_transform_is_int_translate (const struct pixman_transform *t);
+pixman_bool_t pixman_transform_is_inverse       (const struct pixman_transform *a,
+						 const struct pixman_transform *b);
 
 /*
  * Floating point matrices
  */
-struct pixman_f_vector {
+struct pixman_f_vector
+{
     double  v[3];
 };
 
-struct pixman_f_transform {
+struct pixman_f_transform
+{
     double  m[3][3];
 };
 
-pixman_bool_t
-pixman_transform_from_pixman_f_transform (struct pixman_transform *t,
-					  const struct pixman_f_transform *ft);
-
-void
-pixman_f_transform_from_pixman_transform (struct pixman_f_transform *ft,
-					  const struct pixman_transform *t);
-
-pixman_bool_t
-pixman_transform_from_pixman_f_transform (struct pixman_transform *t,
-					  const struct pixman_f_transform *ft);
-
-pixman_bool_t
-pixman_f_transform_invert (struct pixman_f_transform *dst,
-			   const struct pixman_f_transform *src);
-
-pixman_bool_t
-pixman_f_transform_point (const struct pixman_f_transform *t,
-			  struct pixman_f_vector *v);
-
-void
-pixman_f_transform_point_3d (const struct pixman_f_transform *t,
-			     struct pixman_f_vector	*v);
-
+pixman_bool_t pixman_transform_from_pixman_f_transform (struct pixman_transform         *t,
+							const struct pixman_f_transform *ft);
+void          pixman_f_transform_from_pixman_transform (struct pixman_f_transform       *ft,
+							const struct pixman_transform   *t);
+pixman_bool_t pixman_f_transform_invert                (struct pixman_f_transform       *dst,
+							const struct pixman_f_transform *src);
+pixman_bool_t pixman_f_transform_point                 (const struct pixman_f_transform *t,
+							struct pixman_f_vector          *v);
+void          pixman_f_transform_point_3d              (const struct pixman_f_transform *t,
+							struct pixman_f_vector          *v);
+void          pixman_f_transform_multiply              (struct pixman_f_transform       *dst,
+							const struct pixman_f_transform *l,
+							const struct pixman_f_transform *r);
+void          pixman_f_transform_init_scale            (struct pixman_f_transform       *t,
+							double                           sx,
+							double                           sy);
+pixman_bool_t pixman_f_transform_scale                 (struct pixman_f_transform       *forward,
+							struct pixman_f_transform       *reverse,
+							double                           sx,
+							double                           sy);
+void          pixman_f_transform_init_rotate           (struct pixman_f_transform       *t,
+							double                           cos,
+							double                           sin);
+pixman_bool_t pixman_f_transform_rotate                (struct pixman_f_transform       *forward,
+							struct pixman_f_transform       *reverse,
+							double                           c,
+							double                           s);
+void          pixman_f_transform_init_translate        (struct pixman_f_transform       *t,
+							double                           tx,
+							double                           ty);
+pixman_bool_t pixman_f_transform_translate             (struct pixman_f_transform       *forward,
+							struct pixman_f_transform       *reverse,
+							double                           tx,
+							double                           ty);
+pixman_bool_t pixman_f_transform_bounds                (const struct pixman_f_transform *t,
+							struct pixman_box16             *b);
+void          pixman_f_transform_init_identity         (struct pixman_f_transform       *t);
 
-void
-pixman_f_transform_multiply (struct pixman_f_transform *dst,
-			     const struct pixman_f_transform *l,
-			     const struct pixman_f_transform *r);
-
-void
-pixman_f_transform_init_scale (struct pixman_f_transform *t, double sx, double sy);
-
-pixman_bool_t
-pixman_f_transform_scale (struct pixman_f_transform *forward,
-			  struct pixman_f_transform *reverse,
-			  double sx, double sy);
-
-void
-pixman_f_transform_init_rotate (struct pixman_f_transform *t, double cos, double sin);
-
-pixman_bool_t
-pixman_f_transform_rotate (struct pixman_f_transform *forward,
-			   struct pixman_f_transform *reverse,
-			   double c, double s);
-
-void
-pixman_f_transform_init_translate (struct pixman_f_transform *t, double tx, double ty);
-
-pixman_bool_t
-pixman_f_transform_translate (struct pixman_f_transform *forward,
-			      struct pixman_f_transform *reverse,
-			      double tx, double ty);
-
-pixman_bool_t
-pixman_f_transform_bounds (const struct pixman_f_transform *t, struct pixman_box16 *b);
-
-void
-pixman_f_transform_init_identity (struct pixman_f_transform *t);
-
-/* Don't blame me, blame XRender */
 typedef enum
 {
     PIXMAN_REPEAT_NONE,
@@ -368,6 +317,22 @@ typedef enum
     PIXMAN_OP_CONJOINT_ATOP_REVERSE	= 0x2a,
     PIXMAN_OP_CONJOINT_XOR		= 0x2b,
 
+    PIXMAN_OP_MULTIPLY                  = 0x30,
+    PIXMAN_OP_SCREEN                    = 0x31,
+    PIXMAN_OP_OVERLAY                   = 0x32,
+    PIXMAN_OP_DARKEN                    = 0x33,
+    PIXMAN_OP_LIGHTEN                   = 0x34,
+    PIXMAN_OP_COLOR_DODGE               = 0x35,
+    PIXMAN_OP_COLOR_BURN                = 0x36,
+    PIXMAN_OP_HARD_LIGHT                = 0x37,
+    PIXMAN_OP_SOFT_LIGHT                = 0x38,
+    PIXMAN_OP_DIFFERENCE                = 0x39,
+    PIXMAN_OP_EXCLUSION                 = 0x3a,
+    PIXMAN_OP_HSL_HUE			= 0x3b,
+    PIXMAN_OP_HSL_SATURATION		= 0x3c,
+    PIXMAN_OP_HSL_COLOR			= 0x3d,
+    PIXMAN_OP_HSL_LUMINOSITY		= 0x3e,
+
     PIXMAN_OP_NONE,
     PIXMAN_OP_LAST = PIXMAN_OP_NONE
 } pixman_op_t;
@@ -388,8 +353,8 @@ struct pixman_region16_data {
 
 struct pixman_rectangle16
 {
-    int16_t x, y;
-    uint16_t width, height;
+    int16_t	x, y;
+    uint16_t	width, height;
 };
 
 struct pixman_box16
@@ -400,7 +365,7 @@ struct pixman_box16
 struct pixman_region16
 {
     pixman_box16_t          extents;
-    pixman_region16_data_t  *data;
+    pixman_region16_data_t *data;
 };
 
 typedef enum
@@ -410,70 +375,69 @@ typedef enum
     PIXMAN_REGION_PART
 } pixman_region_overlap_t;
 
-/* This function exists only to make it possible to preserve the X ABI - it should
- * go away at first opportunity.
+/* This function exists only to make it possible to preserve
+ * the X ABI - it should go away at first opportunity.
  */
-void                    pixman_region_set_static_pointers (pixman_box16_t         *empty_box,
-							   pixman_region16_data_t *empty_data,
-							   pixman_region16_data_t *broken_data);
-
+void pixman_region_set_static_pointers (pixman_box16_t         *empty_box,
+					pixman_region16_data_t *empty_data,
+					pixman_region16_data_t *broken_data);
 
 /* creation/destruction */
-void                    pixman_region_init                (pixman_region16_t      *region);
-void                    pixman_region_init_rect           (pixman_region16_t      *region,
-							   int                     x,
-							   int                     y,
-							   unsigned int            width,
-							   unsigned int            height);
-pixman_bool_t           pixman_region_init_rects          (pixman_region16_t      *region,
-							   pixman_box16_t         *boxes,
-							   int                     count);
-void                    pixman_region_init_with_extents   (pixman_region16_t      *region,
-							   pixman_box16_t         *extents);
-void                    pixman_region_fini                (pixman_region16_t      *region);
+void                    pixman_region_init               (pixman_region16_t *region);
+void                    pixman_region_init_rect          (pixman_region16_t *region,
+							  int                x,
+							  int                y,
+							  unsigned int       width,
+							  unsigned int       height);
+pixman_bool_t           pixman_region_init_rects         (pixman_region16_t *region,
+							  pixman_box16_t    *boxes,
+							  int                count);
+void                    pixman_region_init_with_extents  (pixman_region16_t *region,
+							  pixman_box16_t    *extents);
+void                    pixman_region_fini               (pixman_region16_t *region);
 
 
-/* manipulation */
-void                    pixman_region_translate           (pixman_region16_t      *region,
-							   int                     x,
-							   int                     y);
-pixman_bool_t           pixman_region_copy                (pixman_region16_t      *dest,
-							   pixman_region16_t      *source);
-pixman_bool_t           pixman_region_intersect           (pixman_region16_t      *newReg,
-							   pixman_region16_t      *reg1,
-							   pixman_region16_t      *reg2);
-pixman_bool_t           pixman_region_union               (pixman_region16_t      *newReg,
-							   pixman_region16_t      *reg1,
-							   pixman_region16_t      *reg2);
-pixman_bool_t           pixman_region_union_rect          (pixman_region16_t      *dest,
-							   pixman_region16_t      *source,
-							   int                     x,
-							   int                     y,
-							   unsigned int            width,
-							   unsigned int            height);
-pixman_bool_t           pixman_region_subtract            (pixman_region16_t      *regD,
-							   pixman_region16_t      *regM,
-							   pixman_region16_t      *regS);
-pixman_bool_t           pixman_region_inverse             (pixman_region16_t      *newReg,
-							   pixman_region16_t      *reg1,
-							   pixman_box16_t         *invRect);
-pixman_bool_t           pixman_region_contains_point      (pixman_region16_t      *region,
-							   int                     x,
-							   int                     y,
-							   pixman_box16_t         *box);
-pixman_region_overlap_t pixman_region_contains_rectangle  (pixman_region16_t      *pixman_region16_t,
-							   pixman_box16_t         *prect);
-pixman_bool_t           pixman_region_not_empty           (pixman_region16_t      *region);
-pixman_box16_t *        pixman_region_extents             (pixman_region16_t      *region);
-int                     pixman_region_n_rects             (pixman_region16_t      *region);
-pixman_box16_t *        pixman_region_rectangles          (pixman_region16_t      *region,
-							   int                    *n_rects);
-pixman_bool_t           pixman_region_equal               (pixman_region16_t      *region1,
-							   pixman_region16_t      *region2);
-pixman_bool_t           pixman_region_selfcheck           (pixman_region16_t      *region);
-void                    pixman_region_reset               (pixman_region16_t      *region,
-							   pixman_box16_t         *box);
 
+/* manipulation */
+void                    pixman_region_translate          (pixman_region16_t *region,
+							  int                x,
+							  int                y);
+pixman_bool_t           pixman_region_copy               (pixman_region16_t *dest,
+							  pixman_region16_t *source);
+pixman_bool_t           pixman_region_intersect          (pixman_region16_t *new_reg,
+							  pixman_region16_t *reg1,
+							  pixman_region16_t *reg2);
+pixman_bool_t           pixman_region_union              (pixman_region16_t *new_reg,
+							  pixman_region16_t *reg1,
+							  pixman_region16_t *reg2);
+pixman_bool_t           pixman_region_union_rect         (pixman_region16_t *dest,
+							  pixman_region16_t *source,
+							  int                x,
+							  int                y,
+							  unsigned int       width,
+							  unsigned int       height);
+pixman_bool_t           pixman_region_subtract           (pixman_region16_t *reg_d,
+							  pixman_region16_t *reg_m,
+							  pixman_region16_t *reg_s);
+pixman_bool_t           pixman_region_inverse            (pixman_region16_t *new_reg,
+							  pixman_region16_t *reg1,
+							  pixman_box16_t    *inv_rect);
+pixman_bool_t           pixman_region_contains_point     (pixman_region16_t *region,
+							  int                x,
+							  int                y,
+							  pixman_box16_t    *box);
+pixman_region_overlap_t pixman_region_contains_rectangle (pixman_region16_t *pixman_region16_t,
+							  pixman_box16_t    *prect);
+pixman_bool_t           pixman_region_not_empty          (pixman_region16_t *region);
+pixman_box16_t *        pixman_region_extents            (pixman_region16_t *region);
+int                     pixman_region_n_rects            (pixman_region16_t *region);
+pixman_box16_t *        pixman_region_rectangles         (pixman_region16_t *region,
+							  int               *n_rects);
+pixman_bool_t           pixman_region_equal              (pixman_region16_t *region1,
+							  pixman_region16_t *region2);
+pixman_bool_t           pixman_region_selfcheck          (pixman_region16_t *region);
+void                    pixman_region_reset              (pixman_region16_t *region,
+							  pixman_box16_t    *box);
 /*
  * 32 bit regions
  */
@@ -526,10 +490,10 @@ void                    pixman_region32_translate          (pixman_region32_t *r
 							    int                y);
 pixman_bool_t           pixman_region32_copy               (pixman_region32_t *dest,
 							    pixman_region32_t *source);
-pixman_bool_t           pixman_region32_intersect          (pixman_region32_t *newReg,
+pixman_bool_t           pixman_region32_intersect          (pixman_region32_t *new_reg,
 							    pixman_region32_t *reg1,
 							    pixman_region32_t *reg2);
-pixman_bool_t           pixman_region32_union              (pixman_region32_t *newReg,
+pixman_bool_t           pixman_region32_union              (pixman_region32_t *new_reg,
 							    pixman_region32_t *reg1,
 							    pixman_region32_t *reg2);
 pixman_bool_t           pixman_region32_union_rect         (pixman_region32_t *dest,
@@ -538,12 +502,12 @@ pixman_bool_t           pixman_region32_union_rect         (pixman_region32_t *d
 							    int                y,
 							    unsigned int       width,
 							    unsigned int       height);
-pixman_bool_t           pixman_region32_subtract           (pixman_region32_t *regD,
-							    pixman_region32_t *regM,
-							    pixman_region32_t *regS);
-pixman_bool_t           pixman_region32_inverse            (pixman_region32_t *newReg,
+pixman_bool_t           pixman_region32_subtract           (pixman_region32_t *reg_d,
+							    pixman_region32_t *reg_m,
+							    pixman_region32_t *reg_s);
+pixman_bool_t           pixman_region32_inverse            (pixman_region32_t *new_reg,
 							    pixman_region32_t *reg1,
-							    pixman_box32_t    *invRect);
+							    pixman_box32_t    *inv_rect);
 pixman_bool_t           pixman_region32_contains_point     (pixman_region32_t *region,
 							    int                x,
 							    int                y,
@@ -597,6 +561,8 @@ typedef struct pixman_gradient_stop	pixman_gradient_stop_t;
 typedef uint32_t (* pixman_read_memory_func_t) (const void *src, int size);
 typedef void     (* pixman_write_memory_func_t) (void *dst, uint32_t value, int size);
 
+typedef void     (* pixman_image_destroy_func_t) (pixman_image_t *image, void *data);
+
 struct pixman_gradient_stop {
     pixman_fixed_t x;
     pixman_color_t color;
@@ -657,65 +623,67 @@ struct pixman_indexed
 
 /* 32bpp formats */
 typedef enum {
-    PIXMAN_a8r8g8b8 =	PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,8,8,8,8),
-    PIXMAN_x8r8g8b8 =	PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,8,8,8),
-    PIXMAN_a8b8g8r8 =	PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,8,8,8,8),
-    PIXMAN_x8b8g8r8 =	PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,8,8,8),
-    PIXMAN_b8g8r8a8 =	PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,8,8,8,8),
-    PIXMAN_b8g8r8x8 =	PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,0,8,8,8),
+    PIXMAN_a8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,8,8,8,8),
+    PIXMAN_x8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,8,8,8),
+    PIXMAN_a8b8g8r8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,8,8,8,8),
+    PIXMAN_x8b8g8r8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,8,8,8),
+    PIXMAN_b8g8r8a8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,8,8,8,8),
+    PIXMAN_b8g8r8x8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,0,8,8,8),
+    PIXMAN_x2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,10,10,10),
+    PIXMAN_a2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,2,10,10,10),
     PIXMAN_x2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,10,10,10),
     PIXMAN_a2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,2,10,10,10),
 
 /* 24bpp formats */
-    PIXMAN_r8g8b8 =	PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB,0,8,8,8),
-    PIXMAN_b8g8r8 =	PIXMAN_FORMAT(24,PIXMAN_TYPE_ABGR,0,8,8,8),
+    PIXMAN_r8g8b8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB,0,8,8,8),
+    PIXMAN_b8g8r8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ABGR,0,8,8,8),
     
 /* 16bpp formats */
-    PIXMAN_r5g6b5 =	PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,6,5),
-    PIXMAN_b5g6r5 =	PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,6,5),
+    PIXMAN_r5g6b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,6,5),
+    PIXMAN_b5g6r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,6,5),
     
-    PIXMAN_a1r5g5b5 =	PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,1,5,5,5),
-    PIXMAN_x1r5g5b5 =	PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,5,5),
-    PIXMAN_a1b5g5r5 =	PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,1,5,5,5),
-    PIXMAN_x1b5g5r5 =	PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,5,5),
-    PIXMAN_a4r4g4b4 =	PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,4,4,4,4),
-    PIXMAN_x4r4g4b4 =	PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,4,4,4),
-    PIXMAN_a4b4g4r4 =	PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,4,4,4,4),
-    PIXMAN_x4b4g4r4 =	PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,4,4,4),
+    PIXMAN_a1r5g5b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,1,5,5,5),
+    PIXMAN_x1r5g5b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,5,5),
+    PIXMAN_a1b5g5r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,1,5,5,5),
+    PIXMAN_x1b5g5r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,5,5),
+    PIXMAN_a4r4g4b4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,4,4,4,4),
+    PIXMAN_x4r4g4b4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,4,4,4),
+    PIXMAN_a4b4g4r4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,4,4,4,4),
+    PIXMAN_x4b4g4r4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,4,4,4),
     
 /* 8bpp formats */
-    PIXMAN_a8 =		PIXMAN_FORMAT(8,PIXMAN_TYPE_A,8,0,0,0),
-    PIXMAN_r3g3b2 =	PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,0,3,3,2),
-    PIXMAN_b2g3r3 =	PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,0,3,3,2),
-    PIXMAN_a2r2g2b2 =	PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,2,2,2,2),
-    PIXMAN_a2b2g2r2 =	PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,2,2,2,2),
+    PIXMAN_a8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_A,8,0,0,0),
+    PIXMAN_r3g3b2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,0,3,3,2),
+    PIXMAN_b2g3r3 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,0,3,3,2),
+    PIXMAN_a2r2g2b2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,2,2,2,2),
+    PIXMAN_a2b2g2r2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,2,2,2,2),
     
-    PIXMAN_c8 =		PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
-    PIXMAN_g8 =		PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
+    PIXMAN_c8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_g8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
     
-    PIXMAN_x4a4 =	PIXMAN_FORMAT(8,PIXMAN_TYPE_A,4,0,0,0),
+    PIXMAN_x4a4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_A,4,0,0,0),
     
-    PIXMAN_x4c4 =	PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
-    PIXMAN_x4g4 =	PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
+    PIXMAN_x4c4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_x4g4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
     
 /* 4bpp formats */
-    PIXMAN_a4 =		PIXMAN_FORMAT(4,PIXMAN_TYPE_A,4,0,0,0),
-    PIXMAN_r1g2b1 =	PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,0,1,2,1),
-    PIXMAN_b1g2r1 =	PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,0,1,2,1),
-    PIXMAN_a1r1g1b1 =	PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,1,1,1,1),
-    PIXMAN_a1b1g1r1 =	PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,1,1,1,1),
+    PIXMAN_a4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_A,4,0,0,0),
+    PIXMAN_r1g2b1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,0,1,2,1),
+    PIXMAN_b1g2r1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,0,1,2,1),
+    PIXMAN_a1r1g1b1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,1,1,1,1),
+    PIXMAN_a1b1g1r1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,1,1,1,1),
     
-    PIXMAN_c4 =		PIXMAN_FORMAT(4,PIXMAN_TYPE_COLOR,0,0,0,0),
-    PIXMAN_g4 =		PIXMAN_FORMAT(4,PIXMAN_TYPE_GRAY,0,0,0,0),
+    PIXMAN_c4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_g4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_GRAY,0,0,0,0),
     
 /* 1bpp formats */
-    PIXMAN_a1 =		PIXMAN_FORMAT(1,PIXMAN_TYPE_A,1,0,0,0),
+    PIXMAN_a1 =		 PIXMAN_FORMAT(1,PIXMAN_TYPE_A,1,0,0,0),
     
-    PIXMAN_g1 =		PIXMAN_FORMAT(1,PIXMAN_TYPE_GRAY,0,0,0,0),
+    PIXMAN_g1 =		 PIXMAN_FORMAT(1,PIXMAN_TYPE_GRAY,0,0,0,0),
 
 /* YUV formats */
-    PIXMAN_yuy2 =	PIXMAN_FORMAT(16,PIXMAN_TYPE_YUY2,0,0,0,0),
-    PIXMAN_yv12 =	PIXMAN_FORMAT(12,PIXMAN_TYPE_YV12,0,0,0,0)
+    PIXMAN_yuy2 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_YUY2,0,0,0,0),
+    PIXMAN_yv12 =	 PIXMAN_FORMAT(12,PIXMAN_TYPE_YV12,0,0,0,0)
 } pixman_format_code_t;
 
 /* Querying supported format values. */
@@ -748,6 +716,9 @@ pixman_image_t *pixman_image_create_bits             (pixman_format_code_t
 pixman_image_t *pixman_image_ref                     (pixman_image_t               *image);
 pixman_bool_t   pixman_image_unref                   (pixman_image_t               *image);
 
+void		pixman_image_set_destroy_function    (pixman_image_t		   *image,
+						      pixman_image_destroy_func_t   function,
+						      void			   *data);
 
 /* Set properties */
 pixman_bool_t   pixman_image_set_clip_region         (pixman_image_t               *image,
@@ -789,16 +760,16 @@ pixman_bool_t	pixman_image_fill_rectangles	     (pixman_op_t		    op,
 						      const pixman_rectangle16_t   *rects);
 
 /* Composite */
-pixman_bool_t pixman_compute_composite_region (pixman_region16_t *pRegion,
-					       pixman_image_t    *pSrc,
-					       pixman_image_t    *pMask,
-					       pixman_image_t    *pDst,
-					       int16_t            xSrc,
-					       int16_t            ySrc,
-					       int16_t            xMask,
-					       int16_t            yMask,
-					       int16_t            xDst,
-					       int16_t            yDst,
+pixman_bool_t pixman_compute_composite_region (pixman_region16_t *region,
+					       pixman_image_t    *src_image,
+					       pixman_image_t    *mask_image,
+					       pixman_image_t    *dst_image,
+					       int16_t            src_x,
+					       int16_t            src_y,
+					       int16_t            mask_x,
+					       int16_t            mask_y,
+					       int16_t            dest_x,
+					       int16_t            dest_y,
 					       uint16_t           width,
 					       uint16_t           height);
 void          pixman_image_composite          (pixman_op_t        op,
@@ -814,6 +785,20 @@ void          pixman_image_composite          (pixman_op_t        op,
 					       uint16_t           width,
 					       uint16_t           height);
 
+/* Old X servers rely on out-of-bounds accesses when they are asked
+ * to composite with a window as the source. They create a pixman image
+ * pointing to some bogus position in memory, but then they set a clip
+ * region to the position where the actual bits are.
+ *
+ * Due to a bug in old versions of pixman, where it would not clip
+ * against the image bounds when a clip region was set, this would
+ * actually work. So by default we allow certain out-of-bound access
+ * to happen unless explicitly disabled.
+ *
+ * Fixed X servers should call this function to disable the workaround.
+ */
+void          pixman_disable_out_of_bounds_workaround (void);
+
 /*
  * Trapezoids
  */
@@ -831,26 +816,26 @@ struct pixman_edge
 {
     pixman_fixed_t	x;
     pixman_fixed_t	e;
-    pixman_fixed_t   stepx;
-    pixman_fixed_t   signdx;
-    pixman_fixed_t   dy;
-    pixman_fixed_t   dx;
-
-    pixman_fixed_t   stepx_small;
-    pixman_fixed_t   stepx_big;
-    pixman_fixed_t   dx_small;
-    pixman_fixed_t   dx_big;
+    pixman_fixed_t	stepx;
+    pixman_fixed_t	signdx;
+    pixman_fixed_t	dy;
+    pixman_fixed_t	dx;
+
+    pixman_fixed_t	stepx_small;
+    pixman_fixed_t	stepx_big;
+    pixman_fixed_t	dx_small;
+    pixman_fixed_t	dx_big;
 };
 
 struct pixman_trapezoid
 {
-    pixman_fixed_t  top, bottom;
+    pixman_fixed_t	top, bottom;
     pixman_line_fixed_t	left, right;
 };
 
 
 /* whether 't' is a well defined not obviously empty trapezoid */
-#define pixman_trapezoid_valid(t)				\
+#define pixman_trapezoid_valid(t)				   \
     ((t)->left.p1.y != (t)->left.p2.y &&			   \
      (t)->right.p1.y != (t)->right.p2.y &&			   \
      (int) ((t)->bottom - (t)->top) > 0)
@@ -904,5 +889,4 @@ void           pixman_rasterize_trapezoid  (pixman_image_t            *image,
 					    int                        x_off,
 					    int                        y_off);
 
-
 #endif /* PIXMAN_H__ */
diff --git a/lib/pixman/test/Makefile.am b/lib/pixman/test/Makefile.am
index be76dc814..c56f62de7 100644
--- a/lib/pixman/test/Makefile.am
+++ b/lib/pixman/test/Makefile.am
@@ -4,13 +4,19 @@ INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman
 TESTPROGRAMS =			\
 	region-test		\
 	scaling-test		\
+	blitters-test		\
 	fetch-test		\
+	oob-test		\
+	window-test		\
 	trap-crasher
 
 fetch_test_LDADD = $(TEST_LDADD)
 region_test_LDADD = $(TEST_LDADD)
 scaling_test_LDADD = $(TEST_LDADD)
+blitters_test_LDADD = $(TEST_LDADD)
 trap_crasher_LDADD = $(TEST_LDADD)
+oob_test_LDADD = $(TEST_LDADD)
+window_test_LDADD = $(TEST_LDADD)
 
 # GTK using test programs
 
@@ -24,10 +30,10 @@ TESTPROGRAMS +=			\
 	composite-test		\
 	gradient-test		\
 	alpha-test		\
+	screen-test		\
+	convolution-test	\
 	trap-test
 
-noinst_PROGRAMS = $(TESTPROGRAMS)
-
 INCLUDES += $(GTK_CFLAGS)
 
 gradient_test_LDADD = $(GTK_LDADD)
@@ -48,5 +54,13 @@ clip_in_SOURCES = clip-in.c utils.c utils.h
 trap_test_LDADD = $(GTK_LDADD)
 trap_test_SOURCES = trap-test.c utils.c utils.h
 
+screen_test_LDADD = $(GTK_LDADD)
+screen_test_SOURCES = screen-test.c utils.c utils.h
+
+convolution_test_LDADD = $(GTK_LDADD)
+convolution_test_SOURCES = convolution-test.c utils.c utils.h
+
 endif
 
+noinst_PROGRAMS = $(TESTPROGRAMS)
+
diff --git a/lib/pixman/test/Makefile.in b/lib/pixman/test/Makefile.in
index 265f5d0df..f270165db 100644
--- a/lib/pixman/test/Makefile.in
+++ b/lib/pixman/test/Makefile.in
@@ -42,10 +42,12 @@ host_triplet = @host@
 @HAVE_GTK_TRUE@	composite-test		\
 @HAVE_GTK_TRUE@	gradient-test		\
 @HAVE_GTK_TRUE@	alpha-test		\
+@HAVE_GTK_TRUE@	screen-test		\
+@HAVE_GTK_TRUE@	convolution-test	\
 @HAVE_GTK_TRUE@	trap-test
 
-@HAVE_GTK_TRUE@noinst_PROGRAMS = $(am__EXEEXT_2)
 @HAVE_GTK_TRUE@am__append_2 = $(GTK_CFLAGS)
+noinst_PROGRAMS = $(am__EXEEXT_2)
 subdir = test
 DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
@@ -57,9 +59,11 @@ CONFIG_HEADER = $(top_builddir)/config.h
 CONFIG_CLEAN_FILES =
 @HAVE_GTK_TRUE@am__EXEEXT_1 = clip-test$(EXEEXT) clip-in$(EXEEXT) \
 @HAVE_GTK_TRUE@	composite-test$(EXEEXT) gradient-test$(EXEEXT) \
-@HAVE_GTK_TRUE@	alpha-test$(EXEEXT) trap-test$(EXEEXT)
+@HAVE_GTK_TRUE@	alpha-test$(EXEEXT) screen-test$(EXEEXT) \
+@HAVE_GTK_TRUE@	convolution-test$(EXEEXT) trap-test$(EXEEXT)
 am__EXEEXT_2 = region-test$(EXEEXT) scaling-test$(EXEEXT) \
-	fetch-test$(EXEEXT) trap-crasher$(EXEEXT) $(am__EXEEXT_1)
+	blitters-test$(EXEEXT) fetch-test$(EXEEXT) oob-test$(EXEEXT) \
+	window-test$(EXEEXT) trap-crasher$(EXEEXT) $(am__EXEEXT_1)
 PROGRAMS = $(noinst_PROGRAMS)
 am__alpha_test_SOURCES_DIST = alpha-test.c utils.c utils.h
 @HAVE_GTK_TRUE@am_alpha_test_OBJECTS = alpha-test.$(OBJEXT) \
@@ -70,6 +74,9 @@ am__DEPENDENCIES_2 =
 @HAVE_GTK_TRUE@am__DEPENDENCIES_3 = $(am__DEPENDENCIES_1) \
 @HAVE_GTK_TRUE@	$(am__DEPENDENCIES_2)
 @HAVE_GTK_TRUE@alpha_test_DEPENDENCIES = $(am__DEPENDENCIES_3)
+blitters_test_SOURCES = blitters-test.c
+blitters_test_OBJECTS = blitters-test.$(OBJEXT)
+blitters_test_DEPENDENCIES = $(am__DEPENDENCIES_1)
 am__clip_in_SOURCES_DIST = clip-in.c utils.c utils.h
 @HAVE_GTK_TRUE@am_clip_in_OBJECTS = clip-in.$(OBJEXT) utils.$(OBJEXT)
 clip_in_OBJECTS = $(am_clip_in_OBJECTS)
@@ -84,6 +91,11 @@ am__composite_test_SOURCES_DIST = composite-test.c utils.c utils.h
 @HAVE_GTK_TRUE@	utils.$(OBJEXT)
 composite_test_OBJECTS = $(am_composite_test_OBJECTS)
 @HAVE_GTK_TRUE@composite_test_DEPENDENCIES = $(am__DEPENDENCIES_3)
+am__convolution_test_SOURCES_DIST = convolution-test.c utils.c utils.h
+@HAVE_GTK_TRUE@am_convolution_test_OBJECTS =  \
+@HAVE_GTK_TRUE@	convolution-test.$(OBJEXT) utils.$(OBJEXT)
+convolution_test_OBJECTS = $(am_convolution_test_OBJECTS)
+@HAVE_GTK_TRUE@convolution_test_DEPENDENCIES = $(am__DEPENDENCIES_3)
 fetch_test_SOURCES = fetch-test.c
 fetch_test_OBJECTS = fetch-test.$(OBJEXT)
 fetch_test_DEPENDENCIES = $(am__DEPENDENCIES_1)
@@ -92,12 +104,20 @@ am__gradient_test_SOURCES_DIST = gradient-test.c utils.c utils.h
 @HAVE_GTK_TRUE@	utils.$(OBJEXT)
 gradient_test_OBJECTS = $(am_gradient_test_OBJECTS)
 @HAVE_GTK_TRUE@gradient_test_DEPENDENCIES = $(am__DEPENDENCIES_3)
+oob_test_SOURCES = oob-test.c
+oob_test_OBJECTS = oob-test.$(OBJEXT)
+oob_test_DEPENDENCIES = $(am__DEPENDENCIES_1)
 region_test_SOURCES = region-test.c
 region_test_OBJECTS = region-test.$(OBJEXT)
 region_test_DEPENDENCIES = $(am__DEPENDENCIES_1)
 scaling_test_SOURCES = scaling-test.c
 scaling_test_OBJECTS = scaling-test.$(OBJEXT)
 scaling_test_DEPENDENCIES = $(am__DEPENDENCIES_1)
+am__screen_test_SOURCES_DIST = screen-test.c utils.c utils.h
+@HAVE_GTK_TRUE@am_screen_test_OBJECTS = screen-test.$(OBJEXT) \
+@HAVE_GTK_TRUE@	utils.$(OBJEXT)
+screen_test_OBJECTS = $(am_screen_test_OBJECTS)
+@HAVE_GTK_TRUE@screen_test_DEPENDENCIES = $(am__DEPENDENCIES_3)
 trap_crasher_SOURCES = trap-crasher.c
 trap_crasher_OBJECTS = trap-crasher.$(OBJEXT)
 trap_crasher_DEPENDENCIES = $(am__DEPENDENCIES_1)
@@ -106,6 +126,9 @@ am__trap_test_SOURCES_DIST = trap-test.c utils.c utils.h
 @HAVE_GTK_TRUE@	utils.$(OBJEXT)
 trap_test_OBJECTS = $(am_trap_test_OBJECTS)
 @HAVE_GTK_TRUE@trap_test_DEPENDENCIES = $(am__DEPENDENCIES_3)
+window_test_SOURCES = window-test.c
+window_test_OBJECTS = window-test.$(OBJEXT)
+window_test_DEPENDENCIES = $(am__DEPENDENCIES_1)
 DEFAULT_INCLUDES = -I. -I$(srcdir) -I$(top_builddir)
 depcomp = $(SHELL) $(top_srcdir)/depcomp
 am__depfiles_maybe = depfiles
@@ -117,15 +140,19 @@ LTCOMPILE = $(LIBTOOL) --tag=CC --mode=compile $(CC) $(DEFS) \
 CCLD = $(CC)
 LINK = $(LIBTOOL) --tag=CC --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
 	$(AM_LDFLAGS) $(LDFLAGS) -o $@
-SOURCES = $(alpha_test_SOURCES) $(clip_in_SOURCES) \
-	$(clip_test_SOURCES) $(composite_test_SOURCES) fetch-test.c \
-	$(gradient_test_SOURCES) region-test.c scaling-test.c \
-	trap-crasher.c $(trap_test_SOURCES)
-DIST_SOURCES = $(am__alpha_test_SOURCES_DIST) \
+SOURCES = $(alpha_test_SOURCES) blitters-test.c $(clip_in_SOURCES) \
+	$(clip_test_SOURCES) $(composite_test_SOURCES) \
+	$(convolution_test_SOURCES) fetch-test.c \
+	$(gradient_test_SOURCES) oob-test.c region-test.c \
+	scaling-test.c $(screen_test_SOURCES) trap-crasher.c \
+	$(trap_test_SOURCES) window-test.c
+DIST_SOURCES = $(am__alpha_test_SOURCES_DIST) blitters-test.c \
 	$(am__clip_in_SOURCES_DIST) $(am__clip_test_SOURCES_DIST) \
-	$(am__composite_test_SOURCES_DIST) fetch-test.c \
-	$(am__gradient_test_SOURCES_DIST) region-test.c scaling-test.c \
-	trap-crasher.c $(am__trap_test_SOURCES_DIST)
+	$(am__composite_test_SOURCES_DIST) \
+	$(am__convolution_test_SOURCES_DIST) fetch-test.c \
+	$(am__gradient_test_SOURCES_DIST) oob-test.c region-test.c \
+	scaling-test.c $(am__screen_test_SOURCES_DIST) trap-crasher.c \
+	$(am__trap_test_SOURCES_DIST) window-test.c
 ETAGS = etags
 CTAGS = ctags
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
@@ -192,6 +219,7 @@ PACKAGE_TARNAME = @PACKAGE_TARNAME@
 PACKAGE_VERSION = @PACKAGE_VERSION@
 PATH_SEPARATOR = @PATH_SEPARATOR@
 PERL = @PERL@
+PIXMAN_TIMERS = @PIXMAN_TIMERS@
 PIXMAN_VERSION_MAJOR = @PIXMAN_VERSION_MAJOR@
 PIXMAN_VERSION_MICRO = @PIXMAN_VERSION_MICRO@
 PIXMAN_VERSION_MINOR = @PIXMAN_VERSION_MINOR@
@@ -267,12 +295,15 @@ target_alias = @target_alias@
 TEST_LDADD = $(top_builddir)/pixman/libpixman-1.la
 INCLUDES = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman \
 	$(am__append_2)
-TESTPROGRAMS = region-test scaling-test fetch-test trap-crasher \
-	$(am__append_1)
+TESTPROGRAMS = region-test scaling-test blitters-test fetch-test \
+	oob-test window-test trap-crasher $(am__append_1)
 fetch_test_LDADD = $(TEST_LDADD)
 region_test_LDADD = $(TEST_LDADD)
 scaling_test_LDADD = $(TEST_LDADD)
+blitters_test_LDADD = $(TEST_LDADD)
 trap_crasher_LDADD = $(TEST_LDADD)
+oob_test_LDADD = $(TEST_LDADD)
+window_test_LDADD = $(TEST_LDADD)
 
 # GTK using test programs
 @HAVE_GTK_TRUE@GTK_LDADD = $(TEST_LDADD) $(GTK_LIBS)
@@ -288,6 +319,10 @@ trap_crasher_LDADD = $(TEST_LDADD)
 @HAVE_GTK_TRUE@clip_in_SOURCES = clip-in.c utils.c utils.h
 @HAVE_GTK_TRUE@trap_test_LDADD = $(GTK_LDADD)
 @HAVE_GTK_TRUE@trap_test_SOURCES = trap-test.c utils.c utils.h
+@HAVE_GTK_TRUE@screen_test_LDADD = $(GTK_LDADD)
+@HAVE_GTK_TRUE@screen_test_SOURCES = screen-test.c utils.c utils.h
+@HAVE_GTK_TRUE@convolution_test_LDADD = $(GTK_LDADD)
+@HAVE_GTK_TRUE@convolution_test_SOURCES = convolution-test.c utils.c utils.h
 all: all-am
 
 .SUFFIXES:
@@ -331,6 +366,9 @@ clean-noinstPROGRAMS:
 alpha-test$(EXEEXT): $(alpha_test_OBJECTS) $(alpha_test_DEPENDENCIES) 
 	@rm -f alpha-test$(EXEEXT)
 	$(LINK) $(alpha_test_LDFLAGS) $(alpha_test_OBJECTS) $(alpha_test_LDADD) $(LIBS)
+blitters-test$(EXEEXT): $(blitters_test_OBJECTS) $(blitters_test_DEPENDENCIES) 
+	@rm -f blitters-test$(EXEEXT)
+	$(LINK) $(blitters_test_LDFLAGS) $(blitters_test_OBJECTS) $(blitters_test_LDADD) $(LIBS)
 clip-in$(EXEEXT): $(clip_in_OBJECTS) $(clip_in_DEPENDENCIES) 
 	@rm -f clip-in$(EXEEXT)
 	$(LINK) $(clip_in_LDFLAGS) $(clip_in_OBJECTS) $(clip_in_LDADD) $(LIBS)
@@ -340,24 +378,36 @@ clip-test$(EXEEXT): $(clip_test_OBJECTS) $(clip_test_DEPENDENCIES)
 composite-test$(EXEEXT): $(composite_test_OBJECTS) $(composite_test_DEPENDENCIES) 
 	@rm -f composite-test$(EXEEXT)
 	$(LINK) $(composite_test_LDFLAGS) $(composite_test_OBJECTS) $(composite_test_LDADD) $(LIBS)
+convolution-test$(EXEEXT): $(convolution_test_OBJECTS) $(convolution_test_DEPENDENCIES) 
+	@rm -f convolution-test$(EXEEXT)
+	$(LINK) $(convolution_test_LDFLAGS) $(convolution_test_OBJECTS) $(convolution_test_LDADD) $(LIBS)
 fetch-test$(EXEEXT): $(fetch_test_OBJECTS) $(fetch_test_DEPENDENCIES) 
 	@rm -f fetch-test$(EXEEXT)
 	$(LINK) $(fetch_test_LDFLAGS) $(fetch_test_OBJECTS) $(fetch_test_LDADD) $(LIBS)
 gradient-test$(EXEEXT): $(gradient_test_OBJECTS) $(gradient_test_DEPENDENCIES) 
 	@rm -f gradient-test$(EXEEXT)
 	$(LINK) $(gradient_test_LDFLAGS) $(gradient_test_OBJECTS) $(gradient_test_LDADD) $(LIBS)
+oob-test$(EXEEXT): $(oob_test_OBJECTS) $(oob_test_DEPENDENCIES) 
+	@rm -f oob-test$(EXEEXT)
+	$(LINK) $(oob_test_LDFLAGS) $(oob_test_OBJECTS) $(oob_test_LDADD) $(LIBS)
 region-test$(EXEEXT): $(region_test_OBJECTS) $(region_test_DEPENDENCIES) 
 	@rm -f region-test$(EXEEXT)
 	$(LINK) $(region_test_LDFLAGS) $(region_test_OBJECTS) $(region_test_LDADD) $(LIBS)
 scaling-test$(EXEEXT): $(scaling_test_OBJECTS) $(scaling_test_DEPENDENCIES) 
 	@rm -f scaling-test$(EXEEXT)
 	$(LINK) $(scaling_test_LDFLAGS) $(scaling_test_OBJECTS) $(scaling_test_LDADD) $(LIBS)
+screen-test$(EXEEXT): $(screen_test_OBJECTS) $(screen_test_DEPENDENCIES) 
+	@rm -f screen-test$(EXEEXT)
+	$(LINK) $(screen_test_LDFLAGS) $(screen_test_OBJECTS) $(screen_test_LDADD) $(LIBS)
 trap-crasher$(EXEEXT): $(trap_crasher_OBJECTS) $(trap_crasher_DEPENDENCIES) 
 	@rm -f trap-crasher$(EXEEXT)
 	$(LINK) $(trap_crasher_LDFLAGS) $(trap_crasher_OBJECTS) $(trap_crasher_LDADD) $(LIBS)
 trap-test$(EXEEXT): $(trap_test_OBJECTS) $(trap_test_DEPENDENCIES) 
 	@rm -f trap-test$(EXEEXT)
 	$(LINK) $(trap_test_LDFLAGS) $(trap_test_OBJECTS) $(trap_test_LDADD) $(LIBS)
+window-test$(EXEEXT): $(window_test_OBJECTS) $(window_test_DEPENDENCIES) 
+	@rm -f window-test$(EXEEXT)
+	$(LINK) $(window_test_LDFLAGS) $(window_test_OBJECTS) $(window_test_LDADD) $(LIBS)
 
 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
@@ -366,16 +416,21 @@ distclean-compile:
 	-rm -f *.tab.c
 
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/alpha-test.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/blitters-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/clip-in.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/clip-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/composite-test.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/convolution-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fetch-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gradient-test.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oob-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/region-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/scaling-test.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/screen-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/trap-crasher.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/trap-test.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utils.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/window-test.Po@am__quote@
 
 .c.o:
 @am__fastdepCC_TRUE@	if $(COMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \
diff --git a/lib/pixman/test/blitters-test.c b/lib/pixman/test/blitters-test.c
new file mode 100644
index 000000000..d5201e541
--- /dev/null
+++ b/lib/pixman/test/blitters-test.c
@@ -0,0 +1,655 @@
+/*
+ * Test program, which stresses the use of different color formats and
+ * compositing operations.
+ *
+ * Just run it without any command line arguments, and it will report either
+ *   "blitters test passed" - everything is ok
+ *   "blitters test failed!" - there is some problem
+ *
+ * In the case of failure, finding the problem involves the following steps:
+ * 1. Get the reference 'blitters-test' binary. It makes sense to disable all
+ *    the cpu specific optimizations in pixman and also configure it with
+ *    '--disable-shared' option. Those who are paranoid can also tweak the
+ *    sources to disable all fastpath functions. The resulting binary
+ *    can be renamed to something like 'blitters-test.ref'.
+ * 2. Compile the buggy binary (also with the '--disable-shared' option).
+ * 3. Run 'ruby blitters-test-bisect.rb ./blitters-test.ref ./blitters-test'
+ * 4. Look at the information about failed case (destination buffer content
+ *    will be shown) and try to figure out what is wrong. Loading
+ *    test program in gdb, specifying failed test number in the command
+ *    line with '-' character prepended and setting breakpoint on
+ *    'pixman_image_composite' function can provide detailed information
+ *    about function arguments
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <config.h>
+#include "pixman.h"
+
+/* A primitive pseudorandom number generator, taken from POSIX.1-2001 example */
+
+static uint32_t lcg_seed;
+
+static inline uint32_t
+lcg_rand (void)
+{
+    lcg_seed = lcg_seed * 1103515245 + 12345;
+    return ((uint32_t)(lcg_seed / 65536) % 32768);
+}
+
+static inline void
+lcg_srand (uint32_t seed)
+{
+    lcg_seed = seed;
+}
+
+static inline uint32_t
+lcg_rand_n (int max)
+{
+    return lcg_rand () % max;
+}
+
+static void *
+aligned_malloc (size_t align, size_t size)
+{
+    void *result;
+
+#ifdef HAVE_POSIX_MEMALIGN
+    posix_memalign (&result, align, size);
+#else
+    result = malloc (size);
+#endif
+
+    return result;
+}
+
+/*----------------------------------------------------------------------------*\
+ *  CRC-32 version 2.0.0 by Craig Bruce, 2006-04-29.
+ *
+ *  This program generates the CRC-32 values for the files named in the
+ *  command-line arguments.  These are the same CRC-32 values used by GZIP,
+ *  PKZIP, and ZMODEM.  The Crc32_ComputeBuf () can also be detached and
+ *  used independently.
+ *
+ *  THIS PROGRAM IS PUBLIC-DOMAIN SOFTWARE.
+ *
+ *  Based on the byte-oriented implementation "File Verification Using CRC"
+ *  by Mark R. Nelson in Dr. Dobb's Journal, May 1992, pp. 64-67.
+ *
+ *  v1.0.0: original release.
+ *  v1.0.1: fixed printf formats.
+ *  v1.0.2: fixed something else.
+ *  v1.0.3: replaced CRC constant table by generator function.
+ *  v1.0.4: reformatted code, made ANSI C.  1994-12-05.
+ *  v2.0.0: rewrote to use memory buffer & static table, 2006-04-29.
+\*----------------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------------*\
+ *  NAME:
+ *     Crc32_ComputeBuf () - computes the CRC-32 value of a memory buffer
+ *  DESCRIPTION:
+ *     Computes or accumulates the CRC-32 value for a memory buffer.
+ *     The 'inCrc32' gives a previously accumulated CRC-32 value to allow
+ *     a CRC to be generated for multiple sequential buffer-fuls of data.
+ *     The 'inCrc32' for the first buffer must be zero.
+ *  ARGUMENTS:
+ *     inCrc32 - accumulated CRC-32 value, must be 0 on first call
+ *     buf     - buffer to compute CRC-32 value for
+ *     bufLen  - number of bytes in buffer
+ *  RETURNS:
+ *     crc32 - computed CRC-32 value
+ *  ERRORS:
+ *     (no errors are possible)
+\*----------------------------------------------------------------------------*/
+
+static uint32_t
+compute_crc32 (uint32_t    in_crc32,
+	       const void *buf,
+	       size_t      buf_len)
+{
+    static const uint32_t crc_table[256] = {
+	0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F,
+	0xE963A535, 0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
+	0x09B64C2B, 0x7EB17CBD,	0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2,
+	0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+	0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,	0x14015C4F, 0x63066CD9,
+	0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
+	0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
+	0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+	0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423,
+	0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
+	0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106,
+	0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+	0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D,
+	0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+	0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950,
+	0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+	0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7,
+	0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
+	0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA,
+	0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+	0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
+	0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
+	0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84,
+	0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+	0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB,
+	0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
+	0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E,
+	0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+	0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55,
+	0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
+	0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28,
+	0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+	0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F,
+	0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
+	0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
+	0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+	0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69,
+	0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
+	0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC,
+	0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+	0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693,
+	0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+	0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+    };
+
+    uint32_t              crc32;
+    unsigned char *       byte_buf;
+    size_t                i;
+
+    /* accumulate crc32 for buffer */
+    crc32 = in_crc32 ^ 0xFFFFFFFF;
+    byte_buf = (unsigned char*) buf;
+
+    for (i = 0; i < buf_len; i++)
+	crc32 = (crc32 >> 8) ^ crc_table[(crc32 ^ byte_buf[i]) & 0xFF];
+
+    return (crc32 ^ 0xFFFFFFFF);
+}
+
+/* perform endian conversion of pixel data */
+static void
+image_endian_swap (pixman_image_t *img, int bpp)
+{
+    int stride = pixman_image_get_stride (img);
+    uint32_t *data = pixman_image_get_data (img);
+    int height = pixman_image_get_height (img);
+    int i, j;
+
+    /* swap bytes only on big endian systems */
+    volatile uint16_t endian_check_var = 0x1234;
+    if (*(volatile uint8_t *)&endian_check_var != 0x12)
+	return;
+
+    for (i = 0; i < height; i++)
+    {
+	uint8_t *line_data = (uint8_t *)data + stride * i;
+	/* swap bytes only for 16, 24 and 32 bpp for now */
+	switch (bpp)
+	{
+	case 1:
+	    for (j = 0; j < stride; j++)
+	    {
+		line_data[j] =
+		    ((line_data[j] & 0x80) >> 7) |
+		    ((line_data[j] & 0x40) >> 5) |
+		    ((line_data[j] & 0x20) >> 3) |
+		    ((line_data[j] & 0x10) >> 1) |
+		    ((line_data[j] & 0x08) << 1) |
+		    ((line_data[j] & 0x04) << 3) |
+		    ((line_data[j] & 0x02) << 5) |
+		    ((line_data[j] & 0x01) << 7);
+	    }
+	    break;
+	case 4:
+	    for (j = 0; j < stride; j++)
+	    {
+		line_data[j] = (line_data[j] >> 4) | (line_data[j] << 4);
+	    }
+	    break;
+	case 16:
+	    for (j = 0; j + 2 <= stride; j += 2)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+
+		line_data[j + 1] = t1;
+		line_data[j + 0] = t2;
+	    }
+	    break;
+	case 24:
+	    for (j = 0; j + 3 <= stride; j += 3)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		char t3 = line_data[j + 2];
+
+		line_data[j + 2] = t1;
+		line_data[j + 1] = t2;
+		line_data[j + 0] = t3;
+	    }
+	    break;
+	case 32:
+	    for (j = 0; j + 4 <= stride; j += 4)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		char t3 = line_data[j + 2];
+		char t4 = line_data[j + 3];
+
+		line_data[j + 3] = t1;
+		line_data[j + 2] = t2;
+		line_data[j + 1] = t3;
+		line_data[j + 0] = t4;
+	    }
+	    break;
+	default:
+	    break;
+	}
+    }
+}
+
+/* Create random image for testing purposes */
+static pixman_image_t *
+create_random_image (pixman_format_code_t *allowed_formats,
+		     int                   max_width,
+		     int                   max_height,
+		     int                   max_extra_stride,
+		     pixman_format_code_t *used_fmt)
+{
+    int n = 0, i, width, height, stride;
+    pixman_format_code_t fmt;
+    uint32_t *buf;
+    pixman_image_t *img;
+
+    while (allowed_formats[n] != -1)
+	n++;
+    fmt = allowed_formats[lcg_rand_n (n)];
+    width = lcg_rand_n (max_width) + 1;
+    height = lcg_rand_n (max_height) + 1;
+    stride = (width * PIXMAN_FORMAT_BPP (fmt) + 7) / 8 +
+	lcg_rand_n (max_extra_stride + 1);
+    stride = (stride + 3) & ~3;
+
+    /* do the allocation */
+    buf = aligned_malloc (64, stride * height);
+
+    /* initialize image with random data */
+    for (i = 0; i < stride * height; i++)
+    {
+	/* generation is biased to having more 0 or 255 bytes as
+	 * they are more likely to be special-cased in code
+	 */
+	*((uint8_t *)buf + i) = lcg_rand_n (4) ? lcg_rand_n (256) :
+	    (lcg_rand_n (2) ? 0 : 255);
+    }
+
+    img = pixman_image_create_bits (fmt, width, height, buf, stride);
+
+    image_endian_swap (img, PIXMAN_FORMAT_BPP (fmt));
+
+    if (used_fmt) *used_fmt = fmt;
+    return img;
+}
+
+/* Free random image, and optionally update crc32 based on its data */
+static uint32_t
+free_random_image (uint32_t initcrc,
+		   pixman_image_t *img,
+		   pixman_format_code_t fmt)
+{
+    uint32_t crc32 = 0;
+    int stride = pixman_image_get_stride (img);
+    uint32_t *data = pixman_image_get_data (img);
+    int height = pixman_image_get_height (img);;
+
+    if (fmt != -1)
+    {
+	/* mask unused 'x' part */
+	if (PIXMAN_FORMAT_BPP (fmt) - PIXMAN_FORMAT_DEPTH (fmt) &&
+	    PIXMAN_FORMAT_DEPTH (fmt) != 0)
+	{
+	    int i;
+	    uint32_t *data = pixman_image_get_data (img);
+	    uint32_t mask = (1 << PIXMAN_FORMAT_DEPTH (fmt)) - 1;
+
+	    if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_BGRA)
+		mask <<= (PIXMAN_FORMAT_BPP (fmt) - PIXMAN_FORMAT_DEPTH (fmt));
+
+	    for (i = 0; i < 32; i++)
+		mask |= mask << (i * PIXMAN_FORMAT_BPP (fmt));
+
+	    for (i = 0; i < stride * height / 4; i++)
+		data[i] &= mask;
+	}
+
+	/* swap endiannes in order to provide identical results on both big
+	 * and litte endian systems
+	 */
+	image_endian_swap (img, PIXMAN_FORMAT_BPP (fmt));
+	crc32 = compute_crc32 (initcrc, data, stride * height);
+    }
+
+    pixman_image_unref (img);
+    free (data);
+
+    return crc32;
+}
+
+static pixman_op_t op_list[] = {
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+#if 0 /* these use floating point math and are not always bitexact on different platforms */
+    PIXMAN_OP_SOFT_LIGHT,
+    PIXMAN_OP_HSL_HUE,
+    PIXMAN_OP_HSL_SATURATION,
+    PIXMAN_OP_HSL_COLOR,
+    PIXMAN_OP_HSL_LUMINOSITY,
+#endif
+};
+
+static pixman_format_code_t img_fmt_list[] = {
+    PIXMAN_a8r8g8b8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_r5g6b5,
+    PIXMAN_r3g3b2,
+    PIXMAN_a8,
+    PIXMAN_a8b8g8r8,
+    PIXMAN_x8b8g8r8,
+    PIXMAN_b8g8r8a8,
+    PIXMAN_b8g8r8x8,
+    PIXMAN_r8g8b8,
+    PIXMAN_b8g8r8,
+    PIXMAN_r5g6b5,
+    PIXMAN_b5g6r5,
+    PIXMAN_x2r10g10b10,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_x2b10g10r10,
+    PIXMAN_a2b10g10r10,
+    PIXMAN_a1r5g5b5,
+    PIXMAN_x1r5g5b5,
+    PIXMAN_a1b5g5r5,
+    PIXMAN_x1b5g5r5,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_x4r4g4b4,
+    PIXMAN_a4b4g4r4,
+    PIXMAN_x4b4g4r4,
+    PIXMAN_a8,
+    PIXMAN_r3g3b2,
+    PIXMAN_b2g3r3,
+    PIXMAN_a2r2g2b2,
+    PIXMAN_a2b2g2r2,
+#if 0 /* using these crashes the test */
+    PIXMAN_c8,
+    PIXMAN_g8,
+    PIXMAN_x4c4,
+    PIXMAN_x4g4,
+    PIXMAN_c4,
+    PIXMAN_g4,
+    PIXMAN_g1,
+#endif
+    PIXMAN_x4a4,
+    PIXMAN_a4,
+    PIXMAN_r1g2b1,
+    PIXMAN_b1g2r1,
+    PIXMAN_a1r1g1b1,
+    PIXMAN_a1b1g1r1,
+    PIXMAN_a1,
+    -1
+};
+
+static pixman_format_code_t mask_fmt_list[] = {
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a8,
+    PIXMAN_a4,
+    PIXMAN_a1,
+    -1
+};
+
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (uint32_t initcrc, int testnum, int verbose)
+{
+    int i;
+    pixman_image_t *src_img = NULL;
+    pixman_image_t *dst_img = NULL;
+    pixman_image_t *mask_img = NULL;
+    int src_width, src_height;
+    int dst_width, dst_height;
+    int src_stride, dst_stride;
+    int src_x, src_y;
+    int dst_x, dst_y;
+    int w, h;
+    int op;
+    pixman_format_code_t src_fmt, dst_fmt, mask_fmt;
+    uint32_t *dstbuf;
+    uint32_t crc32;
+    int max_width, max_height, max_extra_stride;
+
+    max_width = max_height = 24 + testnum / 10000;
+    max_extra_stride = 4 + testnum / 1000000;
+
+    if (max_width > 256)
+	max_width = 256;
+
+    if (max_height > 16)
+	max_height = 16;
+
+    if (max_extra_stride > 8)
+	max_extra_stride = 8;
+
+    lcg_srand (testnum);
+
+    op = op_list[lcg_rand_n (sizeof (op_list) / sizeof (op_list[0]))];
+
+    if (lcg_rand_n (8))
+    {
+	/* normal image */
+	src_img = create_random_image (img_fmt_list, max_width, max_height,
+				       max_extra_stride, &src_fmt);
+    }
+    else
+    {
+	/* solid case */
+	src_img = create_random_image (img_fmt_list, 1, 1,
+				       max_extra_stride, &src_fmt);
+
+	pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
+    }
+
+    dst_img = create_random_image (img_fmt_list, max_width, max_height,
+				   max_extra_stride, &dst_fmt);
+
+    mask_img = NULL;
+    mask_fmt = -1;
+
+    if (lcg_rand_n (2))
+    {
+	if (lcg_rand_n (2))
+	{
+	    mask_img = create_random_image (mask_fmt_list, max_width, max_height,
+					   max_extra_stride, &mask_fmt);
+	}
+	else
+	{
+	    /* solid case */
+	    mask_img = create_random_image (mask_fmt_list, 1, 1,
+					   max_extra_stride, &mask_fmt);
+	    pixman_image_set_repeat (mask_img, PIXMAN_REPEAT_NORMAL);
+	}
+
+	if (lcg_rand_n (2))
+	    pixman_image_set_component_alpha (mask_img, 1);
+    }
+
+    src_width = pixman_image_get_width (src_img);
+    src_height = pixman_image_get_height (src_img);
+    src_stride = pixman_image_get_stride (src_img);
+
+    dst_width = pixman_image_get_width (dst_img);
+    dst_height = pixman_image_get_height (dst_img);
+    dst_stride = pixman_image_get_stride (dst_img);
+
+    dstbuf = pixman_image_get_data (dst_img);
+
+    src_x = lcg_rand_n (src_width);
+    src_y = lcg_rand_n (src_height);
+    dst_x = lcg_rand_n (dst_width);
+    dst_y = lcg_rand_n (dst_height);
+
+    w = lcg_rand_n (dst_width - dst_x + 1);
+    h = lcg_rand_n (dst_height - dst_y + 1);
+
+    if (verbose)
+    {
+	printf ("op=%d, src_fmt=%08X, dst_fmt=%08X, mask_fmt=%08X\n",
+	    op, src_fmt, dst_fmt, mask_fmt);
+	printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
+	    src_width, src_height, dst_width, dst_height);
+	printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
+	    src_x, src_y, dst_x, dst_y);
+	printf ("src_stride=%d, dst_stride=%d\n",
+	    src_stride, dst_stride);
+	printf ("w=%d, h=%d\n", w, h);
+    }
+
+    pixman_image_composite (op, src_img, mask_img, dst_img,
+			    src_x, src_y, src_x, src_y, dst_x, dst_y, w, h);
+
+    if (verbose)
+    {
+	int j;
+
+	printf ("---\n");
+	for (i = 0; i < dst_height; i++)
+	{
+	    for (j = 0; j < dst_stride; j++)
+	    {
+		if (j == (dst_width * PIXMAN_FORMAT_BPP (dst_fmt) + 7) / 8)
+		    printf ("| ");
+
+		printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
+	    }
+	    printf ("\n");
+	}
+	printf ("---\n");
+    }
+
+    free_random_image (initcrc, src_img, -1);
+    crc32 = free_random_image (initcrc, dst_img, dst_fmt);
+
+    if (mask_img)
+	free_random_image (initcrc, mask_img, -1);
+
+    return crc32;
+}
+
+int
+main (int argc, char *argv[])
+{
+    int i, n1 = 1, n2 = 0;
+    uint32_t crc = 0;
+    int verbose = getenv ("VERBOSE") != NULL;
+
+    if (argc >= 3)
+    {
+	n1 = atoi (argv[1]);
+	n2 = atoi (argv[2]);
+    }
+    else if (argc >= 2)
+    {
+	n2 = atoi (argv[1]);
+    }
+    else
+    {
+	n1 = 1;
+	n2 = 2000000;
+    }
+
+    if (n2 < 0)
+    {
+	crc = test_composite (0, abs (n2), 1);
+	printf ("crc32=%08X\n", crc);
+    }
+    else
+    {
+	for (i = n1; i <= n2; i++)
+	{
+	    crc = test_composite (crc, i, 0);
+
+	    if (verbose)
+		printf ("%d: %08X\n", i, crc);
+	}
+	printf ("crc32=%08X\n", crc);
+
+	if (n2 == 2000000)
+	{
+	    /* Predefined value for running with all the fastpath functions
+	       disabled. It needs to be updated every time when changes are
+	       introduced to this program or behavior of pixman changes! */
+	    if (crc == 0x06D8EDB6)
+	    {
+		printf ("blitters test passed\n");
+	    }
+	    else
+	    {
+		printf ("blitters test failed!\n");
+		return 1;
+	    }
+	}
+    }
+    return 0;
+}
diff --git a/lib/pixman/test/clip-test.c b/lib/pixman/test/clip-test.c
index 90310f415..900013718 100644
--- a/lib/pixman/test/clip-test.c
+++ b/lib/pixman/test/clip-test.c
@@ -71,6 +71,7 @@ main (int argc, char **argv)
     pixman_region32_init_rect (&clip_region, 50, 0, 100, 200);
     pixman_image_set_clip_region32 (src_img, &clip_region);
     pixman_image_set_source_clipping (src_img, TRUE);
+    pixman_image_set_has_client_clip (src_img, TRUE);
     pixman_image_set_transform (src_img, &trans);
     pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
     
diff --git a/lib/pixman/test/composite-test.c b/lib/pixman/test/composite-test.c
index 393e15d8f..49e0220a4 100644
--- a/lib/pixman/test/composite-test.c
+++ b/lib/pixman/test/composite-test.c
@@ -1,10 +1,49 @@
+#include <gtk/gtk.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include "pixman.h"
 #include "utils.h"
 
-#define WIDTH	100
-#define HEIGHT	100
+#define WIDTH	60
+#define HEIGHT	60
+
+typedef struct {
+    const char *name;
+    pixman_op_t op;
+} operator_t;
+
+static const operator_t operators[] = {
+    { "CLEAR",		PIXMAN_OP_CLEAR },
+    { "SRC",		PIXMAN_OP_SRC },
+    { "DST",		PIXMAN_OP_DST },
+    { "OVER",		PIXMAN_OP_OVER },
+    { "OVER_REVERSE",	PIXMAN_OP_OVER_REVERSE },
+    { "IN",		PIXMAN_OP_IN },
+    { "IN_REVERSE",	PIXMAN_OP_IN_REVERSE },
+    { "OUT",		PIXMAN_OP_OUT },
+    { "OUT_REVERSE",	PIXMAN_OP_OUT_REVERSE },
+    { "ATOP",		PIXMAN_OP_ATOP },
+    { "ATOP_REVERSE",	PIXMAN_OP_ATOP_REVERSE },
+    { "XOR",		PIXMAN_OP_XOR },
+    { "ADD",		PIXMAN_OP_ADD },
+    { "SATURATE",	PIXMAN_OP_SATURATE },
+
+    { "MULTIPLY",	PIXMAN_OP_MULTIPLY },
+    { "SCREEN",		PIXMAN_OP_SCREEN },
+    { "OVERLAY",	PIXMAN_OP_OVERLAY },
+    { "DARKEN",		PIXMAN_OP_DARKEN },
+    { "LIGHTEN",	PIXMAN_OP_LIGHTEN },
+    { "COLOR_DODGE",	PIXMAN_OP_COLOR_DODGE },
+    { "COLOR_BURN",	PIXMAN_OP_COLOR_BURN },
+    { "HARD_LIGHT",	PIXMAN_OP_HARD_LIGHT },
+    { "SOFT_LIGHT",	PIXMAN_OP_SOFT_LIGHT },
+    { "DIFFERENCE",	PIXMAN_OP_DIFFERENCE },
+    { "EXCLUSION",	PIXMAN_OP_EXCLUSION },
+    { "HSL_HUE",	PIXMAN_OP_HSL_HUE },
+    { "HSL_SATURATION",	PIXMAN_OP_HSL_SATURATION },
+    { "HSL_COLOR",	PIXMAN_OP_HSL_COLOR },
+    { "HSL_LUMINOSITY",	PIXMAN_OP_HSL_LUMINOSITY },
+};
 
 static uint32_t
 reader (const void *src, int size)
@@ -44,40 +83,107 @@ writer (void *src, uint32_t value, int size)
 int
 main (int argc, char **argv)
 {
-    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+#define d2f pixman_double_to_fixed
+    
+    GtkWidget *window, *swindow;
+    GtkWidget *table;
     uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
     pixman_image_t *src_img;
     pixman_image_t *dest_img;
+    pixman_point_fixed_t p1 = { -10 << 0, 0 };
+    pixman_point_fixed_t p2 = { WIDTH << 16, (HEIGHT - 10) << 16 };
+    uint16_t full = 0xcfff;
+    uint16_t low  = 0x5000;
+    uint16_t alpha = 0xffff;
+    pixman_gradient_stop_t stops[6] =
+    {
+	{ d2f (0.0), { full, low, low, alpha } },
+	{ d2f (0.25), { full, full, low, alpha } },
+	{ d2f (0.4), { low, full, low, alpha } },
+	{ d2f (0.5), { low, full, full, alpha } },
+	{ d2f (0.8), { low, low, full, alpha } },
+	{ d2f (1.0), { full, low, full, alpha } },
+    };
+	
+	    
     int i;
 
-    for (i = 0; i < WIDTH * HEIGHT; ++i)
-	src[i] = 0x7f7f0000; /* red */
+    gtk_init (&argc, &argv);
 
-    for (i = 0; i < WIDTH * HEIGHT; ++i)
-	dest[i] = 0x7f00007f; /* blue */
+    window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
+
+    gtk_window_set_default_size (window, 800, 600);
     
-    src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
-					WIDTH, HEIGHT,
-					src,
-					WIDTH * 4);
+    g_signal_connect (window, "delete-event",
+		      G_CALLBACK (gtk_main_quit),
+		      NULL);
+    table = gtk_table_new (G_N_ELEMENTS (operators) / 6, 6, TRUE);
+
+    src_img = pixman_image_create_linear_gradient (&p1, &p2, stops,
+						   sizeof (stops) / sizeof (stops[0]));
 
+    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_PAD);
+    
     dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
 					 WIDTH, HEIGHT,
 					 dest,
 					 WIDTH * 4);
-
-    pixman_image_set_accessors (src_img, reader, writer);
     pixman_image_set_accessors (dest_img, reader, writer);
-    
-    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img,
-			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
 
-    show_image (dest_img);
-    
+    for (i = 0; i < G_N_ELEMENTS (operators); ++i)
+    {
+	GtkWidget *image;
+	GdkPixbuf *pixbuf;
+	GtkWidget *vbox;
+	GtkWidget *label;
+	int j, k;
+
+	vbox = gtk_vbox_new (FALSE, 0);
+
+	label = gtk_label_new (operators[i].name);
+	gtk_box_pack_start (GTK_BOX (vbox), label, FALSE, FALSE, 6);
+	gtk_widget_show (label);
+
+	for (j = 0; j < HEIGHT; ++j)
+	{
+	    for (k = 0; k < WIDTH; ++k)
+		dest[j * WIDTH + k] = 0x7f6f6f00;
+	}
+	pixman_image_composite (operators[i].op, src_img, NULL, dest_img,
+				0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+	pixbuf = pixbuf_from_argb32 (pixman_image_get_data (dest_img), TRUE,
+				     WIDTH, HEIGHT, WIDTH * 4);
+	image = gtk_image_new_from_pixbuf (pixbuf);
+	gtk_box_pack_start (GTK_BOX (vbox), image, FALSE, FALSE, 0);
+	gtk_widget_show (image);
+
+	gtk_table_attach_defaults (GTK_TABLE (table), vbox,
+				   i % 6, (i % 6) + 1, i / 6, (i / 6) + 1);
+	gtk_widget_show (vbox);
+
+	g_object_unref (pixbuf);
+    }
+
     pixman_image_unref (src_img);
-    pixman_image_unref (dest_img);
     free (src);
+    pixman_image_unref (dest_img);
     free (dest);
+
+    swindow = gtk_scrolled_window_new (NULL, NULL);
+    gtk_scrolled_window_set_policy (GTK_SCROLLED_WINDOW (swindow),
+				    GTK_POLICY_AUTOMATIC,
+				    GTK_POLICY_AUTOMATIC);
     
+    gtk_scrolled_window_add_with_viewport (GTK_SCROLLED_WINDOW (swindow), table);
+    gtk_widget_show (table);
+
+    gtk_container_add (GTK_CONTAINER (window), swindow);
+    gtk_widget_show (swindow);
+
+    gtk_widget_show (window);
+
+    gtk_main ();
+
     return 0;
 }
diff --git a/lib/pixman/test/convolution-test.c b/lib/pixman/test/convolution-test.c
new file mode 100644
index 000000000..8609d38a0
--- /dev/null
+++ b/lib/pixman/test/convolution-test.c
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+#define d2f pixman_double_to_fixed
+    
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *mask = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    pixman_fixed_t convolution[] =
+    {
+	d2f (3), d2f (3),
+	d2f (0.5), d2f (0.5), d2f (0.5),
+	d2f (0.5), d2f (0.5), d2f (0.5),
+	d2f (0.5), d2f (0.5), d2f (0.5),
+    };
+    pixman_image_t *simg, *mimg, *dimg;
+
+    int i;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+    {
+	src[i] = 0x7f007f00;
+	mask[i] = (i % 256) * 0x01000000;
+	dest[i] = 0;
+    }
+
+    simg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src, WIDTH * 4);
+    mimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, mask, WIDTH * 4);
+    dimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, dest, WIDTH * 4);
+
+    pixman_image_set_filter (mimg, PIXMAN_FILTER_CONVOLUTION,
+			     convolution, 11);
+
+    pixman_image_composite (PIXMAN_OP_OVER, simg, mimg, dimg, 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    show_image (dimg);
+    
+    return 0;
+}
diff --git a/lib/pixman/test/fetch-test.c b/lib/pixman/test/fetch-test.c
index c41f1a63e..6306a4c42 100644
--- a/lib/pixman/test/fetch-test.c
+++ b/lib/pixman/test/fetch-test.c
@@ -2,6 +2,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include "pixman.h"
+#include <config.h>
 
 #define SIZE 1024
 
@@ -34,8 +35,13 @@ testcase_t testcases[] = {
 	.format = PIXMAN_g1,
 	.width = 8, .height = 2,
 	.stride = 4,
+#ifdef WORDS_BIGENDIAN
+	.src = { 0xaa000000,
+		 0x55000000 },
+#else
 	.src = { 0x00000055, 
 	         0x000000aa },
+#endif
 	.dst = { 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000,
 	         0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff },
 	.indexed = &mono_pallete,
@@ -51,14 +57,24 @@ testcase_t testcases[] = {
 	         0x00898989, 0x00ababab, 0x00cdcdcd, 0x00efefef, },
     },
 #endif
+    /* FIXME: make this work on big endian */
     {
 	.format = PIXMAN_yv12,
 	.width = 8, .height = 2,
 	.stride = 8,
+#ifdef WORDS_BIGENDIAN
+	.src = { 0x00ff00ff, 0x00ff00ff, 
+	         0xff00ff00, 0xff00ff00, 
+	         0x80ff8000, 
+		 0x800080ff
+	},
+#else
 	.src = { 0xff00ff00, 0xff00ff00, 
 	         0x00ff00ff, 0x00ff00ff, 
 	         0x0080ff80, 
-		 0xff800080},
+		 0xff800080
+	 },
+#endif
 	.dst = { 
 		0xff000000, 0xffffffff, 0xffb80000, 0xffffe113,
 		0xff000000, 0xffffffff, 0xff0023ee, 0xff4affff,
diff --git a/lib/pixman/test/oob-test.c b/lib/pixman/test/oob-test.c
new file mode 100644
index 000000000..4f9e5a244
--- /dev/null
+++ b/lib/pixman/test/oob-test.c
@@ -0,0 +1,101 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+
+typedef struct
+{
+    int				width;
+    int				height;
+    int				stride;
+    pixman_format_code_t	format;
+    
+} image_info_t;
+
+typedef struct
+{
+    pixman_op_t		op;
+    
+    image_info_t	src;
+    image_info_t	dest;
+
+    int			src_x;
+    int			src_y;
+    int			dest_x;
+    int			dest_y;
+    int			width;
+    int			height;
+} composite_info_t;
+
+const composite_info_t info[] =
+{
+    {
+	PIXMAN_OP_SRC,
+	{  3, 6, 16, PIXMAN_a8r8g8b8 },
+	{  5, 7, 20, PIXMAN_x8r8g8b8 },
+	1, 8,
+	1, -1,
+	1, 8
+    },
+    {
+	PIXMAN_OP_SRC,
+	{ 7, 5, 36, PIXMAN_a8r8g8b8 },
+	{ 6, 5, 28, PIXMAN_x8r8g8b8 },
+	8, 5,
+	5, 3,
+	1, 2
+    },
+    {
+	PIXMAN_OP_OVER,
+	{ 10, 10, 40, PIXMAN_a2b10g10r10 },
+	{ 10, 10, 40, PIXMAN_a2b10g10r10 },
+	0, 0,
+	0, 0,
+	10, 10
+    },
+    {
+	PIXMAN_OP_OVER,
+	{ 10, 10, 40, PIXMAN_x2b10g10r10 },
+	{ 10, 10, 40, PIXMAN_x2b10g10r10 },
+	0, 0,
+	0, 0,
+	10, 10
+    },
+};
+
+static pixman_image_t *
+make_image (const image_info_t *info)
+{
+    char *data = malloc (info->stride * info->height);
+    int i;
+
+    for (i = 0; i < info->height * info->stride; ++i)
+	data[i] = (i % 255) ^ (((i % 16) << 4) | (i & 0xf0));
+
+    return pixman_image_create_bits (info->format, info->width, info->height, (uint32_t *)data, info->stride);
+}
+    
+static void
+test_composite (const composite_info_t *info)
+{
+    pixman_image_t *src = make_image (&info->src);
+    pixman_image_t *dest = make_image (&info->dest);
+
+    pixman_image_composite (PIXMAN_OP_SRC, src, NULL, dest,
+			    info->src_x, info->src_y,
+			    0, 0,
+			    info->dest_x, info->dest_y,
+			    info->width, info->height);
+}
+
+
+
+int
+main (int argc, char **argv)
+{
+    int i;
+
+    for (i = 0; i < sizeof (info) / sizeof (info[0]); ++i)
+	test_composite (&info[i]);
+    
+    return 0;
+}
diff --git a/lib/pixman/test/region-test.c b/lib/pixman/test/region-test.c
index e214e9b89..3568969f1 100644
--- a/lib/pixman/test/region-test.c
+++ b/lib/pixman/test/region-test.c
@@ -3,21 +3,76 @@
 #include <stdio.h>
 #include "pixman.h"
 
-/* This used to go into an infinite loop before pixman-region.c
- * was fixed to not use explict "short" variables
- */
 int
 main ()
 {
     pixman_region32_t r1;
     pixman_region32_t r2;
     pixman_region32_t r3;
+    pixman_box32_t boxes[] = {
+	{ 10, 10, 20, 20 },
+	{ 30, 30, 30, 40 },
+	{ 50, 45, 60, 44 },
+    };
+    pixman_box32_t boxes2[] = {
+	{ 2, 6, 7, 6 },
+	{ 4, 1, 6, 7 },
+    };
+    pixman_box32_t boxes3[] = {
+	{ 2, 6, 7, 6 },
+	{ 4, 1, 6, 1 },
+    };
+    int i;
+    pixman_box32_t *b;
 
+    /* This used to go into an infinite loop before pixman-region.c
+     * was fixed to not use explict "short" variables
+     */
     pixman_region32_init_rect (&r1, 0, 0, 20, 64000);
     pixman_region32_init_rect (&r2, 0, 0, 20, 64000);
     pixman_region32_init_rect (&r3, 0, 0, 20, 64000);
 
     pixman_region32_subtract (&r1, &r2, &r3);
 
-}
 
+    /* This would produce a region containing an empty
+     * rectangle in it. Such regions are considered malformed,
+     * but using an empty rectangle for initialization should
+     * work.
+     */
+    pixman_region32_init_rects (&r1, boxes, 3);
+
+    b = pixman_region32_rectangles (&r1, &i);
+
+    assert (i == 1);
+    
+    while (i--)
+    {
+	assert (b[i].x1 < b[i].x2);
+	assert (b[i].y1 < b[i].y2);
+    }
+
+    /* This would produce a rectangle containing the bounding box
+     * of the two rectangles. The correct result is to eliminate
+     * the broken rectangle.
+     */
+    pixman_region32_init_rects (&r1, boxes2, 2);
+
+    b = pixman_region32_rectangles (&r1, &i);
+
+    assert (i == 1);
+
+    assert (b[0].x1 == 4);
+    assert (b[0].y1 == 1);
+    assert (b[0].x2 == 6);
+    assert (b[0].y2 == 7);
+
+    /* This should produce an empty region */
+    pixman_region32_init_rects (&r1, boxes3, 2);
+
+    b = pixman_region32_rectangles (&r1, &i);
+
+    assert (i == 0);
+
+    return 0;
+}
diff --git a/lib/pixman/test/scaling-test.c b/lib/pixman/test/scaling-test.c
index c85908ddc..8899c594f 100644
--- a/lib/pixman/test/scaling-test.c
+++ b/lib/pixman/test/scaling-test.c
@@ -29,115 +29,192 @@
 
 static uint32_t lcg_seed;
 
-uint32_t lcg_rand(void)
+uint32_t
+lcg_rand (void)
 {
     lcg_seed = lcg_seed * 1103515245 + 12345;
     return ((uint32_t)(lcg_seed / 65536) % 32768);
 }
 
-void lcg_srand(uint32_t seed)
+void
+lcg_srand (uint32_t seed)
 {
     lcg_seed = seed;
 }
 
-uint32_t lcg_rand_n(int max)
+uint32_t
+lcg_rand_n (int max)
 {
-    return lcg_rand() % max;
+    return lcg_rand () % max;
 }
 
 /*----------------------------------------------------------------------------*\
- *  CRC-32 version 2.0.0 by Craig Bruce, 2006-04-29.
- *
- *  This program generates the CRC-32 values for the files named in the
- *  command-line arguments.  These are the same CRC-32 values used by GZIP,
- *  PKZIP, and ZMODEM.  The Crc32_ComputeBuf() can also be detached and
- *  used independently.
- *
- *  THIS PROGRAM IS PUBLIC-DOMAIN SOFTWARE.
- *
- *  Based on the byte-oriented implementation "File Verification Using CRC"
- *  by Mark R. Nelson in Dr. Dobb's Journal, May 1992, pp. 64-67.
- *
- *  v1.0.0: original release.
- *  v1.0.1: fixed printf formats.
- *  v1.0.2: fixed something else.
- *  v1.0.3: replaced CRC constant table by generator function.
- *  v1.0.4: reformatted code, made ANSI C.  1994-12-05.
- *  v2.0.0: rewrote to use memory buffer & static table, 2006-04-29.
+*  CRC-32 version 2.0.0 by Craig Bruce, 2006-04-29.
+*
+*  This program generates the CRC-32 values for the files named in the
+*  command-line arguments.  These are the same CRC-32 values used by GZIP,
+*  PKZIP, and ZMODEM.  The compute_crc32() can also be detached and
+*  used independently.
+*
+*  THIS PROGRAM IS PUBLIC-DOMAIN SOFTWARE.
+*
+*  Based on the byte-oriented implementation "File Verification Using CRC"
+*  by Mark R. Nelson in Dr. Dobb's Journal, May 1992, pp. 64-67.
+*
+*  v1.0.0: original release.
+*  v1.0.1: fixed printf formats.
+*  v1.0.2: fixed something else.
+*  v1.0.3: replaced CRC constant table by generator function.
+*  v1.0.4: reformatted code, made ANSI C.  1994-12-05.
+*  v2.0.0: rewrote to use memory buffer & static table, 2006-04-29.
 \*----------------------------------------------------------------------------*/
 
 /*----------------------------------------------------------------------------*\
- *  NAME:
- *     Crc32_ComputeBuf() - computes the CRC-32 value of a memory buffer
- *  DESCRIPTION:
- *     Computes or accumulates the CRC-32 value for a memory buffer.
- *     The 'inCrc32' gives a previously accumulated CRC-32 value to allow
- *     a CRC to be generated for multiple sequential buffer-fuls of data.
- *     The 'inCrc32' for the first buffer must be zero.
- *  ARGUMENTS:
- *     inCrc32 - accumulated CRC-32 value, must be 0 on first call
- *     buf     - buffer to compute CRC-32 value for
- *     bufLen  - number of bytes in buffer
- *  RETURNS:
- *     crc32 - computed CRC-32 value
- *  ERRORS:
- *     (no errors are possible)
+*  NAME:
+*     compute_crc32() - computes the CRC-32 value of a memory buffer
+*  DESCRIPTION:
+*     Computes or accumulates the CRC-32 value for a memory buffer.
+*     The 'in_crc32' gives a previously accumulated CRC-32 value to allow
+*     a CRC to be generated for multiple sequential buffer-fuls of data.
+*     The 'in_crc32' for the first buffer must be zero.
+*  ARGUMENTS:
+*     in_crc32 - accumulated CRC-32 value, must be 0 on first call
+*     buf     - buffer to compute CRC-32 value for
+*     buf_len  - number of bytes in buffer
+*  RETURNS:
+*     crc32 - computed CRC-32 value
+*  ERRORS:
+*     (no errors are possible)
 \*----------------------------------------------------------------------------*/
 
-static uint32_t Crc32_ComputeBuf( uint32_t inCrc32, const void *buf,
-                                       size_t bufLen )
+static uint32_t
+compute_crc32 (uint32_t    in_crc32,
+		  const void *buf,
+		  size_t      buf_len)
 {
-    static const uint32_t crcTable[256] = {
-   0x00000000,0x77073096,0xEE0E612C,0x990951BA,0x076DC419,0x706AF48F,0xE963A535,
-   0x9E6495A3,0x0EDB8832,0x79DCB8A4,0xE0D5E91E,0x97D2D988,0x09B64C2B,0x7EB17CBD,
-   0xE7B82D07,0x90BF1D91,0x1DB71064,0x6AB020F2,0xF3B97148,0x84BE41DE,0x1ADAD47D,
-   0x6DDDE4EB,0xF4D4B551,0x83D385C7,0x136C9856,0x646BA8C0,0xFD62F97A,0x8A65C9EC,
-   0x14015C4F,0x63066CD9,0xFA0F3D63,0x8D080DF5,0x3B6E20C8,0x4C69105E,0xD56041E4,
-   0xA2677172,0x3C03E4D1,0x4B04D447,0xD20D85FD,0xA50AB56B,0x35B5A8FA,0x42B2986C,
-   0xDBBBC9D6,0xACBCF940,0x32D86CE3,0x45DF5C75,0xDCD60DCF,0xABD13D59,0x26D930AC,
-   0x51DE003A,0xC8D75180,0xBFD06116,0x21B4F4B5,0x56B3C423,0xCFBA9599,0xB8BDA50F,
-   0x2802B89E,0x5F058808,0xC60CD9B2,0xB10BE924,0x2F6F7C87,0x58684C11,0xC1611DAB,
-   0xB6662D3D,0x76DC4190,0x01DB7106,0x98D220BC,0xEFD5102A,0x71B18589,0x06B6B51F,
-   0x9FBFE4A5,0xE8B8D433,0x7807C9A2,0x0F00F934,0x9609A88E,0xE10E9818,0x7F6A0DBB,
-   0x086D3D2D,0x91646C97,0xE6635C01,0x6B6B51F4,0x1C6C6162,0x856530D8,0xF262004E,
-   0x6C0695ED,0x1B01A57B,0x8208F4C1,0xF50FC457,0x65B0D9C6,0x12B7E950,0x8BBEB8EA,
-   0xFCB9887C,0x62DD1DDF,0x15DA2D49,0x8CD37CF3,0xFBD44C65,0x4DB26158,0x3AB551CE,
-   0xA3BC0074,0xD4BB30E2,0x4ADFA541,0x3DD895D7,0xA4D1C46D,0xD3D6F4FB,0x4369E96A,
-   0x346ED9FC,0xAD678846,0xDA60B8D0,0x44042D73,0x33031DE5,0xAA0A4C5F,0xDD0D7CC9,
-   0x5005713C,0x270241AA,0xBE0B1010,0xC90C2086,0x5768B525,0x206F85B3,0xB966D409,
-   0xCE61E49F,0x5EDEF90E,0x29D9C998,0xB0D09822,0xC7D7A8B4,0x59B33D17,0x2EB40D81,
-   0xB7BD5C3B,0xC0BA6CAD,0xEDB88320,0x9ABFB3B6,0x03B6E20C,0x74B1D29A,0xEAD54739,
-   0x9DD277AF,0x04DB2615,0x73DC1683,0xE3630B12,0x94643B84,0x0D6D6A3E,0x7A6A5AA8,
-   0xE40ECF0B,0x9309FF9D,0x0A00AE27,0x7D079EB1,0xF00F9344,0x8708A3D2,0x1E01F268,
-   0x6906C2FE,0xF762575D,0x806567CB,0x196C3671,0x6E6B06E7,0xFED41B76,0x89D32BE0,
-   0x10DA7A5A,0x67DD4ACC,0xF9B9DF6F,0x8EBEEFF9,0x17B7BE43,0x60B08ED5,0xD6D6A3E8,
-   0xA1D1937E,0x38D8C2C4,0x4FDFF252,0xD1BB67F1,0xA6BC5767,0x3FB506DD,0x48B2364B,
-   0xD80D2BDA,0xAF0A1B4C,0x36034AF6,0x41047A60,0xDF60EFC3,0xA867DF55,0x316E8EEF,
-   0x4669BE79,0xCB61B38C,0xBC66831A,0x256FD2A0,0x5268E236,0xCC0C7795,0xBB0B4703,
-   0x220216B9,0x5505262F,0xC5BA3BBE,0xB2BD0B28,0x2BB45A92,0x5CB36A04,0xC2D7FFA7,
-   0xB5D0CF31,0x2CD99E8B,0x5BDEAE1D,0x9B64C2B0,0xEC63F226,0x756AA39C,0x026D930A,
-   0x9C0906A9,0xEB0E363F,0x72076785,0x05005713,0x95BF4A82,0xE2B87A14,0x7BB12BAE,
-   0x0CB61B38,0x92D28E9B,0xE5D5BE0D,0x7CDCEFB7,0x0BDBDF21,0x86D3D2D4,0xF1D4E242,
-   0x68DDB3F8,0x1FDA836E,0x81BE16CD,0xF6B9265B,0x6FB077E1,0x18B74777,0x88085AE6,
-   0xFF0F6A70,0x66063BCA,0x11010B5C,0x8F659EFF,0xF862AE69,0x616BFFD3,0x166CCF45,
-   0xA00AE278,0xD70DD2EE,0x4E048354,0x3903B3C2,0xA7672661,0xD06016F7,0x4969474D,
-   0x3E6E77DB,0xAED16A4A,0xD9D65ADC,0x40DF0B66,0x37D83BF0,0xA9BCAE53,0xDEBB9EC5,
-   0x47B2CF7F,0x30B5FFE9,0xBDBDF21C,0xCABAC28A,0x53B39330,0x24B4A3A6,0xBAD03605,
-   0xCDD70693,0x54DE5729,0x23D967BF,0xB3667A2E,0xC4614AB8,0x5D681B02,0x2A6F2B94,
-   0xB40BBE37,0xC30C8EA1,0x5A05DF1B,0x2D02EF8D };
-    uint32_t crc32;
-    unsigned char *byteBuf;
-    size_t i;
+    static const uint32_t crc_table[256] = {
+	0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F,
+	0xE963A535, 0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
+	0x09B64C2B, 0x7EB17CBD,	0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2,
+	0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+	0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,	0x14015C4F, 0x63066CD9,
+	0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
+	0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
+	0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+	0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423,
+	0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
+	0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106,
+	0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+	0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D,
+	0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+	0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950,
+	0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+	0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7,
+	0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
+	0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA,
+	0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+	0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
+	0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
+	0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84,
+	0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+	0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB,
+	0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
+	0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E,
+	0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+	0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55,
+	0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
+	0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28,
+	0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+	0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F,
+	0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
+	0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
+	0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+	0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69,
+	0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
+	0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC,
+	0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+	0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693,
+	0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+	0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+    };
+    
+    uint32_t              crc32;
+    unsigned char *       byte_buf;
+    size_t                i;
 
     /** accumulate crc32 for buffer **/
-    crc32 = inCrc32 ^ 0xFFFFFFFF;
-    byteBuf = (unsigned char*) buf;
-    for (i=0; i < bufLen; i++) {
-        crc32 = (crc32 >> 8) ^ crcTable[ (crc32 ^ byteBuf[i]) & 0xFF ];
-    }
-    return( crc32 ^ 0xFFFFFFFF );
+    crc32 = in_crc32 ^ 0xFFFFFFFF;
+    byte_buf = (unsigned char*) buf;
+
+    for (i = 0; i < buf_len; i++)
+	crc32 = (crc32 >> 8) ^ crc_table[(crc32 ^ byte_buf[i]) & 0xFF];
+    
+    return (crc32 ^ 0xFFFFFFFF);
 }
 
+/* perform endian conversion of pixel data */
+static void
+image_endian_swap (pixman_image_t *img,
+		   int             bpp)
+{
+    int       stride = pixman_image_get_stride (img);
+    uint32_t *data = pixman_image_get_data (img);
+    int       height = pixman_image_get_height (img);
+    int i, j;
+
+    /* swap bytes only on big endian systems */
+    volatile uint16_t endian_check_var = 0x1234;
+    if (*(volatile uint8_t *)&endian_check_var != 0x12)
+	return;
+
+    for (i = 0; i < height; i++)
+    {
+	char *line_data = (char *)data + stride * i;
+	
+	/* swap bytes only for 16, 24 and 32 bpp for now */
+	switch (bpp)
+	{
+	case 16:
+	    for (j = 0; j + 2 <= stride; j += 2)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		line_data[j + 1] = t1;
+		line_data[j + 0] = t2;
+	    }
+	    break;
+
+	case 24:
+	    for (j = 0; j + 3 <= stride; j += 3)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		char t3 = line_data[j + 2];
+		line_data[j + 2] = t1;
+		line_data[j + 1] = t2;
+		line_data[j + 0] = t3;
+	    }
+	    break;
+
+	case 32:
+	    for (j = 0; j + 4 <= stride; j += 4)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		char t3 = line_data[j + 2];
+		char t4 = line_data[j + 3];
+		line_data[j + 3] = t1;
+		line_data[j + 2] = t2;
+		line_data[j + 1] = t3;
+		line_data[j + 0] = t4;
+	    }
+	    break;
+
+	default:
+	    break;
+	}
+    }
+}
 
 #define MAX_SRC_WIDTH  10
 #define MAX_SRC_HEIGHT 10
@@ -148,194 +225,250 @@ static uint32_t Crc32_ComputeBuf( uint32_t inCrc32, const void *buf,
 /*
  * Composite operation with pseudorandom images
  */
-uint32_t test_composite(uint32_t initcrc, int testnum, int verbose)
+uint32_t
+test_composite (uint32_t initcrc,
+		int      testnum,
+		int      verbose)
 {
-    int i;
-    pixman_image_t *src_img;
-    pixman_image_t *dst_img;
+    int                i;
+    pixman_image_t *   src_img;
+    pixman_image_t *   dst_img;
     pixman_transform_t transform;
-    pixman_region16_t clip;
-    int src_width, src_height;
-    int dst_width, dst_height;
-    int src_stride, dst_stride;
-    int src_x, src_y;
-    int dst_x, dst_y;
-    int src_bpp;
-    int dst_bpp;
-    int w, h;
-    int scale_x = 32768, scale_y = 32768;
-    int op;
-    int repeat = 0;
-    int src_fmt, dst_fmt;
-    uint32_t *srcbuf;
-    uint32_t *dstbuf;
-    uint32_t crc32;
-
-    lcg_srand(testnum);
-
-    src_bpp = (lcg_rand_n(2) == 0) ? 2 : 4;
-    dst_bpp = (lcg_rand_n(2) == 0) ? 2 : 4;
-    op = (lcg_rand_n(2) == 0) ? PIXMAN_OP_SRC : PIXMAN_OP_OVER;
-
-    src_width = lcg_rand_n(MAX_SRC_WIDTH) + 1;
-    src_height = lcg_rand_n(MAX_SRC_HEIGHT) + 1;
-    dst_width = lcg_rand_n(MAX_DST_WIDTH) + 1;
-    dst_height = lcg_rand_n(MAX_DST_HEIGHT) + 1;
-    src_stride = src_width * src_bpp + lcg_rand_n(MAX_STRIDE) * src_bpp;
-    dst_stride = dst_width * dst_bpp + lcg_rand_n(MAX_STRIDE) * dst_bpp;
-    if (src_stride & 3) src_stride += 2;
-    if (dst_stride & 3) dst_stride += 2;
-
-    src_x = -(src_width / 4) + lcg_rand_n(src_width * 3 / 2);
-    src_y = -(src_height / 4) + lcg_rand_n(src_height * 3 / 2);
-    dst_x = -(dst_width / 4) + lcg_rand_n(dst_width * 3 / 2);
-    dst_y = -(dst_height / 4) + lcg_rand_n(dst_height * 3 / 2);
-    w = lcg_rand_n(dst_width * 3 / 2 - dst_x);
-    h = lcg_rand_n(dst_height * 3 / 2 - dst_y);
-
-    srcbuf = (uint32_t *)malloc(src_stride * src_height);
-    dstbuf = (uint32_t *)malloc(dst_stride * dst_height);
+    pixman_region16_t  clip;
+    int                src_width, src_height;
+    int                dst_width, dst_height;
+    int                src_stride, dst_stride;
+    int                src_x, src_y;
+    int                dst_x, dst_y;
+    int                src_bpp;
+    int                dst_bpp;
+    int                w, h;
+    int                scale_x = 32768, scale_y = 32768;
+    int                op;
+    int                repeat = 0;
+    int                src_fmt, dst_fmt;
+    uint32_t *         srcbuf;
+    uint32_t *         dstbuf;
+    uint32_t           crc32;
+
+    lcg_srand (testnum);
+
+    src_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
+    dst_bpp = (lcg_rand_n (2) == 0) ? 2 : 4;
+    op = (lcg_rand_n (2) == 0) ? PIXMAN_OP_SRC : PIXMAN_OP_OVER;
+
+    src_width = lcg_rand_n (MAX_SRC_WIDTH) + 1;
+    src_height = lcg_rand_n (MAX_SRC_HEIGHT) + 1;
+    dst_width = lcg_rand_n (MAX_DST_WIDTH) + 1;
+    dst_height = lcg_rand_n (MAX_DST_HEIGHT) + 1;
+    src_stride = src_width * src_bpp + lcg_rand_n (MAX_STRIDE) * src_bpp;
+    dst_stride = dst_width * dst_bpp + lcg_rand_n (MAX_STRIDE) * dst_bpp;
+
+    if (src_stride & 3)
+	src_stride += 2;
+    
+    if (dst_stride & 3)
+	dst_stride += 2;
+
+    src_x = -(src_width / 4) + lcg_rand_n (src_width * 3 / 2);
+    src_y = -(src_height / 4) + lcg_rand_n (src_height * 3 / 2);
+    dst_x = -(dst_width / 4) + lcg_rand_n (dst_width * 3 / 2);
+    dst_y = -(dst_height / 4) + lcg_rand_n (dst_height * 3 / 2);
+    w = lcg_rand_n (dst_width * 3 / 2 - dst_x);
+    h = lcg_rand_n (dst_height * 3 / 2 - dst_y);
+
+    srcbuf = (uint32_t *)malloc (src_stride * src_height);
+    dstbuf = (uint32_t *)malloc (dst_stride * dst_height);
+
     for (i = 0; i < src_stride * src_height; i++)
-        *((uint8_t *)srcbuf + i) = lcg_rand_n(256);
+	*((uint8_t *)srcbuf + i) = lcg_rand_n (256);
+
     for (i = 0; i < dst_stride * dst_height; i++)
-        *((uint8_t *)dstbuf + i) = lcg_rand_n(256);
+	*((uint8_t *)dstbuf + i) = lcg_rand_n (256);
 
-    src_fmt = src_bpp == 4 ? (lcg_rand_n(2) == 0 ?
-        PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+    src_fmt = src_bpp == 4 ? (lcg_rand_n (2) == 0 ?
+                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
 
-    dst_fmt = dst_bpp == 4 ? (lcg_rand_n(2) == 0 ?
-        PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+    dst_fmt = dst_bpp == 4 ? (lcg_rand_n (2) == 0 ?
+                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
 
-    src_img = pixman_image_create_bits(
+    src_img = pixman_image_create_bits (
         src_fmt, src_width, src_height, srcbuf, src_stride);
 
-    dst_img = pixman_image_create_bits(
+    dst_img = pixman_image_create_bits (
         dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
 
-    if (lcg_rand_n(8) > 0) {
-        scale_x = 32768 + lcg_rand_n(65536);
-        scale_y = 32768 + lcg_rand_n(65536);
-        pixman_transform_init_scale(&transform, scale_x, scale_y);
-        pixman_image_set_transform(src_img, &transform);
+    image_endian_swap (src_img, src_bpp * 8);
+    image_endian_swap (dst_img, dst_bpp * 8);
+
+    if (lcg_rand_n (8) > 0)
+    {
+	scale_x = 32768 + lcg_rand_n (65536);
+	scale_y = 32768 + lcg_rand_n (65536);
+	pixman_transform_init_scale (&transform, scale_x, scale_y);
+	pixman_image_set_transform (src_img, &transform);
     }
 
-    switch (lcg_rand_n(4)) {
-        case 0: repeat = PIXMAN_REPEAT_NONE; break;
-        case 1: repeat = PIXMAN_REPEAT_NORMAL; break;
-        case 2: repeat = PIXMAN_REPEAT_PAD; break;
-        case 3: repeat = PIXMAN_REPEAT_REFLECT; break;
+    switch (lcg_rand_n (4))
+    {
+    case 0:
+	repeat = PIXMAN_REPEAT_NONE;
+	break;
+
+    case 1:
+	repeat = PIXMAN_REPEAT_NORMAL;
+	break;
+
+    case 2:
+	repeat = PIXMAN_REPEAT_PAD;
+	break;
+
+    case 3:
+	repeat = PIXMAN_REPEAT_REFLECT;
+	break;
     }
-    pixman_image_set_repeat(src_img, repeat);
-
-    if (verbose) {
-        printf("src_fmt=%08X, dst_fmt=%08X\n", src_fmt, dst_fmt);
-        printf("op=%d, scale_x=%d, scale_y=%d, repeat=%d\n",
-            op, scale_x, scale_y, repeat);
-        printf("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
-            src_width, src_height, dst_width, dst_height);
-        printf("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
-            src_x, src_y, dst_x, dst_y);
-        printf("w=%d, h=%d\n", w, h);
+    pixman_image_set_repeat (src_img, repeat);
+
+    if (verbose)
+    {
+	printf ("src_fmt=%08X, dst_fmt=%08X\n", src_fmt, dst_fmt);
+	printf ("op=%d, scale_x=%d, scale_y=%d, repeat=%d\n",
+	        op, scale_x, scale_y, repeat);
+	printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
+	        src_width, src_height, dst_width, dst_height);
+	printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
+	        src_x, src_y, dst_x, dst_y);
+	printf ("w=%d, h=%d\n", w, h);
     }
 
-    if (lcg_rand_n(8) == 0) {
-        pixman_box16_t clip_boxes[2];
-        int n = lcg_rand_n(2) + 1;
-        for (i = 0; i < n; i++) {
-            clip_boxes[i].x1 = lcg_rand_n(src_width);
-            clip_boxes[i].y1 = lcg_rand_n(src_height);
-            clip_boxes[i].x2 = clip_boxes[i].x1 + lcg_rand_n(src_width - clip_boxes[i].x1);
-            clip_boxes[i].y2 = clip_boxes[i].y1 + lcg_rand_n(src_height - clip_boxes[i].y1);
-            if (verbose) {
-                printf("source clip box: [%d,%d-%d,%d]\n",
-                    clip_boxes[i].x1, clip_boxes[i].y1,
-                    clip_boxes[i].x2, clip_boxes[i].y2);
-            }
-        }
-        pixman_region_init_rects(&clip, clip_boxes, n);
-        pixman_image_set_clip_region(src_img, &clip);
-        pixman_image_set_source_clipping(src_img, 1);
-        pixman_region_fini(&clip);
+    if (lcg_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = lcg_rand_n (2) + 1;
+
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = lcg_rand_n (src_width);
+	    clip_boxes[i].y1 = lcg_rand_n (src_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + lcg_rand_n (src_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + lcg_rand_n (src_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("source clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+	
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (src_img, &clip);
+	pixman_image_set_source_clipping (src_img, 1);
+	pixman_region_fini (&clip);
     }
 
-    if (lcg_rand_n(8) == 0) {
-        pixman_box16_t clip_boxes[2];
-        int n = lcg_rand_n(2) + 1;
-        for (i = 0; i < n; i++) {
-            clip_boxes[i].x1 = lcg_rand_n(dst_width);
-            clip_boxes[i].y1 = lcg_rand_n(dst_height);
-            clip_boxes[i].x2 = clip_boxes[i].x1 + lcg_rand_n(dst_width - clip_boxes[i].x1);
-            clip_boxes[i].y2 = clip_boxes[i].y1 + lcg_rand_n(dst_height - clip_boxes[i].y1);
-            if (verbose) {
-                printf("destination clip box: [%d,%d-%d,%d]\n",
-                    clip_boxes[i].x1, clip_boxes[i].y1,
-                    clip_boxes[i].x2, clip_boxes[i].y2);
-            }
-        }
-        pixman_region_init_rects(&clip, clip_boxes, n);
-        pixman_image_set_clip_region(dst_img, &clip);
-        pixman_region_fini(&clip);
+    if (lcg_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = lcg_rand_n (2) + 1;
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = lcg_rand_n (dst_width);
+	    clip_boxes[i].y1 = lcg_rand_n (dst_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + lcg_rand_n (dst_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + lcg_rand_n (dst_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("destination clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (dst_img, &clip);
+	pixman_region_fini (&clip);
     }
 
     pixman_image_composite (op, src_img, NULL, dst_img,
                             src_x, src_y, 0, 0, dst_x, dst_y, w, h);
 
-    if (dst_fmt == PIXMAN_x8r8g8b8) {
-        /* ignore unused part */
-        for (i = 0; i < dst_stride * dst_height / 4; i++)
-            dstbuf[i] &= 0xFFFFFF;
+    if (dst_fmt == PIXMAN_x8r8g8b8)
+    {
+	/* ignore unused part */
+	for (i = 0; i < dst_stride * dst_height / 4; i++)
+	    dstbuf[i] &= 0xFFFFFF;
     }
 
-    if (verbose) {
-        int j;
-        for (i = 0; i < dst_height; i++) {
-            for (j = 0; j < dst_stride; j++) {
-                printf("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
-            }
-            printf("\n");
-        }
+    image_endian_swap (dst_img, dst_bpp * 8);
+
+    if (verbose)
+    {
+	int j;
+	
+	for (i = 0; i < dst_height; i++)
+	{
+	    for (j = 0; j < dst_stride; j++)
+		printf ("%02X ", *((uint8_t *)dstbuf + i * dst_stride + j));
+
+	    printf ("\n");
+	}
     }
 
     pixman_image_unref (src_img);
     pixman_image_unref (dst_img);
 
-    crc32 = Crc32_ComputeBuf(initcrc, dstbuf, dst_stride * dst_height);
-    free(srcbuf);
-    free(dstbuf);
+    crc32 = compute_crc32 (initcrc, dstbuf, dst_stride * dst_height);
+    free (srcbuf);
+    free (dstbuf);
     return crc32;
 }
 
-int main(int argc, char *argv[])
+int
+main (int   argc, char *argv[])
 {
-    int i, n = 0;
+    int      i, n = 0;
     uint32_t crc = 0;
 
+    pixman_disable_out_of_bounds_workaround ();
+
     if (argc >= 2)
-        n = atoi(argv[1]);
+	n = atoi (argv[1]);
 
     if (n == 0) n = 3000000;
 
-    if (n < 0) {
-        crc = test_composite(0, -n, 1);
-        printf("crc32=%08X\n", crc);
+    if (n < 0)
+    {
+	crc = test_composite (0, -n, 1);
+	printf ("crc32=%08X\n", crc);
     }
-    else {
-        for (i = 1; i <= n; i++)
-        {
-            crc = test_composite(crc, i, 0);
-        }
-        printf("crc32=%08X\n", crc);
-#ifdef LITTLE_ENDIAN
-        if (n == 3000000) {
-            /* predefined value for running with all the fastpath functions disabled  */
-            /* it needs to be updated every time changes are introduced to this program! */
-            if (crc == 0xC950E5BB) {
-                printf("scaling test passed\n");
-            } else {
-                printf("scaling test failed!\n");
-            }
-        }
-#endif
+    else
+    {
+	for (i = 1; i <= n; i++)
+	    crc = test_composite (crc, i, 0);
+
+	printf ("crc32=%08X\n", crc);
+
+	if (n == 3000000)
+	{
+	    /* predefined value for running with all the fastpath functions disabled  */
+	    /* it needs to be updated every time changes are introduced to this program! */
+
+	    if (crc == 0x0B633CF4)
+	    {
+		printf ("scaling test passed\n");
+	    }
+	    else
+	    {
+		printf ("scaling test failed!\n");
+		return 1;
+	    }
+	}
     }
+
     return 0;
 }
diff --git a/lib/pixman/test/screen-test.c b/lib/pixman/test/screen-test.c
new file mode 100644
index 000000000..5e02eee08
--- /dev/null
+++ b/lib/pixman/test/screen-test.c
@@ -0,0 +1,44 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 40
+#define HEIGHT 40
+    
+    uint32_t *src1 = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src2 = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src3 = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (3 * WIDTH * 2 * HEIGHT * 4);
+    pixman_image_t *simg1, *simg2, *simg3, *dimg;
+
+    int i;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+    {
+	src1[i] = 0x7ff00000;
+	src2[i] = 0x7f00ff00;
+	src3[i] = 0x7f0000ff;
+    }
+
+    for (i = 0; i < 3 * WIDTH * 2 * HEIGHT; ++i)
+    {
+	dest[i] = 0x0;
+    }
+
+    simg1 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src1, WIDTH * 4);
+    simg2 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src2, WIDTH * 4);
+    simg3 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src3, WIDTH * 4);
+    dimg  = pixman_image_create_bits (PIXMAN_a8r8g8b8, 3 * WIDTH, 2 * HEIGHT, dest, 3 * WIDTH * 4);
+
+    pixman_image_composite (PIXMAN_OP_SCREEN, simg1, NULL, dimg, 0, 0, 0, 0, WIDTH, HEIGHT / 4, WIDTH, HEIGHT);
+    pixman_image_composite (PIXMAN_OP_SCREEN, simg2, NULL, dimg, 0, 0, 0, 0, (WIDTH/2), HEIGHT / 4 + HEIGHT / 2, WIDTH, HEIGHT);
+    pixman_image_composite (PIXMAN_OP_SCREEN, simg3, NULL, dimg, 0, 0, 0, 0, (4 * WIDTH) / 3, HEIGHT, WIDTH, HEIGHT);
+
+    show_image (dimg);
+    
+    return 0;
+}
diff --git a/lib/pixman/test/window-test.c b/lib/pixman/test/window-test.c
new file mode 100644
index 000000000..bbaa3e211
--- /dev/null
+++ b/lib/pixman/test/window-test.c
@@ -0,0 +1,173 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <config.h>
+#include "pixman.h"
+#include "pixman-private.h"
+
+#define FALSE 0
+#define TRUE 1
+
+/* Randomly decide between 32 and 16 bit
+ *
+ * Allocate bits with random width, stride and height
+ *
+ * Then make up some random offset (dx, dy)
+ *
+ * Then make an image with those values.
+ *
+ * Do this for both source and destination
+ *
+ * Composite them together using OVER.
+ *
+ * The bits in the source and the destination should have
+ * recognizable colors so that the result can be verified.
+ *
+ * Ie., walk the bits and verify that they have been composited.
+ */
+
+static int
+get_rand (int bound)
+{
+    return rand () % bound;
+}
+
+static pixman_image_t *
+make_image (int width, int height, pixman_bool_t src, int *rx, int *ry)
+{
+    pixman_format_code_t format;
+    pixman_image_t *image;
+    pixman_region32_t region;
+    uint8_t *bits;
+    int stride;
+    int bpp;
+    int dx, dy;
+    int i, j;
+
+    if (src)
+	format = PIXMAN_a8r8g8b8;
+    else
+	format = PIXMAN_r5g6b5;
+
+    bpp = PIXMAN_FORMAT_BPP (format) / 8;
+
+    stride = width + get_rand (width);
+    stride += (stride & 1);		/* Make it an even number */
+
+    bits = malloc (height * stride * bpp);
+
+    for (j = 0; j < height; ++j)
+    {
+	for (i = 0; i < width; ++i)
+	{
+	    uint8_t *pixel = bits + (stride * j + i) * bpp;
+
+	    if (src)
+		*(uint32_t *)pixel = 0x7f00007f;
+	    else
+		*(uint16_t *)pixel = 0xf100;
+	}
+    }
+
+    dx = dy = 0;
+
+    dx = get_rand (500);
+    dy = get_rand (500);
+
+    if (!src)
+    {
+	/* Now simulate the bogus X server translations */
+	bits -= (dy * stride + dx) * bpp;
+    }
+
+    image = pixman_image_create_bits (
+	format, width, height, (uint32_t *)bits, stride * bpp);
+
+    if (!src)
+    {
+	/* And add the bogus clip region */
+	pixman_region32_init_rect (&region, dx, dy, dx + width, dy + height);
+
+	pixman_image_set_clip_region32 (image, &region);
+    }
+
+    pixman_image_set_source_clipping (image, TRUE);
+
+    if (src)
+    {
+	pixman_transform_t trans;
+
+	pixman_transform_init_identity (&trans);
+
+	pixman_transform_translate (&trans,
+				    NULL,
+				    - pixman_int_to_fixed (width / 2),
+				    - pixman_int_to_fixed (height / 2));
+
+	pixman_transform_scale (&trans,
+				NULL,
+				pixman_double_to_fixed (0.5),
+				pixman_double_to_fixed (0.5));
+
+	pixman_transform_translate (&trans,
+				    NULL,
+				    pixman_int_to_fixed (width / 2),
+				    pixman_int_to_fixed (height / 2));
+
+	pixman_image_set_transform (image, &trans);
+	pixman_image_set_filter (image, PIXMAN_FILTER_BILINEAR, NULL, 0);
+	pixman_image_set_repeat (image, PIXMAN_REPEAT_PAD);
+    }
+
+    if (!src)
+    {
+	*rx = dx;
+	*ry = dy;
+    }
+    else
+    {
+	*rx = *ry = 0;
+    }
+
+    return image;
+}
+
+int
+main ()
+{
+    pixman_image_t *src, *dest;
+    int src_x, src_y, dest_x, dest_y;
+    int i, j;
+    int width = get_rand (500);
+    int height = get_rand (500);
+
+    src = make_image (width, height, TRUE, &src_x, &src_y);
+    dest = make_image (width, height, FALSE, &dest_x, &dest_y);
+
+    pixman_image_composite (
+	PIXMAN_OP_OVER, src, NULL, dest,
+	src_x, src_y,
+	-1, -1,
+	dest_x, dest_y,
+	width, height);
+
+    for (i = 0; i < height; ++i)
+    {
+	for (j = 0; j < width; ++j)
+	{
+	    uint8_t *bits = (uint8_t *)dest->bits.bits;
+	    int bpp = PIXMAN_FORMAT_BPP (dest->bits.format) / 8;
+	    int stride = dest->bits.rowstride * 4;
+
+	    uint8_t *pixel =
+		bits + (i + dest_y) * stride + (j + dest_x) * bpp;
+
+	    if (*(uint16_t *)pixel != 0x788f)
+	    {
+		printf ("bad pixel %x\n", *(uint16_t *)pixel);
+		assert (*(uint16_t *)pixel == 0x788f);
+	    }
+	}
+    }
+
+    return 0;
+}
author	Matthieu Herrb <matthieu@cvs.openbsd.org>	2010-03-25 21:58:53 +0000
committer	Matthieu Herrb <matthieu@cvs.openbsd.org>	2010-03-25 21:58:53 +0000
commit	cea3749b11718d3b585f653d4acbb6c5287794cb (patch)
tree	ab4cf3134d3a5b6e9049cde32c7b44ba677554ff /lib
parent	1a68a9b7a165123cd605727933898146a409555c (diff)