144 files changed, 72083 insertions, 0 deletions
diff --git a/src/pixman/.gitignore b/src/pixman/.gitignore
new file mode 100644
index 0000000..0f11496
--- /dev/null
+++ b/src/pixman/.gitignore
@@ -0,0 +1,89 @@
+Makefile
+Makefile.in
+.deps
+.libs
+.msg
+*.pc
+*.lo
+*.la
+*.a
+*.o
+*~
+aclocal.m4
+autom4te.cache
+compile
+config.guess
+config.log
+config.status
+config.sub
+configure
+depcomp
+install-sh
+libtool
+ltmain.sh
+missing
+stamp-h?
+config.h
+config.h.in
+.*.swp
+demos/alpha-test
+demos/checkerboard
+demos/clip-in
+demos/clip-test
+demos/composite-test
+demos/conical-test
+demos/convolution-test
+demos/gradient-test
+demos/linear-gradient
+demos/quad2quad
+demos/radial-test
+demos/scale
+demos/screen-test
+demos/srgb-test
+demos/srgb-trap-test
+demos/trap-test
+demos/tri-test
+pixman/pixman-srgb.c
+pixman/pixman-version.h
+test/a1-trap-test
+test/affine-test
+test/alpha-loop
+test/alphamap
+test/alpha-test
+test/blitters-test
+test/clip-in
+test/clip-test
+test/combiner-test
+test/composite
+test/composite-test
+test/composite-traps-test
+test/convolution-test
+test/fetch-test
+test/glyph-test
+test/gradient-crash-test
+test/gradient-test
+test/infinite-loop
+test/lowlevel-blt-bench
+test/oob-test
+test/pdf-op-test
+test/prng-test
+test/radial-perf-test
+test/region-contains-test
+test/region-test
+test/region-translate
+test/region-translate-test
+test/rotate-test
+test/scaling-crash-test
+test/scaling-helpers-test
+test/scaling-test
+test/screen-test
+test/stress-test
+test/trap-crasher
+test/trap-test
+test/window-test
+*.pdb
+*.dll
+*.lib
+*.ilk
+*.obj
+*.exe
diff --git a/src/pixman/AUTHORS b/src/pixman/AUTHORS
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/pixman/AUTHORS
diff --git a/src/pixman/CODING_STYLE b/src/pixman/CODING_STYLE
new file mode 100644
index 0000000..9f5171d
--- /dev/null
+++ b/src/pixman/CODING_STYLE
@@ -0,0 +1,199 @@
+Pixman coding style.
+====================
+
+The pixman coding style is close to cairo's with one exception: braces
+go on their own line, rather than on the line of the if/while/for:
+
+	if (condition)
+	{
+	    do_something();
+	    do_something_else();
+	}
+
+not
+
+	if (condition) {
+	    do_something();
+	    do_something_else();
+        }
+
+
+
+Indentation
+===========
+
+Each new level is indented four spaces:
+
+	if (condition)
+	    do_something();
+
+This may be achieved with space characters or with a combination of
+tab characters and space characters. Tab characters are interpreted as
+
+	Advance to the next column which is a multiple of 8.
+
+
+Names
+=====
+
+In all names, words are separated with underscores. Do not use
+CamelCase for any names.
+
+Macros have ALL_CAPITAL_NAMES
+
+Type names are in lower case and end with "_t". For example
+pixman_image_t.
+
+Labels, functions and variables have lower case names.
+
+
+Braces
+======
+
+Braces always go on their own line:
+
+	if (condition)
+	{
+	    do_this ();
+	    do_that ();
+	}
+	else
+	{
+	    do_the_other ();
+	}
+
+Rules for braces and substatements of if/while/for/do:
+
+* If a substatement spans multiple lines, then there must be braces
+  around it.
+
+* If the condition of an if/while/for spans multiple lines, then 
+  braces must be used for the substatements.
+
+* If one substatement of an if statement has braces, then the other
+  must too.
+
+* Otherwise, don't add braces.
+
+
+Comments
+========
+
+For comments either like this:
+
+        /* One line comment */
+
+or like this:
+
+	/* This is a multi-line comment
+	 *
+         * It extends over multiple lines
+	 */
+
+Generally comments should say things that aren't clear from the code
+itself. If too many comments say obvious things, then people will just
+stop reading all comments, including the good ones.
+
+
+Whitespace
+==========
+
+* Put a single space after commas
+
+* Put spaces around arithmetic operators such a +, -, *, /:
+
+        y * stride + x
+
+        x / unit_x
+
+* Do not put spaces after the address-of operator, the * when used as
+  a pointer derefernce or the ! and ~ operators:
+
+     &foo;
+
+     ~0x00000000
+
+     !condition
+
+     *result = 100
+
+* Break up long lines (> ~80 characters) and use whitespace to align
+  things nicely. This is one way:
+
+  	 some_very_long_function name (
+	 	implementation, op, src, mask, dest, 
+		src_x, src_y, mask_x, mask_y, dest_x, dest_y,
+		width, height);
+
+  This is another:
+
+        some_very_long_function_name (implementation, op,
+                                      src, mask, dest,
+				      src_x, src_y,
+				      mask_x, mask_y,
+				      dest_x, dest_y,
+				      width, height);
+
+* Separate logically distinct chunks with a single newline. This
+  obviously applies between functions, but also applies within a
+  function or block or structure definition.
+
+* Use a newline after a block of variable declarations.
+
+* Use a single space before a left parenthesis, except where the
+  standard will not allow it, (eg. when defining a parameterized macro).
+
+* Don't eliminate newlines just because things would still fit on one
+  line. This breaks the expected visual structure of the code making
+  it much harder to read and understand:
+
+	if (condition) foo (); else bar ();	/* Yuck! */
+
+
+Function Definitions
+====================
+
+Function definitions should take the following form:
+
+	void
+	my_function (int argument)
+	{
+	    do_my_things ();
+	}
+
+If all the parameters to a function fit naturally on one line, format
+them that way. Otherwise, put one argument on each line, adding
+whitespace so that the parameter names are aligned with each other.
+
+I.e., do either this:
+
+        void
+        short_arguments (const char *str, int x, int y, int z)
+        {
+        }
+
+or this:
+
+	void
+	long_arguments (const char *char_star_arg,
+			int	    int_arg,
+			double	   *double_star_arg,
+			double	    double_arg)
+	{
+	}
+
+
+Mode lines
+==========
+
+Given the rules above, what is the best way to simplify one's life as
+a code monkey? Get your editor to do most of the tedious work of
+beautifying your code!
+
+As a reward for reading this far, here are some mode lines for the more
+popular editors:
+/*
+ * vim:sw=4:sts=4:ts=8:tw=78:fo=tcroq:cindent:cino=\:0,(0
+ * vim:isk=a-z,A-Z,48-57,_,.,-,>
+ */
+
diff --git a/src/pixman/COPYING b/src/pixman/COPYING
new file mode 100644
index 0000000..6168dea
--- /dev/null
+++ b/src/pixman/COPYING
@@ -0,0 +1,42 @@
+The following is the MIT license, agreed upon by most contributors.
+Copyright holders of new code should use this license statement where
+possible. They may also add themselves to the list below.
+
+/*
+ * Copyright 1987, 1988, 1989, 1998  The Open Group
+ * Copyright 1987, 1988, 1989 Digital Equipment Corporation
+ * Copyright 1999, 2004, 2008 Keith Packard
+ * Copyright 2000 SuSE, Inc.
+ * Copyright 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright 2004, 2005, 2007, 2008, 2009, 2010 Red Hat, Inc.
+ * Copyright 2004 Nicholas Miell
+ * Copyright 2005 Lars Knoll & Zack Rusin, Trolltech
+ * Copyright 2005 Trolltech AS
+ * Copyright 2007 Luca Barbato
+ * Copyright 2008 Aaron Plattner, NVIDIA Corporation
+ * Copyright 2008 Rodrigo Kumpera
+ * Copyright 2008 André Tupinambá
+ * Copyright 2008 Mozilla Corporation
+ * Copyright 2008 Frederic Plourde
+ * Copyright 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2009, 2010 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
diff --git a/src/pixman/ChangeLog b/src/pixman/ChangeLog
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/pixman/ChangeLog
diff --git a/src/pixman/INSTALL b/src/pixman/INSTALL
new file mode 100644
index 0000000..5458714
--- /dev/null
+++ b/src/pixman/INSTALL
@@ -0,0 +1,234 @@
+Installation Instructions
+*************************
+
+Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005,
+2006 Free Software Foundation, Inc.
+
+This file is free documentation; the Free Software Foundation gives
+unlimited permission to copy, distribute and modify it.
+
+Basic Installation
+==================
+
+Briefly, the shell commands `./configure; make; make install' should
+configure, build, and install this package.  The following
+more-detailed instructions are generic; see the `README' file for
+instructions specific to this package.
+
+   The `configure' shell script attempts to guess correct values for
+various system-dependent variables used during compilation.  It uses
+those values to create a `Makefile' in each directory of the package.
+It may also create one or more `.h' files containing system-dependent
+definitions.  Finally, it creates a shell script `config.status' that
+you can run in the future to recreate the current configuration, and a
+file `config.log' containing compiler output (useful mainly for
+debugging `configure').
+
+   It can also use an optional file (typically called `config.cache'
+and enabled with `--cache-file=config.cache' or simply `-C') that saves
+the results of its tests to speed up reconfiguring.  Caching is
+disabled by default to prevent problems with accidental use of stale
+cache files.
+
+   If you need to do unusual things to compile the package, please try
+to figure out how `configure' could check whether to do them, and mail
+diffs or instructions to the address given in the `README' so they can
+be considered for the next release.  If you are using the cache, and at
+some point `config.cache' contains results you don't want to keep, you
+may remove or edit it.
+
+   The file `configure.ac' (or `configure.in') is used to create
+`configure' by a program called `autoconf'.  You need `configure.ac' if
+you want to change it or regenerate `configure' using a newer version
+of `autoconf'.
+
+The simplest way to compile this package is:
+
+  1. `cd' to the directory containing the package's source code and type
+     `./configure' to configure the package for your system.
+
+     Running `configure' might take a while.  While running, it prints
+     some messages telling which features it is checking for.
+
+  2. Type `make' to compile the package.
+
+  3. Optionally, type `make check' to run any self-tests that come with
+     the package.
+
+  4. Type `make install' to install the programs and any data files and
+     documentation.
+
+  5. You can remove the program binaries and object files from the
+     source code directory by typing `make clean'.  To also remove the
+     files that `configure' created (so you can compile the package for
+     a different kind of computer), type `make distclean'.  There is
+     also a `make maintainer-clean' target, but that is intended mainly
+     for the package's developers.  If you use it, you may have to get
+     all sorts of other programs in order to regenerate files that came
+     with the distribution.
+
+Compilers and Options
+=====================
+
+Some systems require unusual options for compilation or linking that the
+`configure' script does not know about.  Run `./configure --help' for
+details on some of the pertinent environment variables.
+
+   You can give `configure' initial values for configuration parameters
+by setting variables in the command line or in the environment.  Here
+is an example:
+
+     ./configure CC=c99 CFLAGS=-g LIBS=-lposix
+
+   *Note Defining Variables::, for more details.
+
+Compiling For Multiple Architectures
+====================================
+
+You can compile the package for more than one kind of computer at the
+same time, by placing the object files for each architecture in their
+own directory.  To do this, you can use GNU `make'.  `cd' to the
+directory where you want the object files and executables to go and run
+the `configure' script.  `configure' automatically checks for the
+source code in the directory that `configure' is in and in `..'.
+
+   With a non-GNU `make', it is safer to compile the package for one
+architecture at a time in the source code directory.  After you have
+installed the package for one architecture, use `make distclean' before
+reconfiguring for another architecture.
+
+Installation Names
+==================
+
+By default, `make install' installs the package's commands under
+`/usr/local/bin', include files under `/usr/local/include', etc.  You
+can specify an installation prefix other than `/usr/local' by giving
+`configure' the option `--prefix=PREFIX'.
+
+   You can specify separate installation prefixes for
+architecture-specific files and architecture-independent files.  If you
+pass the option `--exec-prefix=PREFIX' to `configure', the package uses
+PREFIX as the prefix for installing programs and libraries.
+Documentation and other data files still use the regular prefix.
+
+   In addition, if you use an unusual directory layout you can give
+options like `--bindir=DIR' to specify different values for particular
+kinds of files.  Run `configure --help' for a list of the directories
+you can set and what kinds of files go in them.
+
+   If the package supports it, you can cause programs to be installed
+with an extra prefix or suffix on their names by giving `configure' the
+option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
+
+Optional Features
+=================
+
+Some packages pay attention to `--enable-FEATURE' options to
+`configure', where FEATURE indicates an optional part of the package.
+They may also pay attention to `--with-PACKAGE' options, where PACKAGE
+is something like `gnu-as' or `x' (for the X Window System).  The
+`README' should mention any `--enable-' and `--with-' options that the
+package recognizes.
+
+   For packages that use the X Window System, `configure' can usually
+find the X include and library files automatically, but if it doesn't,
+you can use the `configure' options `--x-includes=DIR' and
+`--x-libraries=DIR' to specify their locations.
+
+Specifying the System Type
+==========================
+
+There may be some features `configure' cannot figure out automatically,
+but needs to determine by the type of machine the package will run on.
+Usually, assuming the package is built to be run on the _same_
+architectures, `configure' can figure that out, but if it prints a
+message saying it cannot guess the machine type, give it the
+`--build=TYPE' option.  TYPE can either be a short name for the system
+type, such as `sun4', or a canonical name which has the form:
+
+     CPU-COMPANY-SYSTEM
+
+where SYSTEM can have one of these forms:
+
+     OS KERNEL-OS
+
+   See the file `config.sub' for the possible values of each field.  If
+`config.sub' isn't included in this package, then this package doesn't
+need to know the machine type.
+
+   If you are _building_ compiler tools for cross-compiling, you should
+use the option `--target=TYPE' to select the type of system they will
+produce code for.
+
+   If you want to _use_ a cross compiler, that generates code for a
+platform different from the build platform, you should specify the
+"host" platform (i.e., that on which the generated programs will
+eventually be run) with `--host=TYPE'.
+
+Sharing Defaults
+================
+
+If you want to set default values for `configure' scripts to share, you
+can create a site shell script called `config.site' that gives default
+values for variables like `CC', `cache_file', and `prefix'.
+`configure' looks for `PREFIX/share/config.site' if it exists, then
+`PREFIX/etc/config.site' if it exists.  Or, you can set the
+`CONFIG_SITE' environment variable to the location of the site script.
+A warning: not all `configure' scripts look for a site script.
+
+Defining Variables
+==================
+
+Variables not defined in a site shell script can be set in the
+environment passed to `configure'.  However, some packages may run
+configure again during the build, and the customized values of these
+variables may be lost.  In order to avoid this problem, you should set
+them in the `configure' command line, using `VAR=value'.  For example:
+
+     ./configure CC=/usr/local2/bin/gcc
+
+causes the specified `gcc' to be used as the C compiler (unless it is
+overridden in the site shell script).
+
+Unfortunately, this technique does not work for `CONFIG_SHELL' due to
+an Autoconf bug.  Until the bug is fixed you can use this workaround:
+
+     CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
+
+`configure' Invocation
+======================
+
+`configure' recognizes the following options to control how it operates.
+
+`--help'
+`-h'
+     Print a summary of the options to `configure', and exit.
+
+`--version'
+`-V'
+     Print the version of Autoconf used to generate the `configure'
+     script, and exit.
+
+`--cache-file=FILE'
+     Enable the cache: use and save the results of the tests in FILE,
+     traditionally `config.cache'.  FILE defaults to `/dev/null' to
+     disable caching.
+
+`--config-cache'
+`-C'
+     Alias for `--cache-file=config.cache'.
+
+`--quiet'
+`--silent'
+`-q'
+     Do not print messages saying which checks are being made.  To
+     suppress all normal output, redirect it to `/dev/null' (any error
+     messages will still be shown).
+
+`--srcdir=DIR'
+     Look for the package's source code in directory DIR.  Usually
+     `configure' can determine that directory automatically.
+
+`configure' also accepts some other, not widely useful, options.  Run
+`configure --help' for more details.
+
diff --git a/src/pixman/Makefile.am b/src/pixman/Makefile.am
new file mode 100644
index 0000000..5137c9e
--- /dev/null
+++ b/src/pixman/Makefile.am
@@ -0,0 +1,137 @@
+SUBDIRS = pixman demos test
+
+pkgconfigdir=$(libdir)/pkgconfig
+pkgconfig_DATA=pixman-1.pc
+
+$(pkgconfig_DATA): pixman-1.pc.in
+
+snapshot:
+	distdir="$(distdir)-`date '+%Y%m%d'`"; \
+	test -d "$(srcdir)/.git" && distdir=$$distdir-`cd "$(srcdir)" && git rev-parse HEAD | cut -c 1-6`; \
+	$(MAKE) $(AM_MAKEFLAGS) distdir="$$distdir" dist
+
+GPGKEY=3892336E
+USERNAME=$$USER
+RELEASE_OR_SNAPSHOT = $$(if test "x$(PIXMAN_VERSION_MINOR)" = "x$$(echo "$(PIXMAN_VERSION_MINOR)/2*2" | bc)" ; then echo release; else echo snapshot; fi)
+RELEASE_CAIRO_HOST =	$(USERNAME)@cairographics.org
+RELEASE_CAIRO_DIR =	/srv/cairo.freedesktop.org/www/$(RELEASE_OR_SNAPSHOT)s
+RELEASE_CAIRO_URL = 	http://cairographics.org/$(RELEASE_OR_SNAPSHOT)s
+RELEASE_XORG_URL =	http://xorg.freedesktop.org/archive/individual/lib
+RELEASE_XORG_HOST =	$(USERNAME)@xorg.freedesktop.org
+RELEASE_XORG_DIR =	/srv/xorg.freedesktop.org/archive/individual/lib
+RELEASE_ANNOUNCE_LIST = cairo-announce@cairographics.org, xorg-announce@lists.freedesktop.org, pixman@lists.freedesktop.org
+
+EXTRA_DIST =				\
+	Makefile.win32			\
+	Makefile.win32.common
+
+tar_gz = $(PACKAGE)-$(VERSION).tar.gz
+tar_bz2 = $(PACKAGE)-$(VERSION).tar.bz2
+
+sha1_tgz = $(tar_gz).sha1
+md5_tgz = $(tar_gz).md5
+
+sha1_tbz2 = $(tar_bz2).sha1
+md5_tbz2 = $(tar_bz2).md5
+
+gpg_file = $(sha1_tgz).asc
+
+$(sha1_tgz): $(tar_gz)
+	sha1sum $^ > $@
+
+$(md5_tgz): $(tar_gz)
+	md5sum $^ > $@
+
+$(sha1_tbz2): $(tar_bz2)
+	sha1sum $^ > $@
+
+$(md5_tbz2): $(tar_bz2)
+	md5sum $^ > $@
+
+$(gpg_file): $(sha1_tgz)
+	@echo "Please enter your GPG password to sign the checksum."
+	gpg --armor --sign $^ 
+
+HASHFILES = $(sha1_tgz) $(sha1_tbz2) $(md5_tgz) $(md5_tbz2)
+
+release-verify-newer:
+	@echo -n "Checking that no $(VERSION) release already exists at $(RELEASE_XORG_HOST)..."
+	@ssh $(RELEASE_XORG_HOST) test ! -e $(RELEASE_XORG_DIR)/$(tar_gz) \
+		|| (echo "Ouch." && echo "Found: $(RELEASE_XORG_HOST):$(RELEASE_XORG_DIR)/$(tar_gz)" \
+		&& echo "Refusing to try to generate a new release of the same name." \
+		&& false)
+	@ssh $(RELEASE_CAIRO_HOST) test ! -e $(RELEASE_CAIRO_DIR)/$(tar_gz) \
+		|| (echo "Ouch." && echo "Found: $(RELEASE_CAIRO_HOST):$(RELEASE_CAIRO_DIR)/$(tar_gz)" \
+		&& echo "Refusing to try to generate a new release of the same name." \
+		&& false)
+	@echo "Good."
+
+release-remove-old:
+	$(RM) $(tar_gz) $(tar_bz2) $(HASHFILES) $(gpg_file)
+
+ensure-prev:
+	@if [[ "$(PREV)" == "" ]]; then							\
+		echo ""							          &&	\
+		echo "You must set the PREV variable on the make command line to" &&	\
+		echo "the last version."				  	  &&	\
+		echo ""								  &&	\
+		echo "For example:"						  &&	\
+		echo "      make PREV=0.7.3"				  	  &&	\
+		echo ""								  &&	\
+		false;									\
+	fi
+
+release-check: ensure-prev release-verify-newer release-remove-old distcheck
+
+release-tag:
+	git tag -u $(GPGKEY) -m "$(PACKAGE) $(VERSION) release" $(PACKAGE)-$(VERSION)
+
+release-upload: release-check $(tar_gz) $(tar_bz2) $(sha1_tgz) $(sha1_tbz2) $(md5_tgz) $(gpg_file)
+	scp $(tar_gz) $(sha1_tgz) $(gpg_file) $(RELEASE_CAIRO_HOST):$(RELEASE_CAIRO_DIR)
+	scp $(tar_gz) $(tar_bz2) $(RELEASE_XORG_HOST):$(RELEASE_XORG_DIR)
+	ssh $(RELEASE_CAIRO_HOST) "rm -f $(RELEASE_CAIRO_DIR)/LATEST-$(PACKAGE)-[0-9]* && ln -s $(tar_gz) $(RELEASE_CAIRO_DIR)/LATEST-$(PACKAGE)-$(VERSION)"
+
+RELEASE_TYPE = $$(if test "x$(PIXMAN_VERSION_MINOR)" = "x$$(echo "$(PIXMAN_VERSION_MINOR)/2*2" | bc)" ; then echo "stable release in the" ; else echo "development snapshot leading up to a stable"; fi)
+
+release-publish-message: $(HASHFILES) ensure-prev
+	@echo "Please follow the instructions in RELEASING to push stuff out and"
+	@echo "send out the announcement mails.  Here is the excerpt you need:"
+	@echo ""
+	@echo "Lists:  $(RELEASE_ANNOUNCE_LIST)"
+	@echo "Subject: [ANNOUNCE] $(PACKAGE) release $(VERSION) now available"
+	@echo "============================== CUT HERE =============================="
+	@echo "A new $(PACKAGE) release $(VERSION) is now available. This is a $(RELEASE_TYPE)"
+	@echo ""
+	@echo "tar.gz:"
+	@echo "	$(RELEASE_CAIRO_URL)/$(tar_gz)"
+	@echo "	$(RELEASE_XORG_URL)/$(tar_gz)"
+	@echo ""
+	@echo "tar.bz2:"
+	@echo "	$(RELEASE_XORG_URL)/$(tar_bz2)"
+	@echo ""
+	@echo "Hashes:"
+	@echo -n "	MD5:  "
+	@cat $(md5_tgz)
+	@echo -n "	MD5:  "
+	@cat $(md5_tbz2)
+	@echo -n "	SHA1: "
+	@cat $(sha1_tgz)
+	@echo -n "	SHA1: "
+	@cat $(sha1_tbz2)
+	@echo ""
+	@echo "GPG signature:"
+	@echo "	$(RELEASE_CAIRO_URL)/$(gpg_file)"
+	@echo "	(signed by`gpg --list-keys $(GPGKEY) | grep uid | cut -b4- | tr -s " "`)"
+	@echo ""
+	@echo "Git:"
+	@echo "	git://git.freedesktop.org/git/pixman"
+	@echo "	tag: $(PACKAGE)-$(VERSION)"
+	@echo ""
+	@echo "Log:"
+	@git log --no-merges "$(PACKAGE)-$(PREV)".."$(PACKAGE)-$(VERSION)" | git shortlog | awk '{ printf "\t"; print ; }' | cut -b1-80
+	@echo "============================== CUT HERE =============================="
+	@echo ""
+
+release-publish: release-upload release-tag release-publish-message
+
+.PHONY: release-upload release-publish release-publish-message release-tag
diff --git a/src/pixman/Makefile.win32 b/src/pixman/Makefile.win32
new file mode 100644
index 0000000..c3ca3bc
--- /dev/null
+++ b/src/pixman/Makefile.win32
@@ -0,0 +1,25 @@
+default: all
+
+top_srcdir = .
+include $(top_srcdir)/Makefile.win32.common
+
+all: pixman test
+
+pixman:
+	@$(MAKE) -C pixman -f Makefile.win32
+
+test:
+	@$(MAKE) -C test -f Makefile.win32
+
+clean_r:
+	@$(MAKE) -C pixman -f Makefile.win32 clean
+	@$(MAKE) -C test   -f Makefile.win32 clean
+
+check:
+	@$(MAKE) -C test -f Makefile.win32 check
+
+
+clean: clean_r
+
+
+.PHONY: all pixman test clean check
diff --git a/src/pixman/Makefile.win32.common b/src/pixman/Makefile.win32.common
new file mode 100644
index 0000000..777f94c
--- /dev/null
+++ b/src/pixman/Makefile.win32.common
@@ -0,0 +1,56 @@
+LIBRARY = pixman-1
+
+CC = cl
+LD = link
+AR = lib
+PERL = perl
+
+ifeq ($(top_builddir),)
+top_builddir = $(top_srcdir)
+endif
+
+CFG_VAR = $(CFG)
+ifeq ($(CFG_VAR),)
+CFG_VAR = release
+endif
+
+ifeq ($(CFG_VAR),debug)
+CFG_CFLAGS  = -MDd -Od -Zi
+CFG_LDFLAGS = -DEBUG
+else
+CFG_CFLAGS  = -MD -O2
+CFG_LDFLAGS =
+endif
+
+# Package definitions, to be used instead of those provided in config.h
+PKG_CFLAGS  = -DPACKAGE=$(LIBRARY) -DPACKAGE_VERSION="" -DPACKAGE_BUGREPORT=""
+
+BASE_CFLAGS = -nologo -I. -I$(top_srcdir) -I$(top_srcdir)/pixman
+
+PIXMAN_CFLAGS  = $(BASE_CFLAGS) $(PKG_CFLAGS) $(CFG_CFLAGS) $(CFLAGS)
+PIXMAN_LDFLAGS = -nologo $(CFG_LDFLAGS) $(LDFLAGS)
+PIXMAN_ARFLAGS = -nologo $(LDFLAGS)
+
+
+inform:
+ifneq ($(CFG),release)
+ifneq ($(CFG),debug)
+ifneq ($(CFG),)
+	@echo "Invalid specified configuration option: "$(CFG)"."
+	@echo
+	@echo "Possible choices for configuration are 'release' and 'debug'"
+	@exit 1
+endif
+	@echo "Using default RELEASE configuration... (use CFG=release or CFG=debug)"
+endif
+endif
+
+
+$(CFG_VAR)/%.obj: %.c $(libpixman_headers)
+	@mkdir -p $(CFG_VAR)
+	@$(CC) -c $(PIXMAN_CFLAGS) -Fo"$@" $<
+
+clean: inform
+	@$(RM) $(CFG_VAR)/*.{exe,ilk,lib,obj,pdb} $(BUILT_SOURCES) || exit 0
+
+.PHONY: inform clean
diff --git a/src/pixman/NEWS b/src/pixman/NEWS
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/pixman/NEWS
diff --git a/src/pixman/README b/src/pixman/README
new file mode 100644
index 0000000..6d8cfd8
--- /dev/null
+++ b/src/pixman/README
@@ -0,0 +1,116 @@
+Pixman is a library that provides low-level pixel manipulation
+features such as image compositing and trapezoid rasterization.
+
+Questions, bug reports and patches should be directed to the pixman
+mailing list:
+
+        http://lists.freedesktop.org/mailman/listinfo/pixman
+
+You can also file bugs at
+
+        https://bugs.freedesktop.org/enter_bug.cgi?product=pixman
+
+For real time discussions about pixman, feel free to join the IRC
+channels #cairo and #xorg-devel on the FreeNode IRC network.
+
+
+Contributing
+------------
+
+In order to contribute to pixman, you will need a working knowledge of
+the git version control system. For a quick getting started guide,
+there is the "Everyday Git With 20 Commands Or So guide"
+
+        http://www.kernel.org/pub/software/scm/git/docs/everyday.html
+
+from the Git homepage. For more in depth git documentation, see the
+resources on the Git community documentation page:
+
+        http://git-scm.com/documentation
+
+Pixman uses the infrastructure from the freedesktop.org umbrella
+project. For instructions about how to use the git service on
+freedesktop.org, see:
+
+        http://www.freedesktop.org/wiki/Infrastructure/git/Developers
+
+The Pixman master repository can be found at:
+
+	git://anongit.freedesktop.org/git/pixman
+
+and browsed on the web here:
+
+	http://cgit.freedesktop.org/pixman/
+
+
+Sending patches
+---------------
+
+The general workflow for sending patches is to first make sure that
+git can send mail on your system. Then, 
+
+ - create a branch off of master in your local git repository
+
+ - make your changes as one or more commits
+
+ - use the 
+
+        git send-email
+
+   command to send the patch series to pixman@lists.freedesktop.org.
+
+In order for your patches to be accepted, please consider the
+following guidelines:
+
+ - This link:
+
+        http://www.kernel.org/pub/software/scm/git/docs/user-manual.html#patch-series
+
+   describes how what a good patch series is, and to create one with
+   git.
+
+ - At each point in the series, pixman should compile and the test
+   suite should pass.
+
+   The exception here is if you are changing the test suite to
+   demonstrate a bug. In this case, make one commit that makes the
+   test suite fail due to the bug, and then another commit that fixes
+   the bug.
+
+   You can run the test suite with 
+
+        make check
+
+   It will take around two minutes to run on a modern PC.
+
+ - Follow the coding style described in the CODING_STYLE file
+
+ - For bug fixes, include an update to the test suite to make sure
+   the bug doesn't reappear.
+
+ - For new features, add tests of the feature to the test
+   suite. Also, add a program demonstrating the new feature to the
+   demos/ directory.
+
+ - Write descriptive commit messages. Useful information to include:
+        - Benchmark results, before and after
+	- Description of the bug that was fixed
+	- Detailed rationale for any new API
+	- Alternative approaches that were rejected (and why they
+          don't work)
+	- If review comments were incorporated, a brief version
+          history describing what those changes were.
+
+ - For big patch series, send an introductory email with an overall
+   description of the patch series, including benchmarks and
+   motivation. Each commit message should still be descriptive and
+   include enough information to understand why this particular commit
+   was necessary.
+
+Pixman has high standards for code quality and so almost everybody
+should expect to have the first versions of their patches rejected.
+
+If you think that the reviewers are wrong about something, or that the
+guidelines above are wrong, feel free to discuss the issue on the
+list. The purpose of the guidelines and code review is to ensure high
+code quality; it is not an exercise in compliance.
diff --git a/src/pixman/RELEASING b/src/pixman/RELEASING
new file mode 100644
index 0000000..657857d
--- /dev/null
+++ b/src/pixman/RELEASING
@@ -0,0 +1,59 @@
+Here are the steps to follow to create a new pixman release:
+
+1) Ensure that there are no uncommitted changes or unpushed commits,
+   and that you are up to date with the latest commits in the central
+   repository. Here are a couple of useful commands:
+
+	git diff			(no output)
+	
+	git status			(should report "nothing to commit")
+
+	git log master...origin		(no output; note: *3* dots)
+
+2) Increment pixman_(major|minor|micro) in configure.ac according to
+   the directions in that file.
+
+3) Make sure that new version works, including
+
+	- make distcheck passes
+
+	- the X server still works with the new pixman version
+	  installed
+
+	- the cairo test suite hasn't gained any new failures compared
+	  to last pixman version.
+
+4) Use "git commit" to record the changes made in step 2 and 3.
+
+5) Generate and publish the tar files by running 
+
+	make PREV=<last version> GPGKEY=<your gpg key id> release-publish
+
+   If your freedesktop user name is different from your local one,
+   then also set the variable USER to your freedesktop user name.
+
+6) Run 
+
+	make release-publish-message
+
+   to generate a draft release announcement. Edit it as appropriate and
+   send it to 
+
+	cairo-announce@cairographics.org
+
+	pixman@lists.freedesktop.org
+
+	xorg-announce@lists.freedesktop.org
+
+7) Increment pixman_micro to the next larger (odd) number in
+   configure.ac. Commit this change, and push all commits created
+   during this process using
+
+	git push
+	git push --tags
+
+   You must use "--tags" here; otherwise the new tag will not
+   be pushed out.
+
+8) Change the topic of the #cairo IRC channel on freenode to advertise
+   the new version.
diff --git a/src/pixman/autogen.sh b/src/pixman/autogen.sh
new file mode 100755
index 0000000..fc34bd5
--- /dev/null
+++ b/src/pixman/autogen.sh
@@ -0,0 +1,14 @@
+#! /bin/sh
+
+srcdir=`dirname $0`
+test -z "$srcdir" && srcdir=.
+
+ORIGDIR=`pwd`
+cd $srcdir
+
+autoreconf -v --install || exit 1
+cd $ORIGDIR || exit $?
+
+if test -z "$NOCONFIGURE"; then
+    $srcdir/configure "$@"
+fi
diff --git a/src/pixman/configure.ac b/src/pixman/configure.ac
new file mode 100644
index 0000000..245827b
--- /dev/null
+++ b/src/pixman/configure.ac
@@ -0,0 +1,1131 @@
+dnl  Copyright 2005 Red Hat, Inc.
+dnl 
+dnl  Permission to use, copy, modify, distribute, and sell this software and its
+dnl  documentation for any purpose is hereby granted without fee, provided that
+dnl  the above copyright notice appear in all copies and that both that
+dnl  copyright notice and this permission notice appear in supporting
+dnl  documentation, and that the name of Red Hat not be used in
+dnl  advertising or publicity pertaining to distribution of the software without
+dnl  specific, written prior permission.  Red Hat makes no
+dnl  representations about the suitability of this software for any purpose.  It
+dnl  is provided "as is" without express or implied warranty.
+dnl 
+dnl  RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+dnl  INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+dnl  EVENT SHALL RED HAT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+dnl  CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+dnl  DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+dnl  TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+dnl  PERFORMANCE OF THIS SOFTWARE.
+dnl
+dnl Process this file with autoconf to create configure.
+
+AC_PREREQ([2.57])
+
+#   Pixman versioning scheme
+#
+#   - The version in git has an odd MICRO version number
+#
+#   - Released versions, both development and stable, have an
+#     even MICRO version number
+#
+#   - Released development versions have an odd MINOR number
+#
+#   - Released stable versions have an even MINOR number
+#
+#   - Versions that break ABI must have a new MAJOR number
+#
+#   - If you break the ABI, then at least this must be done:
+#
+#        - increment MAJOR
+#
+#        - In the first development release where you break ABI, find
+#          all instances of "pixman-n" and change them to pixman-(n+1)
+#
+#          This needs to be done at least in 
+#                    configure.ac
+#                    all Makefile.am's
+#                    pixman-n.pc.in
+#
+#      This ensures that binary incompatible versions can be installed
+#      in parallel.  See http://www106.pair.com/rhp/parallel.html for
+#      more information
+#
+
+m4_define([pixman_major], 0)
+m4_define([pixman_minor], 32)
+m4_define([pixman_micro], 6)
+
+m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
+
+AC_INIT(pixman, pixman_version, [pixman@lists.freedesktop.org], pixman)
+AM_INIT_AUTOMAKE([foreign dist-bzip2])
+
+# Suppress verbose compile lines
+m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
+
+AC_CONFIG_HEADERS(config.h)
+
+AC_CANONICAL_HOST
+
+test_CFLAGS=${CFLAGS+set} # We may override autoconf default CFLAGS.
+
+AC_PROG_CC
+AM_PROG_AS
+AC_PROG_LIBTOOL
+AC_CHECK_FUNCS([getisax])
+AC_C_BIGENDIAN
+AC_C_INLINE
+
+dnl PIXMAN_LINK_WITH_ENV(env-setup, program, true-action, false-action)
+dnl
+dnl Compiles and links the given program in the environment setup by env-setup
+dnl and executes true-action on success and false-action on failure.
+AC_DEFUN([PIXMAN_LINK_WITH_ENV],[dnl
+	save_CFLAGS="$CFLAGS"
+	save_LDFLAGS="$LDFLAGS"
+	save_LIBS="$LIBS"
+	CFLAGS=""
+	LDFLAGS=""
+	LIBS=""
+	$1
+	CFLAGS="$save_CFLAGS $CFLAGS"
+	LDFLAGS="$save_LDFLAGS $LDFLAGS"
+	LIBS="$save_LIBS $LIBS"
+	AC_LINK_IFELSE(
+		[AC_LANG_SOURCE([$2])],
+		[pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
+		 pixman_cc_flag=yes],
+		[pixman_cc_stderr=`test -f conftest.err && cat conftest.err`
+		 pixman_cc_flag=no])
+
+	if test "x$pixman_cc_stderr" != "x"; then
+		pixman_cc_flag=no
+	fi
+
+	if test "x$pixman_cc_flag" = "xyes"; then
+		ifelse([$3], , :, [$3])
+	else
+		ifelse([$4], , :, [$4])
+	fi
+	CFLAGS="$save_CFLAGS"
+	LDFLAGS="$save_LDFLAGS"
+	LIBS="$save_LIBS"
+])
+
+dnl Find a -Werror for catching warnings.
+WERROR=
+for w in -Werror -errwarn; do
+    if test "z$WERROR" = "z"; then
+        AC_MSG_CHECKING([whether the compiler supports $w])
+        PIXMAN_LINK_WITH_ENV(
+		[CFLAGS=$w],
+		[int main(int c, char **v) { (void)c; (void)v; return 0; }],
+		[WERROR=$w; yesno=yes], [yesno=no])
+	AC_MSG_RESULT($yesno)
+    fi
+done
+
+dnl PIXMAN_CHECK_CFLAG(flag, [program])
+dnl  Adds flag to CFLAGS if the given program links without warnings or errors.
+AC_DEFUN([PIXMAN_CHECK_CFLAG], [dnl
+	AC_MSG_CHECKING([whether the compiler supports $1])
+	PIXMAN_LINK_WITH_ENV(
+		[CFLAGS="$WERROR $1"],
+		[$2
+		 int main(int c, char **v) { (void)c; (void)v; return 0; }
+		],
+		[_yesno=yes],
+		[_yesno=no])
+	if test "x$_yesno" = xyes; then
+	   CFLAGS="$CFLAGS $1"
+	fi
+	AC_MSG_RESULT($_yesno)
+])
+
+AC_CHECK_SIZEOF(long)
+
+# Checks for Sun Studio compilers
+AC_CHECK_DECL([__SUNPRO_C], [SUNCC="yes"], [SUNCC="no"])
+AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], [AMD64_ABI="no"])
+
+# Default CFLAGS to -O -g rather than just the -g from AC_PROG_CC
+# if we're using Sun Studio and neither the user nor a config.site
+# has set CFLAGS.
+if test $SUNCC = yes &&			\
+   test "x$test_CFLAGS" = "x" &&	\
+   test "$CFLAGS" = "-g"
+then
+  CFLAGS="-O -g"
+fi
+
+CFLAGS="$CFLAGS -Wno-unused-local-typedefs -Wno-unused-const-variable"
+CFLAGS="$CFLAGS -Wno-attributes -Wno-deprecated -Wno-unused-but-set-variable"
+
+# 
+# We ignore pixman_major in the version here because the major version should
+# always be encoded in the actual library name. Ie., the soname is:
+#
+#      pixman-$(pixman_major).0.minor.micro
+#
+m4_define([lt_current], [pixman_minor])
+m4_define([lt_revision], [pixman_micro])
+m4_define([lt_age], [pixman_minor])
+
+LT_VERSION_INFO="lt_current:lt_revision:lt_age"
+
+PIXMAN_VERSION_MAJOR=pixman_major()
+AC_SUBST(PIXMAN_VERSION_MAJOR)
+PIXMAN_VERSION_MINOR=pixman_minor()
+AC_SUBST(PIXMAN_VERSION_MINOR)
+PIXMAN_VERSION_MICRO=pixman_micro()
+AC_SUBST(PIXMAN_VERSION_MICRO)
+
+AC_SUBST(LT_VERSION_INFO)
+
+# Check for dependencies
+
+PIXMAN_CHECK_CFLAG([-Wall])
+PIXMAN_CHECK_CFLAG([-Wdeclaration-after-statement])
+PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
+
+dnl =========================================================================
+dnl OpenMP for the test suite?
+dnl
+
+# Check for OpenMP support only when autoconf support that (require autoconf >=2.62)
+OPENMP_CFLAGS=
+m4_ifdef([AC_OPENMP], [AC_OPENMP])
+
+if test "x$enable_openmp" = "xyes" && test "x$ac_cv_prog_c_openmp" = "xunsupported" ; then
+  AC_MSG_WARN([OpenMP support requested but found unsupported])
+fi
+
+dnl May not fail to link without -Wall -Werror added
+dnl So try to link only when openmp is supported
+dnl ac_cv_prog_c_openmp is not defined when --disable-openmp is used
+if test "x$ac_cv_prog_c_openmp" != "xunsupported" && test "x$ac_cv_prog_c_openmp" != "x"; then
+  m4_define([openmp_test_program],[dnl
+  #include <stdio.h>
+
+  extern unsigned int lcg_seed;
+  #pragma omp threadprivate(lcg_seed)
+  unsigned int lcg_seed;
+
+  unsigned function(unsigned a, unsigned b)
+  {
+	lcg_seed ^= b;
+	return ((a + b) ^ a ) + lcg_seed;
+  }
+
+  int main(int argc, char **argv)
+  {
+	int i;
+	int n1 = 0, n2 = argc;
+	unsigned checksum = 0;
+	int verbose = argv != NULL;
+	unsigned (*test_function)(unsigned, unsigned);
+	test_function = function;
+	#pragma omp parallel for reduction(+:checksum) default(none) \
+					shared(n1, n2, test_function, verbose)
+	for (i = n1; i < n2; i++)
+	{
+		unsigned crc = test_function (i, 0);
+		if (verbose)
+			printf ("%d: %08X\n", i, crc);
+		checksum += crc;
+	}
+	printf("%u\n", checksum);
+	return 0;
+  }
+  ])
+
+  PIXMAN_LINK_WITH_ENV(
+	[CFLAGS="$OPENMP_CFLAGS" LDFLAGS="$OPENMP_CFLAGS"],
+	[openmp_test_program],
+	[have_openmp=yes],
+	[have_openmp=no])
+  if test "x$have_openmp" = "xyes" ; then
+    AC_DEFINE(USE_OPENMP, 1, [use OpenMP in the test suite])
+  fi
+fi
+AC_SUBST(OPENMP_CFLAGS)
+
+dnl =========================================================================
+dnl -fvisibility stuff
+
+PIXMAN_CHECK_CFLAG([-fvisibility=hidden], [dnl
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#ifdef _WIN32
+#error Have -fvisibility but it is ignored and generates a warning
+#endif
+#else
+#error Need GCC 4.0 for visibility
+#endif
+])
+
+PIXMAN_CHECK_CFLAG([-xldscope=hidden], [dnl
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
+#else
+#error Need Sun Studio 8 for visibility
+#endif
+])
+
+dnl ===========================================================================
+dnl Check for Loongson Multimedia Instructions
+
+if test "x$LS_CFLAGS" = "x" ; then
+    LS_CFLAGS="-march=loongson2f"
+fi
+
+have_loongson_mmi=no
+AC_MSG_CHECKING(whether to use Loongson MMI assembler)
+
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS=" $LS_CFLAGS $CFLAGS -I$srcdir"
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+#ifndef __mips_loongson_vector_rev
+#error "Loongson Multimedia Instructions are only available on Loongson"
+#endif
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4))
+#error "Need GCC >= 4.4 for Loongson MMI compilation"
+#endif
+#include "pixman/loongson-mmintrin.h"
+int main () {
+    union {
+        __m64 v;
+        char c[8];
+    } a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} };
+    int b = 4;
+    __m64 c = _mm_srli_pi16 (a.v, b);
+    return 0;
+}]])], have_loongson_mmi=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(loongson-mmi,
+   [AC_HELP_STRING([--disable-loongson-mmi],
+                   [disable Loongson MMI fast paths])],
+   [enable_loongson_mmi=$enableval], [enable_loongson_mmi=auto])
+
+if test $enable_loongson_mmi = no ; then
+   have_loongson_mmi=disabled
+fi
+
+if test $have_loongson_mmi = yes ; then
+   AC_DEFINE(USE_LOONGSON_MMI, 1, [use Loongson Multimedia Instructions])
+else
+   LS_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_loongson_mmi)
+if test $enable_loongson_mmi = yes && test $have_loongson_mmi = no ; then
+   AC_MSG_ERROR([Loongson MMI not detected])
+fi
+
+AM_CONDITIONAL(USE_LOONGSON_MMI, test $have_loongson_mmi = yes)
+
+dnl ===========================================================================
+dnl Check for MMX
+
+if test "x$MMX_CFLAGS" = "x" ; then
+   if test "x$SUNCC" = "xyes"; then
+      # Sun Studio doesn't have an -xarch=mmx flag, so we have to use sse
+      # but if we're building 64-bit, mmx & sse support is on by default and
+      # -xarch=sse throws an error instead
+      if test "$AMD64_ABI" = "no" ; then
+         MMX_CFLAGS="-xarch=sse"
+      fi
+   else
+      MMX_CFLAGS="-mmmx -Winline"
+   fi
+fi
+
+have_mmx_intrinsics=no
+AC_MSG_CHECKING(whether to use MMX intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$MMX_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+#error "Need GCC >= 3.4 for MMX intrinsics"
+#endif
+#include <mmintrin.h>
+int main () {
+    __m64 v = _mm_cvtsi32_si64 (1);
+    __m64 w;
+
+    /* Some versions of clang will choke on K */
+    asm ("pshufw %2, %1, %0\n\t"
+        : "=y" (w)
+        : "y" (v), "K" (5)
+    );
+
+    /* Some versions of clang will choke on this */
+    asm ("pmulhuw %1, %0\n\t"
+	: "+y" (w)
+	: "y" (v)
+    );
+
+    return _mm_cvtsi64_si32 (v);
+}]])], have_mmx_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(mmx,
+   [AC_HELP_STRING([--disable-mmx],
+                   [disable x86 MMX fast paths])],
+   [enable_mmx=$enableval], [enable_mmx=auto])
+
+if test $enable_mmx = no ; then
+   have_mmx_intrinsics=disabled
+fi
+
+if test $have_mmx_intrinsics = yes ; then
+   AC_DEFINE(USE_X86_MMX, 1, [use x86 MMX compiler intrinsics])
+else
+   MMX_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_mmx_intrinsics)
+if test $enable_mmx = yes && test $have_mmx_intrinsics = no ; then
+   AC_MSG_ERROR([x86 MMX intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_X86_MMX, test $have_mmx_intrinsics = yes)
+
+dnl ===========================================================================
+dnl Check for SSE2
+
+if test "x$SSE2_CFLAGS" = "x" ; then
+   if test "x$SUNCC" = "xyes"; then
+      # SSE2 is enabled by default in the Sun Studio 64-bit environment
+      if test "$AMD64_ABI" = "no" ; then
+         SSE2_CFLAGS="-xarch=sse2"
+      fi
+   else
+      SSE2_CFLAGS="-msse2 -Winline"
+   fi
+fi
+
+have_sse2_intrinsics=no
+AC_MSG_CHECKING(whether to use SSE2 intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$SSE2_CFLAGS $CFLAGS"
+
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2))
+#   if !defined(__amd64__) && !defined(__x86_64__)
+#      error "Need GCC >= 4.2 for SSE2 intrinsics on x86"
+#   endif
+#endif
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+int main () {
+    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
+	c = _mm_xor_si128 (a, b);
+    return 0;
+}]])], have_sse2_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(sse2,
+   [AC_HELP_STRING([--disable-sse2],
+                   [disable SSE2 fast paths])],
+   [enable_sse2=$enableval], [enable_sse2=auto])
+
+if test $enable_sse2 = no ; then
+   have_sse2_intrinsics=disabled
+fi
+
+if test $have_sse2_intrinsics = yes ; then
+   AC_DEFINE(USE_SSE2, 1, [use SSE2 compiler intrinsics])
+fi
+
+AC_MSG_RESULT($have_sse2_intrinsics)
+if test $enable_sse2 = yes && test $have_sse2_intrinsics = no ; then
+   AC_MSG_ERROR([SSE2 intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_SSE2, test $have_sse2_intrinsics = yes)
+
+dnl ===========================================================================
+dnl Check for SSSE3
+
+if test "x$SSSE3_CFLAGS" = "x" ; then
+    SSSE3_CFLAGS="-mssse3 -Winline"
+fi
+
+have_ssse3_intrinsics=no
+AC_MSG_CHECKING(whether to use SSSE3 intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$SSSE3_CFLAGS $CFLAGS"
+
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+int main () {
+    __m128i a = _mm_set1_epi32 (0), b = _mm_set1_epi32 (0), c;
+    c = _mm_maddubs_epi16 (a, b);
+    return 0;
+}]])], have_ssse3_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(ssse3,
+   [AC_HELP_STRING([--disable-ssse3],
+                   [disable SSSE3 fast paths])],
+   [enable_ssse3=$enableval], [enable_ssse3=auto])
+
+if test $enable_ssse3 = no ; then
+   have_ssse3_intrinsics=disabled
+fi
+
+if test $have_ssse3_intrinsics = yes ; then
+   AC_DEFINE(USE_SSSE3, 1, [use SSSE3 compiler intrinsics])
+fi
+
+AC_MSG_RESULT($have_ssse3_intrinsics)
+if test $enable_ssse3 = yes && test $have_ssse3_intrinsics = no ; then
+   AC_MSG_ERROR([SSSE3 intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_SSSE3, test $have_ssse3_intrinsics = yes)
+
+dnl ===========================================================================
+dnl Other special flags needed when building code using MMX or SSE instructions
+case $host_os in
+   solaris*)
+      # When building 32-bit binaries, apply a mapfile to ensure that the
+      # binaries aren't flagged as only able to run on MMX+SSE capable CPUs
+      # since they check at runtime before using those instructions.
+      # Not all linkers grok the mapfile format so we check for that first.
+      if test "$AMD64_ABI" = "no" ; then
+	 use_hwcap_mapfile=no
+	 AC_MSG_CHECKING(whether to use a hardware capability map file)
+	 hwcap_save_LDFLAGS="$LDFLAGS"
+	 HWCAP_LDFLAGS='-Wl,-M,$(srcdir)/solaris-hwcap.mapfile'
+	 LDFLAGS="$LDFLAGS -Wl,-M,pixman/solaris-hwcap.mapfile"
+	 AC_LINK_IFELSE([AC_LANG_SOURCE([[int main() { return 0; }]])],
+			use_hwcap_mapfile=yes,
+			HWCAP_LDFLAGS="")
+	 LDFLAGS="$hwcap_save_LDFLAGS"
+	 AC_MSG_RESULT($use_hwcap_mapfile)
+      fi
+      if test "x$MMX_LDFLAGS" = "x" ; then
+         MMX_LDFLAGS="$HWCAP_LDFLAGS"
+      fi
+      if test "x$SSE2_LDFLAGS" = "x" ; then
+	 SSE2_LDFLAGS="$HWCAP_LDFLAGS"
+      fi
+      ;;
+esac
+
+AC_SUBST(LS_CFLAGS)
+AC_SUBST(IWMMXT_CFLAGS)
+AC_SUBST(MMX_CFLAGS)
+AC_SUBST(MMX_LDFLAGS)
+AC_SUBST(SSE2_CFLAGS)
+AC_SUBST(SSE2_LDFLAGS)
+AC_SUBST(SSSE3_CFLAGS)
+
+dnl ===========================================================================
+dnl Check for VMX/Altivec
+if test -n "`$CC -v 2>&1 | grep version | grep Apple`"; then
+    VMX_CFLAGS="-faltivec"
+else
+    VMX_CFLAGS="-maltivec -mabi=altivec"
+fi
+
+have_vmx_intrinsics=no
+AC_MSG_CHECKING(whether to use VMX/Altivec intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$VMX_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 3 || (__GNUC__ == 3 && __GNUC_MINOR__ < 4))
+#error "Need GCC >= 3.4 for sane altivec support"
+#endif
+#include <altivec.h>
+int main () {
+    vector unsigned int v = vec_splat_u32 (1);
+    v = vec_sub (v, v);
+    return 0;
+}]])], have_vmx_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(vmx,
+   [AC_HELP_STRING([--disable-vmx],
+                   [disable VMX fast paths])],
+   [enable_vmx=$enableval], [enable_vmx=auto])
+
+if test $enable_vmx = no ; then
+   have_vmx_intrinsics=disabled
+fi
+
+if test $have_vmx_intrinsics = yes ; then
+   AC_DEFINE(USE_VMX, 1, [use VMX compiler intrinsics])
+else
+   VMX_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_vmx_intrinsics)
+if test $enable_vmx = yes && test $have_vmx_intrinsics = no ; then
+   AC_MSG_ERROR([VMX intrinsics not detected])
+fi
+
+AC_SUBST(VMX_CFLAGS)
+
+AM_CONDITIONAL(USE_VMX, test $have_vmx_intrinsics = yes)
+
+dnl ==========================================================================
+dnl Check if assembler is gas compatible and supports ARM SIMD instructions
+have_arm_simd=no
+AC_MSG_CHECKING(whether to use ARM SIMD assembler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="-x assembler-with-cpp $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+.text
+.arch armv6
+.object_arch armv4
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+uqadd8 r0, r0, r0]])], have_arm_simd=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(arm-simd,
+   [AC_HELP_STRING([--disable-arm-simd],
+                   [disable ARM SIMD fast paths])],
+   [enable_arm_simd=$enableval], [enable_arm_simd=auto])
+
+if test $enable_arm_simd = no ; then
+   have_arm_simd=disabled
+fi
+
+if test $have_arm_simd = yes ; then
+   AC_DEFINE(USE_ARM_SIMD, 1, [use ARM SIMD assembly optimizations])
+fi
+
+AM_CONDITIONAL(USE_ARM_SIMD, test $have_arm_simd = yes)
+
+AC_MSG_RESULT($have_arm_simd)
+if test $enable_arm_simd = yes && test $have_arm_simd = no ; then
+   AC_MSG_ERROR([ARM SIMD intrinsics not detected])
+fi
+
+dnl ==========================================================================
+dnl Check if assembler is gas compatible and supports NEON instructions
+have_arm_neon=no
+AC_MSG_CHECKING(whether to use ARM NEON assembler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="-x assembler-with-cpp $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.eabi_attribute 10, 0
+.arm
+.altmacro
+#ifndef __ARM_EABI__
+#error EABI is required (to be sure that calling conventions are compatible)
+#endif
+pld [r0]
+vmovn.u16 d0, q0]])], have_arm_neon=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(arm-neon,
+   [AC_HELP_STRING([--disable-arm-neon],
+                   [disable ARM NEON fast paths])],
+   [enable_arm_neon=$enableval], [enable_arm_neon=auto])
+
+if test $enable_arm_neon = no ; then
+   have_arm_neon=disabled
+fi
+
+if test $have_arm_neon = yes ; then
+   AC_DEFINE(USE_ARM_NEON, 1, [use ARM NEON assembly optimizations])
+fi
+
+AM_CONDITIONAL(USE_ARM_NEON, test $have_arm_neon = yes)
+
+AC_MSG_RESULT($have_arm_neon)
+if test $enable_arm_neon = yes && test $have_arm_neon = no ; then
+   AC_MSG_ERROR([ARM NEON intrinsics not detected])
+fi
+
+dnl ===========================================================================
+dnl Check for IWMMXT
+
+AC_ARG_ENABLE(arm-iwmmxt,
+   [AC_HELP_STRING([--disable-arm-iwmmxt],
+                   [disable ARM IWMMXT fast paths])],
+   [enable_iwmmxt=$enableval], [enable_iwmmxt=auto])
+
+AC_ARG_ENABLE(arm-iwmmxt2,
+   [AC_HELP_STRING([--disable-arm-iwmmxt2],
+                   [build ARM IWMMXT fast paths with -march=iwmmxt instead of -march=iwmmxt2])],
+   [enable_iwmmxt2=$enableval], [enable_iwmmxt2=auto])
+
+if test "x$IWMMXT_CFLAGS" = "x" ; then
+   IWMMXT_CFLAGS="-flax-vector-conversions -Winline -march=iwmmxt"
+   if test $enable_iwmmxt2 != no ; then
+      IWMMXT_CFLAGS="${IWMMXT_CFLAGS}2"
+   fi
+fi
+
+have_iwmmxt_intrinsics=no
+AC_MSG_CHECKING(whether to use ARM IWMMXT intrinsics)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="$CFLAGS $IWMMXT_CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#ifndef __arm__
+#error "IWMMXT is only available on ARM"
+#endif
+#ifndef __IWMMXT__
+#error "IWMMXT not enabled (with -march=iwmmxt)"
+#endif
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8))
+#error "Need GCC >= 4.8 for IWMMXT intrinsics"
+#endif
+#include <mmintrin.h>
+int main () {
+	union {
+		__m64 v;
+		char c[8];
+	} a = { .c = {1, 2, 3, 4, 5, 6, 7, 8} };
+	int b = 4;
+	__m64 c = _mm_srli_si64 (a.v, b);
+}]])], have_iwmmxt_intrinsics=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+if test $enable_iwmmxt = no ; then
+   have_iwmmxt_intrinsics=disabled
+fi
+
+if test $have_iwmmxt_intrinsics = yes ; then
+   AC_DEFINE(USE_ARM_IWMMXT, 1, [use ARM IWMMXT compiler intrinsics])
+else
+   IWMMXT_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_iwmmxt_intrinsics)
+if test $enable_iwmmxt = yes && test $have_iwmmxt_intrinsics = no ; then
+   AC_MSG_ERROR([IWMMXT intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_ARM_IWMMXT, test $have_iwmmxt_intrinsics = yes)
+
+dnl ==========================================================================
+dnl Check if assembler is gas compatible and supports MIPS DSPr2 instructions
+
+have_mips_dspr2=no
+AC_MSG_CHECKING(whether to use MIPS DSPr2 assembler)
+xserver_save_CFLAGS=$CFLAGS
+CFLAGS="-mdspr2 $CFLAGS"
+
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if !(defined(__mips__) &&  __mips_isa_rev >= 2)
+#error MIPS DSPr2 is currently only available on MIPS32r2 platforms.
+#endif
+int
+main ()
+{
+    int c = 0, a = 0, b = 0;
+    __asm__ __volatile__ (
+        "precr.qb.ph %[c], %[a], %[b]          \n\t"
+        : [c] "=r" (c)
+        : [a] "r" (a), [b] "r" (b)
+    );
+    return c;
+}]])], have_mips_dspr2=yes)
+CFLAGS=$xserver_save_CFLAGS
+
+AC_ARG_ENABLE(mips-dspr2,
+   [AC_HELP_STRING([--disable-mips-dspr2],
+                   [disable MIPS DSPr2 fast paths])],
+   [enable_mips_dspr2=$enableval], [enable_mips_dspr2=auto])
+
+if test $enable_mips_dspr2 = no ; then
+   have_mips_dspr2=disabled
+fi
+
+if test $have_mips_dspr2 = yes ; then
+   AC_DEFINE(USE_MIPS_DSPR2, 1, [use MIPS DSPr2 assembly optimizations])
+fi
+
+AM_CONDITIONAL(USE_MIPS_DSPR2, test $have_mips_dspr2 = yes)
+
+AC_MSG_RESULT($have_mips_dspr2)
+if test $enable_mips_dspr2 = yes && test $have_mips_dspr2 = no ; then
+   AC_MSG_ERROR([MIPS DSPr2 instructions not detected])
+fi
+
+dnl =========================================================================================
+dnl Check for GNU-style inline assembly support
+
+have_gcc_inline_asm=no
+AC_MSG_CHECKING(whether to use GNU-style inline assembler)
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+int main () {
+    /* Most modern architectures have a NOP instruction, so this is a fairly generic test. */
+	asm volatile ( "\tnop\n" : : : "cc", "memory" );
+    return 0;
+}]])], have_gcc_inline_asm=yes)
+
+AC_ARG_ENABLE(gcc-inline-asm,
+   [AC_HELP_STRING([--disable-gcc-inline-asm],
+                   [disable GNU-style inline assembler])],
+   [enable_gcc_inline_asm=$enableval], [enable_gcc_inline_asm=auto])
+
+if test $enable_gcc_inline_asm = no ; then
+   have_gcc_inline_asm=disabled
+fi
+
+if test $have_gcc_inline_asm = yes ; then
+   AC_DEFINE(USE_GCC_INLINE_ASM, 1, [use GNU-style inline assembler])
+fi
+
+AC_MSG_RESULT($have_gcc_inline_asm)
+if test $enable_gcc_inline_asm = yes && test $have_gcc_inline_asm = no ; then
+   AC_MSG_ERROR([GNU-style inline assembler not detected])
+fi
+
+AM_CONDITIONAL(USE_GCC_INLINE_ASM, test $have_gcc_inline_asm = yes)
+
+dnl ==============================================
+dnl Static test programs
+
+AC_ARG_ENABLE(static-testprogs,
+   [AC_HELP_STRING([--enable-static-testprogs],
+		   [build test programs as static binaries [default=no]])],
+   [enable_static_testprogs=$enableval], [enable_static_testprogs=no])
+
+TESTPROGS_EXTRA_LDFLAGS=
+if test "x$enable_static_testprogs" = "xyes" ; then
+   TESTPROGS_EXTRA_LDFLAGS="-all-static"
+fi
+AC_SUBST(TESTPROGS_EXTRA_LDFLAGS)
+
+dnl ==============================================
+dnl Timers
+
+AC_ARG_ENABLE(timers,
+   [AC_HELP_STRING([--enable-timers],
+		   [enable TIMER_BEGIN and TIMER_END macros [default=no]])],
+   [enable_timers=$enableval], [enable_timers=no])
+
+if test $enable_timers = yes ; then 
+   AC_DEFINE(PIXMAN_TIMERS, 1, [enable TIMER_BEGIN/TIMER_END macros])
+fi
+AC_SUBST(PIXMAN_TIMERS)
+
+dnl ===================================
+dnl GTK+
+
+AC_ARG_ENABLE(gtk,
+   [AC_HELP_STRING([--enable-gtk],
+                   [enable tests using GTK+ [default=auto]])],
+   [enable_gtk=$enableval], [enable_gtk=auto])
+
+PKG_PROG_PKG_CONFIG
+
+if test $enable_gtk = yes ; then
+   AC_CHECK_LIB([pixman-1], [pixman_version_string])
+   PKG_CHECK_MODULES(GTK, [gtk+-2.0 >= 2.16 pixman-1])
+fi
+
+if test $enable_gtk = auto ; then
+   AC_CHECK_LIB([pixman-1], [pixman_version_string], [enable_gtk=auto], [enable_gtk=no])
+fi
+
+if test $enable_gtk = auto ; then
+   PKG_CHECK_MODULES(GTK, [gtk+-2.0 >= 2.16 pixman-1], [enable_gtk=yes], [enable_gtk=no])
+fi
+
+AM_CONDITIONAL(HAVE_GTK, [test "x$enable_gtk" = xyes])
+
+AC_SUBST(GTK_CFLAGS)
+AC_SUBST(GTK_LIBS)
+
+dnl =====================================
+dnl posix_memalign, sigaction, alarm, gettimeofday
+
+AC_CHECK_FUNC(posix_memalign, have_posix_memalign=yes, have_posix_memalign=no)
+if test x$have_posix_memalign = xyes; then
+   AC_DEFINE(HAVE_POSIX_MEMALIGN, 1, [Whether we have posix_memalign()])
+fi
+
+AC_CHECK_FUNC(sigaction, have_sigaction=yes, have_sigaction=no)
+if test x$have_sigaction = xyes; then
+   AC_DEFINE(HAVE_SIGACTION, 1, [Whether we have sigaction()])
+fi
+
+AC_CHECK_FUNC(alarm, have_alarm=yes, have_alarm=no)
+if test x$have_alarm = xyes; then
+   AC_DEFINE(HAVE_ALARM, 1, [Whether we have alarm()])
+fi
+
+AC_CHECK_HEADER([sys/mman.h],
+   [AC_DEFINE(HAVE_SYS_MMAN_H, [1], [Define to 1 if we have <sys/mman.h>])])
+
+AC_CHECK_FUNC(mmap, have_mmap=yes, have_mmap=no)
+if test x$have_mmap = xyes; then
+   AC_DEFINE(HAVE_MMAP, 1, [Whether we have mmap()])
+fi
+
+AC_CHECK_FUNC(mprotect, have_mprotect=yes, have_mprotect=no)
+if test x$have_mprotect = xyes; then
+   AC_DEFINE(HAVE_MPROTECT, 1, [Whether we have mprotect()])
+fi
+
+AC_CHECK_FUNC(getpagesize, have_getpagesize=yes, have_getpagesize=no)
+if test x$have_getpagesize = xyes; then
+   AC_DEFINE(HAVE_GETPAGESIZE, 1, [Whether we have getpagesize()])
+fi
+
+AC_CHECK_HEADER([fenv.h],
+   [AC_DEFINE(HAVE_FENV_H, [1], [Define to 1 if we have <fenv.h>])])
+
+AC_CHECK_LIB(m, feenableexcept, have_feenableexcept=yes, have_feenableexcept=no)
+if test x$have_feenableexcept = xyes; then
+   AC_DEFINE(HAVE_FEENABLEEXCEPT, 1, [Whether we have feenableexcept()])
+fi
+
+AC_CHECK_FUNC(gettimeofday, have_gettimeofday=yes, have_gettimeofday=no)
+AC_CHECK_HEADER(sys/time.h, have_sys_time_h=yes, have_sys_time_h=no)
+if test x$have_gettimeofday = xyes && test x$have_sys_time_h = xyes; then
+   AC_DEFINE(HAVE_GETTIMEOFDAY, 1, [Whether we have gettimeofday()])
+fi
+
+dnl =====================================
+dnl Check for missing sqrtf() as, e.g., for Solaris 9
+
+AC_SEARCH_LIBS([sqrtf], [m], [],
+               [AC_DEFINE([sqrtf], [sqrt],
+                          [Define to sqrt if you do not have the `sqrtf' function.])])
+
+dnl =====================================
+dnl Thread local storage
+
+AC_MSG_CHECKING(for thread local storage (TLS) support)
+AC_CACHE_VAL(ac_cv_tls, [
+    ac_cv_tls=none
+    keywords="__thread __declspec(thread)"
+    for kw in $keywords ; do
+        AC_TRY_COMPILE([
+#if defined(__MINGW32__) && !(__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))
+#error This MinGW version has broken __thread support
+#endif
+#ifdef __OpenBSD__
+#error OpenBSD has broken __thread support
+#endif
+
+int $kw test;], [], [ac_cv_tls=$kw; break])
+    done
+])
+AC_MSG_RESULT($ac_cv_tls)
+
+if test "$ac_cv_tls" != "none"; then
+    AC_DEFINE_UNQUOTED([TLS], $ac_cv_tls, [The compiler supported TLS storage class])
+fi
+
+dnl
+dnl posix tls
+dnl
+
+m4_define([pthread_test_program],AC_LANG_SOURCE([[dnl
+#include <stdlib.h>
+#include <pthread.h>
+
+static pthread_once_t once_control = PTHREAD_ONCE_INIT;
+static pthread_key_t key;
+
+static void
+make_key (void)
+{
+    pthread_key_create (&key, NULL);
+}
+
+int
+main ()
+{
+    void *value = NULL;
+
+    if (pthread_once (&once_control, make_key) != 0)
+    {
+	value = NULL;
+    }
+    else
+    {
+	value = pthread_getspecific (key);
+	if (!value)
+	{
+	    value = malloc (100);
+	    pthread_setspecific (key, value);
+	}
+    }
+    return 0;
+}
+]]))
+
+AC_DEFUN([PIXMAN_CHECK_PTHREAD],[dnl
+    if test "z$support_for_pthreads" != "zyes"; then
+	PIXMAN_LINK_WITH_ENV(
+		[$1], [pthread_test_program],
+		[PTHREAD_CFLAGS="$CFLAGS"
+		 PTHREAD_LIBS="$LIBS"
+		 PTHREAD_LDFLAGS="$LDFLAGS"
+		 support_for_pthreads=yes])
+    fi
+])
+
+support_for_pthreads=no
+
+AC_MSG_CHECKING(for pthreads)
+
+PIXMAN_CHECK_PTHREAD([CFLAGS="-pthread"; LDFLAGS="-pthread"])
+PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LIBS="-lpthread"])
+PIXMAN_CHECK_PTHREAD([CFLAGS="-D_REENTRANT"; LDFLAGS="-lroot"])
+    
+if test $support_for_pthreads = yes; then
+    AC_DEFINE([HAVE_PTHREADS], [], [Whether pthreads is supported])
+    if test $ac_cv_tls = none ; then
+        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
+    fi
+fi
+
+AC_MSG_RESULT($support_for_pthreads)
+
+AC_SUBST(TOOLCHAIN_SUPPORTS__THREAD)
+AC_SUBST(HAVE_PTHREADS)
+AC_SUBST(PTHREAD_LDFLAGS)
+AC_SUBST(PTHREAD_LIBS)
+AC_SUBST(PTHREAD_CFLAGS)
+
+dnl =====================================
+dnl __attribute__((constructor))
+
+support_for_attribute_constructor=no
+
+AC_MSG_CHECKING(for __attribute__((constructor)))
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7))
+/* attribute 'constructor' is supported since gcc 2.7, but some compilers
+ * may only pretend to be gcc, so let's try to actually use it
+ */
+static int x = 1;
+static void __attribute__((constructor)) constructor_function () { x = 0; }
+int main (void) { return x; }
+#else
+#error not gcc or gcc version is older than 2.7
+#endif
+]])], support_for_attribute_constructor=yes)
+
+if test x$support_for_attribute_constructor = xyes; then
+   AC_DEFINE([TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR],
+             [],[Whether the tool chain supports __attribute__((constructor))])
+fi
+
+AC_MSG_RESULT($support_for_attribute_constructor)
+AC_SUBST(TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR)
+
+dnl =====================================
+dnl __float128
+
+support_for_float128=no
+
+AC_MSG_CHECKING(for __float128)
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+__float128 a = 1.0Q, b = 2.0Q; int main (void) { return a + b; }
+]])], support_for_float128=yes)
+
+if test x$support_for_float128 = xyes; then
+   AC_DEFINE([HAVE_FLOAT128], [], [Whether the tool chain supports __float128])
+fi
+
+AC_MSG_RESULT($support_for_float128)
+
+dnl =====================================
+dnl __builtin_clz
+
+support_for_builtin_clz=no
+
+AC_MSG_CHECKING(for __builtin_clz)
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+unsigned int x = 11; int main (void) { return __builtin_clz(x); }
+]])], support_for_builtin_clz=yes)
+
+if test x$support_for_builtin_clz = xyes; then
+   AC_DEFINE([HAVE_BUILTIN_CLZ], [], [Whether the compiler supports __builtin_clz])
+fi
+
+AC_MSG_RESULT($support_for_builtin_clz)
+
+dnl =====================================
+dnl GCC vector extensions
+
+support_for_gcc_vector_extensions=no
+
+AC_MSG_CHECKING(for GCC vector extensions)
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+unsigned int __attribute__ ((vector_size(16))) e, a, b;
+int main (void) { e = a - ((b << 27) + (b >> (32 - 27))) + 1; return e[0]; }
+]])], support_for_gcc_vector_extensions=yes)
+
+if test x$support_for_gcc_vector_extensions = xyes; then
+   AC_DEFINE([HAVE_GCC_VECTOR_EXTENSIONS], [],
+             [Whether the compiler supports GCC vector extensions])
+fi
+
+AC_MSG_RESULT($support_for_gcc_vector_extensions)
+
+dnl ==================
+dnl libpng
+
+AC_ARG_ENABLE(libpng, AS_HELP_STRING([--enable-libpng], [Build support for libpng (default: auto)]),
+                      [have_libpng=$enableval], [have_libpng=auto])
+
+case x$have_libpng in
+	xyes) PKG_CHECK_MODULES(PNG, [libpng]) ;;
+	xno) ;;
+	*) PKG_CHECK_MODULES(PNG, [libpng], have_libpng=yes, have_libpng=no) ;;
+esac
+
+if test x$have_libpng = xyes; then
+    AC_DEFINE([HAVE_LIBPNG], [1], [Whether we have libpng])
+fi
+
+AC_SUBST(HAVE_LIBPNG)
+
+AC_OUTPUT([pixman-1.pc
+           pixman-1-uninstalled.pc
+           Makefile
+	   pixman/Makefile
+	   pixman/pixman-version.h
+	   demos/Makefile
+	   test/Makefile])
+
+m4_if(m4_eval(pixman_minor % 2), [1], [
+   echo
+   echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+   echo
+   echo "      Thanks for testing this development snapshot of pixman. Please"
+   echo "      report any problems you find, either by sending email to "
+   echo
+   echo "          pixman@lists.freedesktop.org"
+   echo
+   echo "      or by filing a bug at "
+   echo
+   echo "          https://bugs.freedesktop.org/enter_bug.cgi?product=pixman "
+   echo
+   echo "      If you are looking for a stable release of pixman, please note "
+   echo "      that stable releases have _even_ minor version numbers. Ie., "
+   echo "      pixman-0.]m4_eval(pixman_minor & ~1)[.x are stable releases, whereas pixman-$PIXMAN_VERSION_MAJOR.$PIXMAN_VERSION_MINOR.$PIXMAN_VERSION_MICRO is a "
+   echo "      development snapshot that may contain bugs and experimental "
+   echo "      features. "
+   echo 
+   echo "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"
+   echo
+])
diff --git a/src/pixman/demos/Makefile.am b/src/pixman/demos/Makefile.am
new file mode 100644
index 0000000..e04743d
--- /dev/null
+++ b/src/pixman/demos/Makefile.am
@@ -0,0 +1,52 @@
+EXTRA_DIST = parrot.c parrot.jpg scale.ui
+
+if HAVE_GTK
+
+AM_CFLAGS = $(OPENMP_CFLAGS)
+AM_LDFLAGS = $(OPENMP_CFLAGS)
+
+LDADD = $(top_builddir)/pixman/libpixman-1.la -lm $(GTK_LIBS) $(PNG_LIBS)
+AM_CPPFLAGS = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(GTK_CFLAGS) $(PNG_CFLAGS)
+
+GTK_UTILS = gtk-utils.c gtk-utils.h ../test/utils.c ../test/utils.h \
+            ../test/utils-prng.c ../test/utils-prng.h
+
+DEMOS =				\
+	clip-test		\
+	clip-in			\
+	composite-test		\
+	gradient-test		\
+	radial-test		\
+	linear-gradient		\
+	conical-test		\
+	alpha-test		\
+	screen-test		\
+	convolution-test	\
+	trap-test		\
+	tri-test		\
+	quad2quad		\
+	checkerboard		\
+	srgb-trap-test		\
+	srgb-test		\
+	scale
+
+gradient_test_SOURCES = gradient-test.c $(GTK_UTILS)
+alpha_test_SOURCES = alpha-test.c $(GTK_UTILS)
+composite_test_SOURCES = composite-test.c $(GTK_UTILS)
+clip_test_SOURCES = clip-test.c $(GTK_UTILS)
+clip_in_SOURCES = clip-in.c $(GTK_UTILS)
+trap_test_SOURCES = trap-test.c $(GTK_UTILS)
+screen_test_SOURCES = screen-test.c $(GTK_UTILS)
+convolution_test_SOURCES = convolution-test.c $(GTK_UTILS)
+radial_test_SOURCES = radial-test.c $(GTK_UTILS)
+linear_gradient_SOURCES = linear-gradient.c $(GTK_UTILS)
+conical_test_SOURCES = conical-test.c $(GTK_UTILS)
+tri_test_SOURCES = tri-test.c $(GTK_UTILS)
+checkerboard_SOURCES = checkerboard.c $(GTK_UTILS)
+srgb_test_SOURCES = srgb-test.c $(GTK_UTILS)
+srgb_trap_test_SOURCES = srgb-trap-test.c $(GTK_UTILS)
+scale_SOURCES = scale.c $(GTK_UTILS)
+
+noinst_PROGRAMS = $(DEMOS)
+
+endif
diff --git a/src/pixman/demos/alpha-test.c b/src/pixman/demos/alpha-test.c
new file mode 100644
index 0000000..54e30fa
--- /dev/null
+++ b/src/pixman/demos/alpha-test.c
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 400
+#define HEIGHT 200
+    
+    uint32_t *alpha = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *grad_img;
+    pixman_image_t *alpha_img;
+    pixman_image_t *dest_img;
+    pixman_image_t *src_img;
+    int i;
+    pixman_gradient_stop_t stops[2] =
+	{
+	    { pixman_int_to_fixed (0), { 0x0000, 0x0000, 0x0000, 0x0000 } },
+	    { pixman_int_to_fixed (1), { 0xffff, 0x0000, 0x1111, 0xffff } }
+	};
+    pixman_point_fixed_t p1 = { pixman_double_to_fixed (0), 0 };
+    pixman_point_fixed_t p2 = { pixman_double_to_fixed (WIDTH),
+				pixman_int_to_fixed (0) };
+#if 0
+    pixman_transform_t trans = {
+	{ { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
+	}
+    };
+#else
+    pixman_transform_t trans = {
+	{ { pixman_fixed_1, 0, 0 },
+	  { 0, pixman_fixed_1, 0 },
+	  { 0, 0, pixman_fixed_1 } }
+    };
+#endif
+
+#if 0
+    pixman_point_fixed_t c_inner;
+    pixman_point_fixed_t c_outer;
+    pixman_fixed_t r_inner;
+    pixman_fixed_t r_outer;
+#endif
+    
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	alpha[i] = 0x4f00004f; /* pale blue */
+    
+    alpha_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT, 
+					  alpha,
+					 WIDTH * 4);
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	dest[i] = 0xffffff00;		/* yellow */
+    
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT, 
+					 dest,
+					 WIDTH * 4);
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	src[i] = 0xffff0000;
+
+    src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					WIDTH, HEIGHT,
+					src,
+					WIDTH * 4);
+    
+#if 0
+    c_inner.x = pixman_double_to_fixed (50.0);
+    c_inner.y = pixman_double_to_fixed (50.0);
+    c_outer.x = pixman_double_to_fixed (50.0);
+    c_outer.y = pixman_double_to_fixed (50.0);
+    r_inner = 0;
+    r_outer = pixman_double_to_fixed (50.0);
+    
+    grad_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+						    stops, 2);
+#endif
+#if 0
+    grad_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+						    stops, 2);
+    grad_img = pixman_image_create_linear_gradient (&c_inner, &c_outer,
+						   r_inner, r_outer,
+						   stops, 2);
+#endif
+    
+    grad_img = pixman_image_create_linear_gradient  (&p1, &p2,
+						    stops, 2);
+
+    pixman_image_set_transform (grad_img, &trans);
+    pixman_image_set_repeat (grad_img, PIXMAN_REPEAT_PAD);
+    
+    pixman_image_composite (PIXMAN_OP_OVER, grad_img, NULL, alpha_img,
+			    0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
+
+    pixman_image_set_alpha_map (src_img, alpha_img, 10, 10);
+    
+    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img,
+			    0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
+    
+    printf ("0, 0: %x\n", dest[0]);
+    printf ("10, 10: %x\n", dest[10 * 10 + 10]);
+    printf ("w, h: %x\n", dest[(HEIGHT - 1) * 100 + (WIDTH - 1)]);
+    
+    show_image (dest_img);
+
+    pixman_image_unref (src_img);
+    pixman_image_unref (grad_img);
+    pixman_image_unref (alpha_img);
+    free (dest);
+    
+    return 0;
+}
diff --git a/src/pixman/demos/checkerboard.c b/src/pixman/demos/checkerboard.c
new file mode 100644
index 0000000..449fedb
--- /dev/null
+++ b/src/pixman/demos/checkerboard.c
@@ -0,0 +1,71 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 400
+#define HEIGHT 400
+#define TILE_SIZE 25
+
+    pixman_image_t *checkerboard;
+    pixman_image_t *destination;
+#define D2F(d) (pixman_double_to_fixed(d))
+    pixman_transform_t trans = { {
+	    { D2F (-1.96830), D2F (-1.82250), D2F (512.12250)},
+	    { D2F (0.00000), D2F (-7.29000), D2F (1458.00000)},
+	    { D2F (0.00000), D2F (-0.00911), D2F (0.59231)},
+	}};
+    int i, j;
+
+    checkerboard = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					     WIDTH, HEIGHT,
+					     NULL, 0);
+
+    destination = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					    WIDTH, HEIGHT,
+					    NULL, 0);
+
+    for (i = 0; i < HEIGHT / TILE_SIZE; ++i)
+    {
+	for (j = 0; j < WIDTH / TILE_SIZE; ++j)
+	{
+	    double u = (double)(j + 1) / (WIDTH / TILE_SIZE);
+	    double v = (double)(i + 1) / (HEIGHT / TILE_SIZE);
+	    pixman_color_t black = { 0, 0, 0, 0xffff };
+	    pixman_color_t white = {
+		v * 0xffff,
+		u * 0xffff,
+		(1 - (double)u) * 0xffff,
+		0xffff };
+	    pixman_color_t *c;
+	    pixman_image_t *fill;
+
+	    if ((j & 1) != (i & 1))
+		c = &black;
+	    else
+		c = &white;
+
+	    fill = pixman_image_create_solid_fill (c);
+
+	    pixman_image_composite (PIXMAN_OP_SRC, fill, NULL, checkerboard,
+				    0, 0, 0, 0, j * TILE_SIZE, i * TILE_SIZE,
+				    TILE_SIZE, TILE_SIZE);
+	}
+    }
+
+    pixman_image_set_transform (checkerboard, &trans);
+    pixman_image_set_filter (checkerboard, PIXMAN_FILTER_BEST, NULL, 0);
+    pixman_image_set_repeat (checkerboard, PIXMAN_REPEAT_NONE);
+
+    pixman_image_composite (PIXMAN_OP_SRC,
+			    checkerboard, NULL, destination,
+			    0, 0, 0, 0, 0, 0,
+			    WIDTH, HEIGHT);
+
+    show_image (destination);
+
+    return 0;
+}
diff --git a/src/pixman/demos/clip-in.c b/src/pixman/demos/clip-in.c
new file mode 100644
index 0000000..5157981
--- /dev/null
+++ b/src/pixman/demos/clip-in.c
@@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+/* This test demonstrates that clipping is done totally different depending
+ * on whether the source is transformed or not.
+ */
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+#define SMALL 25
+    
+    uint32_t *sbits = malloc (SMALL * SMALL * 4);
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    pixman_transform_t trans = {
+    {
+	{ pixman_double_to_fixed (1.0), pixman_double_to_fixed (0), pixman_double_to_fixed (-0.1), },
+	{ pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (-0.1), },
+	{ pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (1.0) }
+    } };
+	  
+    pixman_image_t *src_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, SMALL, SMALL, sbits, 4 * SMALL);
+    pixman_image_t *dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, 4 * WIDTH);
+
+    memset (bits, 0xff, WIDTH * HEIGHT * 4);
+    memset (sbits, 0x00, SMALL * SMALL * 4);
+
+    pixman_image_composite (PIXMAN_OP_IN,
+			    src_img, NULL, dest_img,
+			    0, 0, 0, 0, SMALL, SMALL, 200, 200);
+    
+    pixman_image_set_transform (src_img, &trans);
+    
+    pixman_image_composite (PIXMAN_OP_IN,
+			    src_img, NULL, dest_img,
+			    0, 0, 0, 0, SMALL * 2, SMALL * 2, 200, 200);
+    
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (bits);
+    
+    return 0;
+}
diff --git a/src/pixman/demos/clip-test.c b/src/pixman/demos/clip-test.c
new file mode 100644
index 0000000..aa0df44
--- /dev/null
+++ b/src/pixman/demos/clip-test.c
@@ -0,0 +1,97 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+#define WIDTH 200
+#define HEIGHT 200
+    
+static pixman_image_t *
+create_solid_bits (uint32_t pixel)
+{
+    uint32_t *pixels = malloc (WIDTH * HEIGHT * 4);
+    int i;
+    
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	pixels[i] = pixel;
+
+    return pixman_image_create_bits (PIXMAN_a8r8g8b8,
+				     WIDTH, HEIGHT, 
+				     pixels,
+				     WIDTH * 4);
+}
+
+int
+main (int argc, char **argv)
+{
+    pixman_image_t *gradient_img;
+    pixman_image_t *src_img, *dst_img;
+    pixman_gradient_stop_t stops[2] =
+	{
+	    { pixman_int_to_fixed (0), { 0xffff, 0x0000, 0x0000, 0xffff } },
+	    { pixman_int_to_fixed (1), { 0xffff, 0xffff, 0x0000, 0xffff } }
+	};
+#if 0
+    pixman_point_fixed_t p1 = { 0, 0 };
+    pixman_point_fixed_t p2 = { pixman_int_to_fixed (WIDTH),
+				pixman_int_to_fixed (HEIGHT) };
+#endif
+    pixman_point_fixed_t c_inner;
+    pixman_point_fixed_t c_outer;
+    pixman_fixed_t r_inner;
+    pixman_fixed_t r_outer;
+    pixman_region32_t clip_region;
+    pixman_transform_t trans = {
+	{ { pixman_double_to_fixed (1.3), pixman_double_to_fixed (0), pixman_double_to_fixed (-0.5), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (-0.5), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (1.0) } 
+	}
+    };
+    
+    src_img = create_solid_bits (0xff0000ff);
+    
+    c_inner.x = pixman_double_to_fixed (100.0);
+    c_inner.y = pixman_double_to_fixed (100.0);
+    c_outer.x = pixman_double_to_fixed (100.0);
+    c_outer.y = pixman_double_to_fixed (100.0);
+    r_inner = 0;
+    r_outer = pixman_double_to_fixed (100.0);
+    
+    gradient_img = pixman_image_create_radial_gradient (&c_inner, &c_outer,
+							r_inner, r_outer,
+							stops, 2);
+
+#if 0
+    gradient_img = pixman_image_create_linear_gradient  (&p1, &p2,
+							 stops, 2);
+    
+#endif
+
+    pixman_image_composite (PIXMAN_OP_OVER, gradient_img, NULL, src_img,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+    
+    pixman_region32_init_rect (&clip_region, 50, 0, 100, 200);
+    pixman_image_set_clip_region32 (src_img, &clip_region);
+    pixman_image_set_source_clipping (src_img, TRUE);
+    pixman_image_set_has_client_clip (src_img, TRUE);
+    pixman_image_set_transform (src_img, &trans);
+    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
+    
+    dst_img = create_solid_bits (0xffff0000);
+    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dst_img,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+    
+
+#if 0
+    printf ("0, 0: %x\n", src[0]);
+    printf ("10, 10: %x\n", src[10 * 10 + 10]);
+    printf ("w, h: %x\n", src[(HEIGHT - 1) * 100 + (WIDTH - 1)]);
+#endif
+    
+    show_image (dst_img);
+    
+    pixman_image_unref (gradient_img);
+    pixman_image_unref (src_img);
+    
+    return 0;
+}
diff --git a/src/pixman/demos/composite-test.c b/src/pixman/demos/composite-test.c
new file mode 100644
index 0000000..8213e2f
--- /dev/null
+++ b/src/pixman/demos/composite-test.c
@@ -0,0 +1,186 @@
+#include <gtk/gtk.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+#include "parrot.c"
+
+#define WIDTH	80
+#define HEIGHT	80
+
+typedef struct {
+    const char *name;
+    pixman_op_t op;
+} operator_t;
+
+static const operator_t operators[] = {
+    { "CLEAR",		PIXMAN_OP_CLEAR },
+    { "SRC",		PIXMAN_OP_SRC },
+    { "DST",		PIXMAN_OP_DST },
+    { "OVER",		PIXMAN_OP_OVER },
+    { "OVER_REVERSE",	PIXMAN_OP_OVER_REVERSE },
+    { "IN",		PIXMAN_OP_IN },
+    { "IN_REVERSE",	PIXMAN_OP_IN_REVERSE },
+    { "OUT",		PIXMAN_OP_OUT },
+    { "OUT_REVERSE",	PIXMAN_OP_OUT_REVERSE },
+    { "ATOP",		PIXMAN_OP_ATOP },
+    { "ATOP_REVERSE",	PIXMAN_OP_ATOP_REVERSE },
+    { "XOR",		PIXMAN_OP_XOR },
+    { "ADD",		PIXMAN_OP_ADD },
+    { "SATURATE",	PIXMAN_OP_SATURATE },
+
+    { "MULTIPLY",	PIXMAN_OP_MULTIPLY },
+    { "SCREEN",		PIXMAN_OP_SCREEN },
+    { "OVERLAY",	PIXMAN_OP_OVERLAY },
+    { "DARKEN",		PIXMAN_OP_DARKEN },
+    { "LIGHTEN",	PIXMAN_OP_LIGHTEN },
+    { "COLOR_DODGE",	PIXMAN_OP_COLOR_DODGE },
+    { "COLOR_BURN",	PIXMAN_OP_COLOR_BURN },
+    { "HARD_LIGHT",	PIXMAN_OP_HARD_LIGHT },
+    { "SOFT_LIGHT",	PIXMAN_OP_SOFT_LIGHT },
+    { "DIFFERENCE",	PIXMAN_OP_DIFFERENCE },
+    { "EXCLUSION",	PIXMAN_OP_EXCLUSION },
+    { "HSL_HUE",	PIXMAN_OP_HSL_HUE },
+    { "HSL_SATURATION",	PIXMAN_OP_HSL_SATURATION },
+    { "HSL_COLOR",	PIXMAN_OP_HSL_COLOR },
+    { "HSL_LUMINOSITY",	PIXMAN_OP_HSL_LUMINOSITY },
+};
+
+static uint32_t
+reader (const void *src, int size)
+{
+    switch (size)
+    {
+    case 1:
+	return *(uint8_t *)src;
+    case 2:
+	return *(uint16_t *)src;
+    case 4:
+	return *(uint32_t *)src;
+    default:
+	g_assert_not_reached();
+    }
+}
+
+static void
+writer (void *src, uint32_t value, int size)
+{
+    switch (size)
+    {
+    case 1:
+	*(uint8_t *)src = value;
+	break;
+
+    case 2:
+	*(uint16_t *)src = value;
+	break;
+
+    case 4:
+	*(uint32_t *)src = value;
+	break;
+
+    default:
+        break;
+    }
+}
+
+int
+main (int argc, char **argv)
+{
+#define d2f pixman_double_to_fixed
+
+    GtkWidget *window, *swindow;
+    GtkWidget *table;
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *gradient, *parrot;
+    pixman_image_t *dest_img;
+    pixman_point_fixed_t p1 = { -10 << 16, 10 << 16 };
+    pixman_point_fixed_t p2 = { (WIDTH + 10) << 16, (HEIGHT - 10) << 16 };
+    uint16_t alpha = 0xdddd;
+    pixman_gradient_stop_t stops[6] =
+    {
+	{ d2f (0.0), { 0xf2f2, 0x8787, 0x7d7d, alpha } },
+	{ d2f (0.22), { 0xf3f3, 0xeaea, 0x8383, alpha } },
+	{ d2f (0.42), { 0x6b6b, 0xc0c0, 0x7777, alpha } },
+	{ d2f (0.57), { 0x4b4b, 0xc9c9, 0xf5f5, alpha } },
+	{ d2f (0.75), { 0x6a6a, 0x7f7f, 0xbebe, alpha } },
+	{ d2f (1.0), { 0xeded, 0x8282, 0xb0b0, alpha } },
+    };
+
+    int i;
+
+    gtk_init (&argc, &argv);
+
+    window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
+
+    gtk_window_set_default_size (GTK_WINDOW (window), 800, 600);
+
+    g_signal_connect (window, "delete-event",
+		      G_CALLBACK (gtk_main_quit),
+		      NULL);
+    table = gtk_table_new (G_N_ELEMENTS (operators) / 6, 6, TRUE);
+
+    gradient = pixman_image_create_linear_gradient (&p1, &p2, stops, G_N_ELEMENTS (stops));
+    parrot = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, (uint32_t *)parrot_bits, WIDTH * 4);
+
+    pixman_image_set_repeat (gradient, PIXMAN_REPEAT_PAD);
+
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT,
+					 NULL,
+					 WIDTH * 4);
+    pixman_image_set_accessors (dest_img, reader, writer);
+
+    for (i = 0; i < G_N_ELEMENTS (operators); ++i)
+    {
+	GtkWidget *image;
+	GdkPixbuf *pixbuf;
+	GtkWidget *vbox;
+	GtkWidget *label;
+
+	vbox = gtk_vbox_new (FALSE, 0);
+
+	label = gtk_label_new (operators[i].name);
+	gtk_box_pack_start (GTK_BOX (vbox), label, FALSE, FALSE, 6);
+	gtk_widget_show (label);
+
+	pixman_image_composite (PIXMAN_OP_SRC, gradient, NULL, dest_img,
+				0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+	pixman_image_composite (operators[i].op, parrot, NULL, dest_img,
+				0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+	pixbuf = pixbuf_from_argb32 (pixman_image_get_data (dest_img),
+				     WIDTH, HEIGHT, WIDTH * 4);
+	image = gtk_image_new_from_pixbuf (pixbuf);
+	gtk_box_pack_start (GTK_BOX (vbox), image, FALSE, FALSE, 0);
+	gtk_widget_show (image);
+
+	gtk_table_attach_defaults (GTK_TABLE (table), vbox,
+				   i % 6, (i % 6) + 1, i / 6, (i / 6) + 1);
+	gtk_widget_show (vbox);
+
+	g_object_unref (pixbuf);
+    }
+
+    pixman_image_unref (gradient);
+    free (src);
+    pixman_image_unref (dest_img);
+    free (dest);
+
+    swindow = gtk_scrolled_window_new (NULL, NULL);
+    gtk_scrolled_window_set_policy (GTK_SCROLLED_WINDOW (swindow),
+				    GTK_POLICY_AUTOMATIC,
+				    GTK_POLICY_AUTOMATIC);
+
+    gtk_scrolled_window_add_with_viewport (GTK_SCROLLED_WINDOW (swindow), table);
+    gtk_widget_show (table);
+
+    gtk_container_add (GTK_CONTAINER (window), swindow);
+    gtk_widget_show (swindow);
+
+    gtk_widget_show (window);
+
+    gtk_main ();
+
+    return 0;
+}
diff --git a/src/pixman/demos/conical-test.c b/src/pixman/demos/conical-test.c
new file mode 100644
index 0000000..6b32430
--- /dev/null
+++ b/src/pixman/demos/conical-test.c
@@ -0,0 +1,100 @@
+#include "../test/utils.h"
+#include "gtk-utils.h"
+
+#define SIZE 128
+#define GRADIENTS_PER_ROW 7
+#define NUM_ROWS ((NUM_GRADIENTS + GRADIENTS_PER_ROW - 1) / GRADIENTS_PER_ROW)
+#define WIDTH (SIZE * GRADIENTS_PER_ROW)
+#define HEIGHT (SIZE * NUM_ROWS)
+#define NUM_GRADIENTS 35
+
+#define double_to_color(x)					\
+    (((uint32_t) ((x)*65536)) - (((uint32_t) ((x)*65536)) >> 16))
+
+#define PIXMAN_STOP(offset,r,g,b,a)		\
+    { pixman_double_to_fixed (offset),		\
+	{					\
+	    double_to_color (r),		\
+		double_to_color (g),		\
+		double_to_color (b),		\
+		double_to_color (a)		\
+	}					\
+    }
+
+
+static const pixman_gradient_stop_t stops[] = {
+    PIXMAN_STOP (0.25,       1, 0, 0, 0.7),
+    PIXMAN_STOP (0.5,        1, 1, 0, 0.7),
+    PIXMAN_STOP (0.75,       0, 1, 0, 0.7),
+    PIXMAN_STOP (1.0,        0, 0, 1, 0.7)
+};
+
+#define NUM_STOPS (sizeof (stops) / sizeof (stops[0]))
+
+static pixman_image_t *
+create_conical (int index)
+{
+    pixman_point_fixed_t c;
+    double angle;
+
+    c.x = pixman_double_to_fixed (0);
+    c.y = pixman_double_to_fixed (0);
+
+    angle = (0.5 / NUM_GRADIENTS + index / (double)NUM_GRADIENTS) * 720 - 180;
+
+    return pixman_image_create_conical_gradient (
+	&c, pixman_double_to_fixed (angle), stops, NUM_STOPS);
+}
+
+int
+main (int argc, char **argv)
+{
+    pixman_transform_t transform;
+    pixman_image_t *src_img, *dest_img;
+    int i;
+
+    enable_divbyzero_exceptions ();
+
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT,
+					 NULL, 0);
+ 
+    draw_checkerboard (dest_img, 25, 0xffaaaaaa, 0xff888888);
+
+    pixman_transform_init_identity (&transform);
+
+    pixman_transform_translate (NULL, &transform,
+				pixman_double_to_fixed (0.5),
+				pixman_double_to_fixed (0.5));
+
+    pixman_transform_scale (NULL, &transform,
+			    pixman_double_to_fixed (SIZE),
+			    pixman_double_to_fixed (SIZE));
+    pixman_transform_translate (NULL, &transform,
+				pixman_double_to_fixed (0.5),
+				pixman_double_to_fixed (0.5));
+
+    for (i = 0; i < NUM_GRADIENTS; i++)
+    {
+	int column = i % GRADIENTS_PER_ROW;
+	int row = i / GRADIENTS_PER_ROW;
+
+	src_img = create_conical (i); 
+	pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
+   
+	pixman_image_set_transform (src_img, &transform);
+	
+	pixman_image_composite32 (
+	    PIXMAN_OP_OVER, src_img, NULL,dest_img,
+	    0, 0, 0, 0, column * SIZE, row * SIZE,
+	    SIZE, SIZE);
+	
+	pixman_image_unref (src_img);
+    }
+
+    show_image (dest_img);
+
+    pixman_image_unref (dest_img);
+
+    return 0;
+}
diff --git a/src/pixman/demos/convolution-test.c b/src/pixman/demos/convolution-test.c
new file mode 100644
index 0000000..da284af
--- /dev/null
+++ b/src/pixman/demos/convolution-test.c
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+#define d2f pixman_double_to_fixed
+    
+    uint32_t *src = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *mask = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    pixman_fixed_t convolution[] =
+    {
+	d2f (3), d2f (3),
+	d2f (0.5), d2f (0.5), d2f (0.5),
+	d2f (0.5), d2f (0.5), d2f (0.5),
+	d2f (0.5), d2f (0.5), d2f (0.5),
+    };
+    pixman_image_t *simg, *mimg, *dimg;
+
+    int i;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+    {
+	src[i] = 0x7f007f00;
+	mask[i] = (i % 256) * 0x01000000;
+	dest[i] = 0;
+    }
+
+    simg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src, WIDTH * 4);
+    mimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, mask, WIDTH * 4);
+    dimg = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, dest, WIDTH * 4);
+
+    pixman_image_set_filter (mimg, PIXMAN_FILTER_CONVOLUTION,
+			     convolution, 11);
+
+    pixman_image_composite (PIXMAN_OP_OVER, simg, mimg, dimg, 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    show_image (dimg);
+    
+    return 0;
+}
diff --git a/src/pixman/demos/gradient-test.c b/src/pixman/demos/gradient-test.c
new file mode 100644
index 0000000..e68f69a
--- /dev/null
+++ b/src/pixman/demos/gradient-test.c
@@ -0,0 +1,92 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 400
+#define HEIGHT 200
+    
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *src_img;
+    pixman_image_t *dest_img;
+    int i;
+    pixman_gradient_stop_t stops[2] =
+	{
+	    { pixman_int_to_fixed (0), { 0x0000, 0x0000, 0xffff, 0xffff } },
+	    { pixman_int_to_fixed (1), { 0xffff, 0x1111, 0x1111, 0xffff } }
+	};
+    pixman_point_fixed_t p1 = { pixman_double_to_fixed (50), 0 };
+    pixman_point_fixed_t p2 = { pixman_double_to_fixed (200), 0 };
+#if 0
+    pixman_transform_t trans = {
+	{ { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), },
+	  { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
+	}
+    };
+#else
+    pixman_transform_t trans = {
+	{ { pixman_fixed_1, 0, 0 },
+	  { 0, pixman_fixed_1, 0 },
+	  { 0, 0, pixman_fixed_1 } }
+    };
+#endif
+
+#if 0
+    pixman_point_fixed_t c_inner;
+    pixman_point_fixed_t c_outer;
+    pixman_fixed_t r_inner;
+    pixman_fixed_t r_outer;
+#endif
+    
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	dest[i] = 0xff00ff00;
+    
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT, 
+					 dest,
+					 WIDTH * 4);
+
+#if 0
+    c_inner.x = pixman_double_to_fixed (50.0);
+    c_inner.y = pixman_double_to_fixed (50.0);
+    c_outer.x = pixman_double_to_fixed (50.0);
+    c_outer.y = pixman_double_to_fixed (50.0);
+    r_inner = 0;
+    r_outer = pixman_double_to_fixed (50.0);
+    
+    src_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+						    stops, 2);
+#endif
+#if 0
+    src_img = pixman_image_create_conical_gradient (&c_inner, r_inner,
+						    stops, 2);
+    src_img = pixman_image_create_linear_gradient (&c_inner, &c_outer,
+						   r_inner, r_outer,
+						   stops, 2);
+#endif
+    
+    src_img = pixman_image_create_linear_gradient  (&p1, &p2,
+						    stops, 2);
+    
+    pixman_image_set_transform (src_img, &trans);
+    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NONE);
+    
+    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img,
+			    0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
+    
+    printf ("0, 0: %x\n", dest[0]);
+    printf ("10, 10: %x\n", dest[10 * 10 + 10]);
+    printf ("w, h: %x\n", dest[(HEIGHT - 1) * 100 + (WIDTH - 1)]);
+    
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (dest);
+    
+    return 0;
+}
diff --git a/src/pixman/demos/gtk-utils.c b/src/pixman/demos/gtk-utils.c
new file mode 100644
index 0000000..32d4aec
--- /dev/null
+++ b/src/pixman/demos/gtk-utils.c
@@ -0,0 +1,179 @@
+#include <gtk/gtk.h>
+#include <config.h>
+#include "../test/utils.h"
+#include "gtk-utils.h"
+
+pixman_image_t *
+pixman_image_from_file (const char *filename, pixman_format_code_t format)
+{
+    GdkPixbuf *pixbuf;
+    pixman_image_t *image;
+    int width, height;
+    uint32_t *data, *d;
+    uint8_t *gdk_data;
+    int n_channels;
+    int j, i;
+    int stride;
+
+    if (!(pixbuf = gdk_pixbuf_new_from_file (filename, NULL)))
+	return NULL;
+
+    image = NULL;
+
+    width = gdk_pixbuf_get_width (pixbuf);
+    height = gdk_pixbuf_get_height (pixbuf);
+    n_channels = gdk_pixbuf_get_n_channels (pixbuf);
+    gdk_data = gdk_pixbuf_get_pixels (pixbuf);
+    stride = gdk_pixbuf_get_rowstride (pixbuf);
+
+    if (!(data = malloc (width * height * sizeof (uint32_t))))
+	goto out;
+
+    d = data;
+    for (j = 0; j < height; ++j)
+    {
+	uint8_t *gdk_line = gdk_data;
+
+	for (i = 0; i < width; ++i)
+	{
+	    int r, g, b, a;
+	    uint32_t pixel;
+
+	    r = gdk_line[0];
+	    g = gdk_line[1];
+	    b = gdk_line[2];
+
+	    if (n_channels == 4)
+		a = gdk_line[3];
+	    else
+		a = 0xff;
+
+	    r = (r * a + 127) / 255;
+	    g = (g * a + 127) / 255;
+	    b = (b * a + 127) / 255;
+
+	    pixel = (a << 24) | (r << 16) | (g << 8) | b;
+
+	    *d++ = pixel;
+	    gdk_line += n_channels;
+	}
+
+	gdk_data += stride;
+    }
+
+    image = pixman_image_create_bits (
+	format, width, height, data, width * 4);
+
+out:
+    g_object_unref (pixbuf);
+    return image;
+}
+
+GdkPixbuf *
+pixbuf_from_argb32 (uint32_t *bits,
+		    int width,
+		    int height,
+		    int stride)
+{
+    GdkPixbuf *pixbuf = gdk_pixbuf_new (GDK_COLORSPACE_RGB, TRUE,
+					8, width, height);
+    int p_stride = gdk_pixbuf_get_rowstride (pixbuf);
+    guint32 *p_bits = (guint32 *)gdk_pixbuf_get_pixels (pixbuf);
+    int i;
+
+    for (i = 0; i < height; ++i)
+    {
+	uint32_t *src_row = &bits[i * (stride / 4)];
+	uint32_t *dst_row = p_bits + i * (p_stride / 4);
+
+	a8r8g8b8_to_rgba_np (dst_row, src_row, width);
+    }
+
+    return pixbuf;
+}
+
+static gboolean
+on_expose (GtkWidget *widget, GdkEventExpose *expose, gpointer data)
+{
+    pixman_image_t *pimage = data;
+    int width = pixman_image_get_width (pimage);
+    int height = pixman_image_get_height (pimage);
+    int stride = pixman_image_get_stride (pimage);
+    cairo_surface_t *cimage;
+    cairo_format_t format;
+    cairo_t *cr;
+
+    if (pixman_image_get_format (pimage) == PIXMAN_x8r8g8b8)
+	format = CAIRO_FORMAT_RGB24;
+    else
+	format = CAIRO_FORMAT_ARGB32;
+
+    cimage = cairo_image_surface_create_for_data (
+	(uint8_t *)pixman_image_get_data (pimage),
+	format, width, height, stride);
+    
+    cr = gdk_cairo_create (widget->window);
+
+    cairo_rectangle (cr, 0, 0, width, height);
+    cairo_set_source_surface (cr, cimage, 0, 0);
+    cairo_fill (cr);
+
+    cairo_destroy (cr);
+    cairo_surface_destroy (cimage);
+    
+    return TRUE;
+}
+
+void
+show_image (pixman_image_t *image)
+{
+    GtkWidget *window;
+    int width, height;
+    int argc;
+    char **argv;
+    char *arg0 = g_strdup ("pixman-test-program");
+    pixman_format_code_t format;
+    pixman_image_t *copy;
+
+    argc = 1;
+    argv = (char **)&arg0;
+
+    gtk_init (&argc, &argv);
+    
+    window = gtk_window_new (GTK_WINDOW_TOPLEVEL);
+    width = pixman_image_get_width (image);
+    height = pixman_image_get_height (image);
+
+    gtk_window_set_default_size (GTK_WINDOW (window), width, height);
+
+    format = pixman_image_get_format (image);
+
+    /* We always display the image as if it contains sRGB data. That
+     * means that no conversion should take place when the image
+     * has the a8r8g8b8_sRGB format.
+     */
+    switch (format)
+    {
+    case PIXMAN_a8r8g8b8_sRGB:
+    case PIXMAN_a8r8g8b8:
+    case PIXMAN_x8r8g8b8:
+	copy = pixman_image_ref (image);
+	break;
+
+    default:
+	copy = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 width, height, NULL, -1);
+	pixman_image_composite32 (PIXMAN_OP_SRC,
+				  image, NULL, copy,
+				  0, 0, 0, 0, 0, 0,
+				  width, height);
+	break;
+    }
+
+    g_signal_connect (window, "expose_event", G_CALLBACK (on_expose), copy);
+    g_signal_connect (window, "delete_event", G_CALLBACK (gtk_main_quit), NULL);
+    
+    gtk_widget_show (window);
+    
+    gtk_main ();
+}
diff --git a/src/pixman/demos/gtk-utils.h b/src/pixman/demos/gtk-utils.h
new file mode 100644
index 0000000..36be4de
--- /dev/null
+++ b/src/pixman/demos/gtk-utils.h
@@ -0,0 +1,15 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <glib.h>
+#include <gtk/gtk.h>
+#include "pixman.h"
+
+void show_image (pixman_image_t *image);
+
+pixman_image_t *
+pixman_image_from_file (const char *filename, pixman_format_code_t format);
+
+GdkPixbuf *pixbuf_from_argb32 (uint32_t *bits,
+                               int width,
+                               int height,
+                               int stride);
diff --git a/src/pixman/demos/linear-gradient.c b/src/pixman/demos/linear-gradient.c
new file mode 100644
index 0000000..46433a6
--- /dev/null
+++ b/src/pixman/demos/linear-gradient.c
@@ -0,0 +1,50 @@
+#include "../test/utils.h"
+#include "gtk-utils.h"
+
+#define WIDTH 1024
+#define HEIGHT 640
+
+int
+main (int argc, char **argv)
+{
+    pixman_image_t *src_img, *dest_img;
+    pixman_gradient_stop_t stops[] = {
+        { 0x00000, { 0x0000, 0x0000, 0x4444, 0xdddd } },
+        { 0x10000, { 0xeeee, 0xeeee, 0x8888, 0xdddd } },
+#if 0
+        /* These colors make it very obvious that dithering
+         * is useful even for 8-bit gradients
+         */
+	{ 0x00000, { 0x6666, 0x3333, 0x3333, 0xffff } },
+	{ 0x10000, { 0x3333, 0x6666, 0x6666, 0xffff } },
+#endif
+    };
+    pixman_point_fixed_t p1, p2;
+
+    enable_divbyzero_exceptions ();
+
+    dest_img = pixman_image_create_bits (PIXMAN_x8r8g8b8,
+					 WIDTH, HEIGHT,
+					 NULL, 0);
+
+    p1.x = p1.y = 0x0000;
+    p2.x = WIDTH << 16;
+    p2.y = HEIGHT << 16;
+    
+    src_img = pixman_image_create_linear_gradient (&p1, &p2, stops, ARRAY_LENGTH (stops));
+
+    pixman_image_composite32 (PIXMAN_OP_OVER,
+			      src_img,
+			      NULL,
+			      dest_img,
+			      0, 0,
+			      0, 0,
+			      0, 0,
+			      WIDTH, HEIGHT);
+
+    show_image (dest_img);
+
+    pixman_image_unref (dest_img);
+
+    return 0;
+}
diff --git a/src/pixman/demos/parrot.c b/src/pixman/demos/parrot.c
new file mode 100644
index 0000000..60fd270
--- /dev/null
+++ b/src/pixman/demos/parrot.c
@@ -0,0 +1,1079 @@
+/* This parrot is a finger painting by Rubens LP:
+ *
+ *     http://www.flickr.com/photos/dorubens/4030604504/in/set-72157622586088192/
+ *
+ * Used here under Creative Commons Attribution. The artist's web site:
+ *
+ *     http://www.rubenslp.com.br/
+ *
+ */
+static const uint32_t parrot_bits[] =
+{
+    0x716f7070, 0x1c1b1b1b, 0x110f1010, 0x16151415, 0x14121313, 0x2c292b2b,
+    0x403e3f3f, 0x19181818, 0x06050605, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x02010101, 0x08070707,
+    0x05040404, 0x0a060908, 0x27262426, 0xb3b0b1b2, 0x99979897, 0x2b2a2929,
+    0x100f0f0f, 0x0f0d0e0e, 0x0e0e0c0d, 0x0d0d0b0c, 0x12111111, 0x10100e0f,
+    0x0e0e0c0d, 0x0e0e0c0d, 0x12101211, 0x13121212, 0x17151516, 0x100f0e0f,
+    0x15141414, 0x423f4042, 0x3b393a3a, 0x13121212, 0x16151515, 0x2b282b29,
+    0x13121112, 0x100f0f0f, 0x0f0d0f0e, 0x08070807, 0x0d0c0c0c, 0x0a090a09,
+    0x0e0e0c0d, 0x0c0c0a0b, 0x10100f0f, 0x0f0e0e0e, 0x07060706, 0x0d0c0d0c,
+    0x0e0d0e0d, 0x05040504, 0x08070807, 0x0c0b0c0b, 0x0d0c0d0c, 0x05040504,
+    0x110f1110, 0x08070707, 0x04030303, 0x09080808, 0x06050605, 0x01000000,
+    0x08070707, 0x06050505, 0x05040504, 0x100e100f, 0x0b0a0b0a, 0x01000100,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x04030403,
+    0x03020302, 0x0b0a0b0a, 0x14131313, 0x0e0d0d0d, 0x0e0d0e0d, 0x231f2222,
+    0x4d4b4b4d, 0xa7a5a6a6, 0x5b595a5a, 0x07060606, 0x00000000, 0x00000000,
+    0x01000100, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x02010201,
+    0x05040404, 0x07050706, 0x04020303, 0x403e3e3f, 0xb6b3b5b5, 0x84828283,
+    0x1a191819, 0x0e0d0d0d, 0x0d0c0b0c, 0x0f0f0d0e, 0x0e0d0d0d, 0x0f0f0e0e,
+    0x0e0e0c0d, 0x0c0b0a0b, 0x0b0a090a, 0x11111010, 0x100f0f0f, 0x100f0f0f,
+    0x1b19191a, 0x1f1e1e1e, 0x46434544, 0x3a37383a, 0x1c1b1a1b, 0x1e1d1d1d,
+    0x29272828, 0x19171818, 0x0e0d0d0d, 0x0f0e0e0e, 0x06050505, 0x0c0b0b0b,
+    0x100e100f, 0x09080908, 0x0c0c0a0b, 0x0f0f0e0e, 0x0c0c0a0b, 0x05040404,
+    0x08070807, 0x0c0b0c0b, 0x05040504, 0x06050605, 0x100e100f, 0x09080908,
+    0x09080908, 0x12101211, 0x09080908, 0x03020202, 0x08070707, 0x01000100,
+    0x04030403, 0x07060606, 0x08070707, 0x08070707, 0x0f0e0f0e, 0x0b0a0b0a,
+    0x03020302, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x03020202,
+    0x09080908, 0x05040504, 0x00000000, 0x00000000, 0x0b0a0b0a, 0x0f0e0f0e,
+    0x1a191a19, 0x77757576, 0xc3c1c2c2, 0x75737374, 0x1f1e1e1f, 0x06050505,
+    0x07030605, 0x00000000, 0x03020302, 0x00000000, 0x00000000, 0x00000000,
+    0x03020202, 0x04030403, 0x04030303, 0x02010101, 0x5b5a5959, 0xafacaeae,
+    0x5a575859, 0x1b19191a, 0x0e0d0d0d, 0x100e100f, 0x100f0f0f, 0x11101010,
+    0x12121111, 0x0c0c0a0b, 0x09090708, 0x0d0d0b0c, 0x0f0e0d0e, 0x0d0d0b0c,
+    0x14131313, 0x1c1b1b1b, 0x322f3132, 0x514f504f, 0x2d2b2b2b, 0x2e2b2c2e,
+    0x21202020, 0x201f1f1f, 0x15141414, 0x12101211, 0x0e0d0d0d, 0x08070807,
+    0x0b0a0a0a, 0x100e0f0f, 0x07060706, 0x0a090a09, 0x0f0f0d0e, 0x0c0c0a0b,
+    0x09090708, 0x0d0c0c0c, 0x0b0a0b0a, 0x06050605, 0x0b0a0b0a, 0x0c0b0c0b,
+    0x08070807, 0x07060706, 0x0f0e0f0e, 0x0a090a09, 0x01000000, 0x05040504,
+    0x03020202, 0x01000000, 0x08070707, 0x05040504, 0x09080908, 0x0d0c0d0c,
+    0x07060606, 0x04030403, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x02010201, 0x06050505, 0x08070807, 0x01000100, 0x00000000, 0x00000000,
+    0x0f0d0e0e, 0x2c2a2a2b, 0x9c9a9b9b, 0xcac7cac9, 0x4d4c4b4d, 0x1b19181a,
+    0x0a090909, 0x00000000, 0x02010201, 0x05040504, 0x00000000, 0x00000000,
+    0x00000000, 0x02010101, 0x06040605, 0x03020302, 0x100d0e10, 0x615f5f60,
+    0x9d9a9c9c, 0x32313131, 0x14131313, 0x0d0c0c0c, 0x0f0e0e0e, 0x0e0d0d0d,
+    0x100f0f0f, 0x11101010, 0x08070807, 0x06050405, 0x0f0e0e0e, 0x100e0f0f,
+    0x0d0d0c0c, 0x2a282929, 0x3d3c3b3c, 0x38373637, 0x4f4d4d4f, 0x19181718,
+    0x27262626, 0x14131313, 0x29272828, 0x2e2b2c2d, 0x201d1e20, 0x15121414,
+    0x04030403, 0x07060606, 0x0c0b0c0b, 0x0a090a09, 0x08070707, 0x0e0d0e0d,
+    0x0b0a0a0a, 0x07060606, 0x0c0b0b0b, 0x0a090909, 0x04030403, 0x0a090909,
+    0x0e0d0e0d, 0x0a090a09, 0x09080908, 0x0e0d0e0d, 0x07060706, 0x08070807,
+    0x08070807, 0x00000000, 0x01000000, 0x07060606, 0x04030403, 0x08070807,
+    0x0e0d0e0d, 0x07060706, 0x06050605, 0x01000100, 0x00000000, 0x00000000,
+    0x00000000, 0x02010201, 0x07060706, 0x01000100, 0x00000000, 0x00000000,
+    0x01000100, 0x01000100, 0x322e3131, 0xa9a8a8a8, 0xb9b8b8b8, 0x39383639,
+    0x1d1b1b1c, 0x0c0b0b0b, 0x04030303, 0x05040404, 0x07060706, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x06030605, 0x09060807, 0x0e0d0c0d,
+    0x605e5e5f, 0x99959898, 0x2e2c2c2e, 0x13121212, 0x0d0d0b0c, 0x11111010,
+    0x12121111, 0x13131212, 0x0f0e0e0e, 0x09080908, 0x03020302, 0x0c0b0b0b,
+    0x0e0d0d0d, 0x11101010, 0x38363737, 0x514f4f50, 0x34333134, 0x46434546,
+    0x24222124, 0x29262827, 0x04030303, 0x05040404, 0x14131313, 0x15151414,
+    0x100f100f, 0x07060706, 0x07060606, 0x0d0c0d0c, 0x0a090909, 0x07060706,
+    0x0f0e0f0e, 0x0c0b0c0b, 0x01000100, 0x0c0b0c0b, 0x0a090a09, 0x01000100,
+    0x08070707, 0x12101211, 0x0b0a0b0a, 0x06050605, 0x0f0e0f0e, 0x07060706,
+    0x04030403, 0x06050605, 0x02010201, 0x00000000, 0x05040504, 0x03020302,
+    0x06050605, 0x0d0c0d0c, 0x08070707, 0x07060706, 0x100e100f, 0x05040504,
+    0x01000100, 0x00000000, 0x02010201, 0x07060606, 0x03020202, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x06050605, 0xb8b6b6b7, 0xa4a2a3a3,
+    0x2c2b2b2b, 0x1c191a1b, 0x0e0c0d0d, 0x08070707, 0x35323433, 0x1a191919,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x04020403,
+    0x1b19191a, 0x68666667, 0x7d7a7c7c, 0x23212023, 0x1c191a1c, 0x0f0e0d0e,
+    0x11111010, 0x11111010, 0x10100f0f, 0x0c0b0b0b, 0x09080908, 0x04030403,
+    0x0e0d0e0d, 0x0e0d0d0d, 0x18171717, 0x52505150, 0x5d5a5b5c, 0x3b3a383a,
+    0x3d3a3b3d, 0x24212224, 0x29252729, 0x06050505, 0x04030303, 0x04030403,
+    0x06050505, 0x0c0b0b0b, 0x09080908, 0x04030303, 0x0d0c0c0c, 0x06050605,
+    0x05040504, 0x0c0b0b0b, 0x08070807, 0x06050605, 0x09080908, 0x0c0b0c0b,
+    0x05040504, 0x0a090909, 0x0e0d0e0d, 0x0a090a09, 0x09080908, 0x0f0e0e0e,
+    0x09080908, 0x04030403, 0x09080908, 0x02010201, 0x00000000, 0x07060706,
+    0x05040504, 0x07060606, 0x0f0d0f0e, 0x06050605, 0x08070807, 0x11101010,
+    0x0c0b0b0b, 0x11101010, 0x09080808, 0x03020302, 0x05040404, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x7c7a7b7b, 0x1d1c1c1d, 0x100f0f0f, 0x110e1010, 0x07070607, 0x5b585a59,
+    0x3e3b3d3c, 0x05040404, 0x00000000, 0x01010001, 0x00000000, 0x00000000,
+    0x02010101, 0x1d1b1b1c, 0x615f6060, 0x54525253, 0x1e1c1b1e, 0x18151617,
+    0x1b18181b, 0x1a171819, 0x1b181a1a, 0x1b18191a, 0x100e0f0f, 0x03020302,
+    0x07060606, 0x0d0c0c0c, 0x120f1111, 0x27252626, 0x73727272, 0x706e6f6f,
+    0x524f5152, 0x2f2c2d2f, 0x1e1c1d1d, 0x1f1c1e1e, 0x09070808, 0x03020202,
+    0x04030303, 0x03020302, 0x0b0a0a0a, 0x08070807, 0x02010101, 0x0b0a0b0a,
+    0x0b0a0b0a, 0x04030303, 0x0d0c0d0c, 0x09080808, 0x05040504, 0x0b0a0b0a,
+    0x08070807, 0x02010201, 0x0c0b0c0b, 0x0c0b0b0b, 0x0a090a09, 0x08070807,
+    0x100e100f, 0x06050605, 0x04030403, 0x07060706, 0x02010201, 0x00000000,
+    0x06050605, 0x03020302, 0x09080908, 0x0d0c0d0c, 0x0d0c0c0c, 0x0a090909,
+    0x0d0c0c0c, 0x15131314, 0x1b19191a, 0x1d1b1b1c, 0x11101010, 0x02010201,
+    0x01000100, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x02010201, 0x1c1b1b1b, 0x100f0f0f, 0x07060606, 0x0b0b090b,
+    0x8f898e8c, 0x37353636, 0x01010001, 0x00000000, 0x00000000, 0x01000000,
+    0x00000000, 0x00000000, 0x14121313, 0x57545557, 0x43414142, 0x211e1f21,
+    0x18161517, 0x18161518, 0x1f1d1c1f, 0x1b191a1a, 0x13111212, 0x0c0b0b0b,
+    0x02010101, 0x08070807, 0x0f0e0e0e, 0x14121313, 0x403e3e40, 0x8b888a8a,
+    0x68666767, 0x4c4a4a4c, 0x28252628, 0x23202123, 0x16141615, 0x03020202,
+    0x06050605, 0x04030403, 0x04030403, 0x0d0c0c0c, 0x0c0b0c0b, 0x00000000,
+    0x0a090a09, 0x0b0a0b0a, 0x03020302, 0x09080908, 0x0c0b0b0b, 0x04030403,
+    0x0c0b0c0b, 0x0b0a0b0a, 0x01000100, 0x09080908, 0x0f0e0e0e, 0x09080908,
+    0x0c0b0b0b, 0x0b0a0909, 0x03020202, 0x06050605, 0x08070707, 0x04030303,
+    0x00000000, 0x06050605, 0x02010201, 0x0c0b0c0b, 0x0f0d0e0e, 0x05040504,
+    0x0c0c0a0b, 0x100f0f0f, 0x0f0f0d0e, 0x14121313, 0x18161717, 0x100e0f0f,
+    0x02010101, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x0b0a0a0a, 0x0d0c0c0c, 0x08070707,
+    0x201f1f1f, 0xa29ca19f, 0x27262626, 0x00000000, 0x09080808, 0x05040404,
+    0x02010101, 0x03010302, 0x03020302, 0x1b19191a, 0x35323335, 0x39373738,
+    0x1c1a1a1c, 0x19161718, 0x1c1a191c, 0x211f1f21, 0x201d1d20, 0x14121313,
+    0x09080808, 0x04030403, 0x09080908, 0x0b0a0a0a, 0x1d1b1b1c, 0x4f4c4d4e,
+    0x87838685, 0x4a494749, 0x32303031, 0x1b1a1a1a, 0x1a191919, 0x13121312,
+    0x03020302, 0x09080908, 0x0d0c0d0c, 0x0d0c0d0c, 0x100e0f0f, 0x09080908,
+    0x01000000, 0x09080808, 0x09080808, 0x02010201, 0x0a090a09, 0x09080908,
+    0x04030403, 0x0c0b0c0b, 0x07060706, 0x00000000, 0x08070807, 0x0a090909,
+    0x09080808, 0x08070707, 0x0c0b090a, 0x03020000, 0x04030101, 0x06050405,
+    0x03020202, 0x00000000, 0x05040504, 0x04030403, 0x07060606, 0x100e100f,
+    0x07060706, 0x09080808, 0x11111010, 0x0d0d0b0c, 0x0e0e0c0d, 0x100f0f0f,
+    0x0f0e0f0e, 0x03020302, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x08070707, 0x10100f0f,
+    0x0c0a0b0b, 0x41403f40, 0x89858887, 0x1e1c1d1d, 0x00000000, 0x110f1010,
+    0x37353636, 0x1f1e1e1e, 0x05040404, 0x06050505, 0x201d1f1f, 0x322f3031,
+    0x33303132, 0x18151618, 0x19161719, 0x1f1d1c1f, 0x1c1a1a1b, 0x1c1a1a1b,
+    0x17151516, 0x09080808, 0x04030403, 0x09080908, 0x0f0d0e0e, 0x1a191819,
+    0x66646465, 0x77747676, 0x2b292a29, 0x07050706, 0x07060606, 0x27242726,
+    0x25232424, 0x08070707, 0x09080808, 0x11101010, 0x12111111, 0x15131414,
+    0x0b0a0b0a, 0x04030403, 0x04030203, 0x06050304, 0x01000000, 0x06050505,
+    0x07060606, 0x07060606, 0x09080808, 0x09080808, 0x02010101, 0x07060606,
+    0x100e0f0f, 0x09080808, 0x0c0b090a, 0x0a090708, 0x05040203, 0x03020000,
+    0x05040203, 0x04030102, 0x03020000, 0x05040304, 0x03020202, 0x09080808,
+    0x0a090a09, 0x08070807, 0x0a090a09, 0x110f1110, 0x0d0d0b0c, 0x0e0e0c0d,
+    0x11110f10, 0x0c0b0b0b, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x04030303,
+    0x0d0c0c0c, 0x0a090909, 0x504d4e4f, 0x78757776, 0x1a191919, 0x00000000,
+    0x16151515, 0x31303030, 0x312f3030, 0x100f0f0f, 0x0b0a0b0a, 0x14121313,
+    0x2c292a2c, 0x26232426, 0x19171619, 0x1b171a1b, 0x1f1c1d1f, 0x201d1d20,
+    0x0d0c0c0c, 0x0a090909, 0x06050505, 0x05040504, 0x0c0b0b0b, 0x0b0a0a0a,
+    0x22212121, 0x817d7f7e, 0x59575857, 0x17161616, 0x04030303, 0x01000000,
+    0x07060706, 0x0c0b0b0b, 0x02010201, 0x04030403, 0x0b0a0a0a, 0x201e1e1f,
+    0x17151616, 0x0d0c0c0c, 0x02010201, 0x04030303, 0x04030102, 0x04030101,
+    0x05040202, 0x06050304, 0x05040203, 0x09080707, 0x05040303, 0x03020102,
+    0x07060405, 0x09080607, 0x09080506, 0x0a090708, 0x0c0b090a, 0x06050303,
+    0x04030101, 0x06050302, 0x03020000, 0x03020001, 0x04030102, 0x05040202,
+    0x06050304, 0x0a090808, 0x07060505, 0x08070707, 0x0a090909, 0x0a090909,
+    0x09080808, 0x12111111, 0x09080808, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x03020202,
+    0x03020302, 0x09080908, 0x0a090a09, 0x110f1110, 0x66646465, 0x100e0f0f,
+    0x05030504, 0x1f1d1e1f, 0x25242424, 0x22212121, 0x0f0e0e0e, 0x17151616,
+    0x0f0e0e0e, 0x1e1d1d1d, 0x211e1e21, 0x1a18171a, 0x17151417, 0x1d1a1a1d,
+    0x201d1e20, 0x19161719, 0x00000000, 0x00000000, 0x01000100, 0x15121414,
+    0x16141415, 0x32303031, 0x78747776, 0x2f2e2d2d, 0x09080808, 0x06030605,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x09080808, 0x18171617, 0x11101010, 0x02010201, 0x01000000, 0x05040303,
+    0x03020000, 0x05040202, 0x06050203, 0x04030102, 0x06050304, 0x04030001,
+    0x03020001, 0x09090605, 0x09090607, 0x09080608, 0x16131216, 0x0a0a0709,
+    0x06050305, 0x05040204, 0x07060406, 0x03020002, 0x03020001, 0x04030102,
+    0x02010000, 0x06050304, 0x06050304, 0x03020001, 0x09080607, 0x100f0c0d,
+    0x0e0d0a0b, 0x0d0c090a, 0x0d0c0b0b, 0x05040303, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x03020302, 0x0b0a0a0a, 0x09080908, 0x0e0d0e0d, 0x0f0e0f0e,
+    0x0f0c0d0e, 0x07060606, 0x18171717, 0x27242526, 0x1d1b1c1c, 0x0b0a0a0a,
+    0x16141515, 0x0d0c0c0c, 0x17161617, 0x1a18191a, 0x1a17181a, 0x18151618,
+    0x1e1b1d1d, 0x201e1e1f, 0x17141516, 0x0e0b0c0e, 0x00000000, 0x00000000,
+    0x03020202, 0x15121314, 0x2f2c2d2e, 0x58565657, 0x18161717, 0x06030505,
+    0x01000100, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x08070707, 0x0c0b0b0b, 0x15131314, 0x07060606, 0x01000000,
+    0x03020001, 0x03020000, 0x05040202, 0x09080607, 0x05040102, 0x0b0a0809,
+    0x0e0c0b0c, 0x18121110, 0x271b1d19, 0x39262a25, 0x4f363b33, 0x6042483f,
+    0x3f2a2a1f, 0x36272314, 0x3a2c2615, 0x3a2e2717, 0x382e2617, 0x3c322b1c,
+    0x362e271b, 0x29221c12, 0x28231e17, 0x1815120f, 0x0d0b0909, 0x0b0a0808,
+    0x0c0c090b, 0x100f0c0d, 0x11100d0e, 0x100f0c0d, 0x03020001, 0x02010000,
+    0x03020000, 0x01000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x05040504, 0x03020302, 0x00000000, 0x0a090909, 0x0a090909, 0x08070707,
+    0x0c0b0c0b, 0x09080908, 0x09080908, 0x0a090909, 0x1b191a1a, 0x211e1f20,
+    0x14111214, 0x2c292a2b, 0x19171619, 0x1e1c1c1e, 0x1d1b1b1d, 0x14131213,
+    0x0d0c0c0c, 0x13121212, 0x13121212, 0x0b0a0b0a, 0x00000000, 0x00000000,
+    0x00000000, 0x04030403, 0x17151516, 0x1a191919, 0x2d2b2c2c, 0x04030303,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x03020302, 0x100e0f0f, 0x1c191b1b, 0x0f0e0f0e, 0x03020202,
+    0x00000000, 0x07050404, 0x09060507, 0x0a080606, 0x05050202, 0x09060607,
+    0x32272a2f, 0x5e43535c, 0x8f617882, 0xd18b9f91, 0xffacbb99, 0xffafb687,
+    0xffb3b37a, 0xffb6b071, 0xffb8ae69, 0xffb8ab61, 0xffbaaa5c, 0xffbca955,
+    0xffbea854, 0xffbfa958, 0xf9bba553, 0xefb29e4c, 0xe7ae9d56, 0xb88e7f49,
+    0xa27d7046, 0x7c5d5436, 0x4c3c3629, 0x1f1c1a1a, 0x12120f12, 0x08060404,
+    0x03010000, 0x03020000, 0x03020000, 0x03020000, 0x01000000, 0x00000000,
+    0x00000000, 0x02010201, 0x06050605, 0x02010201, 0x07060706, 0x0a090a09,
+    0x0b0a0a0a, 0x17141616, 0x0a090a09, 0x08070807, 0x05040504, 0x0e0d0d0d,
+    0x11101010, 0x0d0b0b0d, 0x4b49494a, 0x4f4c4e4e, 0x100e0f0f, 0x0f0e0e0e,
+    0x08070707, 0x0b0a0a0a, 0x0a090a09, 0x0d0c0d0c, 0x07060706, 0x00000000,
+    0x00000000, 0x00000000, 0x03020302, 0x0e0d0e0d, 0x1a181819, 0x0e0d0d0d,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x02010201, 0x01000000, 0x12111111, 0x14131313, 0x13121212,
+    0x03020102, 0x02010000, 0x02010000, 0x0e0c090b, 0x03030000, 0x0d0a090a,
+    0x4330393e, 0x9269808b, 0xd193b0b5, 0xf7a9c1ab, 0xffa8bea4, 0xffa5bda7,
+    0xffa9b998, 0xffafb27d, 0xffb6ae6c, 0xffb6ac65, 0xffb8aa5c, 0xffbda650,
+    0xffbda54d, 0xffbea54c, 0xffbfa54c, 0xffbea54c, 0xffbfa44b, 0xffbfa64c,
+    0xffbfa54c, 0xffbca34b, 0xffb09a46, 0xffa48f40, 0xeb988541, 0xb67d7040,
+    0x59463f2d, 0x07070506, 0x03030002, 0x04030002, 0x03020000, 0x03020000,
+    0x02010000, 0x01000000, 0x06050605, 0x08070707, 0x01000100, 0x06050605,
+    0x0a090a09, 0x07060706, 0x0e0c0d0d, 0x120f1111, 0x06050605, 0x0b0a0b0a,
+    0x05040404, 0x02020102, 0x12111111, 0x5b595a5a, 0x48464747, 0x06050505,
+    0x03020202, 0x02010201, 0x01000101, 0x00000000, 0x00000000, 0x01000100,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x05040504, 0x04030403,
+    0x0c0a0a0b, 0x06050505, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x100f0f0f, 0x22201f21,
+    0x14131112, 0x08080305, 0x02010000, 0x02010000, 0x00000000, 0x06040404,
+    0x62455259, 0xbf86a4b0, 0xf4abcccb, 0xfdadbd9b, 0xffadb891, 0xffabbb9b,
+    0xffacb686, 0xffb6ab66, 0xffbaa958, 0xffb9a85c, 0xffb6ac67, 0xffbaa95c,
+    0xffbba752, 0xffbda64e, 0xffc0a54d, 0xffc0a44c, 0xffbea64d, 0xffbfa64c,
+    0xffbea64c, 0xffbfa64c, 0xfebfa54a, 0xffbea54b, 0xffbba249, 0xffb39b45,
+    0xff9b8739, 0xff8f7b32, 0xf8907e37, 0xac756b43, 0x1c1a1711, 0x02020001,
+    0x02000000, 0x02010000, 0x03020000, 0x05040303, 0x07060606, 0x01000100,
+    0x07060706, 0x09080908, 0x07060706, 0x0d0c0d0c, 0x0b0a0a0a, 0x0f0c0d0e,
+    0x07060706, 0x04030403, 0x00000000, 0x0a090909, 0x33323232, 0x211f2021,
+    0x02020101, 0x00000000, 0x00000000, 0x00000000, 0x01000100, 0x03010202,
+    0x05040404, 0x03010202, 0x05030403, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x02010101, 0x03020001,
+    0x08070506, 0x12100e10, 0x0a080606, 0x06040203, 0x0b070707, 0x241b1c1f,
+    0x7752656c, 0xe9a1c9d6, 0xffb7ddea, 0xfcb7cbb7, 0xf9abb890, 0xfeb1b98a,
+    0xfeb4b175, 0xffb8ad69, 0xffb7ab65, 0xffb2af70, 0xffafb484, 0xffb1b380,
+    0xffb8aa5c, 0xffbea54e, 0xffbfa64b, 0xffc0a64d, 0xffc0a64b, 0xffbfa64b,
+    0xffc0a54d, 0xffbfa54a, 0xffbca34a, 0xffbba149, 0xffbba149, 0xffbaa04a,
+    0xffb59c44, 0xffb09845, 0xffab9543, 0xfe9d883a, 0xfe8d7931, 0xff88772f,
+    0xde837744, 0x4c3b372d, 0x06050303, 0x03020001, 0x03020000, 0x05040203,
+    0x03020202, 0x01000100, 0x07060706, 0x06050605, 0x0e0d0e0d, 0x0d0c0d0c,
+    0x04030403, 0x08070807, 0x03020202, 0x00000000, 0x00000000, 0x08070707,
+    0x06060505, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x06050505, 0x05040404, 0x04030302, 0x0a09080a, 0x03020302,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x01000000, 0x02010201, 0x04030303,
+    0x03020101, 0x02010000, 0x03020000, 0x04030002, 0x1c141617, 0x452d383c,
+    0x8858707b, 0xe89fc8d4, 0xffb6dadf, 0xffbbd3c8, 0xfebac5a4, 0xffbab885,
+    0xffb9ba89, 0xf7b0b68a, 0xfeb1b88a, 0xffacb484, 0xffaeb484, 0xffafb382,
+    0xffb7ab62, 0xffbfa547, 0xffc0a44b, 0xffbea64b, 0xffbca248, 0xffb9a049,
+    0xffb8a04a, 0xffb69f4d, 0xffb39c49, 0xffad9644, 0xffa89242, 0xffa58f3f,
+    0xffa18b3b, 0xffa08b39, 0xffa5903e, 0xffa38c3b, 0xff958035, 0xff907c34,
+    0xfe887531, 0xfe7e6c29, 0xff7c6a29, 0xe6837540, 0x5a4b473d, 0x08070506,
+    0x03020001, 0x04030102, 0x0a090909, 0x09080908, 0x04030403, 0x0e0d0e0d,
+    0x0e0d0e0d, 0x04030403, 0x04030403, 0x04030303, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x03010201, 0x01000100, 0x01000100,
+    0x05030403, 0x02010101, 0x01000100, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x03020302,
+    0x08070707, 0x06050304, 0x05040202, 0x02010000, 0x02010000, 0x5d3f4d51,
+    0xc37da2af, 0xeb8bbed2, 0xffacd1d2, 0xffb6d2cc, 0xffb9d5d0, 0xfeb7cab6,
+    0xffbac39f, 0xffb9c4a1, 0xfeb6c19a, 0xffb5b785, 0xffb5b279, 0xfeb1af76,
+    0xffb1b179, 0xffbaa859, 0xffbea64d, 0xffc0a74e, 0xffb9a248, 0xffa48d3a,
+    0xffab9752, 0xffb1a268, 0xffbeb282, 0xffcdc4a0, 0xffd2caa8, 0xffbfb79a,
+    0xffb9b197, 0xff938b6e, 0xffcbc3a7, 0xff9b9171, 0xff998b60, 0xff897737,
+    0xff83702a, 0xff82702b, 0xff7e6c2b, 0xff7a6827, 0xff766525, 0xff7a6828,
+    0xc9887c57, 0x17161211, 0x06050304, 0x05040203, 0x05040404, 0x00000000,
+    0x0b0a0b0a, 0x0a090a09, 0x07060706, 0x07060706, 0x02010101, 0x00000000,
+    0x00000000, 0x00000000, 0x01000100, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x03020202, 0x09080808, 0x01000100, 0x00000000, 0x00000000, 0x00000000,
+    0x02010201, 0x03020302, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x05050404, 0x0d0c0c0c, 0x08070507, 0x06050204, 0x07060305, 0x03030000,
+    0x38263032, 0xd482acb7, 0xff96c5c8, 0xff9ac1bf, 0xff9acbd5, 0xfe9cd2e5,
+    0xffb1d5d8, 0xffaec7b4, 0xffb1ba90, 0xffb6af72, 0xffb9aa5c, 0xffb7ac64,
+    0xffb1b07c, 0xffaeb27d, 0xffb0b178, 0xffa8b68e, 0xffa5c0aa, 0xffb29e4b,
+    0xffa48f3f, 0xffdfd7b7, 0xfff9f7f1, 0xfffaf8f4, 0xffe5e5e2, 0xffcacac8,
+    0xffd2d2d1, 0xffebebea, 0xffd8d8d7, 0xffb0b0ae, 0xfff2f2f1, 0xffc6c6c6,
+    0xffc3c2c1, 0xffe7e3da, 0xffada78c, 0xff8f7f45, 0xff786424, 0xff7a682c,
+    0xfe8b7c48, 0xff978a57, 0xff887a42, 0x805d5b4a, 0x13120f13, 0x0b0a0808,
+    0x09090607, 0x0d0c0c0c, 0x0a090a09, 0x02010201, 0x05040504, 0x01000100,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x01000100, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x03020202, 0x08070707, 0x05050405, 0x01000100, 0x00000000,
+    0x00000000, 0x00000000, 0x05040504, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x03020202, 0x0a090909, 0x05040202, 0x07070202, 0x07060302,
+    0x0e090a09, 0x89567077, 0xff94c4cc, 0xff8bbfc3, 0xfe92c7d5, 0xff91cfeb,
+    0xff9dd4ec, 0xffa0cfd8, 0xff9bbeb4, 0xffa9b487, 0xffb3ad68, 0xffafb074,
+    0xffadb486, 0xffa7b894, 0xffabb68f, 0xffa8b99b, 0xffa3bca9, 0xffa4b189,
+    0xffaba15b, 0xffb5a460, 0xffe5dfc8, 0xfffefefe, 0xfffdfdfe, 0xfffdfdfd,
+    0xffd2d2d3, 0xffeeeeee, 0xfff2f2f2, 0xffcecece, 0xffc9c9c9, 0xffaaaaaa,
+    0xfff7f7f7, 0xffe4e4e4, 0xffc5c5c4, 0xffe2e1e3, 0xffc3c2c5, 0xffdad5cc,
+    0xffa89d71, 0xffeceada, 0xfffffffe, 0xfffafafd, 0xfff3f2e7, 0x8978766e,
+    0x100f0c0f, 0x0c0b0909, 0x13121011, 0x0e0d0d0d, 0x00000000, 0x04030403,
+    0x03020302, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x01000100, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x04040303, 0x09090809, 0x09080808, 0x0f0e0d0f, 0x03030202, 0x02010101,
+    0x07050606, 0x00000000, 0x02010201, 0x08070807, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x02010101, 0x07060505, 0x04030102, 0x04040000,
+    0x02020000, 0x35242a2f, 0xc06b99ac, 0xff7bc1d9, 0xff79c0d5, 0xfe7bc3e1,
+    0xff8bc6d3, 0xffa8c5aa, 0xffb1bc85, 0xffafbb85, 0xffacbe95, 0xffb1b884,
+    0xffadbb94, 0xffa9bc9e, 0xffa6b99a, 0xffa2b99f, 0xff9db497, 0xff9ca57b,
+    0xffa09552, 0xffaf9b55, 0xffd4c8a2, 0xfff8f5ee, 0xffffffff, 0xfffefefc,
+    0xfffefefe, 0xfff1f1f1, 0xffe2e2e2, 0xff737373, 0xffdadada, 0xffe7e7e7,
+    0xffc2c2c2, 0xffc8c8c8, 0xffcfcfcf, 0xffececec, 0xffcececd, 0xffb2b2b2,
+    0xfff6f6f6, 0xffc1c2c4, 0xffe5e6e6, 0xfffcfbfc, 0xff8c8b8d, 0xfec5c5c3,
+    0xffffffff, 0xc8c7c7c7, 0x11110e0f, 0x0c0b080a, 0x0d0d090a, 0x0a090909,
+    0x04030403, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x01000100, 0x05040504, 0x01000100, 0x00000000, 0x00000000, 0x00000000,
+    0x06050505, 0x0c0b0b0c, 0x0a0a0909, 0x07070607, 0x02010101, 0x01000000,
+    0x03020202, 0x06050605, 0x08060707, 0x06050605, 0x04030403, 0x00000000,
+    0x00000000, 0x00000000, 0x03020101, 0x09080508, 0x0c0c0808, 0x05040203,
+    0x02020000, 0x06050303, 0x6745555a, 0xde6fabc2, 0xff6bc0e0, 0xff6cbee2,
+    0xfe5fbae1, 0xff7bbcb7, 0xffccca5e, 0xffedce35, 0xffeace3e, 0xffe7d146,
+    0xffebd13f, 0xffeece35, 0xffeccc3c, 0xffdccf56, 0xffcdd077, 0xffd6d373,
+    0xffdbcb60, 0xffdac24e, 0xffdfc451, 0xfff1ebcb, 0xffffffff, 0xffffffff,
+    0xffffffff, 0xffffffff, 0xffffffff, 0xfffcfcfc, 0xffdadada, 0xffb8b8b8,
+    0xffd9d9d9, 0xffd9d9d9, 0xffe5e5e5, 0xffe3e3e3, 0xffd3d3d3, 0xffdedfde,
+    0xffe3e3e3, 0xffd3d3d4, 0xffefefef, 0xffd8d8d7, 0xfff4f4f4, 0xffacabab,
+    0xff868686, 0xffffffff, 0xfefcfcfa, 0xe3a09f9f, 0x2d262325, 0x04040102,
+    0x05040102, 0x05040303, 0x01000101, 0x00000000, 0x02010201, 0x00000000,
+    0x01000000, 0x00000000, 0x06050505, 0x0d0c0c0c, 0x00000000, 0x00000000,
+    0x02010201, 0x19181718, 0x11100f11, 0x08080707, 0x04040303, 0x00000000,
+    0x00000000, 0x00000000, 0x05040504, 0x02010101, 0x07060706, 0x04030303,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x08070506, 0x11100c0f,
+    0x06040305, 0x00000000, 0x2b1e2223, 0xce89acb6, 0xf872bddb, 0xff5eb9e2,
+    0xff64bce3, 0xfe55b5e6, 0xff5eb6d3, 0xffc3ce6a, 0xffffda27, 0xfffeda25,
+    0xfffdda2e, 0xffffd928, 0xffffd926, 0xffffd925, 0xffffda25, 0xfffdd92e,
+    0xfffade3f, 0xfffeda33, 0xfffed933, 0xffffdc36, 0xffe2cd57, 0xffb6a878,
+    0xffb1afa7, 0xffb1b0b2, 0xffacacac, 0xffababab, 0xffefefef, 0xffffffff,
+    0xfffafafa, 0xfff7f7f7, 0xfff8f8f8, 0xfffbfbfb, 0xffe8e8e8, 0xffd6d6d6,
+    0xffececec, 0xfff5f5f5, 0xffededed, 0xfffcfcfc, 0xfffbfbfb, 0xfffdfdfd,
+    0xfffefefe, 0xffe7e7e7, 0xfff7f7f7, 0xffffffff, 0xfeb4b3b4, 0xff504c4d,
+    0x9a404242, 0x00000000, 0x02010000, 0x02010000, 0x00000000, 0x00000000,
+    0x00000000, 0x01000100, 0x01000100, 0x0a090909, 0x0c0b0b0b, 0x312f3030,
+    0x0d0c0c0c, 0x0e0d0d0d, 0x0c0b0b0b, 0x04010302, 0x03010201, 0x04040303,
+    0x04030303, 0x00000000, 0x00000000, 0x01000100, 0x06050605, 0x0d0c0d0c,
+    0x03020302, 0x00000000, 0x00000000, 0x00000000, 0x01000000, 0x05040404,
+    0x09080607, 0x09080605, 0x07060405, 0x29212022, 0xca85a8b2, 0xff79c1de,
+    0xfe4baad5, 0xff55b3db, 0xff57b6e0, 0xff62b8cd, 0xffbacb74, 0xffffda26,
+    0xfffed924, 0xfffed82a, 0xfffed92d, 0xfffed828, 0xfffed926, 0xffffd829,
+    0xfffcdb30, 0xfffdda34, 0xfffed92f, 0xfffdd92e, 0xfffddc2f, 0xfffdd829,
+    0xfffad734, 0xfff1d565, 0xffedece7, 0xfff5f5f6, 0xfffefefe, 0xfffdfdfd,
+    0xffb9b9b9, 0xffbebebe, 0xfff3f3f3, 0xfff8f8f8, 0xfffbfbfb, 0xfffdfdfd,
+    0xfff7f7f7, 0xffeaeaea, 0xffe6e6e6, 0xfff6f6f6, 0xfff7f7f7, 0xffc7c7c7,
+    0xfff8f8f8, 0xfffbfbfb, 0xfffefefe, 0xffffffff, 0xfef9f9f9, 0xfeaca9ab,
+    0xff3d393d, 0xff4d484c, 0xe9424242, 0x01010101, 0x04030101, 0x02010000,
+    0x00000000, 0x03020202, 0x0a090909, 0x00000000, 0x07060706, 0x0d0c0c0c,
+    0x28262727, 0x47454547, 0x08070707, 0x0e0d0d0d, 0x0f0f0d0e, 0x08070707,
+    0x07060606, 0x05040405, 0x03020203, 0x02010102, 0x02010201, 0x0a090a09,
+    0x09080908, 0x01000100, 0x00000000, 0x00000000, 0x00000000, 0x04030304,
+    0x0c0a0b0b, 0x0b0a0909, 0x0b0a0708, 0x09080608, 0x1d17171a, 0xaf6e8f97,
+    0xff79c2dd, 0xff49a8d3, 0xff4eaed8, 0xff55b6e2, 0xff6ebbc1, 0xffc9ce64,
+    0xffffda26, 0xfffdd927, 0xfffed926, 0xfffcd929, 0xfffddc40, 0xfffdda2f,
+    0xfffed926, 0xfffdda2b, 0xfffedc33, 0xfffdda2f, 0xfffdd92b, 0xffffd827,
+    0xfffdd827, 0xffffda27, 0xffe7d16d, 0xff959590, 0xff6f6b6e, 0xff8f8d8e,
+    0xffb0afaf, 0xffd2d1d1, 0xffdfdfdf, 0xffededed, 0xffc4c4c4, 0xffc8c8c8,
+    0xffd6d6d6, 0xffdbdbdb, 0xffd8d8d8, 0xffd4d4d4, 0xff9d9d9d, 0xff8e8d8e,
+    0xffb1b0b1, 0xfff7f7f7, 0xffe0e0e0, 0xfffcfdfd, 0xfffdfdfd, 0xffddddde,
+    0xfb9f9d9e, 0xfc2d292c, 0xff2e282c, 0xfe5a565b, 0xf63c383b, 0x54333333,
+    0x08080708, 0x07060304, 0x02010101, 0x02010201, 0x08070807, 0x06050505,
+    0x0d0c0c0c, 0x201e1f1f, 0x34313333, 0x18171617, 0x0e0e0d0e, 0x17171617,
+    0x1d1c1b1d, 0x02010101, 0x00000000, 0x02000001, 0x01000001, 0x03010103,
+    0x07060706, 0x09080908, 0x04030403, 0x00000000, 0x00000000, 0x00000000,
+    0x03020203, 0x09050607, 0x0b090909, 0x13111012, 0x09070607, 0x110e0d0f,
+    0xbc7899a6, 0xff77c1dd, 0xff4ba8d4, 0xfe4dadd6, 0xff56b7e1, 0xff78bcbb,
+    0xffd8d152, 0xffffda26, 0xffffd827, 0xfffcdd43, 0xfffcdc3f, 0xfffbdf5e,
+    0xfff9dd4e, 0xfffddc3f, 0xfffddb38, 0xfffed927, 0xfffdd928, 0xfffdd92d,
+    0xfffdd829, 0xfffed829, 0xfffed928, 0xfff5d430, 0xffa39036, 0xff534d49,
+    0xffaba8aa, 0xfffdfdfd, 0xffdadada, 0xffd8d7d7, 0xffc4c4c4, 0xffbfbfbf,
+    0xffc7c7c7, 0xffcecece, 0xffc3c3c3, 0xffbfbfbf, 0xffb8b8b8, 0xffb1b1b1,
+    0xffb8b8b8, 0xffdedede, 0xffb6b6b6, 0xfff1f1f1, 0xffffffff, 0xffc2c1c1,
+    0xff757274, 0xfe4b474b, 0xff2b252a, 0xff2e292d, 0xfe332e32, 0xff514c50,
+    0xfe514d50, 0xab606262, 0x1a1a1a1a, 0x0b0b0809, 0x04030203, 0x03020302,
+    0x0a090a09, 0x0e0d0d0d, 0x13121112, 0x29262729, 0x19171818, 0x06050605,
+    0x0d0c0c0d, 0x1b1a1a1a, 0x1a171819, 0x03010103, 0x01000000, 0x01000001,
+    0x02000002, 0x02000002, 0x00000000, 0x03020302, 0x06050605, 0x01000000,
+    0x00000000, 0x01000000, 0x04000002, 0x06000001, 0x0f07080a, 0x0d0a070a,
+    0x03030101, 0xbd789ba2, 0xff75c1de, 0xff4eacd5, 0xfe4bacd2, 0xff56b4e1,
+    0xff69bacf, 0xffbecb6f, 0xfffed928, 0xffffd822, 0xfffbdc3c, 0xfffbe068,
+    0xfff9e179, 0xfff8e37d, 0xfffcdf65, 0xfffcdd4d, 0xfffbdb37, 0xfffed928,
+    0xfffddb31, 0xfffdd932, 0xfffed82a, 0xfffeda29, 0xffffda2b, 0xfff5d333,
+    0xff82722d, 0xff2d2925, 0xff959298, 0xffffffff, 0xffffffff, 0xffffffff,
+    0xffffffff, 0xfff6f6f6, 0xfff0f0f0, 0xffe6e6e6, 0xffe2e2e2, 0xffe5e5e5,
+    0xffededed, 0xfff5f5f5, 0xfff7f7f7, 0xffb9b9b9, 0xffe4e4e3, 0xffe9e8e9,
+    0xff5c5a5d, 0xff342f33, 0xff2e292d, 0xff2d282c, 0xff2d282c, 0xff2e292d,
+    0xff322d31, 0xff464044, 0xff4a464a, 0xea8f8e8f, 0x5c524f52, 0x0c0c080a,
+    0x04030102, 0x07060706, 0x0e0d0e0d, 0x11101010, 0x16151515, 0x14131213,
+    0x05040404, 0x00000000, 0x22212121, 0x0e0d0c0e, 0x04020204, 0x01000001,
+    0x02000002, 0x01000001, 0x01000001, 0x03000002, 0x04030403, 0x02010101,
+    0x00000000, 0x03020302, 0x02000101, 0x05000001, 0x05000001, 0x08010203,
+    0x0f060407, 0x0c050407, 0xa069838b, 0xff89c6df, 0xfe59b3d7, 0xff4ba8d3,
+    0xff57b5e1, 0xff5ebad5, 0xff9fc28d, 0xfffad930, 0xffffd824, 0xfffdd830,
+    0xfffbdf57, 0xfff8e272, 0xfff6e589, 0xfff7e283, 0xfff6e382, 0xfff8df64,
+    0xfffddc4a, 0xfffdda2c, 0xfffdd82b, 0xfffacf26, 0xfffbd527, 0xfffed827,
+    0xfffed825, 0xfffdd225, 0xff917a39, 0xff352d33, 0xff302e30, 0xff3b373a,
+    0xff5b585b, 0xff7e7b7e, 0xffaeaeae, 0xffeaeaea, 0xfff4f3f4, 0xfff6f6f6,
+    0xfff5f5f5, 0xfff5f5f5, 0xfff6f6f6, 0xffd8d8d8, 0xffa7a7a7, 0xffefefef,
+    0xffe5e4e5, 0xff524d51, 0xff2b262b, 0xff2f2a2e, 0xff2f2a2e, 0xff2f2a2e,
+    0xff2e292d, 0xff2e292d, 0xff302b2f, 0xff433d41, 0xff484246, 0xff939192,
+    0x8c6c6b6d, 0x10100d0e, 0x07060405, 0x0d0c0c0c, 0x0e0d0e0d, 0x12111111,
+    0x11101010, 0x09080808, 0x07060706, 0x08070707, 0x08060608, 0x01000000,
+    0x01000001, 0x02000002, 0x01000001, 0x01000001, 0x08050608, 0x06050605,
+    0x05040404, 0x00000000, 0x03020202, 0x04020202, 0x05000001, 0x06000001,
+    0x0a020304, 0x0c040305, 0x0a030003, 0x764e5e64, 0xff97ccdb, 0xff80c0d6,
+    0xff5dafd0, 0xff52b3dd, 0xff53b3df, 0xff6ebac3, 0xffcbd06b, 0xffedd53c,
+    0xfffed824, 0xfffcdb3a, 0xfffae36a, 0xfff8e173, 0xfff8e387, 0xfff6e593,
+    0xfff7e27b, 0xfff9e170, 0xfffbdf57, 0xfffed92c, 0xfffdd528, 0xffdab123,
+    0xffc2a528, 0xfffcd82a, 0xffe1b824, 0xffb89326, 0xff40392e, 0xff726e71,
+    0xffbcbbbb, 0xffdad9da, 0xffdbdada, 0xffebeaea, 0xffc9c8c9, 0xffa5a4a4,
+    0xffa9a8a8, 0xffacabab, 0xffadabac, 0xffaeacad, 0xffbbbbbb, 0xffdadada,
+    0xffffffff, 0xffffffff, 0xff989798, 0xff2f2a2e, 0xff2e292d, 0xff2e292d,
+    0xff2f2a2e, 0xff2e292d, 0xff2f2a2e, 0xff2e292d, 0xff312c30, 0xff423d41,
+    0xfe474246, 0xff8b898b, 0x71434344, 0x12110e0f, 0x0c0b080a, 0x110f0f10,
+    0x0f0e0f0e, 0x13111312, 0x09080908, 0x03020302, 0x06050505, 0x100f0f0f,
+    0x01000001, 0x01000001, 0x01000001, 0x02000001, 0x03000103, 0x0d0a0b0d,
+    0x13101113, 0x03020202, 0x05040504, 0x02010101, 0x04010101, 0x06000001,
+    0x07000002, 0x08000103, 0x0e050406, 0x06000000, 0x50364144, 0xf8a7ccd2,
+    0xf599c5d0, 0xfd93c4d4, 0xff57b3d6, 0xff41a0cc, 0xff5ab6dc, 0xff82c4d1,
+    0xffafca90, 0xfff5d732, 0xffffd925, 0xfffedb33, 0xfffcdb3f, 0xfff9e170,
+    0xfff7e48e, 0xfff8e27c, 0xfff8e16d, 0xfff9df62, 0xfffbde52, 0xfffedb2b,
+    0xfffdc825, 0xff9c8129, 0xff74652a, 0xffb79b2e, 0xff7a6026, 0xff4d4533,
+    0xff97959a, 0xfffbfdfa, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+    0xffffffff, 0xffffffff, 0xffffffff, 0xfffefefe, 0xfffbfbfb, 0xfffefffe,
+    0xffffffff, 0xffffffff, 0xfffefefe, 0xfff6f6f6, 0xff656365, 0xff2c272b,
+    0xff2e282c, 0xff2d282c, 0xff2e292d, 0xff2d282c, 0xff2f2a2e, 0xff2f2a2e,
+    0xff302b2f, 0xff423d41, 0xff3d383c, 0xff807e81, 0xa7676969, 0x13100d0e,
+    0x04030102, 0x0b0a090b, 0x0f0e0f0e, 0x09080908, 0x04030303, 0x0a090a09,
+    0x0f0e0e0e, 0x24222223, 0x01000001, 0x02000002, 0x01000001, 0x00000000,
+    0x04030303, 0x02010201, 0x02000002, 0x00000000, 0x05040504, 0x04030303,
+    0x04000001, 0x07000002, 0x07000002, 0x0c040404, 0x08020000, 0x01000000,
+    0x583b4549, 0xf7a1c8d2, 0xd38cabb5, 0xfc6ebbd8, 0xff3191be, 0xff50b1d8,
+    0xff63bde5, 0xff88c3b5, 0xffe4d449, 0xffffda29, 0xffffd826, 0xffffd829,
+    0xfffddc3a, 0xfff9e06d, 0xfff8e277, 0xfffbde4f, 0xfffcdb43, 0xfffddd49,
+    0xfff5d447, 0xffeaca44, 0xffc19930, 0xff4e4429, 0xff51472d, 0xff433828,
+    0xff302c2c, 0xff726d70, 0xfff8f8f8, 0xfffffefe, 0xfffdfdfd, 0xfffefdfe,
+    0xfffdfdfd, 0xfffefefe, 0xfffefefe, 0xfffdfdfd, 0xfffefefe, 0xfffefefe,
+    0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffdfdfd, 0xfffefefe, 0xffe3e2e2,
+    0xff524d50, 0xfe2c272b, 0xff2b282b, 0xfe2d282c, 0xff2e282c, 0xff2e292d,
+    0xff2e292d, 0xff2f2a2e, 0xff2e292d, 0xff3f3a3e, 0xff353034, 0xff767477,
+    0xbb757878, 0x110a0809, 0x05000000, 0x07030103, 0x05030403, 0x01000100,
+    0x0d0c0d0c, 0x0d0c0c0c, 0x201e1e1f, 0x25222325, 0x01000001, 0x02000002,
+    0x08050607, 0x0f0c0d0f, 0x02000002, 0x02000001, 0x01000000, 0x04000000,
+    0x01000000, 0x04030303, 0x04010201, 0x05000001, 0x07000002, 0x09040103,
+    0x04010100, 0x0e080709, 0xbf7c9aa0, 0x9b647c82, 0xc46597a8, 0xff3996bd,
+    0xff42a3cd, 0xff56b4e0, 0xff6ab9ca, 0xffcdd162, 0xffefd846, 0xfffcd829,
+    0xfffed929, 0xfffed925, 0xfffcda3b, 0xfffae273, 0xfffbdc40, 0xffffd929,
+    0xfffed927, 0xffffdb33, 0xffe5b32d, 0xffad8e3e, 0xff63583b, 0xff2f2a29,
+    0xff383129, 0xff2f292e, 0xff2c2a2b, 0xff817d81, 0xffffffff, 0xfffefefe,
+    0xfffefefe, 0xfffdfefd, 0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffefefe,
+    0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffefefe,
+    0xfffefefe, 0xffd4d3d3, 0xff413f41, 0xff2f292d, 0xf1464748, 0xff2d292d,
+    0xff2c292e, 0xff302b2e, 0xff2d282c, 0xff2e292d, 0xff2e292d, 0xff3a373a,
+    0xff332f33, 0xff787376, 0xb9727373, 0x12080508, 0x0a030003, 0x0c030204,
+    0x0b060608, 0x1c181a1b, 0x19161718, 0x16161515, 0x19181818, 0x12111111,
+    0x04020204, 0x0f0c0d0f, 0x0b08090b, 0x01000001, 0x02000001, 0x04000001,
+    0x07000002, 0x07000002, 0x06000002, 0x05000001, 0x03000000, 0x04010102,
+    0x05020102, 0x06040203, 0x0e0b0709, 0xae758d92, 0xa96d868c, 0x76445b64,
+    0xfa4fa0c4, 0xff399ac6, 0xff4bacd3, 0xff4eadd4, 0xff8bc2a5, 0xffc0cc75,
+    0xffe7d440, 0xffffda2b, 0xfffeda28, 0xfffdd927, 0xfffcdf53, 0xfffcdc45,
+    0xfffed825, 0xffffd825, 0xfffed629, 0xfffed731, 0xffd8a428, 0xff715b2c,
+    0xff372f2b, 0xff2e2a2b, 0xff322b2c, 0xff2f292e, 0xff2d292d, 0xff777576,
+    0xfffdfdfd, 0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffefefe,
+    0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffdfdfd,
+    0xfffefefe, 0xfffefefe, 0xfffefefe, 0xffc9c8c8, 0xfe363236, 0xff322f32,
+    0x583d3a3c, 0xe23d3d3e, 0xff2b272a, 0xfe2c272b, 0xff2d282c, 0xff2e292d,
+    0xff2f2a2e, 0xff342f33, 0xfe332f33, 0xff757174, 0xb36c6d6d, 0x12060507,
+    0x0b030003, 0x0b020204, 0x07010103, 0x15121314, 0x221f2022, 0x211e1e20,
+    0x0f0b0b0d, 0x0e0b0c0d, 0x1a17181a, 0x0c090a0b, 0x03000001, 0x05000001,
+    0x06000002, 0x07000002, 0x07000002, 0x07000002, 0x06000001, 0x07000002,
+    0x05000001, 0x03010101, 0x04010000, 0x04000000, 0x835a686c, 0xb97d9698,
+    0x3822292d, 0xeb62a5c0, 0xff3797c1, 0xfe40a1ca, 0xff48a7d2, 0xff63b9d1,
+    0xff98c498, 0xffe7d445, 0xffffda27, 0xfffdd926, 0xffffd826, 0xfffcda38,
+    0xfffddd45, 0xfffdda25, 0xfffed826, 0xfffeda2b, 0xfffbc521, 0xffffd12e,
+    0xffcf9822, 0xff584a33, 0xff302c2a, 0xff2f2b2c, 0xff2d292d, 0xff2d292c,
+    0xff2e282c, 0xff747172, 0xfffbfbfb, 0xfffefefe, 0xfffefefe, 0xfffefefe,
+    0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffefefe,
+    0xfffefefe, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffc3c3c4,
+    0xff322e32, 0xff383738, 0x332a2929, 0x7d282829, 0xff312d30, 0xff2d282d,
+    0xff2d282c, 0xff2e292d, 0xff2f2a2e, 0xff312c30, 0xfe373135, 0xff6e686b,
+    0x8b545354, 0x0d030205, 0x0a030003, 0x0c060508, 0x110d0e10, 0x2a27282a,
+    0x1b18181a, 0x08030205, 0x05000001, 0x05000002, 0x0a060608, 0x05000001,
+    0x06000001, 0x07000002, 0x07000002, 0x06000001, 0x07000002, 0x06000001,
+    0x07000002, 0x06000001, 0x0a020304, 0x08020002, 0x09020002, 0x0a010002,
+    0x36222528, 0x150a0a0e, 0xc36994a4, 0xff3896be, 0xff3596c0, 0xfe41a3cd,
+    0xff51afd7, 0xff6db9c2, 0xffd5d156, 0xfffbd429, 0xfffed525, 0xfffdd926,
+    0xfffed627, 0xfffbca24, 0xffffd82c, 0xfffed925, 0xffffd928, 0xffffd729,
+    0xfff9bb1d, 0xfff8c628, 0xffb48320, 0xff594f3e, 0xff2e2929, 0xff2e292d,
+    0xff2d282d, 0xff2d292c, 0xff2d282c, 0xff6a686a, 0xfff6f6f6, 0xfffefefe,
+    0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffefefe, 0xfffefefe,
+    0xfffefefe, 0xfffdfdfd, 0xfffefefe, 0xffebebec, 0xff999798, 0xff6e6d6d,
+    0xffc1c0c0, 0xffb6b4b6, 0xff312c30, 0xff484748, 0x07070707, 0x38171717,
+    0xe5333034, 0xff2d292a, 0xff2d282d, 0xff302b2f, 0xff302b2f, 0xff302b2f,
+    0xff363034, 0xff605d5f, 0x5c333334, 0x07010001, 0x06030102, 0x08070707,
+    0x0c0b0c0b, 0x0b08090a, 0x07000103, 0x06000001, 0x07000002, 0x06000002,
+    0x06000001, 0x07000002, 0x06000001, 0x07000002, 0x07000002, 0x07000002,
+    0x07000002, 0x07000002, 0x07000002, 0x09010203, 0x0b030204, 0x0a020002,
+    0x0a010002, 0x0a020003, 0x05000000, 0x7142575f, 0xff4ba2c8, 0xfe2a8bb9,
+    0xff409fca, 0xff49a6d2, 0xff4ba6c2, 0xffb6cb7b, 0xffe1d048, 0xfff3cc2c,
+    0xfffddb25, 0xffffd92a, 0xfffbc822, 0xfffcd026, 0xfffed927, 0xffffd827,
+    0xfffed929, 0xfffed328, 0xfff9b81a, 0xfff4bd23, 0xff956f25, 0xff4a4337,
+    0xff2d282c, 0xff2d292c, 0xff2d282d, 0xff2e292c, 0xff2d282c, 0xff474446,
+    0xffd3d2d3, 0xfffefefe, 0xfffdfdfd, 0xfffefefe, 0xfffefefe, 0xfffefefe,
+    0xfffefefe, 0xfffefefe, 0xfffdfdfd, 0xfffefefe, 0xfff4f3f4, 0xff908e91,
+    0xff716c71, 0xff575256, 0xff464145, 0xff5c585c, 0xff2e292d, 0xff454547,
+    0x24181616, 0x120e0e0d, 0xb73f3e40, 0xff322d31, 0xfe2c272b, 0xff302b2f,
+    0xff312c30, 0xff312c30, 0xff342f33, 0xe8434144, 0x3c1f1d1f, 0x09080508,
+    0x09090607, 0x05040404, 0x02000000, 0x06000001, 0x07000002, 0x07000002,
+    0x06000001, 0x05000002, 0x07000002, 0x06000001, 0x07000002, 0x07000002,
+    0x07000002, 0x07000002, 0x07000002, 0x06000001, 0x09010203, 0x10060608,
+    0x0a020002, 0x09010002, 0x09020002, 0x09030001, 0x52323e42, 0xfa5facca,
+    0xfe2789b4, 0xfe3a99c3, 0xff41a2cd, 0xff3e9dc0, 0xff90c3a3, 0xffb6c976,
+    0xffd2c551, 0xfffdd629, 0xfffedb2e, 0xfffcd42a, 0xfffdcb26, 0xfffed72a,
+    0xfffdd926, 0xfffed828, 0xfffeda29, 0xfffed52b, 0xfff8b91c, 0xffd9a621,
+    0xff5f4723, 0xff302a2b, 0xff2d292d, 0xff2e292c, 0xff2e282d, 0xff2d292c,
+    0xff2e292d, 0xff292428, 0xff575457, 0xffdcdbdc, 0xffffffff, 0xfffdfdfd,
+    0xfffdfdfd, 0xfffefefe, 0xfffefefe, 0xfffdfdfd, 0xfffdfdfd, 0xffffffff,
+    0xffbebdbf, 0xff757074, 0xff4e494d, 0xff2e292d, 0xff2d282c, 0xff322e32,
+    0xfe2d292d, 0xff2b282c, 0xe4464546, 0x52232222, 0x241e1d1d, 0xad373839,
+    0xff342f33, 0xff2e292d, 0xff302b2f, 0xff312c30, 0xff302c30, 0xce48484a,
+    0x231e1c1d, 0x08080506, 0x07050306, 0x08030406, 0x06000001, 0x06000002,
+    0x06000002, 0x07000002, 0x05000001, 0x05040403, 0x07000002, 0x07000002,
+    0x07000002, 0x07000002, 0x07000002, 0x07000002, 0x07000002, 0x05000001,
+    0x0f060608, 0x0b020003, 0x0a020003, 0x0a020002, 0x08000000, 0x3620262b,
+    0xde5b97ad, 0xff348fb7, 0xfe2c8bb7, 0xff3799c3, 0xff3b9ac3, 0xff6eb8c2,
+    0xff87bfad, 0xff9abb8e, 0xffe8d23d, 0xffffda31, 0xfffcd72b, 0xfffccc25,
+    0xfffdd227, 0xfffed829, 0xfffed927, 0xfffdd829, 0xfffed929, 0xfffed52a,
+    0xfff5b71c, 0xffb88b24, 0xff564425, 0xff2f2b2b, 0xff2e292e, 0xff2e292d,
+    0xff2e282d, 0xff2d292c, 0xff2e292d, 0xff2e282c, 0xff282327, 0xff534f53,
+    0xffdddcde, 0xffffffff, 0xfffdfdfd, 0xfffdfdfd, 0xfffefefe, 0xfffdfdfd,
+    0xffffffff, 0xffd5d4d6, 0xff767276, 0xff544e53, 0xff322c30, 0xff2e292d,
+    0xff2d292d, 0xff2d292c, 0xff2f2a2e, 0xfe2d282c, 0xff312c30, 0xf83a383a,
+    0x702f2f2f, 0x16161616, 0xff413f41, 0xff2c272b, 0xff2e292d, 0xfe2d282c,
+    0xff322f33, 0xb8545656, 0x0f0e0c0d, 0x05030002, 0x110a090b, 0x06000001,
+    0x06000001, 0x07000002, 0x06000001, 0x06000002, 0x03010101, 0x07060706,
+    0x07000002, 0x07000002, 0x07000002, 0x07000002, 0x07000002, 0x07000002,
+    0x06000001, 0x0e050608, 0x0d040305, 0x0a030002, 0x0b010003, 0x08010001,
+    0x170f0e13, 0xc1588798, 0xff338db5, 0xff3090ba, 0xff2b8db8, 0xff3091bc,
+    0xff50acce, 0xff6abac6, 0xff83b9ab, 0xffc7ca60, 0xfffdd32a, 0xfffcd329,
+    0xfffccb27, 0xfffbc620, 0xfffdd428, 0xfffcda28, 0xffffd82b, 0xfffed928,
+    0xfffdd928, 0xffffd72b, 0xfff4b81d, 0xffb98c1f, 0xff664d26, 0xff312c2a,
+    0xff2e2a2b, 0xff2d292b, 0xff2d282d, 0xff2e292c, 0xff2e292d, 0xff2e292d,
+    0xff2d282c, 0xff2e292d, 0xff4b484c, 0xffd6d6d6, 0xffffffff, 0xfffdfdfd,
+    0xfffdfdfd, 0xffffffff, 0xffefefef, 0xff736f73, 0xff443f43, 0xff363034,
+    0xff2d282c, 0xff2d282c, 0xff2d282c, 0xff2e292d, 0xff2e292d, 0xff2e292d,
+    0xfe2f2a2e, 0xff2f2a2e, 0xfa383637, 0x784d4e4f, 0xff3b393c, 0xff2d272b,
+    0xff2e292d, 0xfe2f292d, 0xf239373a, 0x34252124, 0x05020002, 0x09010002,
+    0x1209080a, 0x06000001, 0x06000001, 0x06000001, 0x07000002, 0x07000002,
+    0x05000001, 0x03020202, 0x07000002, 0x07000002, 0x07000002, 0x07000002,
+    0x06000001, 0x06000001, 0x09010203, 0x0f060507, 0x09010002, 0x0a020003,
+    0x08000002, 0x0e090505, 0xad608391, 0xff338ab2, 0xfe2b8cb5, 0xff2787b3,
+    0xff2d8eb8, 0xff3e9dc8, 0xff5fb8d8, 0xff6bb8c9, 0xffa6c286, 0xffe7c63c,
+    0xfff9ca24, 0xfffdca24, 0xfff9bd1c, 0xfffdd52a, 0xfffdd62a, 0xffffd62a,
+    0xfffad023, 0xffffd929, 0xfffdd829, 0xffffd92c, 0xfff7b81c, 0xffd39f1e,
+    0xff8a6a24, 0xff3c3429, 0xff2f2a2d, 0xff2d292e, 0xff2e282d, 0xff2e282c,
+    0xff2d292d, 0xff2e292d, 0xff2e292d, 0xff2d282c, 0xff2e292d, 0xff4c484c,
+    0xffd1d0d1, 0xfffefefe, 0xfffefefe, 0xffffffff, 0xff9a9899, 0xff4e494d,
+    0xff353034, 0xff2d282c, 0xff2d292c, 0xff2e282d, 0xff2e292d, 0xff2d282c,
+    0xff2d282c, 0xff2f2a2e, 0xff2f2a2e, 0xfe322e32, 0xff2c2529, 0xff545657,
+    0xfe3f3d40, 0xfe2b262a, 0xfe2d282c, 0xff302d2f, 0x99414342, 0x00000000,
+    0x09010002, 0x0a020003, 0x0f060708, 0x06000001, 0x07000002, 0x06000001,
+    0x06000001, 0x07000002, 0x06000002, 0x06020204, 0x07000002, 0x07000002,
+    0x07000002, 0x06000001, 0x06000001, 0x0a020304, 0x0c020205, 0x0a020002,
+    0x0a020002, 0x09010002, 0x06030001, 0x894f6772, 0xff4293b5, 0xfe2181ab,
+    0xfe2282ae, 0xff2b88b4, 0xff3695c1, 0xff54b3da, 0xff61b7d7, 0xff98be96,
+    0xffd2c251, 0xffefc32d, 0xfffcc41d, 0xfffabe20, 0xffffd428, 0xfffcd128,
+    0xfffcd62d, 0xfffbc622, 0xfffdd628, 0xfffed929, 0xfffdda2a, 0xfffdd42a,
+    0xfff9ba1b, 0xfffcc023, 0xffdca524, 0xff664f24, 0xff322b28, 0xff2d292c,
+    0xff2d292c, 0xff2e292c, 0xff2d292d, 0xff2e292d, 0xff2e292d, 0xff2d292d,
+    0xff2d282c, 0xff2a2529, 0xff484547, 0xffc1bfc1, 0xffffffff, 0xffe9e8e9,
+    0xff5f5c5e, 0xff312c30, 0xff2e292d, 0xff2d292c, 0xff2d292d, 0xff2e282d,
+    0xff2d282c, 0xff2d282c, 0xff2e292d, 0xff2f2a2e, 0xff302b2f, 0xff342f33,
+    0xfe2f2b2f, 0xff322d32, 0xfe363437, 0xfe2b262a, 0xff332f32, 0xdf494c4c,
+    0x1f161314, 0x06000000, 0x0a020003, 0x10070709, 0x06000001, 0x06000001,
+    0x06000001, 0x07000002, 0x07000002, 0x06000002, 0x05000002, 0x05000001,
+    0x07000002, 0x06000001, 0x06000001, 0x07000002, 0x0d040506, 0x0d040306,
+    0x0b030003, 0x0b020004, 0x09000000, 0x04000000, 0x5f36464b, 0xf64794b4,
+    0xff2282ab, 0xfe2383ab, 0xff2b88ae, 0xff3894be, 0xff4ba7d1, 0xff57b3d9,
+    0xff80bab0, 0xffbebd67, 0xffedc32e, 0xfffdc115, 0xfffbc01e, 0xfffed22a,
+    0xfffcd132, 0xfffccf2a, 0xfffbc525, 0xfffdd027, 0xfffdd829, 0xfffed82b,
+    0xfffeda30, 0xfffcc621, 0xfffdcb23, 0xfffcc624, 0xfffebf23, 0xffd3981c,
+    0xff7c6026, 0xff433728, 0xff2c292a, 0xff2d292c, 0xff2e292d, 0xff2d292d,
+    0xff2d292d, 0xff2e292d, 0xff2d282c, 0xff2d282c, 0xff2b252a, 0xff3d383c,
+    0xff959394, 0xffa5a4a6, 0xff373337, 0xff2d282c, 0xff2d282c, 0xff2d282c,
+    0xff2e282d, 0xff2d282c, 0xff2d282c, 0xff2d282c, 0xff2d282c, 0xff2e292d,
+    0xff2c272b, 0xff302b2f, 0xff312c30, 0xff3c363a, 0xfe2e2a2e, 0xff312c2f,
+    0xc7414343, 0x351c1a1d, 0x05000000, 0x09010002, 0x0d040305, 0x09010103,
+    0x05000001, 0x03000000, 0x05000001, 0x06000001, 0x05000001, 0x06000002,
+    0x06000002, 0x07000002, 0x06000001, 0x07000002, 0x09010203, 0x0f060507,
+    0x08010001, 0x06000000, 0x05000000, 0x08000002, 0x08020001, 0x3720262b,
+    0xdc4d8ba5, 0xff2a89b2, 0xfe2684af, 0xff3089aa, 0xff469ab2, 0xff49a6ca,
+    0xff53b1d8, 0xff73b6c4, 0xffa3b787, 0xffd9c34a, 0xffdeb836, 0xfffdc01b,
+    0xfffdd326, 0xfffbc933, 0xfffcd33a, 0xfffcc923, 0xfffac222, 0xfffdd528,
+    0xfffed829, 0xfffdd931, 0xfffdd02f, 0xfffbc922, 0xfffed92a, 0xfffdd132,
+    0xfffcc223, 0xfffebb21, 0xffcb951d, 0xff705725, 0xff322d2a, 0xff2d282c,
+    0xff2e292d, 0xff2e292d, 0xff2d282c, 0xff2d282c, 0xff2e292d, 0xff2e292d,
+    0xff2d282c, 0xff2d282c, 0xff332e31, 0xff373337, 0xff2f292d, 0xff2e282c,
+    0xff2f2a2e, 0xff2f2a2e, 0xff2d282c, 0xff2e292d, 0xff2e292d, 0xff353034,
+    0xff423d41, 0xff3d383c, 0xff423d41, 0xff343134, 0xff3d393d, 0xff3c393c,
+    0xff302c30, 0xde3d3d3e, 0x351b1b1d, 0x09010002, 0x0a020002, 0x0a020002,
+    0x0a030204, 0x04010102, 0x00000000, 0x01000000, 0x03000001, 0x04000001,
+    0x05000001, 0x05000001, 0x05000001, 0x05000001, 0x0d040406, 0x0d050406,
+    0x10060407, 0x1f0c0a0d, 0x411c1d21, 0x6134393b, 0x7f4c5356, 0x59343b3f,
+    0x3621282d, 0xdb6c9dad, 0xff358fb7, 0xfe2480aa, 0xff3b8faa, 0xff75a588,
+    0xff68a9a7, 0xff55aed3, 0xff6eb5c5, 0xff96b798, 0xffaec17c, 0xffc7b24b,
+    0xfffbbb16, 0xfffdd025, 0xfffbcc26, 0xfffbcc2f, 0xfffdcc25, 0xfffac31f,
+    0xfffbcb23, 0xfffdd727, 0xfffdd82b, 0xfffdd42e, 0xfff8c120, 0xfffcd629,
+    0xfffed92a, 0xfffdd62b, 0xfffdce2d, 0xfffac222, 0xfff7b61d, 0xff9f7a25,
+    0xff463b29, 0xff2d282c, 0xff2d292d, 0xff2d292d, 0xff2e292d, 0xff2e292d,
+    0xff2e292d, 0xff2d282c, 0xff2e292d, 0xff2f2a2e, 0xff2e292d, 0xff2c292c,
+    0xff2d292c, 0xff2e282d, 0xfe2c272b, 0xff2f2a2d, 0xff302b2f, 0xff383337,
+    0xff555054, 0xff777377, 0xff8a888b, 0xff767477, 0xfb555356, 0xf7555656,
+    0xfb636464, 0xff3b3a3b, 0xf4373638, 0x5d2f2f30, 0x02000000, 0x0a000002,
+    0x09010002, 0x08020103, 0x06030304, 0x02010101, 0x01000000, 0x01000000,
+    0x01000000, 0x01000000, 0x02000000, 0x03000000, 0x03000000, 0x02000000,
+    0x09030003, 0x3818191d, 0x972f3439, 0xec465358, 0xff4b6471, 0xff527686,
+    0xff548ba4, 0xff6daac3, 0xf961a3bf, 0xff529ebd, 0xfe2682ae, 0xfe2484b0,
+    0xff80ad8c, 0xff9db06d, 0xff66adbb, 0xff70b3bf, 0xffbab46a, 0xffa0b887,
+    0xffbdb964, 0xfff5bf22, 0xfffccb23, 0xfffcd227, 0xfffbc827, 0xfffbcc27,
+    0xfffdca24, 0xfffcc822, 0xfffcd026, 0xfffcd426, 0xfffdd529, 0xfffac021,
+    0xfffbcc29, 0xfffdd82b, 0xfffed727, 0xfffed92a, 0xfffdda32, 0xfffdd22c,
+    0xfff9c220, 0xffc59826, 0xff584828, 0xff2e292d, 0xff2e292c, 0xff2d292d,
+    0xff2d282d, 0xff2d282c, 0xff2e292d, 0xff2e292d, 0xff2d282c, 0xff2e292d,
+    0xff2f2a2e, 0xff2e292d, 0xff2d282c, 0xff2e292c, 0xff2f2a2e, 0xf2363335,
+    0xd9302e30, 0xdd363437, 0xd7373537, 0xc42c2a2b, 0xc2313031, 0xae3b3c3c,
+    0x82343435, 0x7d4d5050, 0xe0626767, 0xff444346, 0x83444445, 0x00000000,
+    0x09000002, 0x09020002, 0x08020103, 0x08050606, 0x05040504, 0x01000100,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x01000000, 0x00000000, 0x552c2b2c, 0xe23d454b, 0xff3c6577, 0xff6aa0bb,
+    0xff8ec4d8, 0xfe92c9e2, 0xfe70b7d4, 0xff7ebad4, 0xff8bbed6, 0xfe7ab7ce,
+    0xff1d7eaa, 0xff2b8dbb, 0xff99af78, 0xffa3b077, 0xff89b19d, 0xffc5b45c,
+    0xffecb72f, 0xffbaba65, 0xffecbf2e, 0xfffcca21, 0xfffbce23, 0xfffccc27,
+    0xfffbc723, 0xfffcce29, 0xfffac21f, 0xfffdcd25, 0xfffac61a, 0xfffcd125,
+    0xfffabc1e, 0xfffabf26, 0xfffcd333, 0xfffbca22, 0xfffdd426, 0xffffd82a,
+    0xfffdda2e, 0xfffdd62b, 0xfff5bc20, 0xffa58524, 0xff70572a, 0xff352d2a,
+    0xff2d2a2c, 0xff2e282e, 0xff2d292d, 0xff2e292c, 0xff2e292d, 0xff2e292d,
+    0xff2e292d, 0xff2d282c, 0xff2d282c, 0xff2e292d, 0xff2d282c, 0xff2d292c,
+    0xff312d31, 0xb63c3c3e, 0x26100e11, 0x301b191b, 0x2d202020, 0x1b171616,
+    0x13131112, 0x00000000, 0x00000000, 0x5d313031, 0xff616765, 0x6b2e2e32,
+    0x06020003, 0x09000001, 0x09010002, 0x07020001, 0x0b08090a, 0x06040505,
+    0x01000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xf144484c, 0xff457b94,
+    0xff5bb2d8, 0xfe8acae6, 0xff91cde8, 0xff79bfdc, 0xff4e9dc2, 0xff3d92b4,
+    0xff9cc3d8, 0xff97b8c5, 0xfe3881a3, 0xff3399c5, 0xfea2b68b, 0xffd3af45,
+    0xffecb12c, 0xfff8b821, 0xffd1bb4d, 0xffe5bb36, 0xfffdc722, 0xfff9c21d,
+    0xfffdd128, 0xfffbc624, 0xfffcce28, 0xfffabd1c, 0xfffecd25, 0xfffcca1e,
+    0xfffcca21, 0xfffabc1e, 0xfffbaf17, 0xfffdce2c, 0xfffabd1d, 0xfffdc722,
+    0xfffed82b, 0xfffdd72a, 0xffffd82c, 0xfff9c31e, 0xfffdca25, 0xffdaa421,
+    0xff5d4922, 0xff53422a, 0xff2d282c, 0xff2d292b, 0xff2d282d, 0xff2e282c,
+    0xff2d282d, 0xff2d282c, 0xff2d282c, 0xff2e292d, 0xff2d282c, 0xff2e292d,
+    0xff2e292d, 0xff2c292c, 0xff322b2f, 0xba373739, 0x110c0a0d, 0x0f0c0b0d,
+    0x1a18191a, 0x08070807, 0x05040304, 0x02010000, 0x0a060505, 0x94535755,
+    0x733c3d3d, 0x0f050306, 0x09010002, 0x09010002, 0x0a040205, 0x07060505,
+    0x03000001, 0x02000000, 0x01000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0xff4f7484, 0xfe5bb8e0, 0xff5db6dc, 0xff7fc8e6, 0xff68b6da, 0xff5fadca,
+    0xff348fb6, 0xff267da2, 0xfe96bccc, 0xfcb2887d, 0xff3b6577, 0xfe8aada6,
+    0xffd7af45, 0xfff8ad16, 0xfffcb516, 0xffe9b931, 0xffe8b72e, 0xfffebc1a,
+    0xfff9ba18, 0xfffdcd25, 0xfffbc520, 0xfffdca26, 0xfff9bd1e, 0xfffdc21e,
+    0xfff8b70f, 0xfffbc71c, 0xfffbc221, 0xfff9ae15, 0xfffcc522, 0xfffbc223,
+    0xfffbbc1b, 0xfffbca21, 0xfffccc24, 0xfffdd333, 0xfffbca24, 0xfffcc220,
+    0xfff7cf29, 0xfff8b921, 0xffa17520, 0xff463727, 0xff443b2a, 0xff302b2b,
+    0xff2e2a2d, 0xff2d292d, 0xff2e2a2c, 0xff2f2a2e, 0xff2e292d, 0xff2d282c,
+    0xff2d282c, 0xff2e292d, 0xff2f2a2e, 0xff2f2a2e, 0xff2f2a2d, 0xc6322f31,
+    0x1d111013, 0x110e0d10, 0x13131212, 0x0b0a0a0a, 0x01000101, 0x02010000,
+    0x07050303, 0x22171717, 0x02000000, 0x08020002, 0x0a020001, 0x0a040404,
+    0x05020302, 0x04000001, 0x03000001, 0x02000000, 0x01000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0xff5db1d6, 0xff50add4, 0xff65b7da, 0xff6abcde,
+    0xff6db2cf, 0xff469abf, 0xff1d7aa3, 0xff4a97b9, 0xffa37e73, 0xff9d5d35,
+    0xfe417280, 0xffdd9c26, 0xfef1a01e, 0xfffcb017, 0xfffaaf16, 0xfff9ae15,
+    0xfffbb114, 0xfff9b218, 0xfffcc11e, 0xfffbc622, 0xfffdc724, 0xfff9bd1d,
+    0xfffbb91d, 0xfffab313, 0xfffabe17, 0xfffdc61f, 0xfffcb418, 0xfffbbe1e,
+    0xfffdc422, 0xfffbbd1f, 0xfffcbe20, 0xfffabd1c, 0xfffabb22, 0xfffcce2b,
+    0xfffbc11e, 0xfffcc01c, 0xffeabc25, 0xffb98a1e, 0xffe4a424, 0xff6c5223,
+    0xff3f3327, 0xff3c332b, 0xff2e2a2a, 0xff2d282e, 0xff2e292d, 0xff2d282c,
+    0xff2d282c, 0xff2e292d, 0xff2e292d, 0xff2d282c, 0xff2e292d, 0xff2f2a2c,
+    0xff2e292c, 0xda373638, 0x331b1d1e, 0x09050607, 0x09080908, 0x00000000,
+    0x00000000, 0x05030203, 0x07030103, 0x06030102, 0x04010001, 0x08030304,
+    0x09040406, 0x0a060709, 0x06000102, 0x06000002, 0x05000001, 0x02000000,
+    0x01000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xff5fb6dd, 0xff419bc2,
+    0xff5ca8ca, 0xff51a3c6, 0xff3e92b9, 0xff2581a9, 0xff1c7ba4, 0xff428bad,
+    0xff7c401d, 0xffbc782c, 0xff61624e, 0xffbe6314, 0xfff4a525, 0xfffaac15,
+    0xfffaac1a, 0xfffba911, 0xfffab31d, 0xfffcba1a, 0xfffbc321, 0xfffcc122,
+    0xfffbb81c, 0xfffbb41a, 0xfffab518, 0xfffcbd1e, 0xfffaba19, 0xfffcbf1d,
+    0xfffbbb1e, 0xfffbbf1e, 0xfffcc023, 0xfffbc020, 0xfffbbb1d, 0xfffab51a,
+    0xfffcc722, 0xfffbc21e, 0xfffab61b, 0xfffbbb1d, 0xfff9b11a, 0xffbf8b1e,
+    0xffbd881f, 0xffd39520, 0xff684f22, 0xff403628, 0xff3b332a, 0xff2c292d,
+    0xff2d292d, 0xff2e282c, 0xff2e282d, 0xff2d282d, 0xff2d282c, 0xff2e282d,
+    0xff2e292d, 0xff2f2a2e, 0xff2c292d, 0xdc373333, 0x32191818, 0x07020204,
+    0x00000000, 0x00000000, 0x00000000, 0x07010203, 0x08010103, 0x0a030305,
+    0x02000000, 0x03020202, 0x04010203, 0x03000000, 0x05000001, 0x05000001,
+    0x05000001, 0x04000001, 0x02000000, 0x01000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0xff5bafd2, 0xff4496b8, 0xff4397bb, 0xff2a85b1, 0xff2083aa, 0xff1d79a1,
+    0xff2583ac, 0xff3b6e85, 0xff77331a, 0xff994a26, 0xff52443b, 0xffd98218,
+    0xfffcad17, 0xfffbab1f, 0xfffaac1c, 0xfffcb223, 0xfffcb215, 0xfffbb719,
+    0xfffbb61a, 0xfffaaf18, 0xfffbb116, 0xfffbb419, 0xfffcbb1d, 0xfffaad17,
+    0xfffab41e, 0xfffcb917, 0xfffcba1d, 0xfffbbf1d, 0xfffbc123, 0xfffab91c,
+    0xfffab51b, 0xfffcc01d, 0xfffed329, 0xfffbb81c, 0xfffcb51c, 0xfffab117,
+    0xfffbac19, 0xfff9ac17, 0xffad7b19, 0xffbb861e, 0xffdd9c1f, 0xff6e5220,
+    0xff564a2d, 0xff3a3027, 0xff282528, 0xff2c292e, 0xff2d2a2b, 0xff2e292c,
+    0xff2e292d, 0xff2d2a2c, 0xff2d282c, 0xff322c30, 0xff312c2a, 0xe54f412f,
+    0x43332b1f, 0x0e050707, 0x0a080709, 0x03000201, 0x04000000, 0x06000001,
+    0x05000001, 0x04000001, 0x02000000, 0x01000000, 0x02000000, 0x02000000,
+    0x03000000, 0x04000001, 0x05000001, 0x05000001, 0x03000000, 0x01000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x01000000, 0xff2c8ab1, 0xff3f94b8, 0xff2b86ad, 0xff2686ae,
+    0xff1e77a1, 0xff207ba3, 0xff3387ab, 0xff3a5363, 0xff903616, 0xff933920,
+    0xff64402e, 0xffdf8e19, 0xffde7f0e, 0xffd67a10, 0xfffab12a, 0xfffba919,
+    0xfffbab12, 0xfffbaf16, 0xfff9ac16, 0xfffaab14, 0xfffdb318, 0xfffbba1d,
+    0xfffaad16, 0xfffcae17, 0xfffab013, 0xfffcbd1e, 0xfffabb19, 0xfffabd20,
+    0xfffdb51e, 0xfffcb61a, 0xfffcb918, 0xfff8b012, 0xfff9b71a, 0xfff8ae17,
+    0xfffbad16, 0xfffaac16, 0xfffbab15, 0xfff9a816, 0xfffeac1c, 0xffba841c,
+    0xffb37d1a, 0xffdda624, 0xff8c6620, 0xff9a6f1f, 0xffb77f2a, 0xff473424,
+    0xff302a2d, 0xff2f2a30, 0xff2d292c, 0xff2e292e, 0xff2f2b2c, 0xff37322c,
+    0xfe5b4627, 0xffbc8b2a, 0x887c602b, 0x130a0808, 0x1b16171b, 0x02000000,
+    0x06000001, 0x05000001, 0x03000000, 0x02000000, 0x01000000, 0x00000000,
+    0x00000000, 0x00000000, 0x01000000, 0x02000000, 0x04000001, 0x05000001,
+    0x05000001, 0x03000000, 0x02000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x01000000, 0x01000100, 0x03020302, 0xff2f8bb2, 0xff2e89b0,
+    0xff2c89b2, 0xff1a799f, 0xff1f759f, 0xff2782a6, 0xff3b84a4, 0xff393c43,
+    0xff863519, 0xff7a2f18, 0xff6d301f, 0xffb24e1b, 0xffc65a10, 0xffe59528,
+    0xfffdb123, 0xfffba60d, 0xfffbaa12, 0xfffba913, 0xfffaab15, 0xfffcb319,
+    0xfffbb519, 0xfffaab16, 0xfffbac16, 0xfffab417, 0xfffcbc1c, 0xfffcb61a,
+    0xfff5ac1c, 0xfff8b01d, 0xfffbb419, 0xfffab217, 0xfffbb616, 0xfff19d0b,
+    0xfffbaa17, 0xfff9a713, 0xfffaa914, 0xfffbac16, 0xfffaab16, 0xfff6a216,
+    0xfff9a512, 0xffffae15, 0xffe7a71e, 0xffc98d19, 0xffda931e, 0xffd48e1b,
+    0xffdb8f17, 0xffb5791d, 0xff6e5123, 0xff53432d, 0xff433626, 0xff3c3228,
+    0xff564225, 0xff84622c, 0xfe9e7016, 0xffffb424, 0xdab29758, 0x5f3a3c3b,
+    0x09060709, 0x04020201, 0x04000001, 0x04000001, 0x02000000, 0x01000000,
+    0x01000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x01000000,
+    0x03000000, 0x05000001, 0x06000001, 0x04000001, 0x03000001, 0x01000000,
+    0x00000000, 0x00000000, 0x01000000, 0x03020302, 0x03020202, 0x08070807,
+    0xff348db4, 0xff2a88b0, 0xff257ba2, 0xff1b6e90, 0xff2280a8, 0xff3080a2,
+    0xff327692, 0xff42322e, 0xff612b1e, 0xff5f2a1b, 0xff693120, 0xffa3350f,
+    0xffc55a0d, 0xfff0a023, 0xfffba913, 0xfffca912, 0xfff8a911, 0xfffaaa13,
+    0xfff9aa13, 0xfffbae15, 0xfffba812, 0xfffaac16, 0xfffcb317, 0xfffab518,
+    0xfffcb51b, 0xfff3a419, 0xfff39d18, 0xfffbad17, 0xfff9ac17, 0xfffbb215,
+    0xfff7aa15, 0xfff8a816, 0xfff9a914, 0xfffaa815, 0xfffaab17, 0xfffbad17,
+    0xfff9aa18, 0xfff5a114, 0xfffba815, 0xfff6a714, 0xfffcaf19, 0xfff7a417,
+    0xffec9c19, 0xfff7a813, 0xfff8a516, 0xffe99412, 0xffda981d, 0xffcb8f22,
+    0xffc18d2a, 0xffb98321, 0xffda9719, 0xfffdb222, 0xffffb01a, 0xffe8a828,
+    0xf39c8b63, 0x924a4d4c, 0x05040405, 0x0a080809, 0x01000000, 0x02000000,
+    0x01000000, 0x01000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x01000000, 0x03000000, 0x05000001, 0x06000002, 0x06000001,
+    0x04000001, 0x02000001, 0x07050506, 0x0b0a0a0b, 0x07050606, 0x04030403,
+    0x0a090a09, 0x110f1110, 0xff2885ae, 0xff2586b1, 0xff1c6484, 0xff247393,
+    0xff24799c, 0xff25769a, 0xff2b596f, 0xff432e30, 0xff3d2925, 0xff5c2b1c,
+    0xff642718, 0xffaf410d, 0xffd16d0e, 0xfffdad1e, 0xfff7a20f, 0xfff8a911,
+    0xfffaa913, 0xfffaaa13, 0xfffbaa13, 0xfffca70f, 0xfffcac16, 0xfffcac16,
+    0xfffbb015, 0xfffbae16, 0xfff39e15, 0xfff29116, 0xfff9a617, 0xfffbab16,
+    0xfffbae14, 0xfff9b118, 0xffef9912, 0xfffaa915, 0xfffba915, 0xfff8a512,
+    0xfff6a013, 0xfffaac16, 0xfffaa214, 0xfff39e13, 0xfff7a713, 0xfff9a816,
+    0xfff69f15, 0xffef9714, 0xfff9a817, 0xfff8a613, 0xfff19910, 0xfffca817,
+    0xfffbae17, 0xfffaab15, 0xfffab126, 0xfffaaa14, 0xfffbab16, 0xfffaad13,
+    0xfff8ac16, 0xffaa7f2e, 0xff69695f, 0x9c424244, 0x0b0b0b0a, 0x0a070a09,
+    0x02000000, 0x03000001, 0x01000000, 0x01000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x01000000, 0x03000001, 0x05000001,
+    0x07000003, 0x06000002, 0x06000001, 0x0c0a0b0a, 0x1b1a1815, 0x2524211e,
+    0x0d0b0c0b, 0x0d0c0d0c, 0x15131514, 0x0f0e0e0e, 0xff2988b1, 0xff1d7ca5,
+    0xff1e5e79, 0xff216685, 0xff257696, 0xff1b6689, 0xff2c4451, 0xff362f34,
+    0xff342927, 0xff5e2b1f, 0xff602a18, 0xffa73705, 0xffed951d, 0xfff6a311,
+    0xfff9a412, 0xfff9a910, 0xfffaaa14, 0xfffaaa12, 0xfffaa911, 0xfffbab16,
+    0xfff49e15, 0xfffaa30f, 0xfffba813, 0xfff29211, 0xffe57d0b, 0xfff29b11,
+    0xfffcad16, 0xfffaac15, 0xfffba415, 0xffef9314, 0xfff59f12, 0xfff9a917,
+    0xfffbab16, 0xfff6a114, 0xfff6a014, 0xfff8a112, 0xffe37d12, 0xfff9a516,
+    0xfff9a816, 0xfff7a016, 0xffee930e, 0xfff8a216, 0xfff6a212, 0xfff39a12,
+    0xfff7a214, 0xfffbab16, 0xfffcab17, 0xfffba911, 0xfffbab14, 0xfffcab14,
+    0xfffaab13, 0xfffbad1b, 0xffba8726, 0xff2f393c, 0xff213f4c, 0xab394346,
+    0x12121112, 0x1f1c1c1e, 0x0d0a0b0c, 0x04000001, 0x03000000, 0x01000000,
+    0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x01000000, 0x03000101,
+    0x04000102, 0x05000001, 0x07000100, 0x0a020507, 0x18151515, 0x211c1710,
+    0x2c21190d, 0x12100b03, 0x17131512, 0x17161616, 0x0c0b0b0b, 0x08070807,
+    0xff2382ac, 0xff19759a, 0xff21617e, 0xff24607c, 0xff176e91, 0xff1c5b79,
+    0xff2b3c47, 0xff302b2e, 0xff432922, 0xff5e2b1d, 0xff672816, 0xffbc5410,
+    0xffcc650b, 0xfff19b17, 0xfff5a011, 0xfffaa813, 0xfff8a714, 0xfff9aa11,
+    0xfffcab16, 0xfff6a119, 0xfff08e0d, 0xfff9a412, 0xfff1920d, 0xffed8b0e,
+    0xfff49813, 0xfffbad16, 0xfffaab15, 0xfffbad15, 0xfff69d11, 0xfff5a014,
+    0xfffaaa16, 0xfffbac15, 0xfff7a216, 0xffea8810, 0xfff39914, 0xffe78010,
+    0xfff29515, 0xfffbab15, 0xfff6a314, 0xffef8f15, 0xfff5a015, 0xfff7a417,
+    0xffe68913, 0xfff6a113, 0xfffaab16, 0xfffaab16, 0xfffbaa15, 0xfff7a411,
+    0xfffbaa14, 0xfffaaa13, 0xfffead15, 0xffbf871e, 0xff383228, 0xff284b5a,
+    0xff1e3a49, 0xfc5d747b, 0x87596c6f, 0x523b4346, 0x09060607, 0x05000001,
+    0x04000001, 0x04000001, 0x03000000, 0x02000000, 0x02000000, 0x02000000,
+    0x04020203, 0x12101111, 0x12101111, 0x0a060607, 0x03000001, 0x0c0a0908,
+    0x5a473d24, 0x6a50472a, 0x241e180e, 0x56433b29, 0x664e452d, 0x1814120e,
+    0x0d0b0b0b, 0x0a09090a, 0xff1f79a0, 0xff1c749c, 0xff22576e, 0xff235b76,
+    0xff106589, 0xff24536a, 0xff2c363d, 0xff31292c, 0xff5f2f22, 0xff58291c,
+    0xff6f2b12, 0xffb44f10, 0xffd97612, 0xffe48a17, 0xfff49a0f, 0xfff6a015,
+    0xfff9a813, 0xfffdaa17, 0xfff7a41a, 0xffef920e, 0xfff69e0f, 0xffef8e0c,
+    0xfff09011, 0xffdf7109, 0xffee8e10, 0xfffcab16, 0xfffbab17, 0xfffba816,
+    0xffee9010, 0xfff69e13, 0xfff19b12, 0xfff79e14, 0xffeb8c10, 0xfff09112,
+    0xffed8d11, 0xfff09916, 0xfffdac17, 0xfff49a12, 0xffe57913, 0xffed8c13,
+    0xfff8a815, 0xffe58513, 0xffe78914, 0xfff5a116, 0xfff7a615, 0xfffbac14,
+    0xfffaa915, 0xfff8a716, 0xfffbac18, 0xfffea814, 0xffcc8f23, 0xff443827,
+    0xff2b2e35, 0xff2c5363, 0xfe213341, 0xff3d5d6b, 0xeb95bac2, 0x60364348,
+    0x00000000, 0x06000002, 0x05000001, 0x05000001, 0x05000001, 0x05000001,
+    0x04000001, 0x07040504, 0x110f100f, 0x1a191717, 0x100e0e0f, 0x05040403,
+    0x1613100b, 0x93726743, 0xf3b5a057, 0xeeada148, 0xa67b7845, 0x82665d43,
+    0xb7897e50, 0x3e302b1e, 0x03030000, 0x4e3b3421, 0xff166c90, 0xff236f90,
+    0xff243b47, 0xff1a536c, 0xff1a6e93, 0xff2b4a5b, 0xff333139, 0xff393433,
+    0xff7b301a, 0xff55271d, 0xff7a321c, 0xffbf5912, 0xffd77510, 0xffda730b,
+    0xfff69d15, 0xfff6a211, 0xfff7a110, 0xfff39813, 0xffeb8d10, 0xfff39613,
+    0xffea7e05, 0xfff49312, 0xffda6706, 0xffd05303, 0xffed8910, 0xfffcaf16,
+    0xfffaa813, 0xffe8820e, 0xffe87e0f, 0xfff79e14, 0xffe47a0c, 0xffef8d13,
+    0xffec860b, 0xfff39a13, 0xfff6a213, 0xfffbab13, 0xfff1910f, 0xffdc6309,
+    0xffe88211, 0xfff8a314, 0xfff29619, 0xffda7714, 0xffda6e11, 0xffe38011,
+    0xfff9a715, 0xfffbaa16, 0xfff29d12, 0xfff2a016, 0xfffaa814, 0xffdf961f,
+    0xff574627, 0xff2b282c, 0xff2e4955, 0xff2a4d5f, 0xff253542, 0xfe2b4f61,
+    0xf68fb8c1, 0x59323f45, 0x00000000, 0x06000001, 0x06000001, 0x06000001,
+    0x06000001, 0x05000001, 0x0c080a0b, 0x14111313, 0x100e0f0f, 0x09080708,
+    0x08040603, 0x28221c14, 0xb0867748, 0xfeb8a84d, 0xe6aaa14e, 0xcc968d41,
+    0x44322e15, 0x2019170d, 0x48383021, 0x1e19140e, 0x73584d34, 0xf4b5a566,
+    0xff207ba1, 0xff275d75, 0xff272e35, 0xff1d4e65, 0xff287599, 0xff30414c,
+    0xff373539, 0xff533835, 0xff8e2e0d, 0xff4d2920, 0xff7d3118, 0xffc5600d,
+    0xffbd4c08, 0xffed9010, 0xfff19711, 0xfff79d12, 0xffec8a0d, 0xffec890f,
+    0xffeb8110, 0xffe57e08, 0xfff69414, 0xffd55f03, 0xffcf5104, 0xffe2740c,
+    0xfff59c10, 0xfff79d14, 0xffef8f11, 0xffe77a0a, 0xfff49d12, 0xffe8850f,
+    0xffea800f, 0xffe27708, 0xfff59e15, 0xfff8a813, 0xfff5a215, 0xffe7870f,
+    0xffd95a07, 0xffdf710b, 0xfff39614, 0xfff49d14, 0xffd86f0a, 0xffdd7b12,
+    0xffd06c0b, 0xffed9114, 0xfff3a114, 0xfff6a514, 0xfff29b14, 0xfff6a214,
+    0xffe19318, 0xff664d26, 0xff2a242b, 0xff253d4c, 0xff2b4251, 0xff234656,
+    0xff253845, 0xff2a4d5b, 0xf79cc3cb, 0x60374548, 0x01000000, 0x07000002,
+    0x07000002, 0x08010103, 0x0b040406, 0x17131211, 0x413c372f, 0x1f1b1713,
+    0x08070302, 0x09070301, 0x362c2317, 0xca978b4c, 0xfdb7a94a, 0xb2857e42,
+    0x34262009, 0x231b170d, 0x1c181009, 0xb78b7e52, 0xe5ab9b61, 0x7055492d,
+    0xe0a8975f, 0x7a605537, 0xff2a81a7, 0xff1e4d63, 0xff282d34, 0xff294c5f,
+    0xff185977, 0xff38373c, 0xff34353a, 0xff7e3623, 0xff7e2b11, 0xff4b2a23,
+    0xff904618, 0xffb34109, 0xffbf5008, 0xfff29313, 0xfff09511, 0xffe9830c,
+    0xffed8811, 0xffe4750d, 0xffe77705, 0xfff19118, 0xffda6807, 0xffcd5105,
+    0xffde6909, 0xffe1720b, 0xffe7840c, 0xffed8910, 0xffec830c, 0xffef8f0d,
+    0xfff19212, 0xffe77f0d, 0xffe77f0e, 0xffed8c11, 0xfff29813, 0xffeb8f10,
+    0xffda740f, 0xffc94f05, 0xffe3720d, 0xffe38110, 0xffe98914, 0xffde790e,
+    0xffe88b13, 0xffe38712, 0xffdd7e12, 0xffdb780d, 0xffed9713, 0xffdf8112,
+    0xffd66f0d, 0xffe5991b, 0xff6f4e23, 0xff272229, 0xff2b333a, 0xff234b5e,
+    0xff29363f, 0xff244556, 0xff263946, 0xff2d4753, 0xf57fabb7, 0x5229373c,
+    0x03000000, 0x09010204, 0x08000103, 0x0b060405, 0x372b251a, 0x67534934,
+    0x5a463b26, 0x2e252018, 0x0b080300, 0x624a4123, 0xe0a5964a, 0xd99e9240,
+    0x77595121, 0x392b2716, 0x27201c15, 0x3b2b2417, 0xc590814f, 0xefb29c5d,
+    0x9a73663a, 0x624b4027, 0x71564c30, 0xd48d834b, 0xff1f7194, 0xff1d4b60,
+    0xff283038, 0xff213f50, 0xff21495d, 0xff383234, 0xff343134, 0xff9a300f,
+    0xff652a19, 0xff563220, 0xff933e16, 0xffa92f04, 0xffce5e0a, 0xffef8d0f,
+    0xffe68411, 0xffea8511, 0xffe27008, 0xffe16505, 0xffe67407, 0xffd76308,
+    0xffcc4e04, 0xffda5e06, 0xffd55a07, 0xffcb5005, 0xffe1750a, 0xffe87a0d,
+    0xffe97d08, 0xffea810b, 0xffe9820f, 0xfff49815, 0xffeb890e, 0xffe87d0e,
+    0xffdf770c, 0xffc95d0e, 0xffbe4909, 0xffe17510, 0xffd67110, 0xffd0640d,
+    0xffe28211, 0xffea8d16, 0xffd77710, 0xffca610b, 0xffc15109, 0xffd57413,
+    0xffc14f0b, 0xffb03604, 0xffd57213, 0xff7f5c28, 0xff2f292b, 0xff2b2b2e,
+    0xff283f4d, 0xff273a46, 0xff2a2e36, 0xff244353, 0xff273a44, 0xff344448,
+    0xfa6b9aa1, 0x91546b70, 0x17100f0f, 0x04010002, 0x1912110d, 0x664d4731,
+    0x80635c41, 0x9e796c47, 0xd39d8c52, 0x98756845, 0x9d746934, 0xf4afa139,
+    0xfeb4a836, 0xaa817a44, 0x56433c26, 0x04030000, 0x5d403b24, 0xe891894a,
+    0xd98e8546, 0x85645734, 0x73564b2d, 0x79574e2e, 0xdb887f40, 0xc67b753f,
+    0xff126488, 0xff1e4c64, 0xff28313b, 0xff253f4c, 0xff2b363c, 0xff333137,
+    0xff412a26, 0xffa0300d, 0xff4f2a20, 0xff633820, 0xff94300e, 0xffa72c03,
+    0xffd86a0d, 0xffec9014, 0xffc25407, 0xffdb690a, 0xffe06911, 0xffe06706,
+    0xffdd680a, 0xffcb5005, 0xffd65a05, 0xffd05206, 0xffc84703, 0xffda6706,
+    0xffec8112, 0xffde6604, 0xffe77c09, 0xffe97f12, 0xffed8814, 0xffe2770a,
+    0xffe5760d, 0xffd66208, 0xffc7570d, 0xffb03705, 0xffcf5f0d, 0xffc95d09,
+    0xffcd6210, 0xffd7750f, 0xffe18213, 0xffba4c07, 0xffc2520b, 0xffac3506,
+    0xffbd4f0b, 0xffc2581c, 0xffa72c06, 0xff9c3b15, 0xff80562a, 0xff44392e,
+    0xff2d292c, 0xff2d2e34, 0xff293740, 0xff2a3033, 0xff292d35, 0xff234657,
+    0xff273d4a, 0xff363f3f, 0xff57888c, 0xea8cb0b4, 0x4d393e3d, 0x7c5e563a,
+    0xbe8f825b, 0x7f625a3f, 0x795e5640, 0xbc918258, 0xffbda75f, 0xfab7a444,
+    0xfdb5a630, 0xffb5a734, 0xecac9d46, 0xecae9d57, 0x5c463f28, 0x835a5639,
+    0xfa938e42, 0xc97c7940, 0x8e615c3e, 0x996b6745, 0x815f5737, 0xe188803e,
+    0x8f4e4818, 0x2714120a, 0xff15658a, 0xff224d63, 0xff26333d, 0xff2e353c,
+    0xff2f2d33, 0xff312f32, 0xff4c2921, 0xff9a2f10, 0xff382826, 0xff682f1c,
+    0xff9e2c08, 0xffa83201, 0xffec8c1a, 0xffcd660f, 0xffcb5309, 0xffdf6b12,
+    0xffe17009, 0xffea8010, 0xffcf5506, 0xffd75804, 0xffd15009, 0xffc54704,
+    0xffd96206, 0xfff08b11, 0xffdd6507, 0xffe06d07, 0xffe77b0c, 0xffe46e0b,
+    0xffdb6704, 0xffe7770d, 0xffcf5707, 0xffbd4506, 0xffae3203, 0xffb74207,
+    0xffc35009, 0xffcc5f0e, 0xffca600c, 0xffd46f0d, 0xffb43f04, 0xffb33c09,
+    0xffa62c01, 0xffac3406, 0xffb84714, 0xffae3d16, 0xffa62d05, 0xff623324,
+    0xff363134, 0xff302c2d, 0xff2e2a2d, 0xff2e2b2f, 0xff2b2b2f, 0xff2c2c30,
+    0xff292e35, 0xff1e4d62, 0xff26495c, 0xff36414a, 0xff508a95, 0xea7d988b,
+    0xb4817d5e, 0xbf8e8053, 0x5a483e28, 0x44312917, 0x221b150e, 0x382e2618,
+    0xf4b8a563, 0xffb8a83d, 0xfbb2a63a, 0x9a756732, 0x78584d25, 0xffafa354,
+    0xffa19a4f, 0xfe969145, 0xba787245, 0x8f5e5b36, 0xcc827f49, 0xa76f6b42,
+    0xf8948d45, 0x914f4819, 0x1a0b0900, 0x83565744, 0xff1c6c90, 0xff1e4254,
+    0xff27333c, 0xff2d2d31, 0xff2f2b2e, 0xff2f2a2e, 0xff5b2b1e, 0xff812d13,
+    0xff332727, 0xff6b2c1a, 0xff9f2906, 0xffc45a0f, 0xffe78c1d, 0xffbe4403,
+    0xffde6909, 0xffee8711, 0xfff18b13, 0xffd35d06, 0xffd65605, 0xffce4f06,
+    0xffc94f02, 0xffdd6307, 0xffed8510, 0xffe57407, 0xffde6a05, 0xffef880b,
+    0xffe16908, 0xffdd6707, 0xffe5720e, 0xffce5305, 0xffbb3e06, 0xffad3208,
+    0xffaa3206, 0xffbe4507, 0xffba4309, 0xffb13b03, 0xffd1670e, 0xffba4909,
+    0xffa82c04, 0xffa72b02, 0xffa82e06, 0xffae3608, 0xffb4431c, 0xffaa2f07,
+    0xff9e2f0b, 0xff3c2826, 0xff2e2a2e, 0xff2c2a2d, 0xff2f292e, 0xff2e292d,
+    0xff2c292e, 0xff2c2d31, 0xff293038, 0xff195973, 0xff235e79, 0xff314a55,
+    0xff48859f, 0xf981a193, 0xb07f7a58, 0x664f4732, 0x4836301f, 0x17140e0b,
+    0x45342e1d, 0xc897864d, 0xffbba84f, 0xebad9e45, 0xa1797339, 0xa8776f3b,
+    0xffafa356, 0xffaba352, 0xdd8c8644, 0x9a66623d, 0xa96b693a, 0xff9b9045,
+    0xff9a8f40, 0xfa938a3d, 0x82474117, 0x1b0d0a03, 0x86504f3d, 0xff7c7b4d,
+    0xff146388, 0xff213c4b, 0xff29323a, 0xff2f2c30, 0xff302b2e, 0xff2f282d,
+    0xff662c19, 0xff6c2b17, 0xff352728, 0xff712c16, 0xffac3708, 0xffe5851b,
+    0xffcc5609, 0xffdf730e, 0xfff9a116, 0xffe9840d, 0xffdd6806, 0xffdf6707,
+    0xffcf5205, 0xffd35902, 0xffe77008, 0xffe97d0b, 0xffe47408, 0xffe26f06,
+    0xfff7990f, 0xffe97b07, 0xffe36f0a, 0xffe06b09, 0xffcd4e04, 0xffbd4005,
+    0xffb03506, 0xffac3005, 0xffae3103, 0xffbb3d08, 0xffa82d03, 0xffbd4909,
+    0xffc15009, 0xffa92c04, 0xffa82c04, 0xffa72c02, 0xffa82e06, 0xffad360d,
+    0xffad3911, 0xffaa2d07, 0xff9e310c, 0xff342828, 0xff2d292d, 0xff2e292d,
+    0xff2d292c, 0xff2d292c, 0xff2c292d, 0xff2a2e33, 0xff2b3039, 0xff185d7c,
+    0xff206f92, 0xff325a6d, 0xff377e98, 0xf380abac, 0xe6a59f70, 0x83645838,
+    0x16151110, 0x84635a33, 0xecaf9d49, 0xffb7a63c, 0xdfa49a42, 0x96706a34,
+    0xdd9d954b, 0xffaea450, 0xffada352, 0xb9807a44, 0x8c65603c, 0xd68b8547,
+    0xff999043, 0xff978e3c, 0xf08d853b, 0x6a3a3613, 0x180b0903, 0x97565641,
+    0xf9747048, 0xff706a39, 0xff115f80, 0xff253642, 0xff292f36, 0xff2f2b2f,
+    0xff2e2a2c, 0xff31292a, 0xff732d16, 0xff552a1d, 0xff41292a, 0xff782d13,
+    0xffca5910, 0xffd15809, 0xffc7560a, 0xfffcac1e, 0xfff8a114, 0xffe57b0a,
+    0xffe8790a, 0xffd35b04, 0xffd96306, 0xfff18708, 0xffe77505, 0xffe16f07,
+    0xffe37106, 0xfffa9c0d, 0xffe87705, 0xffe47409, 0xffdf6908, 0xffce4e05,
+    0xffbd4003, 0xffb53904, 0xffaf3405, 0xffa92f07, 0xffb73907, 0xffb13704,
+    0xffb13904, 0xffbd4907, 0xffa82c05, 0xffa82c04, 0xffa72d05, 0xffa92b03,
+    0xffa92e06, 0xffac360e, 0xffaa3008, 0xffaa2e07, 0xff9e320e, 0xff302728,
+    0xff302b2f, 0xff2d2a2e, 0xff2e292d, 0xff2d282c, 0xff2e282c, 0xff2b2e35,
+    0xff293239, 0xff1d6280, 0xff227196, 0xff2a6a84, 0xff2e7792, 0xff84aba1,
+    0xbb858058, 0x7861573b, 0xbf8b823c, 0xfdb5a736, 0xffb5a734, 0xc28f893f,
+    0x916e6931, 0xf4aca23e, 0xffafa347, 0xffada659, 0xa5757247, 0x976e6a44,
+    0xed9d944b, 0xff9a9142, 0xff978e3b, 0xe988823b, 0x59332d13, 0x14080501,
+    0xb3676951, 0xff726f44, 0xff706a3a, 0xfe716a3a, 0xff145d7b, 0xff25323c,
+    0xff2d2d33, 0xff2f2b2e, 0xff2e282d, 0xff31282a, 0xff7c2d16, 0xff422923,
+    0xff542c21, 0xff8b4016, 0xffd45d0d, 0xffac3303, 0xfff19519, 0xfff8a311,
+    0xffea850c, 0xffec850f, 0xffe47407, 0xffe9810d, 0xfffc9b09, 0xfff18805,
+    0xffdf6906, 0xffe47208, 0xfff8950d, 0xffe17203, 0xffe3720a, 0xffe67409,
+    0xffcc4e05, 0xffbd3f02, 0xffbd3f07, 0xffb63906, 0xffb03507, 0xffab2f03,
+    0xffb33504, 0xffb53b05, 0xffb73f06, 0xffac3406, 0xffad3203, 0xffa92e04,
+    0xffa72c04, 0xffaa2d04, 0xffa92f06, 0xffab310a, 0xffa92e04, 0xffab3006,
+    0xff9b2f0d, 0xff2d2829, 0xff2f292d, 0xff2d292d, 0xff302a2e, 0xff2d282c,
+    0xff2e282c, 0xff2d2e34, 0xff2b353d, 0xff256888, 0xff207293, 0xff287494,
+    0xff2b7790, 0xe275a1a2, 0xab7d7d51, 0xf7b0a53c, 0xffb7a52f, 0xfbb4a637,
+    0xaa7d7635, 0x9d756f2e, 0xffb4a837, 0xffb2a535, 0xffafa654, 0x74534d26,
+    0xb07e794d, 0xffa59b4c, 0xff998f41, 0xff958d3a, 0xe0827b38, 0x4426220e,
+    0x1b0c0a07, 0xba65644a, 0xff6f6a3d, 0xfe6f6a39, 0xfe6e6a3b, 0xff6e6b39,
+    0xff195773, 0xff273138, 0xff2d2c31, 0xff2e292d, 0xff2d292d, 0xff362828,
+    0xff7e2d15, 0xff3a2728, 0xff62311c, 0xff9c501a, 0xffb84709, 0xffc75f1f,
+    0xfff79d13, 0xffed8c0f, 0xffeb850f, 0xffea7f0a, 0xffee870a, 0xffffa10c,
+    0xfff6920b, 0xffe26f05, 0xffeb7e0b, 0xfff08d10, 0xffe57506, 0xffe87605,
+    0xffe46f07, 0xffcd4e04, 0xffc04004, 0xffbf4307, 0xffbc400a, 0xffbf4309,
+    0xffaa3002, 0xffb53808, 0xffb33707, 0xffb94005, 0xffb23905, 0xffb13906,
+    0xffb53c05, 0xffa62c04, 0xffa82d03, 0xffa82e05, 0xffab3007, 0xffaa2e06,
+    0xffa92e06, 0xffac2f03, 0xff923010, 0xff2b282b, 0xff2f292c, 0xff2d292d,
+    0xff2f2a2e, 0xff2d282c, 0xff2d292c, 0xff2b3036, 0xff273641, 0xff256b8c,
+    0xff1d6e93, 0xff227292, 0xff368098, 0xfe88b398, 0xffb4a838, 0xffb3a72f,
+    0xeda99f3a, 0xa2787236, 0xb8867f31, 0xffb5a631, 0xfeb3a52b, 0xffb1a53e,
+    0xffaea453, 0xefa19952, 0xffa19646, 0xff958d3c, 0xfe938b3a, 0xff958d38,
+    0xc66f692d, 0x53282617, 0xbe5d5c41, 0xff706b3d, 0xfe7b7639, 0xff77713b,
+    0xff726b39, 0xff857c3c, 0xff1c5069, 0xff28343c, 0xff2d2d32, 0xff2d2a2d,
+    0xff30282d, 0xff3e2825, 0xff762b17, 0xff352925, 0xff6a3321, 0xff984112,
+    0xffb7440f, 0xffd56f18, 0xfff19010, 0xffee8b10, 0xffec850e, 0xfff08f0c,
+    0xfff4950d, 0xfff39110, 0xffeb7f0c, 0xffef8309, 0xffea800f, 0xffe47407,
+    0xffea7b09, 0xffe06805, 0xffd05307, 0xffc04304, 0xffc44604, 0xffc34709,
+    0xffc94f11, 0xffaf3404, 0xffaf3204, 0xffb83c07, 0xffaf3404, 0xffbf4507,
+    0xffb23804, 0xffbc4607, 0xffa82d04, 0xffaa2f04, 0xffaa2e05, 0xffaa2f06,
+    0xffa92e05, 0xffa82d04, 0xffa92f06, 0xffa82d03, 0xff923618, 0xff29272a,
+    0xff2d292c, 0xff2d292d, 0xff2d282c, 0xff2e282c, 0xff2d292b, 0xff2b343d,
+    0xff263945, 0xff1d698a, 0xff1d6e91, 0xff1d6e91, 0xfe408183, 0xff87ac85,
+    0xffb9a43a, 0xfdb7ac49, 0x96726c35, 0xd0958d35, 0xffb4a62e, 0xffb4a729,
+    0xfeb2a530, 0xfeafa346, 0xfea69c4a, 0xff9b9240, 0xff958d3b, 0xfe948b3a,
+    0xff958c3b, 0xff948c3a, 0xfc8c833a, 0xf56e6b3e, 0xff75703c, 0xff88803a,
+    0xff938c3c, 0xff8b843b, 0xff8c833b, 0xff958c3c, 0xff1b495e, 0xff293843,
+    0xff2f2c30, 0xff2e2a2e, 0xff30282c, 0xff4a2a22, 0xff6b2b19, 0xff362629,
+    0xff6f301c, 0xff98310a, 0xffc9540d, 0xffed9014, 0xfff19614, 0xffee8e14,
+    0xfff89a10, 0xffef930f, 0xfff79e16, 0xfff38e12, 0xffef8108, 0xffe57205,
+    0xffe37007, 0xffe87c0c, 0xffdf6706, 0xffd15808, 0xffbf4702, 0xffcd4d08,
+    0xffc44605, 0xffcf550a, 0xffb23504, 0xffb43906, 0xffbe4408, 0xffbc4105,
+    0xffbe4408, 0xffb74006, 0xffbd490a, 0xffab3304, 0xffa92f03, 0xffac3106,
+    0xffab3007, 0xffaa2f06, 0xffa72c03, 0xffa82e05, 0xffa92d04, 0xffad3109,
+    0xff8a3214, 0xff29282b, 0xff32282b, 0xff2e2a2f, 0xff2e292c, 0xff2d282c,
+    0xff2d282c, 0xff2c3741, 0xff233b48, 0xff1b688a, 0xff1d6e8f, 0xff196c8e,
+    0xff528b8b, 0xff8cab82, 0xffb8a632, 0xfdb4a833, 0xeeab9e38, 0xfdb3a632,
+    0xe1a29b40, 0xf3ada13b, 0xffb2a536, 0xfeada34d, 0xffa19949, 0xff968e3d,
+    0xff958d3c, 0xff958d3c, 0xfe978f3e, 0xff988f3d, 0xff958b3d, 0xff89823c,
+    0xff978d43, 0xfe9c913f, 0xff958d3b, 0xff948c3c, 0xff948c3b, 0xff958d3b,
+    0xff204253, 0xff293e4c, 0xff2e2b2f, 0xff2e292e, 0xff2f2a2b, 0xff582b1f,
+    0xff612f22, 0xff392728, 0xff6e2d18, 0xffa63a13, 0xffdc781a, 0xfff7a316,
+    0xfff79e17, 0xfffaa214, 0xfff19911, 0xfff49910, 0xfff59a13, 0xffeb7e0a,
+    0xffe77504, 0xffe57607, 0xffec7e0f, 0xffdc6205, 0xffd56005, 0xffca5001,
+    0xffd75f0a, 0xffc54905, 0xffd0540a, 0xffb53804, 0xffbd3f06, 0xffc74f08,
+    0xffc45006, 0xffc34b09, 0xffb93e02, 0xffcc5b10, 0xffb53f0d, 0xffa93002,
+    0xffb94008, 0xffac3204, 0xffac3105, 0xffa72c03, 0xffa92e05, 0xffa92d05,
+    0xffa82c03, 0xffb0350d, 0xff8d2f10, 0xff2c2729, 0xff3d2c2c, 0xff2f2c2e,
+    0xff332b2e, 0xff2d292c, 0xff2d292c, 0xff293640, 0xff223f4e, 0xff1d6c8e,
+    0xff206f91, 0xff186b8c, 0xff60938a, 0xfe91a96a, 0xffb5a728, 0xffb5a72b,
+    0xf7b0a134, 0xae7e7639, 0x5b43412d, 0xba868342, 0xffb2a538, 0xffb0a449,
+    0xffac9f4e, 0xff9c9443, 0xff968e3d, 0xff9e9544, 0xfeab9f4d, 0xec978d45,
+    0xe6918a43, 0xffac9f50, 0xffaa9e4c, 0xff9a9040, 0xff948b3c, 0xff958d3c,
+    0xff978d3d, 0xff998d3e, 0xff243847, 0xff254250, 0xff2e2b2e, 0xff2f292d,
+    0xff312929, 0xff623025, 0xff553027, 0xff412925, 0xff632b1a, 0xffb44117,
+    0xffdf7b12, 0xffed9012, 0xfff8a21b, 0xfff6a012, 0xffef8e0e, 0xfff09212,
+    0xfff08b10, 0xffe87402, 0xffe97905, 0xffe8790f, 0xffd96002, 0xffe06d05,
+    0xffd45c05, 0xffda6405, 0xffd05506, 0xffd2550b, 0xffbf4005, 0xffca4e08,
+    0xffc74c09, 0xffd66108, 0xffcc580b, 0xffba4004, 0xffce5906, 0xffc2500b,
+    0xffa92e01, 0xffc6530f, 0xffb63d06, 0xffb4390c, 0xffaa2f04, 0xffa82d04,
+    0xffa92e05, 0xffa92d04, 0xffab3006, 0xffad3507, 0xff9a300f, 0xff332728,
+    0xff46302d, 0xff302c2d, 0xff31282b, 0xff2d2a2d, 0xff2c292e, 0xff2a3944,
+    0xff224252, 0xff1e6e91, 0xff227094, 0xff186a8b, 0xfe6d9e89, 0xff9ba958,
+    0xffb6a72b, 0xeca99e3b, 0x8c636134, 0x543e3b2b, 0x8f666238, 0xeba49b4a,
+    0xffafa44a, 0xfeaea34b, 0xfeaea34e, 0xffaba04e, 0xffa69c4a, 0xfaaa9f4f,
+    0xbd7f773a, 0x482c260d, 0x5138341c, 0xf8aba55c, 0xffa59b4a, 0xff978e3e,
+    0xff958d3b, 0xff968c3d, 0xff9e9042, 0xffad9c50, 0xff243642, 0xff214150,
+    0xff2e2a2c, 0xff2d2a2e, 0xff31292a, 0xff56342d, 0xff452c25, 0xff422b28,
+    0xff642e1a, 0xffae3309, 0xffd76a09, 0xffec8e13, 0xfffaa318, 0xffee8d0e,
+    0xffec8a0f, 0xfff39111, 0xffe47208, 0xffe47306, 0xffe36c07, 0xffda5c04,
+    0xffeb7d07, 0xffda6504, 0xffe77606, 0xffe26e05, 0xffcf5a07, 0xffc44908,
+    0xffd35207, 0xffcd4e09, 0xffd05307, 0xffd96609, 0xffc34c08, 0xffcb5306,
+    0xffca5808, 0xffb33b04, 0xffc65009, 0xffca5c14, 0xffaf3507, 0xffb93f0c,
+    0xffa82c03, 0xffa92e05, 0xffa82d03, 0xffa92e04, 0xffb53707, 0xffa92f06,
+    0xff9c2e0a, 0xff352626, 0xff513631, 0xff303c44, 0xff30282a, 0xff2d2b2f,
+    0xff2c2a30, 0xff283e4b, 0xff214657, 0xff196c8e, 0xff1f7092, 0xff1c6c8a,
+    0xff78a892, 0xffa5a94d, 0xcd938a38, 0x5e444027, 0x58413f2d, 0xb7817e48,
+    0xf6a9a04f, 0xffb0a346, 0xfeb0a43d, 0xfeb2a631, 0xffb2a540, 0xffada450,
+    0xeda29851, 0x86595328, 0x29171506, 0x452f2c1e, 0xad757140, 0xfdaea356,
+    0xffaca14f, 0xff9e9443, 0xff99903f, 0xffa9994b, 0xffb9a459, 0xffbca65d,
+    0xff243946, 0xff224456, 0xff2d282b, 0xff2d292d, 0xff362f31, 0xff41312f,
+    0xff3c2b29, 0xff3c2b2a, 0xff732a15, 0xffbc4004, 0xffea880f, 0xfff2991b,
+    0xffec910f, 0xffea850e, 0xffe9820d, 0xffe26d09, 0xffdf690b, 0xffde650d,
+    0xffda610b, 0xffe8750a, 0xffdf6b05, 0xffeb7c09, 0xffed8109, 0xffcc5602,
+    0xffd76713, 0xffda5b05, 0xffd35606, 0xffcf5109, 0xffd55a07, 0xffce5b06,
+    0xffd46009, 0xffc75105, 0xffbc4805, 0xffc44b05, 0xffe0720e, 0xffae3803,
+    0xffc0450d, 0xffaa2e03, 0xffaa2f06, 0xffa92e05, 0xffa82d02, 0xffb13607,
+    0xffb33805, 0xffae3006, 0xff9a2e0c, 0xff332625, 0xff523731, 0xff2b4a5b,
+    0xff31282a, 0xff2b2c33, 0xff2b2c30, 0xff283f4c, 0xff1f495e, 0xff16678b,
+    0xff1a6e93, 0xfe2b7286, 0xfc77a9a2, 0xaf777740, 0x43312d1d, 0x563c3b25,
+    0xd2938c49, 0xfeb0a344, 0xffb3a538, 0xffb2a62f, 0xfeb3a52b, 0xffb7a736,
+    0xffb9a64e, 0xcc8e844b, 0x4e322e15, 0x25161309, 0x7d565436, 0xe29a9452,
+    0xfdada253, 0xffaea351, 0xfeada24a, 0xffb5a351, 0xffbba35b, 0xffbea75e,
+    0xffae9c56, 0xff847b43, 0xff233b48, 0xff21465b, 0xff2c282a, 0xff2e292b,
+    0xff332c31, 0xff332828, 0xff372a29, 0xff362829, 0xff882e11, 0xffce5f0b,
+    0xffe98d15, 0xfff29719, 0xffe8850d, 0xffea810f, 0xffe57511, 0xffde670e,
+    0xffdb620b, 0xffda6c1a, 0xffd8600c, 0xffe26c08, 0xffe97d08, 0xffed8a0d,
+    0xffd96702, 0xffe78612, 0xffe8790a, 0xffdc6208, 0xffd1570c, 0xffd45406,
+    0xffc95106, 0xffd9690c, 0xffca5406, 0xffc04a05, 0xffc74f04, 0xffdc6809,
+    0xffc85809, 0xffb13505, 0xffbf410a, 0xffa82c04, 0xffaa3006, 0xffa82c03,
+    0xffac3205, 0xffb63a05, 0xffb63906, 0xffae3103, 0xff952e0f, 0xff312727,
+    0xff4e3834, 0xff294556, 0xff302b2b, 0xff2a3138, 0xff2a2a30, 0xff284351,
+    0xff1c4d64, 0xff13678c, 0xff156d94, 0xff407e7d, 0xec6da6af, 0x291c1b12,
+    0x6c4c482e, 0xeea39d58, 0xffafa444, 0xfeb3a62d, 0xfeb4a62c, 0xffb8a637,
+    0xfebaa549, 0xf3b39f59, 0xa1756b44, 0x2c1c170c, 0x3a262212, 0xb0797546,
+    0xfca8a056, 0xff988f4b, 0xfe867e42, 0xffaa9e40, 0xffb5a730, 0xffb7a63c,
+    0xffb3a04b, 0xff938846, 0xff736e3d, 0xff706b3c, 0xff263844, 0xff1e475c,
+    0xff2d282c, 0xff2e292c, 0xff312a2e, 0xff30292b, 0xff382a2a, 0xff332728,
+    0xffa14017, 0xffe07713, 0xffe78d17, 0xffea890f, 0xffec8711, 0xffe0700a,
+    0xffe17015, 0xffd65d09, 0xffdc6b20, 0xffd1590e, 0xffde6307, 0xffe67809,
+    0xfff0900e, 0xffe57707, 0xffef8d10, 0xffef8b0a, 0xffe06a05, 0xffe1730e,
+    0xffd35d0e, 0xffce5204, 0xffe27b1a, 0xffce5d09, 0xffc24c03, 0xffcf5806,
+    0xffd55d05, 0xffdd740a, 0xffb43e06, 0xffbe4009, 0xffae3204, 0xffac3206,
+    0xffa92f04, 0xffaf3305, 0xffb23905, 0xffbc3e07, 0xffb93c06, 0xffac2f04,
+    0xff8e2e10, 0xff312627, 0xff513a37, 0xff293f4f, 0xff322b2e, 0xff2b2b2f,
+    0xff2a2b2f, 0xff24495a, 0xff1d4e68, 0xff136688, 0xff15678a, 0xff4f8b87,
+    0xeb7aa7a3, 0x956d6641, 0xf3a79d55, 0xffb0a454, 0xfeb4a450, 0xffb9a644,
+    0xffbca64c, 0xffbea75a, 0xf5b39f5c, 0x7b595136, 0x25141109, 0x683f3c1f,
+    0xd8908b4c, 0xffa0984f, 0xff867f42, 0xfe6f6b39, 0xff797137, 0xffa2972f,
+    0xffb7a92c, 0xffa3982e, 0xff7a7336, 0xff6d6937, 0xff7b743d, 0xff928a47,
+    0xff26323d, 0xff20465a, 0xff2d292c, 0xff2d2a2d, 0xff30292b, 0xff30292a,
+    0xff392928, 0xff3a2825, 0xffc05a17, 0xffde7b11, 0xffe07710, 0xffe98310,
+    0xffe27408, 0xffee8b19, 0xffd95f09, 0xffd15e1b, 0xffd05711, 0xffd95a08,
+    0xffe37106, 0xfff18e0d, 0xffe87b08, 0xffe67d0b, 0xfff49814, 0xffe47705,
+    0xffe57409, 0xffe97d07, 0xffd35b04, 0xffeb7c0e, 0xffdc7115, 0xffc24b02,
+    0xffd05d05, 0xffd35705, 0xffe67e0d, 0xffc95b08, 0xffb33805, 0xffb83c05,
+    0xffaf3404, 0xffb23707, 0xffa82c04, 0xffb94206, 0xffb63b05, 0xffc14409,
+    0xffb63b05, 0xffaa2f04, 0xff8d3214, 0xff362725, 0xff523d3b, 0xff283d49,
+    0xff302e32, 0xff2c292d, 0xff2b2d32, 0xff24495d, 0xff1e4b5d, 0xff185e80,
+    0xff185873, 0xff5e9799, 0xfa95a784, 0xfbbba658, 0xffb3a356, 0xff9d904e,
+    0xffb59f58, 0xffbfa75b, 0xffbca55c, 0xffad9b4e, 0xf89c9148, 0xa96d693f,
+    0xaf6a6635, 0xf38f8741, 0xff8b8344, 0xff78723d, 0xfe6f683a, 0xff877f36,
+    0xffa69a30, 0xffb5a42e, 0xffb6a82b, 0xffa39937, 0xff7c7437, 0xff938a35,
+    0xffab9e33, 0xffb0a54d, 0xff283038, 0xff234658, 0xff2d292b, 0xff2c2a2d,
+    0xff31292b, 0xff33282b, 0xff3b2827, 0xff442822, 0xffd46b15, 0xffd2670b,
+    0xffe0710c, 0xffde6b06, 0xfff59d1b, 0xffe2710f, 0xffcb520e, 0xffcf5513,
+    0xffd35406, 0xffe16c08, 0xffed8008, 0xffe67b08, 0xffe77708, 0xffef8910,
+    0xffe77f0a, 0xffe6780a, 0xffeb8310, 0xffe77707, 0xffe5730c, 0xffe57c12,
+    0xffc74f03, 0xffd05d05, 0xffd35804, 0xffe4770a, 0xffda7409, 0xffbf490a,
+    0xffb83a06, 0xffb23504, 0xffbb3f08, 0xffad3004, 0xffac3105, 0xffb23904,
+    0xffc84a07, 0xffbe4206, 0xffb43a08, 0xffab2c05, 0xff933818, 0xff372523,
+    0xff503f3f, 0xff283c47, 0xff2d3036, 0xff2d292c, 0xff2a2e34, 0xff24485b,
+    0xff1d495c, 0xff1d546d, 0xff21566d, 0xfe65a1a9, 0xffada971, 0xffb39d57,
+    0xfe877b44, 0xfe706b3a, 0xff8c7e46, 0xffc1a95f, 0xffb39f52, 0xfe998f3f,
+    0xfe958b3c, 0xfd958d3f, 0xfe8c833e, 0xff78713a, 0xff706b3b, 0xfe7a713f,
+    0xffa29353, 0xffb5a449, 0xffb6a72f, 0xffb3a72f, 0xffb2a43c, 0xffb2a537,
+    0xffafa32f, 0xffb4a72a, 0xffb4a739, 0xffa79c4c
+};
diff --git a/src/pixman/demos/parrot.jpg b/src/pixman/demos/parrot.jpg
new file mode 100644
index 0000000..e7727f3
--- /dev/null
+++ b/src/pixman/demos/parrot.jpg
diff --git a/src/pixman/demos/quad2quad.c b/src/pixman/demos/quad2quad.c
new file mode 100644
index 0000000..66b838f
--- /dev/null
+++ b/src/pixman/demos/quad2quad.c
@@ -0,0 +1,2183 @@
+#include <math.h>
+#include <stdio.h>
+#include <pixman.h>
+
+/* This code is basically the output of Maxima translated into C.
+ *
+ * See http://maxima.sourceforge.net/
+ */
+static void
+quad_to_quad (double x0, double y0,
+	      double x1, double y1,
+	      double x2, double y2,
+	      double x3, double y3,
+
+	      double px0, double py0,
+	      double px1, double py1,
+	      double px2, double py2,
+	      double px3, double py3,
+
+	      struct pixman_f_transform *trans)
+{
+    double
+	t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18,
+	t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29, t30, t31, t32, t33, t34,
+	t35, t36, t37, t38, t39, t40, t41, t42, t43, t44, t45, t46, t47, t48, t49, t50,
+	t51, t52, t53, t54, t55, t56, t57, t58, t59, t60, t61, t62, t63, t64, t65, t66,
+	t67, t68, t69, t70, t71, t72, t73, t74, t75, t76, t77, t78, t79, t80, t81, t82,
+	t83, t84, t85, t86, t87, t88, t89, t90, t91, t92, t93, t94, t95, t96, t97, t98,
+	t99, t100, t101, t102, t103, t104, t105, t106, t107, t108, t109, t110, t111,
+	t112, t113, t114, t115, t116, t117, t118, t119, t120, t121, t122, t123,
+	t124, t125, t126, t127, t128, t129, t130, t131, t132, t133, t134, t135,
+	t136, t137, t138, t139, t140, t141, t142, t143, t144, t145, t146, t147,
+	t148, t149, t150, t151, t152, t153, t154, t155, t156, t157, t158, t159,
+	t160, t161, t162, t163, t164, t165, t166, t167, t168, t169, t170, t171,
+	t172, t173, t174, t175, t176, t177, t178, t179, t180, t181, t182, t183,
+	t184, t185, t186, t187, t188, t189, t190, t191, t192, t193, t194, t195,
+	t196, t197, t198, t199, t200, t201, t202, t203, t204, t205, t206, t207,
+	t208, t209, t210, t211, t212, t213, t214, t215, t216, t217, t218, t219,
+	t220, t221, t222, t223, t224, t225, t226, t227, t228, t229, t230, t231,
+	t232, t233, t234, t235, t236, t237, t238, t239, t240, t241, t242, t243,
+	t244, t245, t246, t247, t248, t249, t250, t251, t252, t253, t254, t255,
+	t256, t257, t258, t259, t260, t261, t262, t263, t264, t265, t266, t267,
+	t268, t269, t270, t271, t272, t273, t274, t275, t276, t277, t278, t279,
+	t280, t281, t282, t283, t284, t285, t286, t287, t288, t289, t290, t291,
+	t292, t293, t294, t295, t296, t297, t298, t299, t300, t301, t302, t303,
+	t304, t305, t306, t307, t308, t309, t310, t311, t312, t313, t314, t315,
+	t316, t317, t318, t319, t320, t321, t322, t323, t324, t325, t326, t327,
+	t328, t329, t330, t331, t332, t333, t334, t335, t336, t337, t338, t339,
+	t340, t341, t342, t343, t344, t345, t346, t347, t348, t349, t350, t351,
+	t352, t353, t354, t355, t356, t357, t358, t359, t360, t361, t362, t363,
+	t364, t365, t366, t367, t368, t369, t370, t371, t372, t373, t374, t375,
+	t376, t377, t378, t379, t380, t381, t382, t383, t384, t385, t386, t387,
+	t388, t389, t390, t391, t392, t393, t394, t395, t396, t397, t398, t399,
+	t400, t401, t402, t403, t404, t405, t406, t407, t408, t409, t410, t411,
+	t412, t413, t414, t415, t416, t417, t418, t419, t420, t421, t422, t423,
+	t424, t425, t426, t427, t428, t429, t430, t431, t432, t433, t434, t435,
+	t436, t437, t438, t439, t440, t441, t442, t443, t444, t445, t446, t447,
+	t448, t449, t450, t451, t452, t453, t454, t455, t456, t457, t458, t459,
+	t460, t461, t462, t463, t464, t465, t466, t467, t468, t469, t470, t471,
+	t472, t473, t474, t475, t476, t477, t478, t479, t480, t481, t482, t483,
+	t484, t485, t486, t487, t488, t489, t490, t491, t492, t493, t494, t495,
+	t496, t497, t498, t499, t500, t501, t502, t503, t504, t505, t506, t507,
+	t508, t509, t510, t511, t512, t513, t514, t515, t516, t517, t518, t519,
+	t520, t521, t522, t523, t524, t525, t526, t527, t528, t529, t530, t531,
+	t532, t533, t534, t535, t536, t537, t538, t539, t540, t541, t542, t543,
+	t544, t545, t546, t547, t548, t549, t550, t551, t552, t553, t554, t555,
+	t556, t557, t558, t559, t560, t561, t562, t563, t564, t565, t566, t567,
+	t568, t569, t570, t571, t572, t573, t574, t575, t576, t577, t578, t579,
+	t580, t581, t582, t583, t584, t585, t586, t587, t588, t589, t590, t591,
+	t592, t593, t594, t595, t596, t597, t598, t599, t600, t601, t602, t603,
+	t604, t605, t606, t607, t608, t609, t610, t611, t612, t613, t614, t615,
+	t616, t617, t618, t619, t620, t621, t622, t623, t624, t625, t626, t627,
+	t628, t629, t630, t631, t632, t633, t634, t635, t636, t637, t638, t639,
+	t640, t641, t642, t643, t644, t645, t646, t647, t648, t649, t650, t651,
+	t652, t653, t654, t655, t656, t657, t658, t659, t660, t661, t662, t663,
+	t664, t665, t666, t667, t668, t669, t670, t671, t672, t673, t674, t675,
+	t676, t677, t678, t679, t680, t681, t682, t683, t684, t685, t686, t687,
+	t688, t689, t690, t691, t692, t693, t694, t695, t696, t697, t698, t699,
+	t700, t701, t702, t703, t704, t705, t706, t707, t708, t709, t710, t711,
+	t712, t713, t714, t715, t716, t717, t718, t719, t720, t721, t722, t723,
+	t724, t725, t726, t727, t728, t729, t730, t731, t732, t733, t734, t735,
+	t736, t737, t738, t739, t740, t741, t742, t743, t744, t745, t746, t747,
+	t748, t749, t750, t751, t752, t753, t754, t755, t756, t757, t758, t759,
+	t760, t761, t762, t763, t764, t765, t766, t767, t768, t769, t770, t771,
+	t772, t773, t774, t775, t776, t777, t778, t779, t780, t781, t782, t783,
+	t784, t785, t786, t787, t788, t789, t790, t791, t792, t793, t794, t795,
+	t796, t797, t798, t799, t800, t801, t802, t803, t804, t805, t806, t807,
+	t808, t809, t810, t811, t812, t813, t814, t815, t816, t817, t818, t819,
+	t820, t821, t822, t823, t824, t825, t826, t827, t828, t829, t830, t831,
+	t832, t833, t834, t835, t836, t837, t838, t839, t840, t841, t842, t843,
+	t844, t845, t846, t847, t848, t849, t850, t851, t852, t853, t854, t855,
+	t856, t857, t858, t859, t860, t861, t862, t863, t864, t865, t866, t867,
+	t868, t869, t870, t871, t872, t873, t874, t875, t876, t877, t878, t879,
+	t880, t881, t882, t883, t884, t885, t886, t887, t888, t889, t890, t891,
+	t892, t893, t894, t895, t896, t897, t898, t899, t900, t901, t902, t903,
+	t904, t905, t906, t907, t908, t909, t910, t911, t912, t913, t914, t915,
+	t916, t917, t918, t919, t920, t921, t922, t923, t924, t925, t926, t927,
+	t928, t929, t930, t931, t932, t933, t934, t935, t936, t937, t938, t939,
+	t940, t941, t942, t943, t944, t945, t946, t947, t948, t949, t950, t951,
+	t952, t953, t954, t955, t956, t957, t958, t959, t960, t961, t962, t963,
+	t964, t965, t966, t967, t968, t969, t970, t971, t972, t973, t974, t975,
+	t976, t977, t978, t979, t980, t981, t982, t983, t984, t985, t986, t987,
+	t988, t989, t990, t991, t992, t993, t994, t995, t996, t997, t998, t999,
+	t1000, t1001, t1002, t1003, t1004, t1005, t1006, t1007, t1008, t1009,
+	t1010, t1011, t1012, t1013, t1014, t1015, t1016, t1017, t1018, t1019,
+	t1020, t1021, t1022, t1023, t1024, t1025, t1026, t1027, t1028, t1029,
+	t1030, t1031, t1032, t1033, t1034, t1035, t1036, t1037, t1038, t1039,
+	t1040, t1041, t1042, t1043, t1044, t1045, t1046, t1047, t1048, t1049,
+	t1050, t1051, t1052, t1053, t1054, t1055, t1056, t1057, t1058, t1059,
+	t1060, t1061, t1062, t1063, t1064, t1065, t1066, t1067, t1068, t1069,
+	t1070, t1071, t1072, t1073;
+
+    t1 = y1 * y1;
+    t2 = x3 * x3;
+    t3 = px2 * px3 * t2;
+    t4 = (t3 - px2 * px3 * x2 * x3) * y2;
+    t5 = x2 * x2;
+    t6 = px2 * px3 * t5 * y3;
+
+    t7 = - px2 * px3 * x2 * x3 * y3;
+    t8 = py1 * (t7 + t6 + t4);
+    t9 = px3 * py2 * x2 * x3;
+
+    t10 = - px3 * py2 * t2;
+    t11 = (t10 + t9) * y2;
+    t12 = - px2 * py3 * t5 * y3;
+
+    t13 = px2 * py3 * x2 * x3 * y3;
+    t14 = y0 * y0;
+    t15 = - px3 * py2;
+    t16 = px2 * py3;
+
+    t17 = t16 + t15;
+    t18 = t17 * x2;
+    t19 = px3 * py2 * x3;
+    t20 = - px2 * py3 * x3;
+
+    t21 = t20 + t19 + t18;
+    t22 = px2 * px3 * t5;
+    t23 = - 2 * px2 * px3 * x2 * x3;
+
+    t24 = py1 * (t3 + t23 + t22);
+    t25 = - px2 * py3 * t5;
+    t26 = px2 * py3 * x3;
+
+    t27 = x2 * (t26 + t19);
+    t28 = t10 + t27 + t25;
+    t29 = x1 * x1;
+    t30 = px3 * py2;
+
+    t31 = - px2 * py3;
+    t32 = t31 + t30;
+    t33 = t32 * y2;
+    t34 = - px3 * py2 * y3;
+
+    t35 = px2 * py3 * y3;
+    t36 = t35 + t34 + t33;
+    t37 = - px2 * px3 * t2;
+
+    t38 = (t37 + px2 * px3 * x2 * x3) * y2;
+    t39 = - px2 * px3 * t5 * y3;
+
+    t40 = px2 * px3 * x2 * x3 * y3;
+    t41 = py1 * (t40 + t39 + t38);
+    t42 = - px2 * py3 * x2 * x3;
+
+    t43 = px3 * py2 * t2;
+    t44 = (t43 + t42) * y2;
+    t45 = px2 * py3 * t5 * y3;
+
+    t46 = - px3 * py2 * x2 * x3 * y3;
+    t47 = (px2 * px3 * x3 - px2 * px3 * x2) * y2;
+
+    t48 = px2 * px3 * x2 * y3;
+    t49 = - px2 * px3 * x3 * y3;
+    t50 = py1 * (t49 + t48 + t47);
+
+    t51 = px2 * py3 * x2;
+    t52 = - 2 * px3 * py2 * x3;
+    t53 = (t26 + t52 + t51) * y2;
+
+    t54 = px3 * py2 * x3 * y3;
+    t55 = px3 * py2 * y3;
+    t56 = - 2 * px2 * py3 * y3;
+    t57 = t56 + t55;
+
+    t58 = x2 * t57;
+    t59 = - px2 * px3 * t5;
+    t60 = 2 * px2 * px3 * x2 * x3;
+    t61 = - px2;
+
+    t62 = px3 + t61;
+    t63 = t62 * x2;
+    t64 = px2 * x3;
+    t65 = - px3 * x3;
+    t66 = t65 + t64 + t63;
+
+    t67 = px2 * t5;
+    t68 = - px2 * x3;
+    t69 = x2 * (t65 + t68);
+    t70 = px3 * t2;
+
+    t71 = t70 + t69 + t67;
+    t72 = - px3;
+    t73 = t72 + px2;
+    t74 = - px2 * y3;
+    t75 = px3 * y3;
+
+    t76 = t75 + t74 + t73 * y2;
+    t77 = px2 * x2 * x3;
+    t78 = - px3 * t2;
+    t79 = - px2 * t5 * y3;
+
+    t80 = px3 * x2 * x3 * y3;
+    t81 = t80 + t79 + (t78 + t77) * y2;
+
+    t82 = (px2 * px3 * x2 - px2 * px3 * x3) * y2;
+    t83 = - px2 * px3 * x2 * y3;
+
+    t84 = px2 * px3 * x3 * y3;
+    t85 = - px2 * x2;
+    t86 = 2 * px3 * x3;
+    t87 = - px3 * x3 * y3;
+
+    t88 = 2 * px2 * y3;
+    t89 = - px3 * y3;
+    t90 = t89 + t88;
+    t91 = x2 * t90;
+
+    t92 = t91 + t87 + (t86 + t68 + t85) * y2;
+    t93 = px2 * py3 * t5;
+    t94 = - px3 * py2 * x3;
+
+    t95 = x2 * (t20 + t94);
+    t96 = t32 * x2;
+    t97 = t73 * x2;
+    t98 = px3 * x3;
+
+    t99 = t98 + t68 + t97;
+    t100 = py1 * t99;
+    t101 = - px2 * t5;
+    t102 = x2 * (t98 + t64);
+
+    t103 = t78 + t102 + t101;
+    t104 = py1 * t103;
+    t105 = - py2;
+    t106 = py3 + t105;
+
+    t107 = py2 * y3;
+    t108 = - py3 * y3;
+    t109 = t108 + t107 + t106 * y2;
+    t110 = - px3 * x2 * x3;
+
+    t111 = px2 * t5 * y3;
+    t112 = - px2 * x2 * x3 * y3;
+    t113 = t112 + t111 + (t70 + t110) * y2;
+
+    t114 = - py2 * x3;
+    t115 = py3 * x3;
+    t116 = t115 + t114;
+    t117 = py2 * x3 * y3;
+
+    t118 = - py3 * x3 * y3;
+    t119 = t118 + t117;
+    t120 = x2 * t119;
+
+    t121 = px1 * (t120 + x2 * t116 * y2);
+    t122 = - px3 * py2 * x2;
+    t123 = (t19 + t122) * y2;
+
+    t124 = px2 * py3 * x2 * y3;
+    t125 = - px2 * py3 * x3 * y3;
+    t126 = px3 * x2;
+
+    t127 = - px2 * x2 * y3;
+    t128 = px2 * x3 * y3;
+    t129 = t128 + t127 + (t65 + t126) * y2;
+
+    t130 = - py3;
+    t131 = t130 + py2;
+    t132 = t131 * x2;
+    t133 = py2 * x3;
+    t134 = - py3 * x3;
+
+    t135 = - py2 * x3 * y3;
+    t136 = py3 * x3 * y3;
+    t137 = - py2 * y3;
+    t138 = py3 * y3;
+
+    t139 = t138 + t137;
+    t140 = x2 * t139;
+
+    t141 = px1 * (t140 + t136 + t135 + (t134 + t133 + t132) * y2);
+    t142 = y2 * y2;
+
+    t143 = - px3 * py2 * x3 * y3;
+    t144 = px2 * py3 * x3 * y3;
+    t145 = t144 + t143;
+
+    t146 = t142 * t145;
+    t147 = y3 * y3;
+    t148 = px3 * py2 * t147;
+    t149 = - px2 * py3 * t147;
+
+    t150 = t149 + t148;
+    t151 = x2 * y2 * t150;
+    t152 = t151 + t146;
+    t153 = - px2 * py3 * y3;
+
+    t154 = t153 + t55;
+    t155 = t142 * t154;
+    t156 = - px3 * py2 * t147;
+
+    t157 = px2 * py3 * t147;
+    t158 = t157 + t156;
+    t159 = y2 * t158;
+    t160 = t159 + t155;
+
+    t161 = x0 * x0;
+    t162 = py1 * t76;
+    t163 = px1 * t109;
+    t164 = px2 * y3;
+    t165 = t89 + t164;
+
+    t166 = - px2 * t147;
+    t167 = px3 * t147;
+    t168 = t167 + t166;
+
+    t169 = y2 * t168 + t142 * t165;
+    t170 = py1 * t169;
+    t171 = py2 * t147;
+
+    t172 = - py3 * t147;
+    t173 = t172 + t171;
+    t174 = y2 * t173 + t142 * t139;
+
+    t175 = px1 * t174;
+    t176 = t17 * t142;
+    t177 = px2 * t147;
+    t178 = - px3 * t147;
+
+    t179 = t178 + t177 + t62 * t142;
+    t180 = - py2 * t147;
+    t181 = py3 * t147;
+
+    t182 = t181 + t180 + t131 * t142;
+
+    t183 = y1 * (px1 * t182 + py1 * t179 + t149 + t148 + t176)
+	+ t175 + t170 + t159 + t1 * (t163 + t162 + t35 + t34 + t33) + t155;
+
+    t184 = - px2 * px3 * t2 * t142;
+    t185 = 2 * px2 * px3 * x2 * x3 * y2 * y3;
+
+    t186 = - px2 * px3 * t5 * t147;
+    t187 = py1 * (t186 + t185 + t184);
+
+    t188 = px3 * py2 * t2 * t142;
+    t189 = x2 * y2 * (t125 + t143);
+    t190 = px2 * py3 * t5 * t147;
+
+    t191 = t190 + t189 + t188;
+    t192 = px2 * px3 * x3 * t142;
+    t193 = y2 * (t49 + t83);
+
+    t194 = px2 * px3 * x2 * t147;
+    t195 = py1 * (t194 + t193 + t192);
+
+    t196 = - px3 * py2 * x3 * t142;
+    t197 = 2 * px3 * py2 * x3 * y3;
+    t198 = 2 * px2 * py3 * y3;
+
+    t199 = t198 + t34;
+    t200 = x2 * t199;
+    t201 = y2 * (t200 + t125 + t197);
+
+    t202 = - px2 * py3 * x2 * t147;
+    t203 = - px2 * x3 * y3;
+    t204 = px3 * x3 * y3;
+
+    t205 = t204 + t203;
+    t206 = t142 * t205;
+    t207 = t178 + t177;
+    t208 = x2 * y2 * t207;
+
+    t209 = t208 + t206;
+    t210 = px2 * px3 * t2 * t142;
+    t211 = - 2 * px2 * px3 * x2 * x3 * y2 * y3;
+
+    t212 = px2 * px3 * t5 * t147;
+    t213 = - px3 * t2 * t142;
+    t214 = x2 * y2 * (t204 + t128);
+
+    t215 = - px2 * t5 * t147;
+    t216 = t215 + t214 + t213;
+    t217 = - px2 * px3 * x3 * t142;
+
+    t218 = y2 * (t84 + t48);
+    t219 = - px2 * px3 * x2 * t147;
+    t220 = px3 * x3 * t142;
+
+    t221 = - 2 * px3 * x3 * y3;
+    t222 = - 2 * px2 * y3;
+    t223 = t75 + t222;
+    t224 = x2 * t223;
+
+    t225 = y2 * (t224 + t221 + t128);
+    t226 = px2 * x2 * t147;
+    t227 = t226 + t225 + t220;
+
+    t228 = t125 + t54;
+    t229 = t142 * t228;
+    t230 = x2 * y2 * t158;
+    t231 = t87 + t128;
+
+    t232 = t142 * t231;
+    t233 = x2 * y2 * t168;
+    t234 = t233 + t232;
+    t235 = py1 * t234;
+
+    t236 = - px3 * py2 * t2 * t142;
+    t237 = x2 * y2 * (t144 + t54);
+
+    t238 = - px2 * py3 * t5 * t147;
+    t239 = px3 * t2 * t142;
+    t240 = x2 * y2 * (t87 + t203);
+
+    t241 = px2 * t5 * t147;
+    t242 = t241 + t240 + t239;
+    t243 = py1 * t242;
+
+    t244 = px2 * py3 * x3 * t142;
+    t245 = - px2 * py3 * x2 * y3;
+    t246 = y2 * (t143 + t245);
+
+    t247 = px3 * py2 * x2 * t147;
+    t248 = - px2 * x3 * t142;
+    t249 = px2 * x2 * y3;
+
+    t250 = y2 * (t204 + t249);
+    t251 = - px3 * x2 * t147;
+    t252 = t251 + t250 + t248;
+
+    t253 = t134 + t133;
+    t254 = t253 * t142;
+    t255 = t108 + t107;
+    t256 = x2 * t255;
+
+    t257 = t256 + t136 + t135;
+    t258 = y2 * t257;
+    t259 = t181 + t180;
+    t260 = x2 * t259;
+
+    t261 = px1 * (t260 + t258 + t254);
+    t262 = py1 * (t37 + t60 + t59);
+
+    t263 = t43 + t95 + t93;
+    t264 = px1 * t263;
+    t265 = t26 + t94;
+    t266 = x2 * t265 * y2;
+
+    t267 = x2 * t228;
+    t268 = t267 + t266;
+    t269 = py1 * (t84 + t83 + t82);
+
+    t270 = - 2 * px2 * py3;
+    t271 = (t26 + (t270 + t30) * x2) * y2;
+    t272 = px3 * py2 * x2 * y3;
+
+    t273 = - 2 * px3 * py2 * x3 * y3;
+    t274 = t149 + t148 + t176;
+
+    t275 = py1 * (t212 + t211 + t210);
+    t276 = t238 + t237 + t236;
+    t277 = px1 * t276;
+
+    t278 = py1 * (t219 + t218 + t217);
+    t279 = 2 * px3 * py2 * x3;
+    t280 = t20 + t279;
+
+    t281 = t280 * t142;
+    t282 = - px3 * py2 * x2 * y3;
+    t283 = y2 * (t125 + t282);
+
+    t284 = 2 * px2 * py3 * t147;
+    t285 = x2 * (t284 + t156);
+    t286 = px1 * t103;
+
+    t287 = t98 + t68;
+    t288 = x2 * t287 * y2;
+    t289 = x2 * t231;
+    t290 = t289 + t288;
+
+    t291 = 2 * px2;
+    t292 = - px3 * x2 * y3;
+    t293 = 2 * px3 * x3 * y3;
+
+    t294 = t293 + t203 + t292 + (t68 + (t72 + t291) * x2) * y2;
+    t295 = px1 * t242;
+
+    t296 = - 2 * px3 * x3;
+    t297 = t296 + t64;
+    t298 = px3 * x2 * y3;
+    t299 = y2 * (t128 + t298);
+
+    t300 = - 2 * px2 * t147;
+    t301 = x2 * (t167 + t300) + t299 + t297 * t142;
+    t302 = py1 * t71;
+
+    t303 = py1 * t290;
+    t304 = 2 * py2 * x3;
+    t305 = - 2 * py3 * x3;
+    t306 = - 2 * py2 * x3 * y3;
+
+    t307 = 2 * py3 * x3 * y3;
+    t308 = t307 + t306;
+    t309 = - 2 * px2 * py3 * x3;
+
+    t310 = (t309 + t19 + t51) * y2;
+    t311 = - 2 * px3 * py2 * y3;
+    t312 = t35 + t311;
+
+    t313 = x2 * t312;
+    t314 = 2 * px2 * x3;
+    t315 = 2 * px3 * y3;
+    t316 = t315 + t74;
+
+    t317 = x2 * t316;
+    t318 = t317 + t87 + (t65 + t314 + t85) * y2;
+    t319 = t106 * x2;
+
+    t320 = px1 * (t256 + t118 + t117 + (t115 + t114 + t319) * y2);
+    t321 = py1 * t216;
+
+    t322 = 2 * px2 * py3 * x3 * y3;
+    t323 = 2 * px3 * py2 * y3;
+    t324 = t153 + t323;
+
+    t325 = x2 * t324;
+    t326 = y2 * (t325 + t322 + t143);
+    t327 = - 2 * px2 * x3 * y3;
+
+    t328 = - 2 * px3 * y3;
+    t329 = t328 + t164;
+    t330 = x2 * t329;
+
+    t331 = y2 * (t330 + t204 + t327);
+    t332 = t226 + t331 + t220;
+    t333 = t116 * t142;
+
+    t334 = t140 + t118 + t117;
+    t335 = y2 * t334;
+    t336 = x2 * t173;
+
+    t337 = px1 * (t336 + t335 + t333);
+    t338 = t26 + t94 + t96;
+    t339 = t17 * y2;
+
+    t340 = t153 + t55 + t339;
+    t341 = px2 * px3 * t142;
+    t342 = - 2 * px2 * px3 * y2 * y3;
+
+    t343 = px2 * px3 * t147;
+    t344 = py1 * (t343 + t342 + t341);
+    t345 = - px2 * py3 * t142;
+
+    t346 = y2 * (t35 + t55);
+    t347 = t156 + t346 + t345;
+    t348 = px1 * t347 + t344;
+
+    t349 = t89 + t164 + t62 * y2;
+    t350 = - px2 * px3 * t142;
+    t351 = 2 * px2 * px3 * y2 * y3;
+
+    t352 = - px2 * px3 * t147;
+    t353 = px2 * t142;
+    t354 = y2 * (t89 + t74);
+
+    t355 = t167 + t354 + t353;
+    t356 = px1 * t355 + t352 + t351 + t350;
+    t357 = py1 * t66;
+
+    t358 = py1 * t349;
+    t359 = 2 * py2;
+    t360 = - 2 * py3;
+    t361 = - 2 * py2 * y3;
+
+    t362 = 2 * py3 * y3;
+    t363 = px3 * py2 * t142;
+    t364 = y2 * (t153 + t34);
+
+    t365 = - px3 * t142;
+    t366 = y2 * (t75 + t164);
+    t367 = t166 + t366 + t365;
+
+    t368 = py1 * t367;
+    t369 = px1 * (t172 + t171 + t106 * t142);
+    t370 = t35 + t34;
+
+    t371 = t142 * t370;
+    t372 = y2 * t150;
+    t373 = t372 + t371;
+    t374 = t230 + t229;
+
+    t375 = py1 * (t352 + t351 + t350);
+    t376 = t157 + t364 + t363;
+    t377 = px1 * t376 + t375;
+
+    t378 = t75 + t74;
+    t379 = y2 * t207 + t142 * t378;
+    t380 = px1 * t367 + t343 + t342 + t341;
+
+    t381 = py1 * t209;
+    t382 = py1 * t355;
+    t383 = py1 * t379;
+    t384 = 2 * py2 * y3;
+
+    t385 = - 2 * py3 * y3;
+    t386 = t385 + t384;
+    t387 = - 2 * py2 * t147;
+    t388 = 2 * py3 * t147;
+
+    t389 = px2 * py3 * t2;
+    t390 = t389 + t10;
+    t391 = x2 * t390 * y2;
+    t392 = t5 * t228;
+
+    t393 = - px2 * t2;
+    t394 = t70 + t393;
+    t395 = x2 * t394 * y2;
+    t396 = t5 * t231;
+
+    t397 = t396 + t395;
+    t398 = py1 * t397;
+    t399 = py2 * t2;
+    t400 = - py3 * t2;
+
+    t401 = t400 + t399;
+    t402 = x2 * t401 * y2;
+    t403 = t136 + t135;
+    t404 = t5 * t403;
+
+    t405 = t404 + t402;
+    t406 = px1 * t405;
+    t407 = t1 * (t406 + t398 + t392 + t391);
+
+    t408 = t65 + t64;
+    t409 = t5 * t408;
+    t410 = x2 * t394;
+    t411 = t410 + t409;
+
+    t412 = py1 * t411;
+    t413 = t5 * t116;
+    t414 = x2 * t401;
+    t415 = t414 + t413;
+
+    t416 = px1 * t415;
+    t417 = py2 * t5;
+    t418 = x2 * (t134 + t114);
+    t419 = py3 * t2;
+
+    t420 = t419 + t418 + t417;
+    t421 = px1 * t420;
+    t422 = t265 * y2;
+    t423 = x2 * t154;
+
+    t424 = px2 * x2;
+    t425 = (t68 + t424) * y2;
+    t426 = - py2 * x2;
+    t427 = (t133 + t426) * y2;
+
+    t428 = py3 * x2 * y3;
+    t429 = t20 + t19;
+    t430 = x2 * t429;
+    t431 = - px2 * py3 * t2;
+
+    t432 = (t431 + t43 + t430) * y2;
+    t433 = t5 * t370;
+    t434 = x2 * t145;
+
+    t435 = - px2 * x2 * x3;
+    t436 = px2 * t2;
+    t437 = (t436 + t435) * y2;
+    t438 = px3 * t5 * y3;
+
+    t439 = - px3 * x2 * x3 * y3;
+    t440 = py2 * x2 * x3;
+    t441 = - py2 * t2;
+
+    t442 = (t441 + t440) * y2;
+    t443 = - py3 * t5 * y3;
+    t444 = py3 * x2 * x3 * y3;
+
+    t445 = t5 * t287;
+    t446 = t78 + t436;
+    t447 = x2 * t446;
+    t448 = - t2;
+
+    t449 = t448 + 2 * x2 * x3 - t5;
+    t450 = px1 * t449;
+    t451 = (t98 + t85) * y2;
+    t452 = - x2 * y3;
+
+    t453 = x3 * y3;
+    t454 = t453 + t452 + (x2 - x3) * y2;
+    t455 = px1 * t454;
+    t456 = t65 + t314;
+
+    t457 = x2 * t456;
+    t458 = (t78 + t457) * y2;
+    t459 = x2 * (t293 + t203);
+
+    t460 = - x2 * x3 * y3 + t5 * y3 + (t2 - x2 * x3) * y2;
+    t461 = px1 * t460;
+    t462 = t5 * t253;
+
+    t463 = t419 + t441;
+    t464 = x2 * t463;
+    t465 = - py2 * t5;
+    t466 = x2 * (t115 + t133);
+
+    t467 = t2 - 2 * x2 * x3 + t5;
+    t468 = py1 * t467;
+    t469 = py2 * x2;
+    t470 = (t134 + t469) * y2;
+
+    t471 = - py2 * x2 * y3;
+    t472 = x2 * y3;
+    t473 = - x3 * y3;
+    t474 = t473 + t472 + (x3 - x2) * y2;
+
+    t475 = py1 * t474;
+    t476 = - 2 * py2 * x3;
+    t477 = t115 + t476;
+    t478 = x2 * t477;
+
+    t479 = (t419 + t478) * y2;
+    t480 = py2 * t5 * y3;
+    t481 = - 2 * py3 * x3 * y3;
+
+    t482 = x2 * (t481 + t117);
+    t483 = x2 * x3 * y3 - t5 * y3 + (t448 + x2 * x3) * y2;
+
+    t484 = py1 * t483;
+    t485 = t431 + t43;
+    t486 = t485 * t142;
+    t487 = t5 * t158;
+
+    t488 = t446 * t142;
+    t489 = t5 * t168;
+    t490 = t489 + t488;
+    t491 = py1 * t490;
+
+    t492 = t463 * t142;
+    t493 = t5 * t173;
+    t494 = t493 + t492;
+    t495 = px1 * t494;
+
+    t496 = x1 * y1 * (t495 + t491 + t487 + t486);
+    t497 = t142 * t119;
+    t498 = x2 * y2 * t259;
+
+    t499 = t498 + t497;
+    t500 = px1 * t499;
+    t501 = t29 * (t500 + t381 + t151 + t146);
+
+    t502 = t429 * t142;
+    t503 = x2 * t370;
+    t504 = y2 * (t503 + t125 + t54);
+    t505 = x2 * t158;
+
+    t506 = - px3 * x3 * t142;
+    t507 = - px2 * x2 * t147;
+    t508 = py3 * x3 * t142;
+
+    t509 = y2 * (t118 + t471);
+    t510 = py2 * x2 * t147;
+    t511 = - py2 * t142;
+
+    t512 = y2 * (t138 + t107);
+    t513 = t172 + t512 + t511;
+    t514 = px1 * t513;
+
+    t515 = y2 * t259 + t142 * t255;
+    t516 = px1 * t515;
+    t517 = py1 * t454;
+
+    t518 = - py2 * x3 * t142;
+    t519 = t108 + t384;
+    t520 = x2 * t519;
+
+    t521 = y2 * (t520 + t307 + t135);
+    t522 = - py3 * x2 * t147;
+    t523 = py2 * t142;
+
+    t524 = y2 * (t108 + t137);
+    t525 = - t147 + 2 * y2 * y3 - t142;
+    t526 = py1 * t525;
+
+    t527 = x2 * t147 + y2 * (t473 + t452) + x3 * t142;
+    t528 = py1 * t527;
+    t529 = px1 * t474;
+
+    t530 = px2 * x3 * t142;
+    t531 = px3 * x2 * t147;
+
+    t532 = - x2 * t147 + y2 * (t453 + t472) - x3 * t142;
+    t533 = px1 * t532;
+
+    t534 = - px2 * t142;
+    t535 = t147 - 2 * y2 * y3 + t142;
+    t536 = px1 * t535;
+
+    t537 = t447 + t445;
+    t538 = py1 * t537;
+    t539 = t464 + t462;
+    t540 = px1 * t539;
+
+    t541 = 2 * px3 * py2 * t2;
+    t542 = - 2 * px2 * py3 * t2;
+    t543 = x2 * t446 * y2;
+
+    t544 = t5 * t205;
+    t545 = t544 + t543;
+    t546 = py1 * t545;
+    t547 = x2 * t463 * y2;
+
+    t548 = t5 * t119;
+    t549 = t548 + t547;
+    t550 = px1 * t549;
+    t551 = x2 * t265;
+
+    t552 = (t389 + t10 + t551) * y2;
+    t553 = t5 * t154;
+    t554 = 2 * px3 * t2;
+
+    t555 = (t554 + t393 + t110) * y2;
+    t556 = t5 * t90;
+    t557 = py3 * x2 * x3;
+
+    t558 = - 2 * py3 * t2;
+    t559 = (t558 + t399 + t557) * y2;
+    t560 = py2 * x2 * x3 * y3;
+
+    t561 = t138 + t361;
+    t562 = t5 * t561;
+    t563 = t390 * t142;
+    t564 = t5 * t150;
+
+    t565 = - px2 * t2 * t142;
+    t566 = - px3 * t5 * t147;
+    t567 = t566 + t214 + t565;
+
+    t568 = py1 * t567;
+    t569 = py2 * t2 * t142;
+    t570 = x2 * y2 * (t118 + t135);
+
+    t571 = py3 * t5 * t147;
+    t572 = t571 + t570 + t569;
+    t573 = px1 * t572;
+    t574 = t86 + t68;
+
+    t575 = x2 * t574;
+    t576 = (t78 + t575) * y2;
+    t577 = 2 * px2 * x3 * y3;
+
+    t578 = x2 * (t87 + t577);
+    t579 = px1 * t527;
+
+    t580 = - t5 * t147 + 2 * x2 * x3 * y2 * y3 - t2 * t142;
+    t581 = px1 * t580;
+    t582 = t305 + t133;
+
+    t583 = x2 * t582;
+    t584 = (t419 + t583) * y2;
+    t585 = x2 * (t136 + t306);
+
+    t586 = py1 * t532;
+    t587 = - py3 * t2 * t142;
+    t588 = x2 * y2 * (t136 + t117);
+
+    t589 = - py2 * t5 * t147;
+    t590 = t5 * t147 - 2 * x2 * x3 * y2 * y3 + t2 * t142;
+
+    t591 = py1 * t590;
+    t592 = t400 + t466 + t465;
+    t593 = px1 * t592;
+    t594 = t309 + t279;
+
+    t595 = t198 + t311;
+    t596 = x2 * t378;
+    t597 = t596 + t408 * y2;
+    t598 = py1 * t597;
+
+    t599 = t256 + t116 * y2;
+    t600 = px1 * t599;
+    t601 = t178 + t366 + t534;
+
+    t602 = py1 * t601;
+    t603 = t181 + t524 + t523;
+    t604 = px1 * t603;
+    t605 = t265 * t142;
+
+    t606 = t423 + t144 + t143;
+    t607 = y2 * t606;
+    t608 = x2 * t150;
+    t609 = 2 * py2 * x3 * y3;
+
+    t610 = t362 + t137;
+    t611 = x2 * t610;
+    t612 = y2 * (t611 + t118 + t609);
+
+    t613 = py1 * t449;
+    t614 = t419 + t613 + t418 + t417;
+    t615 = py1 * t460;
+
+    t616 = py1 * t535;
+    t617 = t616 + t172 + t512 + t511;
+    t618 = t134 + t304;
+
+    t619 = t618 * t142;
+    t620 = - py3 * x2 * y3;
+    t621 = y2 * (t135 + t620);
+
+    t622 = x2 * (t388 + t180);
+    t623 = px1 * t467;
+    t624 = t623 + t78 + t102 + t101;
+
+    t625 = px1 * t483;
+    t626 = px1 * t525;
+    t627 = t167 + t626 + t354 + t353;
+
+    t628 = - 2 * px2 * x3;
+    t629 = t98 + t628;
+    t630 = t629 * t142;
+    t631 = - 2 * px3 * t147;
+
+    t632 = x2 * (t631 + t177);
+    t633 = - 2 * px2 * py3 * x3 * y3;
+    t634 = t633 + t197;
+
+    t635 = - 2 * px3 * py2 * t147;
+    t636 = t142 * t403;
+    t637 = x2 * y2 * t173;
+
+    t638 = t637 + t636;
+    t639 = px1 * t638;
+    t640 = t589 + t588 + t587;
+    t641 = px1 * t640;
+
+    t642 = px1 * t590;
+    t643 = py1 * t580;
+
+    t644 = (x0 * (px0 * (y1 * (x1 * (t528 + t522 + t612 + t518)
+			       + t643 + t571 + t570 + t569)
+			 + t29 * t515 + x1 * t638 + t1 * (t615 + t444 + t443 + t442))
+		  + py0 * (y1 * (x1 * (t533 + t531 + t331 + t530)
+				 + t642 + t566 + t214 + t565)
+			   + x1 * t234 + t29 * t379 + t1 * (t625 + t439 + t438 + t437))
+		  + y1 * (x1 * (px1 * (t622 + t621 + t619) + py1 * (t632 + t299 + t630)
+				+ t608 + t607 + t605)
+			  + t641 + t243 + t564 + t563)
+		  + x1 * (t639 + t235 + x2 * y2 * (t284 + t635) + t142 * t634)
+		  + t29 * (t175 + t170)
+		  + t1 * (px1 * (t482 + t480 + t479) + py1 * (t459 + t79 + t458) + t434
+			  + t433 + t432))
+	    + y0 * (x0 * (py0 * (x1 * (t579 + t632 + t299 + t630)
+				 + t489 + t29 * t627
+				 + y1 * (x1 * t597 + t625 + t556 + t112 + t555) + t488
+				 + t624 * t1)
+			  + px0 * (x1 * (t586 + t622 + t621 + t619)
+				   + t29 * t617 + t493
+				   + y1 * (x1 * t599 + t615 + t562 + t560 + t559) + t492
+				   + t614 * t1)
+			  + x1 * (px1 * (t522 + t612 + t518) + py1 * (t531 + t331 + t530)
+				  + t608 + t607 + t605)
+			  + t29 * (t604 + t602) + t487
+			  + y1 * (x1 * (t600 + t598 + x2 * t595 + t594 * y2)
+				  + px1 * (t585 + t480 + t584) + py1 * (t578 + t79 + t576) + t267
+				  + t553 + t552) + t486 + (t593 + t302) * t1)
+		    + px0 * (x1 * (t591 + t589 + t588 + t587)
+			     + t29 * (t586 + t510 + t509 + t508)
+			     + y1 * (x1 * (t484 + t585 + t480 + t584) + t548 + t547) + t415 * t1)
+		    + py0 * (x1 * (t581 + t241 + t240 + t239)
+			     + t29 * (t579 + t507 + t250 + t506)
+			     + y1 * (x1 * (t461 + t578 + t79 + t576) + t544 + t543) + t411 * t1)
+		    + x1 * (t573 + t568 + t564 + t563)
+		    + t29 * (px1 * (t522 + t521 + t518) + py1 * (t531 + t225 + t530) + t505
+			     + t504 + t502)
+		    + y1 * (x1 * (px1 * (t562 + t560 + t559) + py1 * (t556 + t112 + t555)
+				  + t267 + t553 + t552)
+			    + t550 + t546 + t5 * (t322 + t273) + x2 * (t542 + t541) * y2)
+		    + (t540 + t538) * t1)
+	    + t161 * (py0 * (y1 * (x1 * (t536 + t178 + t366 + t534)
+				   + t533 + t531 + t225 + t530)
+			     + x1 * t169 + t208 + t1 * (t529 + t204 + t292 + t425) + t206)
+		      + px0 * (y1 * (t528 + x1 * (t181 + t526 + t524 + t523) + t522 + t521
+				     + t518)
+			       + x1 * t174 + t498 + t1 * (t517 + t118 + t428 + t427) + t497)
+		      + x1 * (t516 + t383)
+		      + y1 * (x1 * (t514 + t382) + px1 * (t510 + t509 + t508)
+			      + py1 * (t507 + t250 + t506) + t505 + t504
+			      + t502) + t151
+		      + t1 * (px1 * (t136 + t471 + t470) + py1 * (t87 + t249 + t451) + t423
+			      + t422) + t146) + t501 + t496
+	    + t14 * (px0 * (x1 * (t484 + t482 + t480 + t479)
+			    + t29 * (t475 + t136 + t471 + t470) + t404 + t402
+			    + (x1 * (t468 + t400 + t466 + t465) + t464 + t462) * y1)
+		     + py0 * (x1 * (t461 + t459 + t79 + t458)
+			      + t29 * (t455 + t87 + t249 + t451) + t396 + t395
+			      + (x1 * (t70 + t450 + t69 + t67) + t447 + t445) * y1)
+		     + x1 * (px1 * (t444 + t443 + t442) + py1 * (t439 + t438 + t437) + t434
+			     + t433 + t432)
+		     + t29 * (px1 * (t118 + t428 + t427) + py1 * (t204 + t292 + t425) + t423
+			      + t422) + t392 + t391
+		     + (x1 * (t421 + t104) + t416 + t412) * y1) + t407);
+    t645 = t5 * t265;
+
+    t646 = t115 + t114 + t132;
+    t647 = px1 * t646;
+    t648 = x2 * t485;
+    t649 = t32 * t5;
+
+    t650 = t70 + t393 + t73 * t5;
+    t651 = t400 + t399 + t106 * t5;
+
+    t652 = t540 + x1 * (px1 * t651 + py1 * t650 + t389 + t10 + t649) + t538 + t648
+	+ t29 * (t647 + t357 + t20 + t19 + t18) + t645;
+    t653 = t648 + t645;
+
+    t654 = t392 + t391;
+    t655 = px1 * t654;
+    t656 = t309 + t19;
+    t657 = x2 * t656;
+
+    t658 = (t389 + t657) * y2;
+    t659 = px3 * py2 * t5 * y3;
+    t660 = x2 * (t144 + t273);
+
+    t661 = - px3 * py2 * t5;
+    t662 = t431 + t27 + t661;
+    t663 = px1 * t662 + t24;
+
+    t664 = t5 * t429;
+    t665 = x2 * t390;
+    t666 = t665 + t664;
+    t667 = px3 * py2 * x2;
+
+    t668 = (t20 + t667) * y2;
+    t669 = x2 * t485 * y2;
+    t670 = t5 * t145;
+    t671 = t670 + t669;
+
+    t672 = px1 * t671;
+    t673 = t26 + t52;
+    t674 = x2 * t673;
+    t675 = (t389 + t674) * y2;
+
+    t676 = x2 * (t633 + t54);
+    t677 = px3 * t5;
+    t678 = t436 + t69 + t677;
+
+    t679 = px1 * t678 + t37 + t60 + t59;
+    t680 = - px3 * x2;
+
+    t681 = t203 + t298 + (t64 + t680) * y2;
+    t682 = px1 * t545;
+    t683 = - px3 * t5 * y3;
+
+    t684 = t578 + t683 + (t393 + t575) * y2;
+    t685 = 2 * py3 * x3;
+    t686 = t685 + t476;
+
+    t687 = 2 * py2 * t2;
+    t688 = px1 * (t419 + t441 + t131 * t5);
+    t689 = - px2 * py3 * x2;
+
+    t690 = 2 * px2 * py3 * x3;
+    t691 = (t690 + t94 + t689) * y2;
+
+    t692 = t330 + t204 + (t98 + t628 + t424) * y2;
+    t693 = t134 + t133 + t319;
+
+    t694 = px1 * (t140 + t118 + t117 + t693 * y2);
+    t695 = (t542 + t43 + t9) * y2;
+
+    t696 = t5 * t312;
+    t697 = 2 * px2 * t2;
+    t698 = t5 * t316 + t112 + (t78 + t697 + t110) * y2;
+
+    t699 = x2 * t253;
+    t700 = t5 * t255;
+    t701 = x2 * t403;
+
+    t702 = px1 * (t701 + t700 + (t419 + t441 + t699) * y2);
+    t703 = px2 * py3 * x2 * x3;
+
+    t704 = (t10 + t703) * y2;
+    t705 = px3 * py2 * x2 * x3 * y3;
+    t706 = (t20 + t279 + t689) * y2;
+
+    t707 = t439 + t111 + (t70 + t435) * y2;
+    t708 = t224 + t204 + (t296 + t64 + t424) * y2;
+
+    t709 = - 2 * py2;
+    t710 = 2 * py3;
+    t711 = py1 * t678;
+
+    t712 = t459 + t683 + (t393 + t457) * y2;
+    t713 = x2 * t116;
+    t714 = t5 * t139;
+
+    t715 = px1 * (t120 + t714 + (t400 + t399 + t713) * y2);
+    t716 = 2 * px2 * py3;
+
+    t717 = (t94 + (t716 + t15) * x2) * y2;
+    t718 = - 2 * px2;
+
+    t719 = t221 + t128 + t249 + (t98 + (px3 + t718) * x2) * y2;
+
+    t720 = px1 * (t256 + t136 + t135 + t646 * y2);
+    t721 = - px2 * py3 * t2 * t142;
+
+    t722 = - px3 * py2 * t5 * t147;
+    t723 = t722 + t237 + t721;
+    t724 = - px2 * py3 * x3 * t142;
+
+    t725 = y2 * (t54 + t124);
+    t726 = px1 * y2 * t257;
+    t727 = - px3 * py2 * x2 * t147;
+
+    t728 = y2 * (t87 + t127);
+    t729 = t531 + t728 + t530;
+    t730 = px2 * py3 * t2 * t142;
+
+    t731 = px3 * py2 * t5 * t147;
+    t732 = px1 * t397;
+    t733 = t251 + t299 + t248;
+
+    t734 = px2 * t2 * t142;
+    t735 = px3 * t5 * t147;
+    t736 = t735 + t240 + t734;
+
+    t737 = t389 + t10 + t649;
+    t738 = t731 + t189 + t730;
+    t739 = px1 * t738;
+
+    t740 = x2 * t165;
+    t741 = t740 + t204 + t203;
+    t742 = py1 * y2 * t741;
+    t743 = py1 * t736;
+
+    t744 = px2 * py3 * t142;
+    t745 = px1 * t567;
+    t746 = t148 + t364 + t744;
+
+    t747 = px3 * py2 * t5;
+    t748 = t389 + t95 + t747;
+    t749 = (t26 + t122) * y2;
+
+    t750 = x2 * t280;
+    t751 = (t431 + t750) * y2;
+    t752 = - px3 * py2 * t5 * y3;
+
+    t753 = x2 * (t322 + t143);
+    t754 = - px3 * t5;
+    t755 = t393 + t102 + t754;
+
+    t756 = t128 + t292 + (t68 + t126) * y2;
+    t757 = x2 * t297;
+    t758 = x2 * (t204 + t327);
+
+    t759 = t758 + t438 + (t436 + t757) * y2;
+    t760 = (t94 + t667) * y2;
+
+    t761 = t203 + t249 + (t98 + t680) * y2;
+    t762 = px1 * (t140 + t253 * y2);
+
+    t763 = - px3 * py2 * x2 * x3;
+    t764 = (t43 + t763) * y2;
+    t765 = - px2 * py3 * x2 * x3 * y3;
+
+    t766 = px3 * x2 * x3;
+    t767 = px2 * x2 * x3 * y3;
+    t768 = t767 + t79 + (t78 + t766) * y2;
+
+    t769 = px1 * (t120 + t700 + (t419 + t441 + t713) * y2);
+    t770 = t501 + t496 + t407;
+
+    t771 = px3 * py2 * x3 * t142;
+    t772 = y2 * (t313 + t633 + t54);
+
+    t773 = px2 * py3 * x2 * t147;
+    t774 = - px3 * py2 * t142;
+    t775 = t149 + t346 + t774;
+
+    t776 = y2 * (t317 + t87 + t577);
+    t777 = t507 + t776 + t506;
+    t778 = px3 * t142;
+
+    t779 = t177 + t354 + t778;
+    t780 = y2 * (t144 + t272);
+    t781 = y2 * (t203 + t292);
+
+    t782 = t531 + t781 + t530;
+    t783 = px1 * (t336 + t258 + t333);
+    t784 = t690 + t94;
+
+    t785 = x2 * t784;
+    t786 = (t431 + t785) * y2;
+    t787 = x2 * (t125 + t197);
+
+    t788 = x2 * t629;
+    t789 = x2 * (t221 + t128);
+    t790 = t789 + t438 + (t436 + t788) * y2;
+
+    t791 = - 2 * py2 * t2;
+    t792 = 2 * py3 * t2;
+    t793 = 2 * px2 * py3 * t2;
+
+    t794 = (t793 + t10 + t42) * y2;
+    t795 = t5 * t324;
+    t796 = - 2 * px2 * t2;
+
+    t797 = t5 * t329 + t80 + (t70 + t796 + t77) * y2;
+
+    t798 = px1 * (t701 + t714 + (t400 + t399 + t699) * y2);
+
+    t799 = px1 * (t5 * t259 + t401 * t142);
+    t800 = t429 * y2;
+    t801 = t503 + t800;
+
+    t802 = t487 + t486;
+    t803 = t673 * t142;
+    t804 = - 2 * px2 * py3 * t147;
+
+    t805 = x2 * (t804 + t148);
+    t806 = 2 * px2 * t147;
+
+    t807 = x2 * (t178 + t806) + t728 + t574 * t142;
+    t808 = py1 * t755;
+    t809 = py1 * t779;
+
+    t810 = y2 * (t58 + t144 + t273);
+    t811 = y2 * (t91 + t293 + t203);
+
+    t812 = t507 + t811 + t506;
+    t813 = px1 * (t260 + t335 + t254);
+    t814 = 2 * py2 * t147;
+
+    t815 = - 2 * py3 * t147;
+    t816 = (t389 + t42) * y2;
+    t817 = - py2 * py3 * t2;
+
+    t818 = (t817 + py2 * py3 * x2 * x3) * y2;
+    t819 = - py2 * py3 * t5 * y3;
+
+    t820 = py2 * py3 * x2 * x3 * y3;
+    t821 = px1 * (t820 + t819 + t818);
+    t822 = - py2 * py3 * t5;
+
+    t823 = 2 * py2 * py3 * x2 * x3;
+    t824 = px1 * (t817 + t823 + t822);
+    t825 = (t431 + t9) * y2;
+
+    t826 = py2 * py3 * t2;
+    t827 = (t826 - py2 * py3 * x2 * x3) * y2;
+    t828 = py2 * py3 * t5 * y3;
+
+    t829 = - py2 * py3 * x2 * x3 * y3;
+    t830 = px1 * (t829 + t828 + t827);
+
+    t831 = (py2 * py3 * x2 - py2 * py3 * x3) * y2;
+    t832 = - py2 * py3 * x2 * y3;
+
+    t833 = py2 * py3 * x3 * y3;
+    t834 = px1 * (t833 + t832 + t831);
+
+    t835 = (t690 + t94 + t122) * y2;
+    t836 = px1 * t693;
+    t837 = - py2 * t5 * y3;
+
+    t838 = t560 + t837 + (t400 + t557) * y2;
+    t839 = x2 * t205;
+
+    t840 = py1 * (t839 + x2 * t408 * y2);
+    t841 = (t20 + t51) * y2;
+    t842 = - py3 * x2;
+
+    t843 = py2 * x2 * y3;
+    t844 = t135 + t843 + (t115 + t842) * y2;
+
+    t845 = py1 * (t740 + t87 + t128 + (t98 + t68 + t63) * y2);
+    t846 = py2 * py3 * t5;
+
+    t847 = - 2 * py2 * py3 * x2 * x3;
+    t848 = - py2 * x2 * x3;
+    t849 = - py3 * x2 * x3 * y3;
+
+    t850 = t849 + t480 + (t419 + t848) * y2;
+    t851 = (py2 * py3 * x3 - py2 * py3 * x2) * y2;
+
+    t852 = py2 * py3 * x2 * y3;
+    t853 = - py2 * py3 * x3 * y3;
+    t854 = x2 * t561;
+
+    t855 = t854 + t136 + (t305 + t133 + t469) * y2;
+    t856 = py2 * py3 * t2 * t142;
+
+    t857 = - 2 * py2 * py3 * x2 * x3 * y2 * y3;
+    t858 = py2 * py3 * t5 * t147;
+
+    t859 = px1 * (t858 + t857 + t856);
+    t860 = - py2 * py3 * x3 * t142;
+
+    t861 = y2 * (t833 + t852);
+    t862 = - py2 * py3 * x2 * t147;
+
+    t863 = px1 * (t862 + t861 + t860);
+    t864 = - py2 * py3 * t2 * t142;
+
+    t865 = 2 * py2 * py3 * x2 * x3 * y2 * y3;
+    t866 = - py2 * py3 * t5 * t147;
+
+    t867 = py3 * t2 * t142;
+    t868 = py2 * t5 * t147;
+    t869 = t868 + t570 + t867;
+
+    t870 = py2 * py3 * x3 * t142;
+    t871 = y2 * (t853 + t832);
+    t872 = py2 * py3 * x2 * t147;
+
+    t873 = - py3 * x3 * t142;
+    t874 = - py2 * x2 * t147;
+    t875 = t874 + t521 + t873;
+
+    t876 = py2 * x3 * t142;
+    t877 = py3 * x2 * t147;
+    t878 = t877 + t509 + t876;
+
+    t879 = t287 * t142;
+    t880 = t596 + t87 + t128;
+    t881 = y2 * t880;
+    t882 = x2 * t207;
+
+    t883 = py1 * (t882 + t881 + t879);
+    t884 = py1 * t662;
+
+    t885 = px1 * (t826 + t847 + t846);
+    t886 = 2 * px3 * py2;
+
+    t887 = (t94 + (t31 + t886) * x2) * y2;
+    t888 = px1 * (t853 + t852 + t851);
+
+    t889 = py1 * t738;
+    t890 = px1 * (t866 + t865 + t864);
+
+    t891 = px1 * (t872 + t871 + t870);
+    t892 = t656 * t142;
+    t893 = x2 * (t157 + t635);
+
+    t894 = t221 + t577;
+    t895 = x2 * t253 * y2;
+    t896 = t701 + t895;
+    t897 = px1 * t896;
+
+    t898 = (t20 + t279 + t122) * y2;
+
+    t899 = py1 * (t596 + t204 + t203 + (t65 + t64 + t97) * y2);
+    t900 = t385 + t107;
+
+    t901 = x2 * t900;
+    t902 = t901 + t136 + (t115 + t476 + t469) * y2;
+    t903 = px1 * t869;
+
+    t904 = t874 + t612 + t873;
+    t905 = t408 * t142;
+    t906 = y2 * t741;
+    t907 = x2 * t168;
+
+    t908 = py1 * (t907 + t906 + t905);
+    t909 = - py2 * py3 * t142;
+
+    t910 = 2 * py2 * py3 * y2 * y3;
+    t911 = - py2 * py3 * t147;
+
+    t912 = px1 * (t911 + t910 + t909);
+    t913 = t912 + py1 * t376;
+
+    t914 = t481 + t117 + t428 + (t133 + (py3 + t709) * x2) * y2;
+    t915 = 2 * px3;
+
+    t916 = t138 + t137 + t131 * y2;
+    t917 = px1 * t916;
+
+    t918 = py1 * (t167 + t166 + t73 * t142);
+    t919 = py3 * t142;
+    t920 = t171 + t524 + t919;
+
+    t921 = px1 * t920;
+    t922 = py2 * py3 * t142;
+    t923 = - 2 * py2 * py3 * y2 * y3;
+
+    t924 = py2 * py3 * t147;
+    t925 = py1 * t513 + t924 + t923 + t922;
+    t926 = py1 * t420;
+
+    t927 = py1 * t640;
+    t928 = t685 + t114;
+    t929 = x2 * (t172 + t814) + t621 + t928 * t142;
+
+    t930 = px1 * (t924 + t923 + t922);
+    t931 = t930 + py1 * t347;
+
+    t932 = py1 * t920 + t911 + t910 + t909;
+    t933 = t315 + t222;
+    t934 = py1 * t654;
+
+    t935 = (t10 + t750) * y2;
+    t936 = t824 + py1 * t263;
+    t937 = py1 * t671;
+
+    t938 = (t19 + t689) * y2;
+    t939 = (t10 + t785) * y2;
+    t940 = t296 + t314;
+
+    t941 = py1 * (t78 + t436 + t62 * t5);
+    t942 = (t26 + t52 + t667) * y2;
+
+    t943 = py1 * (t740 + t204 + t203 + t99 * y2);
+
+    t944 = t611 + t118 + (t134 + t304 + t426) * y2;
+    t945 = (t431 + t541 + t42) * y2;
+
+    t946 = t5 * t199;
+    t947 = t5 * t900 + t560 + (t419 + t791 + t557) * y2;
+    t948 = x2 * t287;
+
+    t949 = t5 * t378;
+    t950 = py1 * (t289 + t949 + (t78 + t436 + t948) * y2);
+
+    t951 = - py3 * t5;
+    t952 = t441 + t466 + t951;
+    t953 = py1 * t952 + t826 + t847 + t846;
+
+    t954 = py3 * x2;
+    t955 = t117 + t620 + (t114 + t954) * y2;
+    t956 = py1 * t549;
+
+    t957 = py3 * t5 * y3;
+    t958 = t585 + t957 + (t399 + t583) * y2;
+    t959 = (t389 + t763) * y2;
+
+    t960 = (t309 + t19 + t667) * y2;
+    t961 = - 2 * px3;
+    t962 = px1 * t952;
+    t963 = x2 * t408;
+
+    t964 = t5 * t165;
+    t965 = py1 * (t839 + t964 + (t70 + t393 + t963) * y2);
+
+    t966 = t482 + t957 + (t399 + t478) * y2;
+    t967 = - 2 * px3 * py2;
+
+    t968 = (t26 + (t16 + t967) * x2) * y2;
+
+    t969 = t307 + t135 + t471 + (t134 + (t130 + t359) * x2) * y2;
+
+    t970 = py1 * (t596 + t87 + t128 + t66 * y2);
+    t971 = t444 + t837 + (t400 + t440) * y2;
+
+    t972 = t520 + t118 + (t685 + t114 + t426) * y2;
+    t973 = py1 * t405;
+
+    t974 = t877 + t621 + t876;
+    t975 = - py2 * t2 * t142;
+    t976 = - py3 * t5 * t147;
+
+    t977 = t976 + t588 + t975;
+    t978 = py1 * y2 * t880;
+    t979 = y2 * (t136 + t843);
+
+    t980 = t522 + t979 + t518;
+    t981 = py1 * t276;
+    t982 = py1 * t572;
+    t983 = px1 * y2 * t334;
+
+    t984 = px1 * t977;
+    t985 = (t94 + t51) * y2;
+    t986 = (t43 + t657) * y2;
+
+    t987 = (t26 + t689) * y2;
+    t988 = t117 + t471 + (t134 + t954) * y2;
+
+    t989 = py1 * (t740 + t287 * y2);
+    t990 = (t431 + t703) * y2;
+    t991 = - py3 * x2 * x3;
+
+    t992 = - py2 * x2 * x3 * y3;
+    t993 = t992 + t480 + (t419 + t991) * y2;
+
+    t994 = py1 * (t839 + t949 + (t78 + t436 + t963) * y2);
+    t995 = py3 * t5;
+
+    t996 = t399 + t418 + t995;
+    t997 = t135 + t428 + (t133 + t842) * y2;
+    t998 = x2 * t928;
+
+    t999 = x2 * (t118 + t609);
+    t1000 = t999 + t443 + (t441 + t998) * y2;
+
+    t1001 = y2 * (t901 + t136 + t306);
+    t1002 = t510 + t1001 + t508;
+    t1003 = - py3 * t142;
+
+    t1004 = t180 + t512 + t1003;
+    t1005 = y2 * (t117 + t428);
+    t1006 = t522 + t1005 + t518;
+
+    t1007 = py1 * (t907 + t881 + t905);
+    t1008 = y2 * (t854 + t481 + t117);
+
+    t1009 = t510 + t1008 + t508;
+    t1010 = 2 * px3 * t147;
+
+    t1011 = py1 * (t5 * t207 + t394 * t142);
+    t1012 = t784 * t142;
+
+    t1013 = 2 * px3 * py2 * t147;
+    t1014 = x2 * (t149 + t1013);
+
+    t1015 = py1 * (t882 + t906 + t879);
+    t1016 = x2 * (t181 + t387) + t979 + t582 * t142;
+
+    t1017 = (t43 + t674) * y2;
+    t1018 = x2 * t618;
+    t1019 = x2 * (t307 + t135);
+
+    t1020 = t1019 + t443 + (t441 + t1018) * y2;
+    t1021 = - 2 * px3 * t2;
+
+    t1022 = - 2 * px3 * py2 * t2;
+    t1023 = (t389 + t1022 + t9) * y2;
+    t1024 = t5 * t57;
+
+    t1025 = t5 * t610 + t849 + (t400 + t687 + t848) * y2;
+
+    t1026 = py1 * (t289 + t964 + (t70 + t393 + t948) * y2);
+    t1027 = px1 * t996;
+
+    t1028 = px1 * t1004;
+    t1029 = x2 * t429 * y2;
+    t1030 = (t436 + t110) * y2;
+
+    t1031 = (t441 + t557) * y2;
+    t1032 = (t393 + t77) * y2;
+    t1033 = (t399 + t848) * y2;
+
+    t1034 = (t26 + t94 + t18) * y2;
+    t1035 = (t64 + t85) * y2;
+    t1036 = (t114 + t469) * y2;
+
+    t1037 = (t98 + t628 + t126) * y2;
+    t1038 = (t134 + t304 + t842) * y2;
+
+    t1039 = (t20 + t19 + t96) * y2;
+    t1040 = (t296 + t64 + t126) * y2;
+
+    t1041 = (t685 + t114 + t842) * y2;
+    t1042 = (t98 + (t961 + px2) * x2) * y2;
+
+    t1043 = t456 * t142;
+    t1044 = x2 * (t1010 + t166);
+
+    t1045 = (t134 + (t710 + t105) * x2) * y2;
+    t1046 = t477 * t142;
+
+    t1047 = x2 * (t815 + t171);
+    t1048 = t32 * t142;
+    t1049 = t171 + t526 + t524 + t919;
+
+    t1050 = t536 + t166 + t366 + t365;
+    t1051 = (t389 + t10 + t430) * y2;
+
+    t1052 = (t393 + t766) * y2;
+    t1053 = (t399 + t991) * y2;
+    t1054 = t17 * t5;
+
+    t1055 = (t431 + t43 + t551) * y2;
+    t1056 = (t1021 + t436 + t77) * y2;
+    t1057 = t5 * t223;
+
+    t1058 = (t792 + t441 + t848) * y2;
+    t1059 = t5 * t519;
+    t1060 = t338 * y2;
+
+    t1061 = (t86 + t68 + t680) * y2;
+    t1062 = (t305 + t133 + t954) * y2;
+
+    t1063 = (t115 + t426) * y2;
+    t1064 = (t400 + t1018) * y2;
+    t1065 = (t65 + t424) * y2;
+
+    t1066 = (t70 + t788) * y2;
+    t1067 = (t70 + t757) * y2;
+    t1068 = (t400 + t998) * y2;
+
+    t1069 = t21 * y2;
+    t1070 = (t68 + (t915 + t61) * x2) * y2;
+
+    t1071 = (t133 + (t360 + py2) * x2) * y2;
+    t1072 = (t115 + t476 + t954) * y2;
+
+    t1073 = (t65 + t314 + t680) * y2;
+
+    trans->m[0][0]
+	= (x0 * (px0 * (x1 * (px1 * (y2 * (t388 + t387) + t142 * t386)
+			      + t383 + t372 + t371)
+			+ y1 * (x1 * (t369 + t382 + t156 + t346 + t345)
+				+ t337 + py1 * t301 + t285 + t283 + t281) + t381 + t151
+			+ t1 * (t141 + py1 * t92 + t58 + t54 + t53) + t146)
+		 + py0 * (y1 * (x1 * t380 + px1 * t332 + t219 + t218 + t217)
+			  + px1 * t234 + px1 * x1 * t379 + t1 * (px1 * t129 + t49 + t48 + t47))
+		 + y1 * (x1 * t377 + px1 * (t202 + t326 + t196) + t195) + px1 * t374
+		 + px1 * x1 * t373 + t1 * (px1 * (t125 + t124 + t123) + t269))
+	   + y0 * (x0 * (px0 * (t261 + x1 * (t369 + t368 + t157 + t364 + t363) + py1 * t227
+				+ t202
+				+ y1
+				* (x1
+				   * (px1 * (t362 + t361 + (t360 + t359) * y2)
+				      + t358 + t153 + t55 + t339)
+				   + t320 + py1 * t294 + t144 + t273 + t272 + t271)
+				+ t201 + t196 + (t357 + t20 + t19 + t18) * t1)
+			 + py0 * (x1 * t356 + px1 * t252 + t194
+				  + y1 * (px1 * t318 + px1 * x1 * t349 + t84 + t83 + t82)
+				  + t193 + t192 + px1 * t99 * t1) + x1 * t348
+			 + px1 * (t247 + t246 + t244) + t278
+			 + y1 * (px1 * (t313 + t54 + t310) + t50 + px1 * x1 * t340)
+			 + px1 * t338 * t1)
+		   + px0 * (x1 * (t337 + py1 * t332 + t202 + t326 + t196)
+			    + t321 + px1 * t29 * t182 + t190
+			    + y1 * (x1 * (t320 + py1 * t318 + t313 + t54 + t310)
+				    + px1 * (x2 * t308 + x2 * (t305 + t304) * y2) + t303 + t267
+				    + t266) + t189 + t188 + (t302 + t10 + t27 + t25) * t1)
+		   + py0 * (x1 * (px1 * t301 + t194 + t193 + t192)
+			    + t295 + px1 * t29 * t179 + t186
+			    + y1 * (x1 * (px1 * t294 + t49 + t48 + t47) + px1 * t290) + t185 + t184
+			    + (t286 + t3 + t23 + t22) * t1)
+		   + x1 * (px1 * (t285 + t283 + t281) + t278) + t277 + t275 + px1 * t29 * t274
+		   + y1 * (x1 * (px1 * (t144 + t273 + t272 + t271) + t269) + px1 * t268)
+		   + (t264 + t262) * t1)
+	   + px0 * (y1 * (x1 * (t261 + py1 * t252 + t247 + t246 + t244)
+			  + t243 + t238 + t237 + t236)
+		    + x1 * (t235 + t230 + t229) + px1 * t29 * t174
+		    + t1 * (t121 + py1 * t81 + t46 + t45 + t44))
+	   + py0 * (y1 * (x1 * (px1 * t227 + t219 + t218 + t217)
+			  + px1 * t216 + t212 + t211 + t210)
+		    + px1 * t29 * t169 + px1 * x1 * t209 + t1 * (px1 * t113 + t40 + t39 + t38))
+	   + y1 * (x1 * (px1 * (t202 + t201 + t196) + t195) + px1 * t191 + t187)
+	   + px0 * t161 * t183 + px1 * t29 * t160 + px1 * x1 * t152
+	   + t14 * (px0 * (x1 * (t141 + py1 * t129 + t125 + t124 + t123)
+			   + t121 + py1 * t113 + px1 * t29 * t109 + t13 + t12 + t11
+			   + (t104 + t43 + x1 * (t100 + t26 + t94 + t96) + t95 + t93) * y1)
+		    + py0 * (x1 * (px1 * t92 + t84 + t83 + t82)
+			     + px1 * t81 + px1 * t29 * t76 + t7 + t6 + t4
+			     + (px1 * t71 + t37 + px1 * x1 * t66 + t60 + t59) * y1)
+		    + x1 * (px1 * (t58 + t54 + t53) + t50) + px1 * (t46 + t45 + t44) + t41
+		    + px1 * t29 * t36 + (px1 * t28 + t24 + px1 * x1 * t21) * y1)
+	   + t1 * (px1 * (t13 + t12 + t11) + t8));
+
+    trans->m[0][1] =
+	(t161 * (px0 * (x1 * (t382 + t156 + t346 + t345)
+			+ py1 * t733 + t247
+			+ y1 * (t694 + x1 * (t358 + t153 + t55 + t339) + py1 * t681
+				+ t144 + t282 + t668) + t726 + t283 + t244
+			+ px1 * t646 * t1)
+		 + py0 * (x1 * (px1 * t601 + t343 + t342 + t341)
+			  + px1 * t729 + t219
+			  + y1 * (px1 * t692 + px1 * x1 * t76 + t49 + t48 + t47) + t218
+			  + t217 + px1 * t66 * t1) + x1 * (px1 * t746 + t375)
+		 + px1 * (t727 + t725 + t724) + t195
+		 + y1 * (px1 * (t325 + t143 + t691) + t269 + px1 * x1 * t36)
+		 + px1 * t21 * t1)
+	 + x0 * (py0 * (t29 * t356 + t745 + t212
+			+ y1
+			* (x1 * (px1 * t719 + t84 + t83 + t82)
+			   + px1 * t698 + t40 + t39 + t38) + px1 * x1 * y2 * t741
+			+ t211 + t210 + px1 * t650 * t1)
+		 + px0 * (t29 * (t602 + t148 + t364 + t744)
+			  + t743 + t722
+			  + y1 * (x1 * (t720 + py1 * t708 + t200 + t143 + t706)
+				  + t702 + py1 * t684 + t676 + t659 + t675)
+			  + x1 * (t607 + px1 * y2 * (x2 * (t362 + t361) + t481 + t609) + t742)
+			  + t237 + t721 + px1 * t651 * t1) + t29 * t348 + t739 + t187
+		 + y1 * (x1 * (px1 * (t125 + t197 + t245 + t717) + t50)
+			 + px1 * (t696 + t13 + t695) + t8) + px1 * x1 * y2 * t606
+		 + px1 * t737 * t1)
+	 + py0 * (x1 * (px1 * t736 + t186 + t185 + t184)
+		  + t29 * (px1 * t733 + t194 + t193 + t192)
+		  + y1 * (x1 * (px1 * t712 + t7 + t6 + t4) + t732) + px1 * t537 * t1)
+	 + px0 * (x1 * (t568 + t731 + t189 + t730)
+		  + t29 * (py1 * t729 + t727 + t726 + t725 + t724)
+		  + y1 * (x1 * (t715 + py1 * t707 + t705 + t12 + t704) + t546 + t670 + t669)
+		  + px1 * t539 * t1) + x1 * (px1 * t723 + t275)
+	 + t29 * (px1 * (t247 + t283 + t244) + t278)
+	 + y0 * (x0 * (px0 * (x1 * (t720 + py1 * t719 + t125 + t197 + t245 + t717)
+			      + t715 + py1 * t712 + t29 * (t162 + t35 + t34 + t33) + t660
+			      + t659 + t658
+			      + (t688 + t711 + t431
+				 + x1
+				 * (px1 * (t305 + t304 + (t710 + t709) * x2)
+				    + t100 + t26 + t94 + t96) + t27 + t661)
+			      * y1)
+		       + py0 * (x1 * (px1 * t708 + t49 + t48 + t47)
+				+ px1 * t707 + px1 * t29 * t349 + t40 + t39 + t38
+				+ (t286 + t3 + px1 * x1 * t99 + t23 + t22) * y1)
+		       + x1 * (px1 * (t200 + t143 + t706) + t269) + px1 * (t705 + t12 + t704)
+		       + t8 + px1 * t29 * t340 + (t264 + t262 + px1 * x1 * t338) * y1)
+		 + px0 * (x1 * (t702 + py1 * t698 + t696 + t13 + t695)
+			  + t29 * (t694 + py1 * t692 + t325 + t143 + t691) + t398 + t392 + t391
+			  + (x1 * (t688 + t104 + t43 + t95 + t93)
+			     + px1 * (x2 * (t558 + t687) + t5 * t686) + t412 + t665 + t664)
+			  * y1)
+		 + py0 * (x1 * (px1 * t684 + t7 + t6 + t4) + t682
+			  + t29 * (px1 * t681 + t84 + t83 + t82)
+			  + (px1 * t411 + x1 * t679) * y1)
+		 + x1 * (px1 * (t676 + t659 + t675) + t41) + t672
+		 + t29 * (px1 * (t144 + t282 + t668) + t50) + (px1 * t666 + x1 * t663) * y1)
+	 + y1 * (x1 * (px1 * (t660 + t659 + t658) + t41) + t655) + px1 * t653 * t1
+	 + px0 * t652 * t14)
+	;
+
+    trans->m[0][2] =
+	(x0 * (px0 * (y1 * (x1 * (t813 + py1 * t807 + t805 + t725 + t803)
+			    + t799 + t568 + t731 + t189 + t730)
+		      + x1 * (px1 * (x2 * y2 * (t815 + t814) + t142 * t308)
+			      + t235 + t230 + t229) + t29 * (t170 + t159 + t155)
+		      + t1 * (t769 + py1 * t759 + t753 + t752 + t751))
+	       + py0 * (y1 * (x1 * (px1 * t812 + t194 + t193 + t192)
+			      + t295 + t186 + t185 + t184)
+			+ px1 * x1 * t234 + px1 * t29 * t379
+			+ t1 * (px1 * t768 + t7 + t6 + t4))
+	       + y1 * (x1 * (px1 * (t773 + t810 + t771) + t278) + t277 + t275)
+	       + px1 * x1 * t374 + px1 * t29 * t373
+	       + t1 * (px1 * (t765 + t45 + t764) + t41))
+	 + y0 * (x0 * (px0 * (x1 * (t813 + py1 * t812 + t773 + t810 + t771)
+			      + t495 + t29 * (t809 + t149 + t346 + t774)
+			      + y1
+			      * (x1
+				 * (px1 * (x2 * t386 + t686 * y2)
+				    + t598 + t503 + t800)
+				 + t798 + py1 * t790 + t787 + t752 + t786)
+			      + (t808 + t389 + t95 + t747) * t1)
+		       + py0 * (x1 * (px1 * t807 + t219 + t218 + t217)
+				+ px1 * t490 + t29 * t380
+				+ y1 * (px1 * x1 * t597 + px1 * t797 + t7 + t6 + t4)
+				+ t679 * t1)
+		       + x1 * (px1 * (t805 + t725 + t803) + t195) + px1 * t802
+		       + t29 * t377
+		       + y1 * (px1 * x1 * t801 + px1 * (t795 + t46 + t794) + t41)
+		       + t663 * t1)
+		 + px0 * (x1 * (t799 + t243 + t238 + t237 + t236)
+			  + t29 * (t783 + py1 * t777 + t773 + t772 + t771)
+			  + y1 * (x1 * (t798 + py1 * t797 + t795 + t46 + t794)
+				  + px1 * (t5 * (t481 + t609) + x2 * (t792 + t791) * y2)
+				  + t546 + t670 + t669) + (t538 + t648 + t645) * t1)
+		 + py0 * (x1 * (t745 + t212 + t211 + t210)
+			  + t29 * (px1 * t782 + t219 + t218 + t217)
+			  + y1 * (x1 * (px1 * t790 + t40 + t39 + t38) + t682)
+			  + px1 * t411 * t1) + x1 * (t739 + t187)
+		 + t29 * (px1 * (t727 + t780 + t724) + t195)
+		 + y1 * (x1 * (px1 * (t787 + t752 + t786) + t8) + t672)
+		 + px1 * t666 * t1)
+	 + t161 * (px0 * (y1
+			  * (t783 + x1 * (t368 + t157 + t364 + t363) + py1 * t782
+			     + t727 + t780 + t724)
+			  + x1 * (t383 + t372 + t371) + t500
+			  + t1 * (t762 + py1 * t756 + t125 + t272 + t749))
+		   + py0 * (y1
+			    * (x1 * (px1 * t779 + t352 + t351 + t350)
+			       + px1 * t777 + t194 + t193 + t192)
+			    + px1 * x1 * t169 + px1 * t209
+			    + t1 * (px1 * t761 + t84 + t83 + t82))
+		   + y1 * (x1 * (px1 * t775 + t344) + px1 * (t773 + t772 + t771)
+			   + t278) + px1 * x1 * t160
+		   + px1 * t152 + t1 * (px1 * (t144 + t245 + t760) + t50))
+	 + px0 * t770
+	 + t14 * (px0 * (x1 * (t769 + py1 * t768 + t765 + t45 + t764)
+			 + t29 * (t762 + py1 * t761 + t144 + t245 + t760) + t406
+			 + (t412 + x1 * (t711 + t431 + t27 + t661) + t665 + t664)
+			 * y1)
+		  + py0 * (x1 * (px1 * t759 + t40 + t39 + t38)
+			   + t732 + t29 * (px1 * t756 + t49 + t48 + t47)
+			   + (px1 * t537 + x1 * (px1 * t755 + t3 + t23 + t22)) * y1)
+		  + x1 * (px1 * (t753 + t752 + t751) + t8) + t655
+		  + t29 * (px1 * (t125 + t272 + t749) + t269)
+		  + (x1 * (px1 * t748 + t262) + px1 * t653) * y1));
+
+    trans->m[1][0] = (x0 * (py0 * (x1 * (t516 + py1 * (y2 * (t631 + t806) + t142 * t933) + t372
+					 + t371)
+				   + y1 * (px1 * t929 + x1 * (t514 + t918 + t157 + t364 + t363) + t908
+					   + t893 + t725 + t892) + t500 + t151
+				   + t1 * (px1 * t855 + t845 + t325 + t125 + t835) + t146)
+			    + px0 * (y1 * (x1 * t932 + py1 * t904 + t872 + t871 + t870)
+				     + py1 * x1 * t515 + py1 * t638
+				     + t1 * (py1 * t844 + t833 + t832 + t831))
+			    + y1 * (x1 * t931 + t863 + py1 * (t247 + t810 + t244)) + py1 * t374
+			    + py1 * x1 * t373 + t1 * (t888 + py1 * (t54 + t282 + t841)))
+		      + y0 * (px0 * (x1 * (py1 * t929 + t862 + t861 + t860)
+				     + t927 + py1 * t29 * t182 + t858
+				     + y1 * (py1 * t896 + x1 * (py1 * t914 + t833 + t832 + t831)) + t857
+				     + t856 + (t926 + t817 + t823 + t822) * t1)
+			      + x0 * (px0 * (x1 * t925 + py1 * t878 + t862
+					     + y1
+					     * (py1 * t902 + py1 * x1 * t916 + t853 + t852
+						+ t851) + t861 + t860
+					     + py1 * t693 * t1)
+				      + py0 * (x1 * (t921 + t918 + t156 + t346 + t345)
+					       + t883 + px1 * t875 + t247
+					       + y1 * (x1 * (t917 + py1 * (t328 + t88 + (t915 + t718) * y2)
+							     + t153 + t55 + t339)
+						       + t899 + px1 * t914 + t322 + t143 + t245 + t887) + t772
+					       + t244 + (t647 + t20 + t19 + t18) * t1) + x1 * t913 + t891
+				      + py1 * (t202 + t780 + t196)
+				      + y1 * (py1 * (t200 + t125 + t898) + t834 + py1 * x1 * t340)
+				      + py1 * t338 * t1)
+			      + py0 * (x1 * (t908 + px1 * t904 + t247 + t810 + t244)
+				       + t903 + py1 * t29 * t179 + t722
+				       + y1 * (x1 * (px1 * t902 + t899 + t200 + t125 + t898)
+					       + t897 + py1 * (x2 * t894 + x2 * (t86 + t628) * y2) + t267
+					       + t266) + t237 + t721 + (t593 + t389 + t95 + t747) * t1)
+			      + x1 * (py1 * (t893 + t725 + t892) + t891) + t890 + t889 + py1 * t29 * t274
+			      + y1 * (x1 * (t888 + py1 * (t322 + t143 + t245 + t887)) + py1 * t268)
+			      + (t885 + t884) * t1)
+		      + py0 * (y1 * (x1 * (t883 + px1 * t878 + t202 + t780 + t196)
+				     + t641 + t731 + t189 + t730)
+			       + x1 * (t639 + t230 + t229) + py1 * t29 * t169
+			       + t1 * (t840 + px1 * t850 + t13 + t752 + t825))
+		      + px0 * (y1 * (x1 * (py1 * t875 + t872 + t871 + t870)
+				     + py1 * t869 + t866 + t865 + t864)
+			       + py1 * x1 * t499 + py1 * t29 * t174
+			       + t1 * (py1 * t838 + t829 + t828 + t827))
+		      + y1 * (x1 * (t863 + py1 * (t247 + t772 + t244)) + t859 + py1 * t723)
+		      + py0 * t161 * t183 + py1 * t29 * t160 + py1 * x1 * t152
+		      + t14 * (px0 * (x1 * (py1 * t855 + t853 + t852 + t851)
+				      + py1 * t850 + py1 * t29 * t109 + t820 + t819 + t818
+				      + (py1 * t592 + t826 + py1 * x1 * t646 + t847 + t846) * y1)
+			       + py0 * (x1 * (t845 + px1 * t844 + t54 + t282 + t841)
+					+ t840 + px1 * t838 + py1 * t29 * t76 + t46 + t659 + t816
+					+ (t421 + t431 + x1 * (t836 + t26 + t94 + t96) + t27 + t661) * y1)
+			       + x1 * (py1 * (t325 + t125 + t835) + t834) + t830
+			       + py1 * (t13 + t752 + t825) + py1 * t29 * t36
+			       + (t824 + py1 * t748 + py1 * x1 * t21) * y1)
+		      + t1 * (t821 + py1 * (t46 + t659 + t816)))
+	;
+
+    trans->m[1][1] = (t161 * (px0 * (x1 * (py1 * t603 + t911 + t910 + t909)
+				     + py1 * t980 + t872
+				     + y1 * (py1 * t944 + py1 * x1 * t109 + t833 + t832 + t831) + t871
+				     + t870 + py1 * t646 * t1)
+			      + py0 * (x1 * (t514 + t157 + t364 + t363)
+				       + px1 * t974 + t202
+				       + y1 * (x1 * (t917 + t153 + t55 + t339)
+					       + t943 + px1 * t955 + t143 + t124 + t938) + t978 + t725
+				       + t196 + py1 * t66 * t1) + x1 * (t930 + py1 * t775) + t863
+			      + py1 * (t773 + t283 + t771)
+			      + y1 * (py1 * (t58 + t144 + t942) + t888 + py1 * x1 * t36)
+			      + py1 * t21 * t1)
+		      + x0 * (py0 * (t29 * (t604 + t149 + t346 + t774)
+				     + t984 + t190
+				     + y1 * (x1 * (px1 * t972 + t970 + t313 + t144 + t960)
+					     + px1 * t958 + t950 + t787 + t12 + t939)
+				     + x1 * (t607 + t983 + py1 * y2 * (x2 * (t328 + t88) + t293 + t327))
+				     + t189 + t188 + py1 * t650 * t1)
+			      + px0 * (t29 * t925 + t982 + t866
+				       + y1
+				       * (x1 * (py1 * t969 + t853 + t852 + t851)
+					  + py1 * t947 + t829 + t828 + t827)
+				       + py1 * x1 * y2 * t334 + t865 + t864 + py1 * t651 * t1)
+			      + t29 * t913 + t859 + t981
+			      + y1 * (x1 * (t834 + py1 * (t633 + t54 + t272 + t968))
+				      + py1 * (t946 + t46 + t945) + t821) + py1 * x1 * y2 * t606
+			      + py1 * t737 * t1)
+		      + py0 * (x1 * (t573 + t238 + t237 + t236)
+			       + t29 * (px1 * t980 + t773 + t978 + t283 + t771)
+			       + y1 * (x1 * (t965 + px1 * t971 + t765 + t659 + t959) + t550 + t670 + t669)
+			       + py1 * t537 * t1)
+		      + px0 * (x1 * (py1 * t977 + t858 + t857 + t856)
+			       + t29 * (py1 * t974 + t862 + t861 + t860)
+			       + y1 * (x1 * (py1 * t966 + t820 + t819 + t818) + t973) + py1 * t539 * t1)
+		      + x1 * (t890 + py1 * t191) + t29 * (t891 + py1 * (t202 + t725 + t196))
+		      + y0 * (x0 * (px0 * (x1 * (py1 * t972 + t833 + t832 + t831)
+					   + py1 * t971 + py1 * t29 * t916 + t829 + t828 + t827
+					   + (t926 + t817 + py1 * x1 * t693 + t823 + t822) * y1)
+				    + py0 * (x1 * (t970 + px1 * t969 + t633 + t54 + t272 + t968)
+					     + px1 * t966 + t965 + t29 * (t163 + t35 + t34 + t33) + t753 + t12
+					     + t935
+					     + (t962 + t941 + t43
+						+ x1
+						* (t836 + py1 * (t86 + t628 + (t961 + t291) * x2)
+						   + t26 + t94 + t96) + t95 + t93)
+					     * y1) + x1 * (py1 * (t313 + t144 + t960) + t888) + t821
+				    + py1 * (t765 + t659 + t959) + py1 * t29 * t340
+				    + (t885 + t884 + py1 * x1 * t338) * y1)
+			      + px0 * (x1 * (py1 * t958 + t820 + t819 + t818)
+				       + t956 + t29 * (py1 * t955 + t853 + t852 + t851)
+				       + (py1 * t415 + x1 * t953) * y1)
+			      + py0 * (x1 * (t950 + px1 * t947 + t946 + t46 + t945)
+				       + t29 * (px1 * t944 + t943 + t58 + t144 + t942) + t406 + t392 + t391
+				       + (x1 * (t421 + t941 + t431 + t27 + t661)
+					  + t416 + py1 * (x2 * (t554 + t796) + t5 * t940) + t665 + t664)
+				       * y1) + x1 * (py1 * (t787 + t12 + t939) + t830)
+			      + t29 * (t834 + py1 * (t143 + t124 + t938)) + t937
+			      + (x1 * t936 + py1 * t666) * y1)
+		      + y1 * (x1 * (py1 * (t753 + t12 + t935) + t830) + t934) + py1 * t653 * t1
+		      + py0 * t652 * t14)
+	;
+
+    trans->m[1][2] = (y0 * (x0 * (px0 * (x1 * (py1 * t1016 + t872 + t871 + t870)
+					 + py1 * t494 + t29 * t932
+					 + y1
+					 * (py1 * t1025 + py1 * x1 * t599 + t820 + t819
+					    + t818) + t953 * t1)
+				  + py0 * (x1 * (t1015 + px1 * t1009 + t727 + t326 + t724)
+					   + t29 * (t1028 + t148 + t364 + t744) + t491
+					   + y1
+					   * (x1
+					      * (t600 + py1 * (x2 * t933 + t940 * y2) + t503
+						 + t800)
+					      + px1 * t1020 + t1026 + t676 + t45 + t1017)
+					   + (t1027 + t10 + t27 + t25) * t1)
+				  + x1 * (py1 * (t1014 + t283 + t1012) + t863) + t29 * t931
+				  + py1 * t802
+				  + y1 * (py1 * x1 * t801 + py1 * (t1024 + t13 + t1023) + t830)
+				  + t936 * t1)
+			    + py0 * (t29 * (t1007 + px1 * t1002 + t727 + t201 + t724)
+				     + x1 * (t1011 + t641 + t731 + t189 + t730)
+				     + y1 * (x1 * (t1026 + px1 * t1025 + t1024 + t13 + t1023)
+					     + t550
+					     + py1
+					     * (t5 * (t293 + t327) + x2 * (t1021 + t697) * y2)
+					     + t670 + t669) + (t540 + t648 + t645) * t1)
+			    + px0 * (x1 * (t982 + t866 + t865 + t864)
+				     + t29 * (py1 * t1006 + t872 + t871 + t870)
+				     + y1 * (x1 * (py1 * t1020 + t829 + t828 + t827) + t956)
+				     + py1 * t415 * t1) + x1 * (t859 + t981)
+			    + t29 * (t863 + py1 * (t773 + t246 + t771))
+			    + y1 * (x1 * (py1 * (t676 + t45 + t1017) + t821) + t937)
+			    + py1 * t666 * t1)
+		      + x0 * (py0 * (y1 * (x1 * (px1 * t1016 + t1015 + t1014 + t283 + t1012)
+					   + t1011 + t573 + t238 + t237 + t236)
+				     + x1 * (t639
+					     + py1 * (x2 * y2 * (t1010 + t300) + t142 * t894)
+					     + t230 + t229) + t29 * (t175 + t159 + t155)
+				     + t1 * (px1 * t1000 + t994 + t660 + t45 + t986))
+			      + px0 * (y1 * (x1 * (py1 * t1009 + t862 + t861 + t860)
+					     + t927 + t858 + t857 + t856)
+				       + py1 * t29 * t515 + py1 * x1 * t638
+				       + t1 * (py1 * t993 + t820 + t819 + t818))
+			      + y1 * (x1 * (t891 + py1 * (t727 + t326 + t724)) + t890 + t889)
+			      + py1 * x1 * t374 + py1 * t29 * t373
+			      + t1 * (t830 + py1 * (t705 + t752 + t990)))
+		      + t161 * (py0 * (x1 * (t516 + t372 + t371)
+				       + y1
+				       * (x1 * (t921 + t156 + t346 + t345)
+					  + t1007 + px1 * t1006 + t773 + t246 + t771) + t381
+				       + t1 * (t989 + px1 * t997 + t54 + t245 + t985))
+				+ px0 * (y1
+					 * (x1 * (py1 * t1004 + t924 + t923 + t922)
+					    + py1 * t1002 + t862 + t861 + t860)
+					 + py1 * t499 + py1 * x1 * t174
+					 + t1 * (py1 * t988 + t853 + t852 + t851))
+				+ y1 * (x1 * (t912 + py1 * t746) + t891
+					+ py1 * (t727 + t201 + t724))
+				+ py1 * x1 * t160 + py1 * t152
+				+ t1 * (t834 + py1 * (t143 + t272 + t987))) + py0 * t770
+		      + t14 * (px0 * (x1 * (py1 * t1000 + t829 + t828 + t827)
+				      + t973 + t29 * (py1 * t997 + t833 + t832 + t831)
+				      + (py1 * t539 + x1 * (py1 * t996 + t817 + t823 + t822))
+				      * y1)
+			       + py0 * (x1 * (t994 + px1 * t993 + t705 + t752 + t990)
+					+ t29 * (t989 + px1 * t988 + t143 + t272 + t987) + t398
+					+ (t416 + x1 * (t962 + t43 + t95 + t93) + t665 + t664)
+					* y1) + x1 * (py1 * (t660 + t45 + t986) + t821)
+			       + t29 * (t888 + py1 * (t54 + t245 + t985)) + t934
+			       + (x1 * (t885 + py1 * t28) + py1 * t653) * y1));
+
+    trans->m[2][0] = (x0 * (px0 * (y1 * (x1 * t617 + t586 + t877 + t1008 + t876)
+				   + x1 * t515 + t637 + t1 * (t475 + t136 + t620 + t1036) + t636)
+			    + py0 * (y1 * (t579 + x1 * t627 + t251 + t811 + t248)
+				     + x1 * t379 + t233 + t1 * (t455 + t87 + t298 + t1035) + t232)
+			    + x1 * (t516 + t383 + y2 * (t804 + t1013) + t142 * t595)
+			    + y1 * (px1 * (t1047 + t979 + t1046)
+				    + x1 * (t921 + t368 + t157 + t156 + t1048)
+				    + py1 * (t1044 + t728 + t1043) + t505 + t607 + t502) + t500 + t381
+			    + t1 * (px1 * (t611 + t135 + t1038) + py1 * (t330 + t128 + t1037) + t423
+				    + t125 + t54 + t1034))
+		      + y0 * (x0 * (py0 * (x1 * t1050 + t533 + t226
+					   + y1 * (t529 + t224 + x1 * t349 + t128 + t1040)
+					   + t781 + t220 + t99 * t1)
+				    + px0 * (t528 + x1 * t1049 + t874
+					     + y1 * (t517 + x1 * t916 + t520 + t135 + t1041) + t1005
+					     + t873 + t693 * t1)
+				    + x1 * (t514 + t382 + t157 + t156 + t1048)
+				    + px1 * (t877 + t1001 + t876) + py1 * (t251 + t776 + t248) + t608
+				    + y1 * (x1 * (t917 + t358 + t56 + t323 + (t716 + t967) * y2)
+					    + px1 * (t118 + t609 + t471 + t1045)
+					    + py1 * (t204 + t327 + t249 + t1042) + t503 + t144 + t143
+					    + t1039) + t504 + t605 + (t647 + t357) * t1)
+			      + px0 * (x1 * (t528 + t1047 + t979 + t1046)
+				       + t643 + t29 * t182 + t571
+				       + y1 * (x1 * (t475 + t118 + t609 + t471 + t1045) + t701 + t895)
+				       + t570 + t569 + (t468 + t441 + t466 + t951) * t1)
+			      + py0 * (x1 * (t533 + t1044 + t728 + t1043)
+				       + t642 + t29 * t179 + t566
+				       + y1 * (x1 * (t455 + t204 + t327 + t249 + t1042) + t289 + t288)
+				       + t214 + t565 + (t436 + t450 + t69 + t677) * t1)
+			      + x1 * (px1 * (t877 + t1008 + t876) + py1 * (t251 + t811 + t248) + t505
+				      + t607 + t502) + t984 + t743
+			      + t29 * t274
+			      + y1 * (x1 * (px1 * (t520 + t135 + t1041) + py1 * (t224 + t128 + t1040)
+					    + t503 + t144 + t143 + t1039)
+				      + t897 + t303 + x2 * t634 + x2 * (t690 + t52) * y2)
+			      + (t1027 + t808) * t1)
+		      + py0 * (y1 * (x1 * (t579 + t251 + t776 + t248) + t581 + t735 + t240 + t734)
+			       + t29 * t169 + x1 * t209 + t1 * (t461 + t80 + t683 + t1032))
+		      + px0 * (y1 * (x1 * (t586 + t877 + t1001 + t876) + t591 + t976 + t588 + t975)
+			       + x1 * t499 + t29 * t174 + t1 * (t484 + t849 + t957 + t1033))
+		      + y1 * (x1 * (px1 * (t874 + t1005 + t873) + py1 * (t226 + t781 + t220) + t608
+				    + t504 + t605)
+			      + t573 + t568) + t161 * t183 + x1 * (t639 + t235) + t29 * t160
+		      + t14 * (px0 * (x1 * (t517 + t611 + t135 + t1038)
+				      + t615 + t29 * t109 + t560 + t443 + t1031
+				      + (t399 + t613 + x1 * t646 + t418 + t995) * y1)
+			       + py0 * (x1 * (t529 + t330 + t128 + t1037)
+					+ t625 + t29 * t76 + t112 + t438 + t1030
+					+ (t623 + t393 + t102 + x1 * t66 + t754) * y1)
+			       + x1 * (px1 * (t136 + t620 + t1036) + py1 * (t87 + t298 + t1035) + t423
+				       + t125 + t54 + t1034)
+			       + px1 * (t849 + t957 + t1033) + py1 * (t80 + t683 + t1032) + t434
+			       + t29 * t36 + t1029 + (t962 + t711 + x1 * (t836 + t100)) * y1)
+		      + t1 * (px1 * (t560 + t443 + t1031) + py1 * (t112 + t438 + t1030) + t434
+			      + t1029))
+	;
+
+    trans->m[2][1] = (t161 * (px0 * (x1 * (t616 + t180 + t512 + t1003)
+				     + t586 + t510 + y1 * (t475 + t854 + x1 * t109 + t117 + t1062)
+				     + t621 + t508 + t646 * t1)
+			      + py0 * (t579 + x1 * (t177 + t626 + t354 + t778) + t507
+				       + y1 * (t455 + x1 * t76 + t91 + t203 + t1061) + t299
+				       + t506 + t66 * t1) + x1 * (t921 + t368)
+			      + px1 * (t874 + t979 + t873) + py1 * (t226 + t728 + t220)
+			      + y1 * (x1 * (t917 + t358) + px1 * (t118 + t843 + t1063)
+				      + py1 * (t204 + t127 + t1065) + t423 + t144
+				      + t143 + t1060) + t504 + t21 * t1)
+		      + x0 * (py0 * (t29 * t1050 + t581 + t241
+				     + y1
+				     * (x1 * (t529 + t87 + t577 + t292 + t1070)
+					+ t461 + t1057 + t80 + t1056) + x1 * y2 * t741 + t240
+				     + t239 + t650 * t1)
+			      + px0 * (t591 + t29 * t1049 + t589
+				       + y1 * (x1 * (t517 + t136 + t306 + t428 + t1071)
+					       + t484 + t1059 + t849 + t1058) + x1 * y2 * t334 + t588
+				       + t587 + t651 * t1) + t29 * (t1028 + t809) + t903 + t321
+			      + y1 * (x1 * (px1 * (t901 + t117 + t1072) + py1 * (t317 + t203 + t1073)
+					    + t503 + t125 + t54 + t1069)
+				      + px1 * (t1019 + t837 + t1064) + py1 * (t789 + t111 + t1066) + t267
+				      + t433 + t1055)
+			      + x1 * (y2 * (x2 * (t56 + t323) + t322 + t273) + t983 + t742) + t737 * t1)
+		      + py0 * (x1 * (t642 + t215 + t214 + t213) + t29 * (t533 + t226 + t728 + t220)
+			       + y1
+			       * (x1 * (t625 + t758 + t111 + t1067)
+				  + t396 + t395) + t537 * t1)
+		      + px0 * (x1 * (t643 + t868 + t570 + t867) + t29 * (t528 + t874 + t979 + t873)
+			       + y1
+			       * (x1 * (t615 + t999 + t837 + t1068)
+				  + t404 + t402) + t539 * t1)
+		      + x1 * (t641 + t243)
+		      + t29 * (px1 * (t510 + t621 + t508) + py1 * (t507 + t299 + t506) + t504)
+		      + y0 * (x0 * (py0 * (x1 * (t455 + t317 + t203 + t1073)
+					   + t461 + t29 * t349 + t767 + t683 + t1052
+					   + (t436 + t450 + x1 * t99 + t69 + t677) * y1)
+				    + px0 * (x1 * (t475 + t901 + t117 + t1072)
+					     + t484 + t29 * t916 + t992 + t957 + t1053
+					     + (t468 + t441 + t466 + x1 * t693 + t951) * y1)
+				    + x1 * (px1 * (t136 + t306 + t428 + t1071)
+					    + py1 * (t87 + t577 + t292 + t1070) + t503 + t125 + t54 + t1069)
+				    + px1 * (t999 + t837 + t1068) + py1 * (t758 + t111 + t1067)
+				    + t29 * (t163 + t162) + t434 + t553 + t1051
+				    + (t421 + t104 + t431 + t43
+				       + x1 * (t836 + t100 + t690 + t52 + (t270 + t886) * x2)
+				       + t1054)
+				    * y1)
+			      + py0 * (x1 * (t625 + t789 + t111 + t1066)
+				       + t29 * (t529 + t204 + t127 + t1065) + t544 + t543
+				       + (x1 * t624 + t410 + t409) * y1)
+			      + px0 * (x1 * (t615 + t1019 + t837 + t1064)
+				       + t29 * (t517 + t118 + t843 + t1063) + t548 + t547
+				       + (x1 * t614 + t414 + t413) * y1)
+			      + t29 * (px1 * (t854 + t117 + t1062) + py1 * (t91 + t203 + t1061) + t423
+				       + t144 + t143 + t1060)
+			      + x1 * (px1 * (t1059 + t849 + t1058) + py1 * (t1057 + t80 + t1056) + t267
+				      + t433 + t1055) + t406 + t398
+			      + (t416 + x1 * (t962 + t711 + t431 + t43 + t1054) + t412
+				 + x2 * (t793 + t1022) + t5 * t594)
+			      * y1)
+		      + y1 * (x1 * (px1 * (t992 + t957 + t1053) + py1 * (t767 + t683 + t1052) + t434
+				    + t553 + t1051)
+			      + t550 + t546) + t653 * t1 + t652 * t14)
+	;
+    trans->m[2][2] = t644;
+}
+
+static void
+print_trans (const char *header, struct pixman_f_transform *trans)
+{
+    int i, j;
+    double max;
+
+    max = 0;
+
+    printf ("%s\n", header);
+
+    for (i = 0; i < 3; ++i)
+    {
+	for (j = 0; j < 3; ++j)
+	{
+	    double a = fabs (trans->m[i][j]);
+
+	    if (a > max)
+		max = a;
+	}
+    }
+
+    if (max == 0.0)
+	max = 1.0;
+
+    for (i = 0; i < 3; ++i)
+    {
+	printf ("{ ");
+	for (j = 0; j < 3; ++j)
+	{
+	    printf ("D2F (%.5f)%s", 16384 * (trans->m[i][j] / max), j == 2 ? "" : ", ");
+	}
+
+	printf ("},\n");
+    }
+}
+
+int
+main ()
+{
+    struct pixman_f_transform t;
+
+#if 0
+    quad_to_quad (75, 200,
+		  325, 200,
+		  450, 335,
+		  -50, 335,
+
+		  0, 0,
+		  400, 0,
+		  400, 400,
+		  0, 400,
+
+		  &t);
+#endif
+    quad_to_quad (
+	1, 0,
+	1, 2,
+	2, 2,
+	2, 0,
+
+	1, 0,
+	1, 112,
+	2, 2,
+	2, 0,
+
+	&t);
+
+    print_trans ("0->0", &t);
+
+    return 0;
+}
diff --git a/src/pixman/demos/radial-test.c b/src/pixman/demos/radial-test.c
new file mode 100644
index 0000000..08a367c
--- /dev/null
+++ b/src/pixman/demos/radial-test.c
@@ -0,0 +1,208 @@
+#include "../test/utils.h"
+#include "gtk-utils.h"
+
+#define NUM_GRADIENTS 9
+#define NUM_STOPS 3
+#define NUM_REPEAT 4
+#define SIZE 128
+#define WIDTH (SIZE * NUM_GRADIENTS)
+#define HEIGHT (SIZE * NUM_REPEAT)
+
+/*
+ * We want to test all the possible relative positions of the start
+ * and end circle:
+ *
+ *  - The start circle can be smaller/equal/bigger than the end
+ *    circle. A radial gradient can be classified in one of these
+ *    three cases depending on the sign of dr.
+ *
+ *  - The smaller circle can be completely inside/internally
+ *    tangent/outside (at least in part) of the bigger circle. This
+ *    classification is the same as the one which can be computed by
+ *    examining the sign of a = (dx^2 + dy^2 - dr^2).
+ *
+ *  - If the two circles have the same size, neither can be inside or
+ *    internally tangent
+ *
+ * This test draws radial gradients whose circles always have the same
+ * centers (0, 0) and (1, 0), but with different radiuses. From left
+ * to right:
+ *
+ * - Degenerate start circle completely inside the end circle
+ *     0.00 -> 1.75; dr = 1.75 > 0; a = 1 - 1.75^2 < 0
+ *
+ * - Small start circle completely inside the end circle
+ *     0.25 -> 1.75; dr =  1.5 > 0; a = 1 - 1.50^2 < 0
+ *
+ * - Small start circle internally tangent to the end circle
+ *     0.50 -> 1.50; dr =  1.0 > 0; a = 1 - 1.00^2 = 0
+ *
+ * - Small start circle outside of the end circle
+ *     0.50 -> 1.00; dr =  0.5 > 0; a = 1 - 0.50^2 > 0
+ *
+ * - Start circle with the same size as the end circle
+ *     1.00 -> 1.00; dr =  0.0 = 0; a = 1 - 0.00^2 > 0
+ *
+ * - Small end circle outside of the start circle
+ *     1.00 -> 0.50; dr = -0.5 > 0; a = 1 - 0.50^2 > 0
+ *
+ * - Small end circle internally tangent to the start circle
+ *     1.50 -> 0.50; dr = -1.0 > 0; a = 1 - 1.00^2 = 0
+ *
+ * - Small end circle completely inside the start circle
+ *     1.75 -> 0.25; dr = -1.5 > 0; a = 1 - 1.50^2 < 0
+ *
+ * - Degenerate end circle completely inside the start circle
+ *     0.00 -> 1.75; dr = 1.75 > 0; a = 1 - 1.75^2 < 0
+ *
+ */
+
+const static double radiuses[NUM_GRADIENTS] = {
+    0.00,
+    0.25,
+    0.50,
+    0.50,
+    1.00,
+    1.00,
+    1.50,
+    1.75,
+    1.75
+};
+
+#define double_to_color(x)					\
+    (((uint32_t) ((x)*65536)) - (((uint32_t) ((x)*65536)) >> 16))
+
+#define PIXMAN_STOP(offset,r,g,b,a)		\
+    { pixman_double_to_fixed (offset),		\
+	{					\
+	double_to_color (r),			\
+	double_to_color (g),			\
+	double_to_color (b),			\
+	double_to_color (a)			\
+	}					\
+    }
+
+static const pixman_gradient_stop_t stops[NUM_STOPS] = {
+    PIXMAN_STOP (0.0,        1, 0, 0, 0.75),
+    PIXMAN_STOP (0.70710678, 0, 1, 0, 0),
+    PIXMAN_STOP (1.0,        0, 0, 1, 1)
+};
+
+static pixman_image_t *
+create_radial (int index)
+{
+    pixman_point_fixed_t p0, p1;
+    pixman_fixed_t r0, r1;
+    double x0, x1, radius0, radius1, left, right, center;
+
+    x0 = 0;
+    x1 = 1;
+    radius0 = radiuses[index];
+    radius1 = radiuses[NUM_GRADIENTS - index - 1];
+
+    /* center the gradient */
+    left = MIN (x0 - radius0, x1 - radius1);
+    right = MAX (x0 + radius0, x1 + radius1);
+    center = (left + right) * 0.5;
+    x0 -= center;
+    x1 -= center;
+
+    /* scale to make it fit within a 1x1 rect centered in (0,0) */
+    x0 *= 0.25;
+    x1 *= 0.25;
+    radius0 *= 0.25;
+    radius1 *= 0.25;
+
+    p0.x = pixman_double_to_fixed (x0);
+    p0.y = pixman_double_to_fixed (0);
+
+    p1.x = pixman_double_to_fixed (x1);
+    p1.y = pixman_double_to_fixed (0);
+
+    r0 = pixman_double_to_fixed (radius0);
+    r1 = pixman_double_to_fixed (radius1);
+
+    return pixman_image_create_radial_gradient (&p0, &p1,
+						r0, r1,
+						stops, NUM_STOPS);
+}
+
+static const pixman_repeat_t repeat[NUM_REPEAT] = {
+    PIXMAN_REPEAT_NONE,
+    PIXMAN_REPEAT_NORMAL,
+    PIXMAN_REPEAT_REFLECT,
+    PIXMAN_REPEAT_PAD
+};
+
+int
+main (int argc, char **argv)
+{
+    pixman_transform_t transform;
+    pixman_image_t *src_img, *dest_img;
+    int i, j;
+
+    enable_divbyzero_exceptions ();
+
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT,
+					 NULL, 0);
+
+    draw_checkerboard (dest_img, 25, 0xffaaaaaa, 0xffbbbbbb);
+    
+    pixman_transform_init_identity (&transform);
+
+    /*
+     * The create_radial() function returns gradients centered in the
+     * origin and whose interesting part fits a 1x1 square. We want to
+     * paint these gradients on a SIZExSIZE square and to make things
+     * easier we want the origin in the top-left corner of the square
+     * we want to see.
+     */
+    pixman_transform_translate (NULL, &transform,
+				pixman_double_to_fixed (0.5),
+				pixman_double_to_fixed (0.5));
+
+    pixman_transform_scale (NULL, &transform,
+			    pixman_double_to_fixed (SIZE),
+			    pixman_double_to_fixed (SIZE));
+
+    /*
+     * Gradients are evaluated at the center of each pixel, so we need
+     * to translate by half a pixel to trigger some interesting
+     * cornercases. In particular, the original implementation of PDF
+     * radial gradients tried to divide by 0 when using this transform
+     * on the "tangent circles" cases.
+     */
+    pixman_transform_translate (NULL, &transform,
+				pixman_double_to_fixed (0.5),
+				pixman_double_to_fixed (0.5));
+
+    for (i = 0; i < NUM_GRADIENTS; i++)
+    {
+	src_img = create_radial (i);
+	pixman_image_set_transform (src_img, &transform);
+
+	for (j = 0; j < NUM_REPEAT; j++)
+	{
+	    pixman_image_set_repeat (src_img, repeat[j]);
+
+	    pixman_image_composite32 (PIXMAN_OP_OVER,
+				      src_img,
+				      NULL,
+				      dest_img,
+				      0, 0,
+				      0, 0,
+				      i * SIZE, j * SIZE,
+				      SIZE, SIZE);
+
+	}
+
+	pixman_image_unref (src_img);
+    }
+
+    show_image (dest_img);
+
+    pixman_image_unref (dest_img);
+
+    return 0;
+}
diff --git a/src/pixman/demos/scale.c b/src/pixman/demos/scale.c
new file mode 100644
index 0000000..d00307e
--- /dev/null
+++ b/src/pixman/demos/scale.c
@@ -0,0 +1,436 @@
+/*
+ * Copyright 2012, Red Hat, Inc.
+ * Copyright 2012, Soren Sandmann
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Soren Sandmann <soren.sandmann@gmail.com>
+ */
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include <math.h>
+#include <gtk/gtk.h>
+#include <pixman.h>
+#include <stdlib.h>
+#include "gtk-utils.h"
+
+typedef struct
+{
+    GtkBuilder *        builder;
+    pixman_image_t *	original;
+    GtkAdjustment *     scale_x_adjustment;
+    GtkAdjustment *     scale_y_adjustment;
+    GtkAdjustment *     rotate_adjustment;
+    GtkAdjustment *	subsample_adjustment;
+    int                 scaled_width;
+    int                 scaled_height;
+} app_t;
+
+static GtkWidget *
+get_widget (app_t *app, const char *name)
+{
+    GtkWidget *widget = GTK_WIDGET (gtk_builder_get_object (app->builder, name));
+
+    if (!widget)
+        g_error ("Widget %s not found\n", name);
+
+    return widget;
+}
+
+static double
+min4 (double a, double b, double c, double d)
+{
+    double m1, m2;
+
+    m1 = MIN (a, b);
+    m2 = MIN (c, d);
+    return MIN (m1, m2);
+}
+
+static double
+max4 (double a, double b, double c, double d)
+{
+    double m1, m2;
+
+    m1 = MAX (a, b);
+    m2 = MAX (c, d);
+    return MAX (m1, m2);
+}
+
+static void
+compute_extents (pixman_f_transform_t *trans, double *sx, double *sy)
+{
+    double min_x, max_x, min_y, max_y;
+    pixman_f_vector_t v[4] =
+    {
+	{ { 1, 1, 1 } },
+	{ { -1, 1, 1 } },
+	{ { -1, -1, 1 } },
+	{ { 1, -1, 1 } },
+    };
+
+    pixman_f_transform_point (trans, &v[0]);
+    pixman_f_transform_point (trans, &v[1]);
+    pixman_f_transform_point (trans, &v[2]);
+    pixman_f_transform_point (trans, &v[3]);
+
+    min_x = min4 (v[0].v[0], v[1].v[0], v[2].v[0], v[3].v[0]);
+    max_x = max4 (v[0].v[0], v[1].v[0], v[2].v[0], v[3].v[0]);
+    min_y = min4 (v[0].v[1], v[1].v[1], v[2].v[1], v[3].v[1]);
+    max_y = max4 (v[0].v[1], v[1].v[1], v[2].v[1], v[3].v[1]);
+
+    *sx = (max_x - min_x) / 2.0;
+    *sy = (max_y - min_y) / 2.0;
+}
+
+typedef struct
+{
+    char	name [20];
+    int		value;
+} named_int_t;
+
+static const named_int_t filters[] =
+{
+    { "Box",			PIXMAN_KERNEL_BOX },
+    { "Impulse",		PIXMAN_KERNEL_IMPULSE },
+    { "Linear",			PIXMAN_KERNEL_LINEAR },
+    { "Cubic",			PIXMAN_KERNEL_CUBIC },
+    { "Lanczos2",		PIXMAN_KERNEL_LANCZOS2 },
+    { "Lanczos3",		PIXMAN_KERNEL_LANCZOS3 },
+    { "Lanczos3 Stretched",	PIXMAN_KERNEL_LANCZOS3_STRETCHED },
+    { "Gaussian",		PIXMAN_KERNEL_GAUSSIAN },
+};
+
+static const named_int_t repeats[] =
+{
+    { "None",                   PIXMAN_REPEAT_NONE },
+    { "Normal",                 PIXMAN_REPEAT_NORMAL },
+    { "Reflect",                PIXMAN_REPEAT_REFLECT },
+    { "Pad",                    PIXMAN_REPEAT_PAD },
+};
+
+static int
+get_value (app_t *app, const named_int_t table[], const char *box_name)
+{
+    GtkComboBox *box = GTK_COMBO_BOX (get_widget (app, box_name));
+
+    return table[gtk_combo_box_get_active (box)].value;
+}
+
+static void
+copy_to_counterpart (app_t *app, GObject *object)
+{
+    static const char *xy_map[] =
+    {
+	"reconstruct_x_combo_box", "reconstruct_y_combo_box",
+	"sample_x_combo_box",      "sample_y_combo_box",
+	"scale_x_adjustment",      "scale_y_adjustment",
+    };
+    GObject *counterpart = NULL;
+    int i;
+
+    for (i = 0; i < G_N_ELEMENTS (xy_map); i += 2)
+    {
+	GObject *x = gtk_builder_get_object (app->builder, xy_map[i]);
+	GObject *y = gtk_builder_get_object (app->builder, xy_map[i + 1]);
+
+	if (object == x)
+	    counterpart = y;
+	if (object == y)
+	    counterpart = x;
+    }
+
+    if (!counterpart)
+	return;
+    
+    if (GTK_IS_COMBO_BOX (counterpart))
+    {
+	gtk_combo_box_set_active (
+	    GTK_COMBO_BOX (counterpart),
+	    gtk_combo_box_get_active (
+		GTK_COMBO_BOX (object)));
+    }
+    else if (GTK_IS_ADJUSTMENT (counterpart))
+    {
+	gtk_adjustment_set_value (
+	    GTK_ADJUSTMENT (counterpart),
+	    gtk_adjustment_get_value (
+		GTK_ADJUSTMENT (object)));
+    }
+}
+
+static double
+to_scale (double v)
+{
+    return pow (1.15, v);
+}
+
+static void
+rescale (GtkWidget *may_be_null, app_t *app)
+{
+    pixman_f_transform_t ftransform;
+    pixman_transform_t transform;
+    double new_width, new_height;
+    double fscale_x, fscale_y;
+    double rotation;
+    pixman_fixed_t *params;
+    int n_params;
+    double sx, sy;
+
+    pixman_f_transform_init_identity (&ftransform);
+
+    if (may_be_null && gtk_toggle_button_get_active (
+	    GTK_TOGGLE_BUTTON (get_widget (app, "lock_checkbutton"))))
+    {
+	copy_to_counterpart (app, G_OBJECT (may_be_null));
+    }
+    
+    fscale_x = gtk_adjustment_get_value (app->scale_x_adjustment);
+    fscale_y = gtk_adjustment_get_value (app->scale_y_adjustment);
+    rotation = gtk_adjustment_get_value (app->rotate_adjustment);
+
+    fscale_x = to_scale (fscale_x);
+    fscale_y = to_scale (fscale_y);
+    
+    new_width = pixman_image_get_width (app->original) * fscale_x;
+    new_height = pixman_image_get_height (app->original) * fscale_y;
+
+    pixman_f_transform_scale (&ftransform, NULL, fscale_x, fscale_y);
+
+    pixman_f_transform_translate (&ftransform, NULL, - new_width / 2.0, - new_height / 2.0);
+
+    rotation = (rotation / 360.0) * 2 * M_PI;
+    pixman_f_transform_rotate (&ftransform, NULL, cos (rotation), sin (rotation));
+
+    pixman_f_transform_translate (&ftransform, NULL, new_width / 2.0, new_height / 2.0);
+
+    pixman_f_transform_invert (&ftransform, &ftransform);
+
+    compute_extents (&ftransform, &sx, &sy);
+    
+    pixman_transform_from_pixman_f_transform (&transform, &ftransform);
+    pixman_image_set_transform (app->original, &transform);
+
+    params = pixman_filter_create_separable_convolution (
+        &n_params,
+        sx * 65536.0 + 0.5,
+	sy * 65536.0 + 0.5,
+	get_value (app, filters, "reconstruct_x_combo_box"),
+	get_value (app, filters, "reconstruct_y_combo_box"),
+	get_value (app, filters, "sample_x_combo_box"),
+	get_value (app, filters, "sample_y_combo_box"),
+	gtk_adjustment_get_value (app->subsample_adjustment),
+	gtk_adjustment_get_value (app->subsample_adjustment));
+
+    pixman_image_set_filter (app->original, PIXMAN_FILTER_SEPARABLE_CONVOLUTION, params, n_params);
+
+    pixman_image_set_repeat (
+        app->original, get_value (app, repeats, "repeat_combo_box"));
+    
+    free (params);
+
+    app->scaled_width = ceil (new_width);
+    app->scaled_height = ceil (new_height);
+    
+    gtk_widget_set_size_request (
+        get_widget (app, "drawing_area"), new_width + 0.5, new_height + 0.5);
+
+    gtk_widget_queue_draw (
+        get_widget (app, "drawing_area"));
+}
+
+static gboolean
+on_expose (GtkWidget *da, GdkEvent *event, gpointer data)
+{
+    app_t *app = data;
+    GdkRectangle *area = &event->expose.area;
+    cairo_surface_t *surface;
+    pixman_image_t *tmp;
+    cairo_t *cr;
+    uint32_t *pixels;
+
+    pixels = calloc (1, area->width * area->height * 4);
+    tmp = pixman_image_create_bits (
+        PIXMAN_a8r8g8b8, area->width, area->height, pixels, area->width * 4);
+
+    if (area->x < app->scaled_width && area->y < app->scaled_height)
+    {
+        pixman_image_composite (
+            PIXMAN_OP_SRC,
+            app->original, NULL, tmp,
+            area->x, area->y, 0, 0, 0, 0,
+            app->scaled_width - area->x, app->scaled_height - area->y);
+    }
+
+    surface = cairo_image_surface_create_for_data (
+        (uint8_t *)pixels, CAIRO_FORMAT_ARGB32,
+        area->width, area->height, area->width * 4);
+
+    cr = gdk_cairo_create (da->window);
+
+    cairo_set_source_surface (cr, surface, area->x, area->y);
+
+    cairo_paint (cr);
+
+    cairo_destroy (cr);
+    cairo_surface_destroy (surface);
+    free (pixels);
+    pixman_image_unref (tmp);
+
+    return TRUE;
+}
+
+static void
+set_up_combo_box (app_t *app, const char *box_name,
+                  int n_entries, const named_int_t table[])
+{
+    GtkWidget *widget = get_widget (app, box_name);
+    GtkListStore *model;
+    GtkCellRenderer *cell;
+    int i;
+
+    model = gtk_list_store_new (1, G_TYPE_STRING);
+    
+    cell = gtk_cell_renderer_text_new ();
+    gtk_cell_layout_pack_start (GTK_CELL_LAYOUT (widget), cell, TRUE);
+    gtk_cell_layout_set_attributes (GTK_CELL_LAYOUT (widget), cell,
+				    "text", 0,
+				    NULL);
+
+    gtk_combo_box_set_model (GTK_COMBO_BOX (widget), GTK_TREE_MODEL (model));
+    
+    for (i = 0; i < n_entries; ++i)
+    {
+	const named_int_t *info = &(table[i]);
+	GtkTreeIter iter;
+
+	gtk_list_store_append (model, &iter);
+	gtk_list_store_set (model, &iter, 0, info->name, -1);
+    }
+
+    gtk_combo_box_set_active (GTK_COMBO_BOX (widget), 0);
+
+    g_signal_connect (widget, "changed", G_CALLBACK (rescale), app);
+}
+
+static void
+set_up_filter_box (app_t *app, const char *box_name)
+{
+    set_up_combo_box (app, box_name, G_N_ELEMENTS (filters), filters);
+}
+
+static char *
+format_value (GtkWidget *widget, double value)
+{
+    return g_strdup_printf ("%.4f", to_scale (value));
+}
+
+static app_t *
+app_new (pixman_image_t *original)
+{
+    GtkWidget *widget;
+    app_t *app = g_malloc (sizeof *app);
+    GError *err = NULL;
+
+    app->builder = gtk_builder_new ();
+    app->original = original;
+
+    if (!gtk_builder_add_from_file (app->builder, "scale.ui", &err))
+	g_error ("Could not read file scale.ui: %s", err->message);
+
+    app->scale_x_adjustment =
+        GTK_ADJUSTMENT (gtk_builder_get_object (app->builder, "scale_x_adjustment"));
+    app->scale_y_adjustment =
+        GTK_ADJUSTMENT (gtk_builder_get_object (app->builder, "scale_y_adjustment"));
+    app->rotate_adjustment =
+        GTK_ADJUSTMENT (gtk_builder_get_object (app->builder, "rotate_adjustment"));
+    app->subsample_adjustment =
+	GTK_ADJUSTMENT (gtk_builder_get_object (app->builder, "subsample_adjustment"));
+
+    g_signal_connect (app->scale_x_adjustment, "value_changed", G_CALLBACK (rescale), app);
+    g_signal_connect (app->scale_y_adjustment, "value_changed", G_CALLBACK (rescale), app);
+    g_signal_connect (app->rotate_adjustment, "value_changed", G_CALLBACK (rescale), app);
+    g_signal_connect (app->subsample_adjustment, "value_changed", G_CALLBACK (rescale), app);
+    
+    widget = get_widget (app, "scale_x_scale");
+    gtk_scale_add_mark (GTK_SCALE (widget), 0.0, GTK_POS_LEFT, NULL);
+    g_signal_connect (widget, "format_value", G_CALLBACK (format_value), app);
+    widget = get_widget (app, "scale_y_scale");
+    gtk_scale_add_mark (GTK_SCALE (widget), 0.0, GTK_POS_LEFT, NULL);
+    g_signal_connect (widget, "format_value", G_CALLBACK (format_value), app);
+    widget = get_widget (app, "rotate_scale");
+    gtk_scale_add_mark (GTK_SCALE (widget), 0.0, GTK_POS_LEFT, NULL);
+
+    widget = get_widget (app, "drawing_area");
+    g_signal_connect (widget, "expose_event", G_CALLBACK (on_expose), app);
+
+    set_up_filter_box (app, "reconstruct_x_combo_box");
+    set_up_filter_box (app, "reconstruct_y_combo_box");
+    set_up_filter_box (app, "sample_x_combo_box");
+    set_up_filter_box (app, "sample_y_combo_box");
+
+    set_up_combo_box (
+        app, "repeat_combo_box", G_N_ELEMENTS (repeats), repeats);
+
+    g_signal_connect (
+	gtk_builder_get_object (app->builder, "lock_checkbutton"),
+	"toggled", G_CALLBACK (rescale), app);
+    
+    rescale (NULL, app);
+    
+    return app;
+}
+
+int
+main (int argc, char **argv)
+{
+    GtkWidget *window;
+    pixman_image_t *image;
+    app_t *app;
+    
+    gtk_init (&argc, &argv);
+
+    if (argc < 2)
+    {
+	printf ("%s <image file>\n", argv[0]);
+	return -1;
+    }
+
+    if (!(image = pixman_image_from_file (argv[1], PIXMAN_a8r8g8b8)))
+    {
+	printf ("Could not load image \"%s\"\n", argv[1]);
+	return -1;
+    }
+
+    app = app_new (image);
+    
+    window = get_widget (app, "main");
+
+    g_signal_connect (window, "delete_event", G_CALLBACK (gtk_main_quit), NULL);
+    
+    gtk_window_set_default_size (GTK_WINDOW (window), 1024, 768);
+    
+    gtk_widget_show_all (window);
+    
+    gtk_main ();
+
+    return 0;
+}
diff --git a/src/pixman/demos/scale.ui b/src/pixman/demos/scale.ui
new file mode 100644
index 0000000..ee985dd
--- /dev/null
+++ b/src/pixman/demos/scale.ui
@@ -0,0 +1,332 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<interface>
+  <!-- interface-requires gtk+ 2.12 -->
+  <!-- interface-naming-policy toplevel-contextual -->
+  <object class="GtkAdjustment" id="rotate_adjustment">
+    <property name="lower">-180</property>
+    <property name="upper">190</property>
+    <property name="step_increment">1</property>
+    <property name="page_increment">10</property>
+    <property name="page_size">10</property>
+  </object>
+  <object class="GtkAdjustment" id="scale_y_adjustment">
+    <property name="lower">-32</property>
+    <property name="upper">42</property>
+    <property name="step_increment">1</property>
+    <property name="page_increment">10</property>
+    <property name="page_size">10</property>
+  </object>
+  <object class="GtkAdjustment" id="scale_x_adjustment">
+    <property name="lower">-32</property>
+    <property name="upper">42</property>
+    <property name="step_increment">1</property>
+    <property name="page_increment">10</property>
+    <property name="page_size">10</property>
+  </object>
+  <object class="GtkAdjustment" id="subsample_adjustment">
+    <property name="lower">0</property>
+    <property name="upper">12</property>
+    <property name="step_increment">1</property>
+    <property name="page_increment">1</property>
+    <property name="page_size">0</property>
+    <property name="value">4</property>
+  </object>
+  <object class="GtkWindow" id="main">
+    <child>
+      <object class="GtkHBox" id="u">
+        <property name="visible">True</property>
+        <property name="spacing">12</property>
+        <child>
+          <object class="GtkScrolledWindow" id="scrolledwindow1">
+            <property name="visible">True</property>
+            <property name="can_focus">True</property>
+            <property name="shadow_type">in</property>
+            <child>
+              <object class="GtkViewport" id="viewport1">
+                <property name="visible">True</property>
+                <child>
+                  <object class="GtkDrawingArea" id="drawing_area">
+                    <property name="visible">True</property>
+                  </object>
+                </child>
+              </object>
+            </child>
+          </object>
+          <packing>
+            <property name="position">0</property>
+          </packing>
+        </child>
+        <child>
+          <object class="GtkVBox" id="box1">
+            <property name="visible">True</property>
+	    <property name="spacing">12</property>
+            <child>
+              <object class="GtkHBox" id="box2">
+                <property name="visible">True</property>
+                <property name="homogeneous">True</property>
+                <child>
+                  <object class="GtkVBox" id="box3">
+                    <property name="visible">True</property>
+                    <property name="spacing">6</property>
+                    <child>
+                      <object class="GtkLabel" id="label1">
+                        <property name="visible">True</property>
+                        <property name="label" translatable="yes">&lt;b&gt;Scale X&lt;/b&gt;</property>
+                        <property name="use_markup">True</property>
+                      </object>
+                      <packing>
+                        <property name="expand">False</property>
+                        <property name="position">0</property>
+                      </packing>
+                    </child>
+                    <child>
+                      <object class="GtkVScale" id="scale_x_scale">
+                        <property name="visible">True</property>
+                        <property name="can_focus">True</property>
+                        <property name="adjustment">scale_x_adjustment</property>
+                        <property name="fill_level">32</property>
+                        <property name="value_pos">right</property>
+                      </object>
+                      <packing>
+                        <property name="position">1</property>
+                      </packing>
+                    </child>
+                  </object>
+                  <packing>
+                    <property name="expand">False</property>
+                    <property name="position">0</property>
+                  </packing>
+                </child>
+                <child>
+                  <object class="GtkVBox" id="box4">
+                    <property name="visible">True</property>
+                    <property name="spacing">6</property>
+                    <child>
+                      <object class="GtkLabel" id="label2">
+                        <property name="visible">True</property>
+                        <property name="label" translatable="yes">&lt;b&gt;Scale Y&lt;/b&gt;</property>
+                        <property name="use_markup">True</property>
+                      </object>
+                      <packing>
+                        <property name="expand">False</property>
+                        <property name="position">0</property>
+                      </packing>
+                    </child>
+                    <child>
+                      <object class="GtkVScale" id="scale_y_scale">
+                        <property name="visible">True</property>
+                        <property name="can_focus">True</property>
+                        <property name="adjustment">scale_y_adjustment</property>
+                        <property name="fill_level">32</property>
+                        <property name="value_pos">right</property>
+                      </object>
+                      <packing>
+                        <property name="position">1</property>
+                      </packing>
+                    </child>
+                  </object>
+                  <packing>
+                    <property name="expand">False</property>
+                    <property name="position">1</property>
+                  </packing>
+                </child>
+                <child>
+                  <object class="GtkVBox" id="box5">
+                    <property name="visible">True</property>
+                    <property name="spacing">6</property>
+                    <child>
+                      <object class="GtkLabel" id="label3">
+                        <property name="visible">True</property>
+                        <property name="label" translatable="yes">&lt;b&gt;Rotate&lt;/b&gt;</property>
+                        <property name="use_markup">True</property>
+                      </object>
+                      <packing>
+                        <property name="expand">False</property>
+                        <property name="position">0</property>
+                      </packing>
+                    </child>
+                    <child>
+                      <object class="GtkVScale" id="rotate_scale">
+                        <property name="visible">True</property>
+                        <property name="can_focus">True</property>
+                        <property name="adjustment">rotate_adjustment</property>
+                        <property name="fill_level">180</property>
+                        <property name="value_pos">right</property>
+                      </object>
+                      <packing>
+                        <property name="position">1</property>
+                      </packing>
+                    </child>
+                  </object>
+                  <packing>
+                    <property name="expand">False</property>
+                    <property name="position">2</property>
+                  </packing>
+                </child>
+              </object>
+              <packing>
+                <property name="padding">6</property>
+                <property name="position">0</property>
+              </packing>
+            </child>
+            <child>
+              <object class="GtkVBox" id="box6">
+                <property name="visible">True</property>
+		<child>
+		  <object class="GtkCheckButton"
+			  id="lock_checkbutton">
+		    <property name="label" translatable="yes">Lock X and Y Dimensions</property>
+		    <property name="xalign">0.0</property>
+		  </object>
+                  <packing>
+                    <property name="expand">False</property>
+                    <property name="fill">False</property>
+                    <property name="padding">6</property>
+                    <property name="position">1</property>
+                  </packing>
+		</child>
+                <child>
+                  <object class="GtkTable" id="grid1">
+                    <property name="visible">True</property>
+                    <property name="column_spacing">8</property>
+                    <property name="row_spacing">6</property>
+                    <child>
+                      <object class="GtkLabel" id="label4">
+                        <property name="visible">True</property>
+                        <property name="xalign">1</property>
+                        <property name="label" translatable="yes">&lt;b&gt;Reconstruct X:&lt;/b&gt;</property>
+                        <property name="use_markup">True</property>
+                      </object>
+                    </child>
+                    <child>
+                      <object class="GtkLabel" id="label5">
+                        <property name="visible">True</property>
+                        <property name="xalign">1</property>
+                        <property name="label" translatable="yes">&lt;b&gt;Reconstruct Y:&lt;/b&gt;</property>
+                        <property name="use_markup">True</property>
+                      </object>
+                      <packing>
+                        <property name="top_attach">1</property>
+                      </packing>
+                    </child>
+                    <child>
+                      <object class="GtkLabel" id="label6">
+                        <property name="visible">True</property>
+                        <property name="xalign">1</property>
+                        <property name="label" translatable="yes">&lt;b&gt;Sample X:&lt;/b&gt;</property>
+                        <property name="use_markup">True</property>
+                      </object>
+                      <packing>
+                        <property name="top_attach">2</property>
+                      </packing>
+                    </child>
+                    <child>
+                      <object class="GtkLabel" id="label7">
+                        <property name="visible">True</property>
+                        <property name="xalign">1</property>
+                        <property name="label" translatable="yes">&lt;b&gt;Sample Y:&lt;/b&gt;</property>
+                        <property name="use_markup">True</property>
+                      </object>
+                      <packing>
+                        <property name="top_attach">3</property>
+                      </packing>
+                    </child>
+                    <child>
+                      <object class="GtkLabel" id="label8">
+                        <property name="visible">True</property>
+                        <property name="xalign">1</property>
+                        <property name="label" translatable="yes">&lt;b&gt;Repeat:&lt;/b&gt;</property>
+                        <property name="use_markup">True</property>
+                      </object>
+                      <packing>
+                        <property name="top_attach">4</property>
+                      </packing>
+                    </child>
+                    <child>
+                      <object class="GtkLabel" id="label9">
+                        <property name="visible">True</property>
+                        <property name="xalign">1</property>
+                        <property name="label" translatable="yes">&lt;b&gt;Subsample:&lt;/b&gt;</property>
+                        <property name="use_markup">True</property>
+                      </object>
+                      <packing>
+                        <property name="top_attach">5</property>
+                      </packing>
+                    </child>
+                    <child>
+                      <object class="GtkComboBox" id="reconstruct_x_combo_box">
+                        <property name="visible">True</property>
+                      </object>
+                      <packing>
+                        <property name="left_attach">1</property>
+                      </packing>
+                    </child>
+                    <child>
+                      <object class="GtkComboBox" id="reconstruct_y_combo_box">
+                        <property name="visible">True</property>
+                      </object>
+                      <packing>
+                        <property name="left_attach">1</property>
+                        <property name="top_attach">1</property>
+                      </packing>
+                    </child>
+                    <child>
+                      <object class="GtkComboBox" id="sample_x_combo_box">
+                        <property name="visible">True</property>
+                      </object>
+                      <packing>
+                        <property name="left_attach">1</property>
+                        <property name="top_attach">2</property>
+                      </packing>
+                    </child>
+                    <child>
+                      <object class="GtkComboBox" id="sample_y_combo_box">
+                        <property name="visible">True</property>
+                      </object>
+                      <packing>
+                        <property name="left_attach">1</property>
+                        <property name="top_attach">3</property>
+                      </packing>
+                    </child>
+                    <child>
+                      <object class="GtkComboBox" id="repeat_combo_box">
+                        <property name="visible">True</property>
+                      </object>
+                      <packing>
+                        <property name="left_attach">1</property>
+                        <property name="top_attach">4</property>
+                      </packing>
+                    </child>
+                    <child>
+                      <object class="GtkSpinButton" id="subsample_spin_button">
+                        <property name="visible">True</property>
+			<property name="adjustment">subsample_adjustment</property>
+                      </object>
+                      <packing>
+                        <property name="left_attach">1</property>
+                        <property name="top_attach">5</property>
+                      </packing>
+                    </child>
+                  </object>
+                  <packing>
+                    <property name="expand">False</property>
+                    <property name="padding">6</property>
+                    <property name="position">1</property>
+                  </packing>
+                </child>
+              </object>
+              <packing>
+                <property name="expand">False</property>
+                <property name="position">0</property>
+              </packing>
+            </child>
+          </object>
+          <packing>
+            <property name="expand">False</property>
+            <property name="position">1</property>
+          </packing>
+        </child>
+      </object>
+    </child>
+  </object>
+</interface>
diff --git a/src/pixman/demos/screen-test.c b/src/pixman/demos/screen-test.c
new file mode 100644
index 0000000..e69dba3
--- /dev/null
+++ b/src/pixman/demos/screen-test.c
@@ -0,0 +1,44 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 40
+#define HEIGHT 40
+    
+    uint32_t *src1 = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src2 = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src3 = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *dest = malloc (3 * WIDTH * 2 * HEIGHT * 4);
+    pixman_image_t *simg1, *simg2, *simg3, *dimg;
+
+    int i;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+    {
+	src1[i] = 0x7ff00000;
+	src2[i] = 0x7f00ff00;
+	src3[i] = 0x7f0000ff;
+    }
+
+    for (i = 0; i < 3 * WIDTH * 2 * HEIGHT; ++i)
+    {
+	dest[i] = 0x0;
+    }
+
+    simg1 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src1, WIDTH * 4);
+    simg2 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src2, WIDTH * 4);
+    simg3 = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, src3, WIDTH * 4);
+    dimg  = pixman_image_create_bits (PIXMAN_a8r8g8b8, 3 * WIDTH, 2 * HEIGHT, dest, 3 * WIDTH * 4);
+
+    pixman_image_composite (PIXMAN_OP_SCREEN, simg1, NULL, dimg, 0, 0, 0, 0, WIDTH, HEIGHT / 4, WIDTH, HEIGHT);
+    pixman_image_composite (PIXMAN_OP_SCREEN, simg2, NULL, dimg, 0, 0, 0, 0, (WIDTH/2), HEIGHT / 4 + HEIGHT / 2, WIDTH, HEIGHT);
+    pixman_image_composite (PIXMAN_OP_SCREEN, simg3, NULL, dimg, 0, 0, 0, 0, (4 * WIDTH) / 3, HEIGHT, WIDTH, HEIGHT);
+
+    show_image (dimg);
+    
+    return 0;
+}
diff --git a/src/pixman/demos/srgb-test.c b/src/pixman/demos/srgb-test.c
new file mode 100644
index 0000000..681d521
--- /dev/null
+++ b/src/pixman/demos/srgb-test.c
@@ -0,0 +1,87 @@
+#include <math.h>
+
+#include "pixman.h"
+#include "gtk-utils.h"
+
+static uint32_t
+linear_argb_to_premult_argb (float a,
+			     float r,
+			     float g,
+			     float b)
+{
+    r *= a;
+    g *= a;
+    b *= a;
+    return (uint32_t) (a * 255.0f + 0.5f) << 24
+	 | (uint32_t) (r * 255.0f + 0.5f) << 16
+	 | (uint32_t) (g * 255.0f + 0.5f) <<  8
+	 | (uint32_t) (b * 255.0f + 0.5f) <<  0;
+}
+
+static float
+lin2srgb (float linear)
+{
+    if (linear < 0.0031308f)
+	return linear * 12.92f;
+    else
+	return 1.055f * powf (linear, 1.0f/2.4f) - 0.055f;
+}
+
+static uint32_t
+linear_argb_to_premult_srgb_argb (float a,
+				  float r,
+				  float g,
+				  float b)
+{
+    r = lin2srgb (r * a);
+    g = lin2srgb (g * a);
+    b = lin2srgb (b * a);
+    return (uint32_t) (a * 255.0f + 0.5f) << 24
+	 | (uint32_t) (r * 255.0f + 0.5f) << 16
+	 | (uint32_t) (g * 255.0f + 0.5f) <<  8
+	 | (uint32_t) (b * 255.0f + 0.5f) <<  0;
+}
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 400
+#define HEIGHT 200
+    int y, x, p;
+    float alpha;
+ 
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *src1 = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *dest_img, *src1_img;
+   
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8_sRGB,
+					 WIDTH, HEIGHT,
+					 dest,
+					 WIDTH * 4);
+    src1_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT,
+					 src1,
+					 WIDTH * 4);
+
+    for (y = 0; y < HEIGHT; y ++)
+    {
+	p = WIDTH * y;
+	for (x = 0; x < WIDTH; x ++)
+	{
+	     alpha = (float) x / WIDTH;
+	     src1[p + x] = linear_argb_to_premult_argb (alpha, 1, 0, 1);
+	     dest[p + x] = linear_argb_to_premult_srgb_argb (1-alpha, 0, 1, 0);
+	}
+    }
+    
+    pixman_image_composite (PIXMAN_OP_ADD, src1_img, NULL, dest_img,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+    pixman_image_unref (src1_img);
+    free (src1);
+
+    show_image (dest_img);
+    pixman_image_unref (dest_img);
+    free (dest);
+    
+    return 0;
+}
diff --git a/src/pixman/demos/srgb-trap-test.c b/src/pixman/demos/srgb-trap-test.c
new file mode 100644
index 0000000..d5ae16a
--- /dev/null
+++ b/src/pixman/demos/srgb-trap-test.c
@@ -0,0 +1,119 @@
+#include <math.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+#define F(x)								\
+    pixman_double_to_fixed (x)
+
+#define WIDTH 600
+#define HEIGHT 300
+
+static uint16_t
+convert_to_srgb (uint16_t in)
+{
+    double d = in * (1/65535.0);
+    double a = 0.055;
+
+    if (d < 0.0031308)
+	d = 12.92 * d;
+    else
+	d = (1 + a) * pow (d, 1 / 2.4) - a;
+
+    return (d * 65535.0) + 0.5;
+}
+
+static void
+convert_color (pixman_color_t *dest_srgb, pixman_color_t *linear)
+{
+    dest_srgb->alpha = convert_to_srgb (linear->alpha);
+    dest_srgb->red = convert_to_srgb (linear->red);
+    dest_srgb->green = convert_to_srgb (linear->green);
+    dest_srgb->blue = convert_to_srgb (linear->blue);
+}
+
+int
+main (int argc, char **argv)
+{
+    static const pixman_trapezoid_t traps[] =
+    {
+	{ F(10.10), F(280.0),
+	  { { F(20.0), F(10.10) },
+	    { F(5.3), F(280.0) } },
+	  { { F(20.3), F(10.10) },
+	    { F(5.6), F(280.0) } }
+	},
+	{ F(10.10), F(280.0),
+	  { { F(40.0), F(10.10) },
+	    { F(15.3), F(280.0) } },
+	  { { F(41.0), F(10.10) },
+	    { F(16.3), F(280.0) } }
+	},
+	{ F(10.10), F(280.0),
+	  { { F(120.0), F(10.10) },
+	    { F(5.3), F(280.0) } },
+	  { { F(128.3), F(10.10) },
+	    { F(6.6), F(280.0) } }
+	},
+	{ F(10.10), F(280.0),
+	  { { F(60.0), F(10.10) },
+	    { F(25.3), F(280.0) } },
+	  { { F(61.0), F(10.10) },
+	    { F(26.3), F(280.0) } }
+	},
+	{ F(10.10), F(280.0),
+	  { { F(90.0), F(10.10) },
+	    { F(55.3), F(280.0) } },
+	  { { F(93.0), F(10.10) },
+	    { F(58.3), F(280.0) } }
+	},
+	{ F(130.10), F(150.0),
+	  { { F(100.0), F(130.10) },
+	    { F(250.3), F(150.0) } },
+	  { { F(110.0), F(130.10) },
+	    { F(260.3), F(150.0) } }
+	},
+	{ F(170.10), F(240.0),
+	  { { F(100.0), F(170.10) },
+	    { F(120.3), F(240.0) } },
+	  { { F(250.0), F(170.10) },
+	    { F(250.3), F(240.0) } }
+	},
+    };
+
+    pixman_image_t *src, *dest_srgb, *dest_linear;
+    pixman_color_t bg = { 0x0000, 0x0000, 0x0000, 0xffff };
+    pixman_color_t fg = { 0xffff, 0xffff, 0xffff, 0xffff };
+    pixman_color_t fg_srgb;
+    uint32_t *d;
+
+    d = malloc (WIDTH * HEIGHT * 4);
+    
+    dest_srgb = pixman_image_create_bits (
+	PIXMAN_a8r8g8b8_sRGB, WIDTH, HEIGHT, d, WIDTH * 4);
+    dest_linear = pixman_image_create_bits (
+	PIXMAN_a8r8g8b8, WIDTH, HEIGHT, d, WIDTH * 4);
+    
+    src = pixman_image_create_solid_fill (&bg);
+    pixman_image_composite32 (PIXMAN_OP_SRC,
+			      src, NULL, dest_srgb,
+			      0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+    
+    src = pixman_image_create_solid_fill (&fg);
+    
+    pixman_composite_trapezoids (PIXMAN_OP_OVER,
+				 src, dest_srgb, PIXMAN_a8,
+				 0, 0, 10, 10, G_N_ELEMENTS (traps), traps);
+
+    convert_color (&fg_srgb, &fg);
+    src = pixman_image_create_solid_fill (&fg_srgb);
+    
+    pixman_composite_trapezoids (PIXMAN_OP_OVER,
+				 src, dest_linear, PIXMAN_a8,
+				 0, 0, 310, 10, G_N_ELEMENTS (traps), traps);
+
+    show_image (dest_linear);
+    pixman_image_unref(dest_linear);
+    free(d);
+    
+    return 0;
+}
diff --git a/src/pixman/demos/trap-test.c b/src/pixman/demos/trap-test.c
new file mode 100644
index 0000000..19295e7
--- /dev/null
+++ b/src/pixman/demos/trap-test.c
@@ -0,0 +1,49 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+    pixman_image_t *src_img;
+    pixman_image_t *mask_img;
+    pixman_image_t *dest_img;
+    pixman_trap_t trap;
+    pixman_color_t white = { 0x0000, 0xffff, 0x0000, 0xffff };
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *mbits = malloc (WIDTH * HEIGHT);
+
+    memset (mbits, 0, WIDTH * HEIGHT);
+    memset (bits, 0xff, WIDTH * HEIGHT * 4);
+    
+    trap.top.l = pixman_int_to_fixed (50) + 0x8000;
+    trap.top.r = pixman_int_to_fixed (150) + 0x8000;
+    trap.top.y = pixman_int_to_fixed (30);
+
+    trap.bot.l = pixman_int_to_fixed (50) + 0x8000;
+    trap.bot.r = pixman_int_to_fixed (150) + 0x8000;
+    trap.bot.y = pixman_int_to_fixed (150);
+
+    mask_img = pixman_image_create_bits (PIXMAN_a8, WIDTH, HEIGHT, mbits, WIDTH);
+    src_img = pixman_image_create_solid_fill (&white);
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4);
+    
+    pixman_add_traps (mask_img, 0, 0, 1, &trap);
+
+    pixman_image_composite (PIXMAN_OP_OVER,
+			    src_img, mask_img, dest_img,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+    
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (bits);
+    
+    return 0;
+}
diff --git a/src/pixman/demos/tri-test.c b/src/pixman/demos/tri-test.c
new file mode 100644
index 0000000..a71869a
--- /dev/null
+++ b/src/pixman/demos/tri-test.c
@@ -0,0 +1,48 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../test/utils.h"
+#include "gtk-utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 200
+#define HEIGHT 200
+
+#define POINT(x,y)							\
+    { pixman_double_to_fixed ((x)), pixman_double_to_fixed ((y)) }
+    
+    pixman_image_t *src_img, *dest_img;
+    pixman_triangle_t tris[4] =
+    {
+	{ POINT (100, 100), POINT (10, 50), POINT (110, 10) },
+	{ POINT (100, 100), POINT (150, 10), POINT (200, 50) },
+	{ POINT (100, 100), POINT (10, 170), POINT (90, 175) },
+	{ POINT (100, 100), POINT (170, 150), POINT (120, 190) },
+    };
+    pixman_color_t color = { 0x4444, 0x4444, 0xffff, 0xffff };
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    int i;
+
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	bits[i] = (i / HEIGHT) * 0x01010000;
+    
+    src_img = pixman_image_create_solid_fill (&color);
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4);
+    
+    pixman_composite_triangles (PIXMAN_OP_ATOP_REVERSE,
+				src_img,
+				dest_img,
+				PIXMAN_a8,
+				200, 200,
+				-5, 5,
+				ARRAY_LENGTH (tris), tris);
+    show_image (dest_img);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dest_img);
+    free (bits);
+    
+    return 0;
+}
diff --git a/src/pixman/demos/zone_plate.png b/src/pixman/demos/zone_plate.png
new file mode 100644
index 0000000..519291d
--- /dev/null
+++ b/src/pixman/demos/zone_plate.png
diff --git a/src/pixman/pixman-1-uninstalled.pc.in b/src/pixman/pixman-1-uninstalled.pc.in
new file mode 100644
index 0000000..e0347d0
--- /dev/null
+++ b/src/pixman/pixman-1-uninstalled.pc.in
@@ -0,0 +1,5 @@
+Name: Pixman
+Description: The pixman library (version 1)
+Version: @PACKAGE_VERSION@
+Cflags: -I${pc_top_builddir}/${pcfiledir}/pixman
+Libs: ${pc_top_builddir}/${pcfiledir}/pixman/libpixman-1.la
diff --git a/src/pixman/pixman-1.pc.in b/src/pixman/pixman-1.pc.in
new file mode 100644
index 0000000..e3b9711
--- /dev/null
+++ b/src/pixman/pixman-1.pc.in
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: Pixman
+Description: The pixman library (version 1)
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/pixman-1
+Libs: -L${libdir} -lpixman-1
+
diff --git a/src/pixman/pixman/Makefile.am b/src/pixman/pixman/Makefile.am
new file mode 100644
index 0000000..b376d9a
--- /dev/null
+++ b/src/pixman/pixman/Makefile.am
@@ -0,0 +1,139 @@
+include $(top_srcdir)/pixman/Makefile.sources
+
+lib_LTLIBRARIES = libpixman-1.la
+
+libpixman_1_la_LDFLAGS = -version-info $(LT_VERSION_INFO) -no-undefined @PTHREAD_LDFLAGS@ 
+libpixman_1_la_LIBADD = @PTHREAD_LIBS@ -lm
+libpixman_1_la_SOURCES = $(libpixman_sources) $(libpixman_headers)
+
+libpixmanincludedir = $(includedir)/pixman-1
+libpixmaninclude_HEADERS = pixman.h pixman-version.h
+noinst_LTLIBRARIES = 
+
+EXTRA_DIST =				\
+	Makefile.win32			\
+	pixman-region.c			\
+	solaris-hwcap.mapfile		\
+	$(NULL)
+
+# mmx code
+if USE_X86_MMX
+noinst_LTLIBRARIES += libpixman-mmx.la
+libpixman_mmx_la_SOURCES = \
+	pixman-mmx.c
+libpixman_mmx_la_CFLAGS = $(MMX_CFLAGS)
+libpixman_1_la_LDFLAGS += $(MMX_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-mmx.la
+
+ASM_CFLAGS_mmx=$(MMX_CFLAGS)
+endif
+
+# vmx code
+if USE_VMX
+noinst_LTLIBRARIES += libpixman-vmx.la
+libpixman_vmx_la_SOURCES = \
+	pixman-vmx.c \
+	pixman-combine32.h
+libpixman_vmx_la_CFLAGS = $(VMX_CFLAGS)
+libpixman_1_la_LIBADD += libpixman-vmx.la
+
+ASM_CFLAGS_vmx=$(VMX_CFLAGS)
+endif
+
+# sse2 code
+if USE_SSE2
+noinst_LTLIBRARIES += libpixman-sse2.la
+libpixman_sse2_la_SOURCES = \
+	pixman-sse2.c
+libpixman_sse2_la_CFLAGS = $(SSE2_CFLAGS)
+libpixman_1_la_LDFLAGS += $(SSE2_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-sse2.la
+
+ASM_CFLAGS_sse2=$(SSE2_CFLAGS)
+endif
+
+# ssse3 code
+if USE_SSSE3
+noinst_LTLIBRARIES += libpixman-ssse3.la
+libpixman_ssse3_la_SOURCES = \
+	pixman-ssse3.c
+libpixman_ssse3_la_CFLAGS = $(SSSE3_CFLAGS)
+libpixman_1_la_LDFLAGS += $(SSSE3_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-ssse3.la
+
+ASM_CFLAGS_ssse3=$(SSSE3_CFLAGS)
+endif
+
+# arm simd code
+if USE_ARM_SIMD
+noinst_LTLIBRARIES += libpixman-arm-simd.la
+libpixman_arm_simd_la_SOURCES = \
+	pixman-arm-simd.c	\
+	pixman-arm-common.h	\
+	pixman-arm-simd-asm.S   \
+	pixman-arm-simd-asm-scaled.S \
+	pixman-arm-simd-asm.h
+libpixman_1_la_LIBADD += libpixman-arm-simd.la
+
+ASM_CFLAGS_arm_simd=
+endif
+
+# arm neon code
+if USE_ARM_NEON
+noinst_LTLIBRARIES += libpixman-arm-neon.la
+libpixman_arm_neon_la_SOURCES = \
+        pixman-arm-neon.c	\
+        pixman-arm-common.h	\
+        pixman-arm-neon-asm.S	\
+		pixman-arm-neon-asm-bilinear.S \
+        pixman-arm-neon-asm.h
+libpixman_1_la_LIBADD += libpixman-arm-neon.la
+
+ASM_CFLAGS_arm_neon=
+endif
+
+# iwmmxt code
+if USE_ARM_IWMMXT
+libpixman_iwmmxt_la_SOURCES = pixman-mmx.c
+noinst_LTLIBRARIES += libpixman-iwmmxt.la
+libpixman_1_la_LIBADD += libpixman-iwmmxt.la
+
+libpixman_iwmmxt_la-pixman-mmx.lo: pixman-mmx.c
+	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(AM_CPPFLAGS) $(AM_CPPFLAGS) $(CPPFLAGS) $(CFLAGS) $(IWMMXT_CFLAGS) -MT libpixman_iwmmxt_la-pixman-mmx.lo -MD -MP -MF $(DEPDIR)/libpixman_iwmmxt_la-pixman-mmx.Tpo -c -o libpixman_iwmmxt_la-pixman-mmx.lo `test -f 'pixman-mmx.c' || echo '$(srcdir)/'`pixman-mmx.c
+	$(AM_V_at)$(am__mv) $(DEPDIR)/libpixman_iwmmxt_la-pixman-mmx.Tpo $(DEPDIR)/libpixman_iwmmxt_la-pixman-mmx.Plo
+
+libpixman_iwmmxt_la_DEPENDENCIES = $(am__DEPENDENCIES_1)
+libpixman_iwmmxt_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+        $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(CFLAGS) $(IWMMXT_CFLAGS) $(AM_LDFLAGS) \
+	$(LDFLAGS) -o $@
+
+libpixman-iwmmxt.la: libpixman_iwmmxt_la-pixman-mmx.lo $(libpixman_iwmmxt_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(libpixman_iwmmxt_la_LINK) libpixman_iwmmxt_la-pixman-mmx.lo $(libpixman_iwmmxt_la_LIBADD) $(LIBS)
+endif
+
+# mips dspr2 code
+if USE_MIPS_DSPR2
+noinst_LTLIBRARIES += libpixman-mips-dspr2.la
+libpixman_mips_dspr2_la_SOURCES = \
+        pixman-mips-dspr2.c \
+        pixman-mips-dspr2.h \
+        pixman-mips-dspr2-asm.S \
+        pixman-mips-dspr2-asm.h \
+        pixman-mips-memcpy-asm.S
+libpixman_1_la_LIBADD += libpixman-mips-dspr2.la
+
+ASM_CFLAGS_mips_dspr2=
+endif
+
+# loongson code
+if USE_LOONGSON_MMI
+noinst_LTLIBRARIES += libpixman-loongson-mmi.la
+libpixman_loongson_mmi_la_SOURCES = pixman-mmx.c loongson-mmintrin.h
+libpixman_loongson_mmi_la_CFLAGS = $(LS_CFLAGS)
+libpixman_1_la_LDFLAGS += $(LS_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-loongson-mmi.la
+endif
+
+.c.s : $(libpixmaninclude_HEADERS)
+	$(CC) $(CFLAGS) $(ASM_CFLAGS_$(@:pixman-%.s=%)) $(ASM_CFLAGS_$(@:pixman-arm-%.s=arm_%)) -DHAVE_CONFIG_H -I$(srcdir) -I$(builddir) -I$(top_builddir) -S -o $@ $<
diff --git a/src/pixman/pixman/Makefile.sources b/src/pixman/pixman/Makefile.sources
new file mode 100644
index 0000000..c624eb9
--- /dev/null
+++ b/src/pixman/pixman/Makefile.sources
@@ -0,0 +1,42 @@
+libpixman_sources =			\
+	pixman.c			\
+	pixman-access.c			\
+	pixman-access-accessors.c	\
+	pixman-bits-image.c		\
+	pixman-combine32.c		\
+	pixman-combine-float.c		\
+	pixman-conical-gradient.c	\
+	pixman-filter.c			\
+	pixman-x86.c			\
+	pixman-mips.c			\
+	pixman-arm.c			\
+	pixman-ppc.c			\
+	pixman-edge.c			\
+	pixman-edge-accessors.c		\
+	pixman-fast-path.c		\
+	pixman-glyph.c			\
+	pixman-general.c		\
+	pixman-gradient-walker.c	\
+	pixman-image.c			\
+	pixman-implementation.c		\
+	pixman-linear-gradient.c	\
+	pixman-matrix.c			\
+	pixman-noop.c			\
+	pixman-radial-gradient.c	\
+	pixman-region16.c		\
+	pixman-region32.c		\
+	pixman-solid-fill.c		\
+	pixman-timer.c			\
+	pixman-trap.c			\
+	pixman-utils.c			\
+	$(NULL)
+
+libpixman_headers =			\
+	pixman.h			\
+	pixman-accessor.h		\
+	pixman-combine32.h		\
+	pixman-compiler.h		\
+	pixman-edge-imp.h		\
+	pixman-inlines.h		\
+	pixman-private.h		\
+	$(NULL)
diff --git a/src/pixman/pixman/Makefile.win32 b/src/pixman/pixman/Makefile.win32
new file mode 100644
index 0000000..7b64033
--- /dev/null
+++ b/src/pixman/pixman/Makefile.win32
@@ -0,0 +1,93 @@
+default: all
+
+top_srcdir = ..
+include $(top_srcdir)/pixman/Makefile.sources
+include $(top_srcdir)/Makefile.win32.common
+
+MMX_VAR = $(MMX)
+ifeq ($(MMX_VAR),)
+MMX_VAR=on
+endif
+
+SSE2_VAR = $(SSE2)
+ifeq ($(SSE2_VAR),)
+SSE2_VAR=on
+endif
+
+SSSE3_VAR = $(SSSE3)
+ifeq ($(SSSE3_VAR),)
+SSSE3_VAR=on
+endif
+
+MMX_CFLAGS = -DUSE_X86_MMX -w14710 -w14714
+SSE2_CFLAGS = -DUSE_SSE2
+SSSE3_CFLAGS = -DUSE_SSSE3
+
+# MMX compilation flags
+ifeq ($(MMX_VAR),on)
+PIXMAN_CFLAGS += $(MMX_CFLAGS)
+libpixman_sources += pixman-mmx.c
+endif
+
+# SSE2 compilation flags
+ifeq ($(SSE2_VAR),on)
+PIXMAN_CFLAGS += $(SSE2_CFLAGS)
+libpixman_sources += pixman-sse2.c
+endif
+
+# SSSE3 compilation flags
+ifeq ($(SSSE3_VAR),on)
+PIXMAN_CFLAGS += $(SSSE3_CFLAGS)
+libpixman_sources += pixman-ssse3.c
+endif
+
+OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(libpixman_sources))
+
+# targets
+all: inform informMMX informSSE2 informSSSE3 $(CFG_VAR)/$(LIBRARY).lib
+
+informMMX:
+ifneq ($(MMX),off)
+ifneq ($(MMX),on)
+ifneq ($(MMX),)
+	@echo "Invalid specified MMX option : "$(MMX_VAR)"."
+	@echo
+	@echo "Possible choices for MMX are 'on' or 'off'"
+	@exit 1
+endif
+	@echo "Setting MMX flag to default value 'on'... (use MMX=on or MMX=off)"
+endif
+endif
+
+informSSE2:
+ifneq ($(SSE2),off)
+ifneq ($(SSE2),on)
+ifneq ($(SSE2),)
+	@echo "Invalid specified SSE option : "$(SSE2)"."
+	@echo
+	@echo "Possible choices for SSE2 are 'on' or 'off'"
+	@exit 1
+endif
+	@echo "Setting SSE2 flag to default value 'on'... (use SSE2=on or SSE2=off)"
+endif
+endif
+
+informSSSE3:
+ifneq ($(SSSE3),off)
+ifneq ($(SSSE3),on)
+ifneq ($(SSSE3),)
+	@echo "Invalid specified SSE option : "$(SSSE3)"."
+	@echo
+	@echo "Possible choices for SSSE3 are 'on' or 'off'"
+	@exit 1
+endif
+	@echo "Setting SSSE3 flag to default value 'on'... (use SSSE3=on or SSSE3=off)"
+endif
+endif
+
+
+# pixman linking
+$(CFG_VAR)/$(LIBRARY).lib: $(OBJECTS)
+	@$(AR) $(PIXMAN_ARFLAGS) -OUT:$@ $^
+
+.PHONY: all informMMX informSSE2 informSSSE3
diff --git a/src/pixman/pixman/loongson-mmintrin.h b/src/pixman/pixman/loongson-mmintrin.h
new file mode 100644
index 0000000..086c6e0
--- /dev/null
+++ b/src/pixman/pixman/loongson-mmintrin.h
@@ -0,0 +1,410 @@
+/* The gcc-provided loongson intrinsic functions are way too fucking broken
+ * to be of any use, otherwise I'd use them.
+ *
+ * - The hardware instructions are very similar to MMX or iwMMXt. Certainly
+ *   close enough that they could have implemented the _mm_*-style intrinsic
+ *   interface and had a ton of optimized code available to them. Instead they
+ *   implemented something much, much worse.
+ *
+ * - pshuf takes a dead first argument, causing extra instructions to be
+ *   generated.
+ *
+ * - There are no 64-bit shift or logical intrinsics, which means you have
+ *   to implement them with inline assembly, but this is a nightmare because
+ *   gcc doesn't understand that the integer vector datatypes are actually in
+ *   floating-point registers, so you end up with braindead code like
+ *
+ *	punpcklwd	$f9,$f9,$f5
+ *	    dmtc1	v0,$f8
+ *	punpcklwd	$f19,$f19,$f5
+ *	    dmfc1	t9,$f9
+ *	    dmtc1	v0,$f9
+ *	    dmtc1	t9,$f20
+ *	    dmfc1	s0,$f19
+ *	punpcklbh	$f20,$f20,$f2
+ *
+ *   where crap just gets copied back and forth between integer and floating-
+ *   point registers ad nauseum.
+ *
+ * Instead of trying to workaround the problems from these crap intrinsics, I
+ * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline
+ * assembly.
+ */
+
+#include <stdint.h>
+
+/* vectors are stored in 64-bit floating-point registers */
+typedef double __m64;
+/* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */
+typedef float  __m32;
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_si64 (void)
+{
+	return 0.0;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi32 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddush %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu8 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("paddusb %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_si64 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("and %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pcmpeqw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_empty (void)
+{
+
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pmaddhw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pmulhuh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("pmullh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_si64 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("or %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pu16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("packushb %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pi32 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("packsswh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0)
+{
+	if (__builtin_constant_p (__w3) &&
+	    __builtin_constant_p (__w2) &&
+	    __builtin_constant_p (__w1) &&
+	    __builtin_constant_p (__w0))
+	{
+		uint64_t val = ((uint64_t)__w3 << 48)
+			     | ((uint64_t)__w2 << 32)
+			     | ((uint64_t)__w1 << 16)
+			     | ((uint64_t)__w0 <<  0);
+		return *(__m64 *)&val;
+	}
+	else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0)
+	{
+		/* TODO: handle other cases */
+		uint64_t val = __w3;
+		uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0);
+		__m64 ret;
+		asm("pshufh %0, %1, %2\n\t"
+		    : "=f" (ret)
+		    : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm)
+		);
+		return ret;
+	}
+	uint64_t val = ((uint64_t)__w3 << 48)
+		     | ((uint64_t)__w2 << 32)
+		     | ((uint64_t)__w1 << 16)
+		     | ((uint64_t)__w0 <<  0);
+	return *(__m64 *)&val;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi32 (unsigned __i1, unsigned __i0)
+{
+	if (__builtin_constant_p (__i1) &&
+	    __builtin_constant_p (__i0))
+	{
+		uint64_t val = ((uint64_t)__i1 << 32)
+			     | ((uint64_t)__i0 <<  0);
+		return *(__m64 *)&val;
+	}
+	else if (__i1 == __i0)
+	{
+		uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0);
+		__m64 ret;
+		asm("pshufh %0, %1, %2\n\t"
+		    : "=f" (ret)
+		    : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
+		);
+		return ret;
+	}
+	uint64_t val = ((uint64_t)__i1 << 32)
+		     | ((uint64_t)__i0 <<  0);
+	return *(__m64 *)&val;
+}
+#undef _MM_SHUFFLE
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi16 (__m64 __m, int64_t __n)
+{
+	__m64 ret;
+	asm("pshufh %0, %1, %2\n\t"
+	    : "=f" (ret)
+	    : "f" (__m), "f" (*(__m64 *)&__n)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_pi16 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("psllh  %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_si64 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("dsll  %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi16 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("psrlh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi32 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("psrlw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_si64 (__m64 __m, int64_t __count)
+{
+	__m64 ret;
+	asm("dsrl  %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__count)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("psubh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpckhbh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpckhhw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpcklbh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+/* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which
+ * allows load8888 to use 32-bit loads */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpcklbh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("punpcklhw %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_si64 (__m64 __m1, __m64 __m2)
+{
+	__m64 ret;
+	asm("xor %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+loongson_extract_pi16 (__m64 __m, int64_t __pos)
+{
+	__m64 ret;
+	asm("pextrh %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m), "f" (*(__m64 *)&__pos)
+	);
+	return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+loongson_insert_pi16 (__m64 __m1, __m64 __m2, int64_t __pos)
+{
+	__m64 ret;
+	asm("pinsrh_%3 %0, %1, %2\n\t"
+	   : "=f" (ret)
+	   : "f" (__m1), "f" (__m2), "i" (__pos)
+	);
+	return ret;
+}
diff --git a/src/pixman/pixman/make-srgb.pl b/src/pixman/pixman/make-srgb.pl
new file mode 100644
index 0000000..cdaa80b
--- /dev/null
+++ b/src/pixman/pixman/make-srgb.pl
@@ -0,0 +1,115 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+sub linear_to_srgb
+{
+    my ($c) = @_;
+
+    if ($c < 0.0031308)
+    {
+	return $c * 12.92;
+    }
+    else
+    {
+	return 1.055 * $c ** (1.0/2.4) - 0.055;
+    }
+}
+
+sub srgb_to_linear
+{
+    my ($c) = @_;
+
+    if ($c < 0.04045)
+    {
+	return $c / 12.92;
+    }
+    else
+    {
+	return (($c + 0.055) / 1.055) ** 2.4
+    }
+}
+
+my @linear_to_srgb;
+for my $linear (0 .. 4095)
+{
+    my $srgb = int(linear_to_srgb($linear / 4095.0) * 255.0 + 0.5);
+    push @linear_to_srgb, $srgb;
+}
+
+my @srgb_to_linear;
+for my $srgb (0 .. 255)
+{
+    my $linear = int(srgb_to_linear($srgb / 255.0) * 65535.0 + 0.5);
+    push @srgb_to_linear, $linear;
+}
+
+# Ensure that we have a lossless sRGB and back conversion loop.
+# some of the darkest shades need a little bias -- maximum is just
+# 5 increments out of 16. This gives us useful property with
+# least amount of error in the sRGB-to-linear table, and keeps the actual
+# table lookup in the other direction as simple as possible.
+for my $srgb (0 .. $#srgb_to_linear)
+{
+    my $add = 0;
+    while (1)
+    {
+	my $linear = $srgb_to_linear[$srgb];
+	my $srgb_lossy = $linear_to_srgb[$linear >> 4];
+	last if $srgb == $srgb_lossy;
+
+	# Add slight bias to this component until it rounds correctly
+	$srgb_to_linear[$srgb] ++;
+	$add ++;
+    }
+    die "Too many adds at $srgb" if $add > 5;
+}
+
+print <<"PROLOG";
+/* WARNING: This file is generated by $0.
+ * Please edit that file instead of this one.
+ */
+
+#include <stdint.h>
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+
+PROLOG
+
+print "const uint8_t linear_to_srgb[" . @linear_to_srgb . "] =\n";
+print "{\n";
+for my $linear (0 .. $#linear_to_srgb)
+{
+    if (($linear % 10) == 0)
+    {
+	print "\t";
+    }
+    print sprintf("%d, ", $linear_to_srgb[$linear]);
+    if (($linear % 10) == 9)
+    {
+	print "\n";
+    }
+}
+print "\n};\n";
+print "\n";
+
+print "const uint16_t srgb_to_linear[" . @srgb_to_linear . "] =\n";
+print "{\n";
+for my $srgb (0 .. $#srgb_to_linear)
+{
+    if (($srgb % 10) == 0)
+    {
+	print "\t";
+    }
+    print sprintf("%d, ", $srgb_to_linear[$srgb]);
+    if (($srgb % 10) == 9)
+    {
+	print "\n";
+    }
+}
+print "\n};\n";
+
diff --git a/src/pixman/pixman/pixman-access-accessors.c b/src/pixman/pixman/pixman-access-accessors.c
new file mode 100644
index 0000000..3263582
--- /dev/null
+++ b/src/pixman/pixman/pixman-access-accessors.c
@@ -0,0 +1,3 @@
+#define PIXMAN_FB_ACCESSORS
+
+#include "pixman-access.c"
diff --git a/src/pixman/pixman/pixman-access.c b/src/pixman/pixman/pixman-access.c
new file mode 100644
index 0000000..4f0642d
--- /dev/null
+++ b/src/pixman/pixman/pixman-access.c
@@ -0,0 +1,1433 @@
+/*
+ *
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *             2008 Aaron Plattner, NVIDIA Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <math.h>
+
+#include "pixman-accessor.h"
+#include "pixman-private.h"
+
+#define CONVERT_RGB24_TO_Y15(s)						\
+    (((((s) >> 16) & 0xff) * 153 +					\
+      (((s) >>  8) & 0xff) * 301 +					\
+      (((s)      ) & 0xff) * 58) >> 2)
+
+#define CONVERT_RGB24_TO_RGB15(s)                                       \
+    ((((s) >> 3) & 0x001f) |                                            \
+     (((s) >> 6) & 0x03e0) |                                            \
+     (((s) >> 9) & 0x7c00))
+
+/* Fetch macros */
+
+#ifdef WORDS_BIGENDIAN
+#define FETCH_1(img,l,o)						\
+    (((READ ((img), ((uint32_t *)(l)) + ((o) >> 5))) >> (0x1f - ((o) & 0x1f))) & 0x1)
+#else
+#define FETCH_1(img,l,o)						\
+    ((((READ ((img), ((uint32_t *)(l)) + ((o) >> 5))) >> ((o) & 0x1f))) & 0x1)
+#endif
+
+#define FETCH_8(img,l,o)    (READ (img, (((uint8_t *)(l)) + ((o) >> 3))))
+
+#ifdef WORDS_BIGENDIAN
+#define FETCH_4(img,l,o)						\
+    (((4 * (o)) & 4) ? (FETCH_8 (img,l, 4 * (o)) & 0xf) : (FETCH_8 (img,l,(4 * (o))) >> 4))
+#else
+#define FETCH_4(img,l,o)						\
+    (((4 * (o)) & 4) ? (FETCH_8 (img, l, 4 * (o)) >> 4) : (FETCH_8 (img, l, (4 * (o))) & 0xf))
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define FETCH_24(img,l,o)                                              \
+    ((READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 16)    |       \
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)     |       \
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 0))
+#else
+#define FETCH_24(img,l,o)						\
+    ((READ (img, (((uint8_t *)(l)) + ((o) * 3) + 0)) << 0)	|	\
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 1)) << 8)	|	\
+     (READ (img, (((uint8_t *)(l)) + ((o) * 3) + 2)) << 16))
+#endif
+
+/* Store macros */
+
+#ifdef WORDS_BIGENDIAN
+#define STORE_1(img,l,o,v)						\
+    do									\
+    {									\
+	uint32_t  *__d = ((uint32_t *)(l)) + ((o) >> 5);		\
+	uint32_t __m, __v;						\
+									\
+	__m = 1 << (0x1f - ((o) & 0x1f));				\
+	__v = (v)? __m : 0;						\
+									\
+	WRITE((img), __d, (READ((img), __d) & ~__m) | __v);		\
+    }									\
+    while (0)
+#else
+#define STORE_1(img,l,o,v)						\
+    do									\
+    {									\
+	uint32_t  *__d = ((uint32_t *)(l)) + ((o) >> 5);		\
+	uint32_t __m, __v;						\
+									\
+	__m = 1 << ((o) & 0x1f);					\
+	__v = (v)? __m : 0;						\
+									\
+	WRITE((img), __d, (READ((img), __d) & ~__m) | __v);		\
+    }									\
+    while (0)
+#endif
+
+#define STORE_8(img,l,o,v)  (WRITE (img, (uint8_t *)(l) + ((o) >> 3), (v)))
+
+#ifdef WORDS_BIGENDIAN
+#define STORE_4(img,l,o,v)						\
+    do									\
+    {									\
+	int bo = 4 * (o);						\
+	int v4 = (v) & 0x0f;						\
+									\
+	STORE_8 (img, l, bo, (						\
+		     bo & 4 ?						\
+		     (FETCH_8 (img, l, bo) & 0xf0) | (v4) :		\
+		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4)));	\
+    } while (0)
+#else
+#define STORE_4(img,l,o,v)						\
+    do									\
+    {									\
+	int bo = 4 * (o);						\
+	int v4 = (v) & 0x0f;						\
+									\
+	STORE_8 (img, l, bo, (						\
+		     bo & 4 ?						\
+		     (FETCH_8 (img, l, bo) & 0x0f) | (v4 << 4) :	\
+		     (FETCH_8 (img, l, bo) & 0xf0) | (v4)));		\
+    } while (0)
+#endif
+
+#ifdef WORDS_BIGENDIAN
+#define STORE_24(img,l,o,v)                                            \
+    do                                                                 \
+    {                                                                  \
+	uint8_t *__tmp = (l) + 3 * (o);				       \
+        							       \
+	WRITE ((img), __tmp++, ((v) & 0x00ff0000) >> 16);	       \
+	WRITE ((img), __tmp++, ((v) & 0x0000ff00) >>  8);	       \
+	WRITE ((img), __tmp++, ((v) & 0x000000ff) >>  0);	       \
+    }                                                                  \
+    while (0)
+#else
+#define STORE_24(img,l,o,v)                                            \
+    do                                                                 \
+    {                                                                  \
+	uint8_t *__tmp = (l) + 3 * (o);				       \
+        							       \
+	WRITE ((img), __tmp++, ((v) & 0x000000ff) >>  0);	       \
+	WRITE ((img), __tmp++, ((v) & 0x0000ff00) >>  8);	       \
+	WRITE ((img), __tmp++, ((v) & 0x00ff0000) >> 16);	       \
+    }								       \
+    while (0)
+#endif
+
+/*
+ * YV12 setup and access macros
+ */
+
+#define YV12_SETUP(image)                                               \
+    bits_image_t *__bits_image = (bits_image_t *)image;                 \
+    uint32_t *bits = __bits_image->bits;                                \
+    int stride = __bits_image->rowstride;                               \
+    int offset0 = stride < 0 ?                                          \
+    ((-stride) >> 1) * ((__bits_image->height - 1) >> 1) - stride :	\
+    stride * __bits_image->height;					\
+    int offset1 = stride < 0 ?                                          \
+    offset0 + ((-stride) >> 1) * ((__bits_image->height) >> 1) :	\
+	offset0 + (offset0 >> 2)
+
+/* Note no trailing semicolon on the above macro; if it's there, then
+ * the typical usage of YV12_SETUP(image); will have an extra trailing ;
+ * that some compilers will interpret as a statement -- and then any further
+ * variable declarations will cause an error.
+ */
+
+#define YV12_Y(line)                                                    \
+    ((uint8_t *) ((bits) + (stride) * (line)))
+
+#define YV12_U(line)                                                    \
+    ((uint8_t *) ((bits) + offset1 +                                    \
+                  ((stride) >> 1) * ((line) >> 1)))
+
+#define YV12_V(line)                                                    \
+    ((uint8_t *) ((bits) + offset0 +                                    \
+                  ((stride) >> 1) * ((line) >> 1)))
+
+/* Misc. helpers */
+
+static force_inline void
+get_shifts (pixman_format_code_t  format,
+	    int			 *a,
+	    int			 *r,
+	    int                  *g,
+	    int                  *b)
+{
+    switch (PIXMAN_FORMAT_TYPE (format))
+    {
+    case PIXMAN_TYPE_A:
+	*b = 0;
+	*g = 0;
+	*r = 0;
+	*a = 0;
+	break;
+
+    case PIXMAN_TYPE_ARGB:
+    case PIXMAN_TYPE_ARGB_SRGB:
+	*b = 0;
+	*g = *b + PIXMAN_FORMAT_B (format);
+	*r = *g + PIXMAN_FORMAT_G (format);
+	*a = *r + PIXMAN_FORMAT_R (format);
+	break;
+
+    case PIXMAN_TYPE_ABGR:
+	*r = 0;
+	*g = *r + PIXMAN_FORMAT_R (format);
+	*b = *g + PIXMAN_FORMAT_G (format);
+	*a = *b + PIXMAN_FORMAT_B (format);
+	break;
+
+    case PIXMAN_TYPE_BGRA:
+	/* With BGRA formats we start counting at the high end of the pixel */
+	*b = PIXMAN_FORMAT_BPP (format) - PIXMAN_FORMAT_B (format);
+	*g = *b - PIXMAN_FORMAT_B (format);
+	*r = *g - PIXMAN_FORMAT_G (format);
+	*a = *r - PIXMAN_FORMAT_R (format);
+	break;
+
+    case PIXMAN_TYPE_RGBA:
+	/* With BGRA formats we start counting at the high end of the pixel */
+	*r = PIXMAN_FORMAT_BPP (format) - PIXMAN_FORMAT_R (format);
+	*g = *r - PIXMAN_FORMAT_R (format);
+	*b = *g - PIXMAN_FORMAT_G (format);
+	*a = *b - PIXMAN_FORMAT_B (format);
+	break;
+
+    default:
+	assert (0);
+	break;
+    }
+}
+
+static force_inline uint32_t
+convert_channel (uint32_t pixel, uint32_t def_value,
+		 int n_from_bits, int from_shift,
+		 int n_to_bits, int to_shift)
+{
+    uint32_t v;
+
+    if (n_from_bits && n_to_bits)
+	v  = unorm_to_unorm (pixel >> from_shift, n_from_bits, n_to_bits);
+    else if (n_to_bits)
+	v = def_value;
+    else
+	v = 0;
+
+    return (v & ((1 << n_to_bits) - 1)) << to_shift;
+}
+
+static force_inline uint32_t
+convert_pixel (pixman_format_code_t from, pixman_format_code_t to, uint32_t pixel)
+{
+    int a_from_shift, r_from_shift, g_from_shift, b_from_shift;
+    int a_to_shift, r_to_shift, g_to_shift, b_to_shift;
+    uint32_t a, r, g, b;
+
+    get_shifts (from, &a_from_shift, &r_from_shift, &g_from_shift, &b_from_shift);
+    get_shifts (to, &a_to_shift, &r_to_shift, &g_to_shift, &b_to_shift);
+
+    a = convert_channel (pixel, ~0,
+			 PIXMAN_FORMAT_A (from), a_from_shift,
+			 PIXMAN_FORMAT_A (to), a_to_shift);
+
+    r = convert_channel (pixel, 0,
+			 PIXMAN_FORMAT_R (from), r_from_shift,
+			 PIXMAN_FORMAT_R (to), r_to_shift);
+
+    g = convert_channel (pixel, 0,
+			 PIXMAN_FORMAT_G (from), g_from_shift,
+			 PIXMAN_FORMAT_G (to), g_to_shift);
+
+    b = convert_channel (pixel, 0,
+			 PIXMAN_FORMAT_B (from), b_from_shift,
+			 PIXMAN_FORMAT_B (to), b_to_shift);
+
+    return a | r | g | b;
+}
+
+static force_inline uint32_t
+convert_pixel_to_a8r8g8b8 (bits_image_t *image,
+			   pixman_format_code_t format,
+			   uint32_t pixel)
+{
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY		||
+	PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
+    {
+	return image->indexed->rgba[pixel];
+    }
+    else
+    {
+	return convert_pixel (format, PIXMAN_a8r8g8b8, pixel);
+    }
+}
+
+static force_inline uint32_t
+convert_pixel_from_a8r8g8b8 (pixman_image_t *image,
+			     pixman_format_code_t format, uint32_t pixel)
+{
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
+    {
+	pixel = CONVERT_RGB24_TO_Y15 (pixel);
+
+	return image->bits.indexed->ent[pixel & 0x7fff];
+    }
+    else if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
+    {
+	pixel = convert_pixel (PIXMAN_a8r8g8b8, PIXMAN_x1r5g5b5, pixel);
+
+	return image->bits.indexed->ent[pixel & 0x7fff];
+    }
+    else
+    {
+	return convert_pixel (PIXMAN_a8r8g8b8, format, pixel);
+    }
+}
+
+static force_inline uint32_t
+fetch_and_convert_pixel (bits_image_t *		image,
+			 const uint8_t *	bits,
+			 int			offset,
+			 pixman_format_code_t	format)
+{
+    uint32_t pixel;
+
+    switch (PIXMAN_FORMAT_BPP (format))
+    {
+    case 1:
+	pixel = FETCH_1 (image, bits, offset);
+	break;
+
+    case 4:
+	pixel = FETCH_4 (image, bits, offset);
+	break;
+
+    case 8:
+	pixel = READ (image, bits + offset);
+	break;
+
+    case 16:
+	pixel = READ (image, ((uint16_t *)bits + offset));
+	break;
+
+    case 24:
+	pixel = FETCH_24 (image, bits, offset);
+	break;
+
+    case 32:
+	pixel = READ (image, ((uint32_t *)bits + offset));
+	break;
+
+    default:
+	pixel = 0xffff00ff; /* As ugly as possible to detect the bug */
+	break;
+    }
+
+    return convert_pixel_to_a8r8g8b8 (image, format, pixel);
+}
+
+static force_inline void
+convert_and_store_pixel (bits_image_t *		image,
+			 uint8_t *		dest,
+			 int                    offset,
+			 pixman_format_code_t	format,
+			 uint32_t		pixel)
+{
+    uint32_t converted = convert_pixel_from_a8r8g8b8 (
+	(pixman_image_t *)image, format, pixel);
+
+    switch (PIXMAN_FORMAT_BPP (format))
+    {
+    case 1:
+	STORE_1 (image, dest, offset, converted & 0x01);
+	break;
+
+    case 4:
+	STORE_4 (image, dest, offset, converted & 0xf);
+	break;
+
+    case 8:
+	WRITE (image, (dest + offset), converted & 0xff);
+	break;
+
+    case 16:
+	WRITE (image, ((uint16_t *)dest + offset), converted & 0xffff);
+	break;
+
+    case 24:
+	STORE_24 (image, dest, offset, converted);
+	break;
+
+    case 32:
+	WRITE (image, ((uint32_t *)dest + offset), converted);
+	break;
+
+    default:
+	*dest = 0x0;
+	break;
+    }
+}
+
+#define MAKE_ACCESSORS(format)						\
+    static void								\
+    fetch_scanline_ ## format (bits_image_t *image,			\
+			       int	       x,			\
+			       int             y,			\
+			       int             width,			\
+			       uint32_t *      buffer,			\
+			       const uint32_t *mask)			\
+    {									\
+	uint8_t *bits =							\
+	    (uint8_t *)(image->bits + y * image->rowstride);		\
+	int i;								\
+									\
+	for (i = 0; i < width; ++i)					\
+	{								\
+	    *buffer++ =							\
+		fetch_and_convert_pixel (image, bits, x + i, PIXMAN_ ## format); \
+	}								\
+    }									\
+									\
+    static void								\
+    store_scanline_ ## format (bits_image_t *  image,			\
+			       int             x,			\
+			       int             y,			\
+			       int             width,			\
+			       const uint32_t *values)			\
+    {									\
+	uint8_t *dest =							\
+	    (uint8_t *)(image->bits + y * image->rowstride);		\
+	int i;								\
+									\
+	for (i = 0; i < width; ++i)					\
+	{								\
+	    convert_and_store_pixel (					\
+		image, dest, i + x, PIXMAN_ ## format, values[i]);	\
+	}								\
+    }									\
+									\
+    static uint32_t							\
+    fetch_pixel_ ## format (bits_image_t *image,			\
+			    int		offset,				\
+			    int		line)				\
+    {									\
+	uint8_t *bits =							\
+	    (uint8_t *)(image->bits + line * image->rowstride);		\
+									\
+	return fetch_and_convert_pixel (				\
+	    image, bits, offset, PIXMAN_ ## format);			\
+    }									\
+									\
+    static const void *const __dummy__ ## format
+
+MAKE_ACCESSORS(a8r8g8b8);
+MAKE_ACCESSORS(x8r8g8b8);
+MAKE_ACCESSORS(a8b8g8r8);
+MAKE_ACCESSORS(x8b8g8r8);
+MAKE_ACCESSORS(x14r6g6b6);
+MAKE_ACCESSORS(b8g8r8a8);
+MAKE_ACCESSORS(b8g8r8x8);
+MAKE_ACCESSORS(r8g8b8x8);
+MAKE_ACCESSORS(r8g8b8a8);
+MAKE_ACCESSORS(r8g8b8);
+MAKE_ACCESSORS(b8g8r8);
+MAKE_ACCESSORS(r5g6b5);
+MAKE_ACCESSORS(b5g6r5);
+MAKE_ACCESSORS(a1r5g5b5);
+MAKE_ACCESSORS(x1r5g5b5);
+MAKE_ACCESSORS(a1b5g5r5);
+MAKE_ACCESSORS(x1b5g5r5);
+MAKE_ACCESSORS(a4r4g4b4);
+MAKE_ACCESSORS(x4r4g4b4);
+MAKE_ACCESSORS(a4b4g4r4);
+MAKE_ACCESSORS(x4b4g4r4);
+MAKE_ACCESSORS(a8);
+MAKE_ACCESSORS(c8);
+MAKE_ACCESSORS(g8);
+MAKE_ACCESSORS(r3g3b2);
+MAKE_ACCESSORS(b2g3r3);
+MAKE_ACCESSORS(a2r2g2b2);
+MAKE_ACCESSORS(a2b2g2r2);
+MAKE_ACCESSORS(x4a4);
+MAKE_ACCESSORS(a4);
+MAKE_ACCESSORS(g4);
+MAKE_ACCESSORS(c4);
+MAKE_ACCESSORS(r1g2b1);
+MAKE_ACCESSORS(b1g2r1);
+MAKE_ACCESSORS(a1r1g1b1);
+MAKE_ACCESSORS(a1b1g1r1);
+MAKE_ACCESSORS(a1);
+MAKE_ACCESSORS(g1);
+
+/********************************** Fetch ************************************/
+/* Table mapping sRGB-encoded 8 bit numbers to linearly encoded
+ * floating point numbers. We assume that single precision
+ * floating point follows the IEEE 754 format.
+ */
+static const uint32_t to_linear_u[256] =
+{
+    0x00000000, 0x399f22b4, 0x3a1f22b4, 0x3a6eb40e, 0x3a9f22b4, 0x3ac6eb61,
+    0x3aeeb40e, 0x3b0b3e5d, 0x3b1f22b4, 0x3b33070b, 0x3b46eb61, 0x3b5b518a,
+    0x3b70f18a, 0x3b83e1c5, 0x3b8fe614, 0x3b9c87fb, 0x3ba9c9b5, 0x3bb7ad6d,
+    0x3bc63547, 0x3bd5635f, 0x3be539bd, 0x3bf5ba70, 0x3c0373b5, 0x3c0c6152,
+    0x3c15a703, 0x3c1f45bc, 0x3c293e68, 0x3c3391f4, 0x3c3e4149, 0x3c494d43,
+    0x3c54b6c7, 0x3c607eb1, 0x3c6ca5df, 0x3c792d22, 0x3c830aa8, 0x3c89af9e,
+    0x3c9085db, 0x3c978dc5, 0x3c9ec7c0, 0x3ca63432, 0x3cadd37d, 0x3cb5a601,
+    0x3cbdac20, 0x3cc5e639, 0x3cce54ab, 0x3cd6f7d2, 0x3cdfd00e, 0x3ce8ddb9,
+    0x3cf2212c, 0x3cfb9ac1, 0x3d02a569, 0x3d0798dc, 0x3d0ca7e4, 0x3d11d2ae,
+    0x3d171963, 0x3d1c7c2e, 0x3d21fb3a, 0x3d2796af, 0x3d2d4ebb, 0x3d332380,
+    0x3d39152b, 0x3d3f23e3, 0x3d454fd0, 0x3d4b991c, 0x3d51ffeb, 0x3d588466,
+    0x3d5f26b7, 0x3d65e6fe, 0x3d6cc564, 0x3d73c210, 0x3d7add25, 0x3d810b65,
+    0x3d84b793, 0x3d88732e, 0x3d8c3e48, 0x3d9018f4, 0x3d940343, 0x3d97fd48,
+    0x3d9c0714, 0x3da020b9, 0x3da44a48, 0x3da883d6, 0x3daccd70, 0x3db12728,
+    0x3db59110, 0x3dba0b38, 0x3dbe95b2, 0x3dc3308f, 0x3dc7dbe0, 0x3dcc97b4,
+    0x3dd1641c, 0x3dd6412a, 0x3ddb2eec, 0x3de02d75, 0x3de53cd3, 0x3dea5d16,
+    0x3def8e52, 0x3df4d091, 0x3dfa23e5, 0x3dff885e, 0x3e027f06, 0x3e05427f,
+    0x3e080ea2, 0x3e0ae376, 0x3e0dc104, 0x3e10a752, 0x3e139669, 0x3e168e50,
+    0x3e198f0e, 0x3e1c98ab, 0x3e1fab2e, 0x3e22c6a0, 0x3e25eb08, 0x3e29186a,
+    0x3e2c4ed0, 0x3e2f8e42, 0x3e32d6c4, 0x3e362861, 0x3e39831e, 0x3e3ce702,
+    0x3e405416, 0x3e43ca5e, 0x3e4749e4, 0x3e4ad2ae, 0x3e4e64c2, 0x3e520027,
+    0x3e55a4e6, 0x3e595303, 0x3e5d0a8a, 0x3e60cb7c, 0x3e6495e0, 0x3e6869bf,
+    0x3e6c4720, 0x3e702e08, 0x3e741e7f, 0x3e78188c, 0x3e7c1c34, 0x3e8014c0,
+    0x3e822039, 0x3e84308b, 0x3e8645b8, 0x3e885fc3, 0x3e8a7eb0, 0x3e8ca281,
+    0x3e8ecb3a, 0x3e90f8df, 0x3e932b72, 0x3e9562f6, 0x3e979f6f, 0x3e99e0e0,
+    0x3e9c274e, 0x3e9e72b8, 0x3ea0c322, 0x3ea31892, 0x3ea57308, 0x3ea7d28a,
+    0x3eaa3718, 0x3eaca0b7, 0x3eaf0f69, 0x3eb18332, 0x3eb3fc16, 0x3eb67a15,
+    0x3eb8fd34, 0x3ebb8576, 0x3ebe12de, 0x3ec0a56e, 0x3ec33d2a, 0x3ec5da14,
+    0x3ec87c30, 0x3ecb2380, 0x3ecdd008, 0x3ed081ca, 0x3ed338c9, 0x3ed5f508,
+    0x3ed8b68a, 0x3edb7d52, 0x3ede4962, 0x3ee11abe, 0x3ee3f168, 0x3ee6cd64,
+    0x3ee9aeb6, 0x3eec955d, 0x3eef815d, 0x3ef272ba, 0x3ef56976, 0x3ef86594,
+    0x3efb6717, 0x3efe6e02, 0x3f00bd2b, 0x3f02460c, 0x3f03d1a5, 0x3f055ff8,
+    0x3f06f105, 0x3f0884ce, 0x3f0a1b54, 0x3f0bb499, 0x3f0d509f, 0x3f0eef65,
+    0x3f1090ef, 0x3f12353c, 0x3f13dc50, 0x3f15862a, 0x3f1732cc, 0x3f18e237,
+    0x3f1a946d, 0x3f1c4970, 0x3f1e013f, 0x3f1fbbde, 0x3f21794c, 0x3f23398c,
+    0x3f24fca0, 0x3f26c286, 0x3f288b42, 0x3f2a56d3, 0x3f2c253d, 0x3f2df680,
+    0x3f2fca9d, 0x3f31a195, 0x3f337b6a, 0x3f35581e, 0x3f3737b1, 0x3f391a24,
+    0x3f3aff7a, 0x3f3ce7b2, 0x3f3ed2d0, 0x3f40c0d2, 0x3f42b1bc, 0x3f44a58e,
+    0x3f469c49, 0x3f4895ee, 0x3f4a9280, 0x3f4c91ff, 0x3f4e946c, 0x3f5099c8,
+    0x3f52a216, 0x3f54ad55, 0x3f56bb88, 0x3f58ccae, 0x3f5ae0cb, 0x3f5cf7de,
+    0x3f5f11ec, 0x3f612ef0, 0x3f634eef, 0x3f6571ea, 0x3f6797e1, 0x3f69c0d6,
+    0x3f6beccb, 0x3f6e1bc0, 0x3f704db6, 0x3f7282af, 0x3f74baac, 0x3f76f5ae,
+    0x3f7933b6, 0x3f7b74c6, 0x3f7db8de, 0x3f800000
+};
+
+static const float * const to_linear = (const float *)to_linear_u;
+
+static uint8_t
+to_srgb (float f)
+{
+    uint8_t low = 0;
+    uint8_t high = 255;
+
+    while (high - low > 1)
+    {
+	uint8_t mid = (low + high) / 2;
+
+	if (to_linear[mid] > f)
+	    high = mid;
+	else
+	    low = mid;
+    }
+
+    if (to_linear[high] - f < f - to_linear[low])
+	return high;
+    else
+	return low;
+}
+
+static void
+fetch_scanline_a8r8g8b8_sRGB_float (bits_image_t *  image,
+				    int             x,
+				    int             y,
+				    int             width,
+				    uint32_t *      b,
+				    const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits + y * image->rowstride;
+    const uint32_t *pixel = bits + x;
+    const uint32_t *end = pixel + width;
+    argb_t *buffer = (argb_t *)b;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	argb_t *argb = buffer;
+
+	argb->a = pixman_unorm_to_float ((p >> 24) & 0xff, 8);
+
+	argb->r = to_linear [(p >> 16) & 0xff];
+	argb->g = to_linear [(p >>  8) & 0xff];
+	argb->b = to_linear [(p >>  0) & 0xff];
+
+	buffer++;
+    }
+}
+
+/* Expects a float buffer */
+static void
+fetch_scanline_a2r10g10b10_float (bits_image_t *  image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  uint32_t *      b,
+				  const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits + y * image->rowstride;
+    const uint32_t *pixel = bits + x;
+    const uint32_t *end = pixel + width;
+    argb_t *buffer = (argb_t *)b;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t a = p >> 30;
+	uint64_t r = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t b = p & 0x3ff;
+
+	buffer->a = pixman_unorm_to_float (a, 2);
+	buffer->r = pixman_unorm_to_float (r, 10);
+	buffer->g = pixman_unorm_to_float (g, 10);
+	buffer->b = pixman_unorm_to_float (b, 10);
+
+	buffer++;
+    }
+}
+
+/* Expects a float buffer */
+static void
+fetch_scanline_x2r10g10b10_float (bits_image_t   *image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  uint32_t *      b,
+				  const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits + y * image->rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    argb_t *buffer = (argb_t *)b;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t r = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t b = p & 0x3ff;
+
+	buffer->a = 1.0;
+	buffer->r = pixman_unorm_to_float (r, 10);
+	buffer->g = pixman_unorm_to_float (g, 10);
+	buffer->b = pixman_unorm_to_float (b, 10);
+
+	buffer++;
+    }
+}
+
+/* Expects a float buffer */
+static void
+fetch_scanline_a2b10g10r10_float (bits_image_t   *image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  uint32_t *      b,
+				  const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits + y * image->rowstride;
+    const uint32_t *pixel = bits + x;
+    const uint32_t *end = pixel + width;
+    argb_t *buffer = (argb_t *)b;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t a = p >> 30;
+	uint64_t b = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t r = p & 0x3ff;
+
+	buffer->a = pixman_unorm_to_float (a, 2);
+	buffer->r = pixman_unorm_to_float (r, 10);
+	buffer->g = pixman_unorm_to_float (g, 10);
+	buffer->b = pixman_unorm_to_float (b, 10);
+
+	buffer++;
+    }
+}
+
+/* Expects a float buffer */
+static void
+fetch_scanline_x2b10g10r10_float (bits_image_t   *image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  uint32_t *      b,
+				  const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits + y * image->rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    argb_t *buffer = (argb_t *)b;
+
+    while (pixel < end)
+    {
+	uint32_t p = READ (image, pixel++);
+	uint64_t b = (p >> 20) & 0x3ff;
+	uint64_t g = (p >> 10) & 0x3ff;
+	uint64_t r = p & 0x3ff;
+
+	buffer->a = 1.0;
+	buffer->r = pixman_unorm_to_float (r, 10);
+	buffer->g = pixman_unorm_to_float (g, 10);
+	buffer->b = pixman_unorm_to_float (b, 10);
+
+	buffer++;
+    }
+}
+
+static void
+fetch_scanline_yuy2 (bits_image_t   *image,
+                     int             x,
+                     int             line,
+                     int             width,
+                     uint32_t *      buffer,
+                     const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits + image->rowstride * line;
+    int i;
+    
+    for (i = 0; i < width; i++)
+    {
+	int16_t y, u, v;
+	int32_t r, g, b;
+	
+	y = ((uint8_t *) bits)[(x + i) << 1] - 16;
+	u = ((uint8_t *) bits)[(((x + i) << 1) & - 4) + 1] - 128;
+	v = ((uint8_t *) bits)[(((x + i) << 1) & - 4) + 3] - 128;
+	
+	/* R = 1.164(Y - 16) + 1.596(V - 128) */
+	r = 0x012b27 * y + 0x019a2e * v;
+	/* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+	g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+	/* B = 1.164(Y - 16) + 2.018(U - 128) */
+	b = 0x012b27 * y + 0x0206a2 * u;
+	
+	*buffer++ = 0xff000000 |
+	    (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+	    (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+	    (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+    }
+}
+
+static void
+fetch_scanline_yv12 (bits_image_t   *image,
+                     int             x,
+                     int             line,
+                     int             width,
+                     uint32_t *      buffer,
+                     const uint32_t *mask)
+{
+    YV12_SETUP (image);
+    uint8_t *y_line = YV12_Y (line);
+    uint8_t *u_line = YV12_U (line);
+    uint8_t *v_line = YV12_V (line);
+    int i;
+    
+    for (i = 0; i < width; i++)
+    {
+	int16_t y, u, v;
+	int32_t r, g, b;
+
+	y = y_line[x + i] - 16;
+	u = u_line[(x + i) >> 1] - 128;
+	v = v_line[(x + i) >> 1] - 128;
+
+	/* R = 1.164(Y - 16) + 1.596(V - 128) */
+	r = 0x012b27 * y + 0x019a2e * v;
+	/* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+	g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+	/* B = 1.164(Y - 16) + 2.018(U - 128) */
+	b = 0x012b27 * y + 0x0206a2 * u;
+
+	*buffer++ = 0xff000000 |
+	    (r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+	    (g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+	    (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+    }
+}
+
+/**************************** Pixel wise fetching *****************************/
+
+static argb_t
+fetch_pixel_x2r10g10b10_float (bits_image_t *image,
+			       int	   offset,
+			       int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t r = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t b = p & 0x3ff;
+    argb_t argb;
+
+    argb.a = 1.0;
+    argb.r = pixman_unorm_to_float (r, 10);
+    argb.g = pixman_unorm_to_float (g, 10);
+    argb.b = pixman_unorm_to_float (b, 10);
+
+    return argb;
+}
+
+static argb_t
+fetch_pixel_a2r10g10b10_float (bits_image_t *image,
+			       int	     offset,
+			       int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t a = p >> 30;
+    uint64_t r = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t b = p & 0x3ff;
+    argb_t argb;
+
+    argb.a = pixman_unorm_to_float (a, 2);
+    argb.r = pixman_unorm_to_float (r, 10);
+    argb.g = pixman_unorm_to_float (g, 10);
+    argb.b = pixman_unorm_to_float (b, 10);
+
+    return argb;
+}
+
+static argb_t
+fetch_pixel_a2b10g10r10_float (bits_image_t *image,
+			       int           offset,
+			       int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t a = p >> 30;
+    uint64_t b = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t r = p & 0x3ff;
+    argb_t argb;
+
+    argb.a = pixman_unorm_to_float (a, 2);
+    argb.r = pixman_unorm_to_float (r, 10);
+    argb.g = pixman_unorm_to_float (g, 10);
+    argb.b = pixman_unorm_to_float (b, 10);
+
+    return argb;
+}
+
+static argb_t
+fetch_pixel_x2b10g10r10_float (bits_image_t *image,
+			       int           offset,
+			       int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    uint64_t b = (p >> 20) & 0x3ff;
+    uint64_t g = (p >> 10) & 0x3ff;
+    uint64_t r = p & 0x3ff;
+    argb_t argb;
+
+    argb.a = 1.0;
+    argb.r = pixman_unorm_to_float (r, 10);
+    argb.g = pixman_unorm_to_float (g, 10);
+    argb.b = pixman_unorm_to_float (b, 10);
+
+    return argb;
+}
+
+static argb_t
+fetch_pixel_a8r8g8b8_sRGB_float (bits_image_t *image,
+				 int	       offset,
+				 int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t p = READ (image, bits + offset);
+    argb_t argb;
+
+    argb.a = pixman_unorm_to_float ((p >> 24) & 0xff, 8);
+
+    argb.r = to_linear [(p >> 16) & 0xff];
+    argb.g = to_linear [(p >>  8) & 0xff];
+    argb.b = to_linear [(p >>  0) & 0xff];
+
+    return argb;
+}
+
+static uint32_t
+fetch_pixel_yuy2 (bits_image_t *image,
+		  int           offset,
+		  int           line)
+{
+    const uint32_t *bits = image->bits + image->rowstride * line;
+    
+    int16_t y, u, v;
+    int32_t r, g, b;
+    
+    y = ((uint8_t *) bits)[offset << 1] - 16;
+    u = ((uint8_t *) bits)[((offset << 1) & - 4) + 1] - 128;
+    v = ((uint8_t *) bits)[((offset << 1) & - 4) + 3] - 128;
+    
+    /* R = 1.164(Y - 16) + 1.596(V - 128) */
+    r = 0x012b27 * y + 0x019a2e * v;
+    
+    /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+    g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+    
+    /* B = 1.164(Y - 16) + 2.018(U - 128) */
+    b = 0x012b27 * y + 0x0206a2 * u;
+    
+    return 0xff000000 |
+	(r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+	(g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+	(b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+}
+
+static uint32_t
+fetch_pixel_yv12 (bits_image_t *image,
+		  int           offset,
+		  int           line)
+{
+    YV12_SETUP (image);
+    int16_t y = YV12_Y (line)[offset] - 16;
+    int16_t u = YV12_U (line)[offset >> 1] - 128;
+    int16_t v = YV12_V (line)[offset >> 1] - 128;
+    int32_t r, g, b;
+    
+    /* R = 1.164(Y - 16) + 1.596(V - 128) */
+    r = 0x012b27 * y + 0x019a2e * v;
+    
+    /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */
+    g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u;
+    
+    /* B = 1.164(Y - 16) + 2.018(U - 128) */
+    b = 0x012b27 * y + 0x0206a2 * u;
+    
+    return 0xff000000 |
+	(r >= 0 ? r < 0x1000000 ? r         & 0xff0000 : 0xff0000 : 0) |
+	(g >= 0 ? g < 0x1000000 ? (g >> 8)  & 0x00ff00 : 0x00ff00 : 0) |
+	(b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0);
+}
+
+/*********************************** Store ************************************/
+
+static void
+store_scanline_a2r10g10b10_float (bits_image_t *  image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = bits + x;
+    argb_t *values = (argb_t *)v;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint16_t a, r, g, b;
+
+	a = pixman_float_to_unorm (values[i].a, 2);
+	r = pixman_float_to_unorm (values[i].r, 10);
+	g = pixman_float_to_unorm (values[i].g, 10);
+	b = pixman_float_to_unorm (values[i].b, 10);
+
+	WRITE (image, pixel++,
+	       (a << 30) | (r << 20) | (g << 10) | b);
+    }
+}
+
+static void
+store_scanline_x2r10g10b10_float (bits_image_t *  image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = bits + x;
+    argb_t *values = (argb_t *)v;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint16_t r, g, b;
+
+	r = pixman_float_to_unorm (values[i].r, 10);
+	g = pixman_float_to_unorm (values[i].g, 10);
+	b = pixman_float_to_unorm (values[i].b, 10);
+
+	WRITE (image, pixel++,
+	       (r << 20) | (g << 10) | b);
+    }
+}
+
+static void
+store_scanline_a2b10g10r10_float (bits_image_t *  image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = bits + x;
+    argb_t *values = (argb_t *)v;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint16_t a, r, g, b;
+
+	a = pixman_float_to_unorm (values[i].a, 2);
+	r = pixman_float_to_unorm (values[i].r, 10);
+	g = pixman_float_to_unorm (values[i].g, 10);
+	b = pixman_float_to_unorm (values[i].b, 10);
+
+	WRITE (image, pixel++,
+	       (a << 30) | (b << 20) | (g << 10) | r);
+    }
+}
+
+static void
+store_scanline_x2b10g10r10_float (bits_image_t *  image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = bits + x;
+    argb_t *values = (argb_t *)v;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint16_t r, g, b;
+
+	r = pixman_float_to_unorm (values[i].r, 10);
+	g = pixman_float_to_unorm (values[i].g, 10);
+	b = pixman_float_to_unorm (values[i].b, 10);
+
+	WRITE (image, pixel++,
+	       (b << 20) | (g << 10) | r);
+    }
+}
+
+static void
+store_scanline_a8r8g8b8_sRGB_float (bits_image_t *  image,
+				    int             x,
+				    int             y,
+				    int             width,
+				    const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint32_t *pixel = bits + x;
+    argb_t *values = (argb_t *)v;
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint8_t a, r, g, b;
+
+	a = pixman_float_to_unorm (values[i].a, 8);
+	r = to_srgb (values[i].r);
+	g = to_srgb (values[i].g);
+	b = to_srgb (values[i].b);
+
+	WRITE (image, pixel++,
+	       (a << 24) | (r << 16) | (g << 8) | b);
+    }
+}
+
+/*
+ * Contracts a floating point image to 32bpp and then stores it using a
+ * regular 32-bit store proc. Despite the type, this function expects an
+ * argb_t buffer.
+ */
+static void
+store_scanline_generic_float (bits_image_t *  image,
+			      int             x,
+			      int             y,
+			      int             width,
+			      const uint32_t *values)
+{
+    uint32_t *argb8_pixels;
+
+    assert (image->common.type == BITS);
+
+    argb8_pixels = pixman_malloc_ab (width, sizeof(uint32_t));
+    if (!argb8_pixels)
+	return;
+
+    /* Contract the scanline.  We could do this in place if values weren't
+     * const.
+     */
+    pixman_contract_from_float (argb8_pixels, (argb_t *)values, width);
+
+    image->store_scanline_32 (image, x, y, width, argb8_pixels);
+
+    free (argb8_pixels);
+}
+
+static void
+fetch_scanline_generic_float (bits_image_t *  image,
+			      int	      x,
+			      int	      y,
+			      int	      width,
+			      uint32_t *      buffer,
+			      const uint32_t *mask)
+{
+    image->fetch_scanline_32 (image, x, y, width, buffer, NULL);
+
+    pixman_expand_to_float ((argb_t *)buffer, buffer, image->format, width);
+}
+
+/* The 32_sRGB paths should be deleted after narrow processing
+ * is no longer invoked for formats that are considered wide.
+ * (Also see fetch_pixel_generic_lossy_32) */
+static void
+fetch_scanline_a8r8g8b8_32_sRGB (bits_image_t   *image,
+                                 int             x,
+                                 int             y,
+                                 int             width,
+                                 uint32_t       *buffer,
+                                 const uint32_t *mask)
+{
+    const uint32_t *bits = image->bits + y * image->rowstride;
+    const uint32_t *pixel = (uint32_t *)bits + x;
+    const uint32_t *end = pixel + width;
+    uint32_t tmp;
+    
+    while (pixel < end)
+    {
+	uint8_t a, r, g, b;
+
+	tmp = READ (image, pixel++);
+
+	a = (tmp >> 24) & 0xff;
+	r = (tmp >> 16) & 0xff;
+	g = (tmp >> 8) & 0xff;
+	b = (tmp >> 0) & 0xff;
+
+	r = to_linear[r] * 255.0f + 0.5f;
+	g = to_linear[g] * 255.0f + 0.5f;
+	b = to_linear[b] * 255.0f + 0.5f;
+
+	*buffer++ = (a << 24) | (r << 16) | (g << 8) | (b << 0);
+    }
+}
+
+static uint32_t
+fetch_pixel_a8r8g8b8_32_sRGB (bits_image_t *image,
+			      int           offset,
+			      int           line)
+{
+    uint32_t *bits = image->bits + line * image->rowstride;
+    uint32_t tmp = READ (image, bits + offset);
+    uint8_t a, r, g, b;
+
+    a = (tmp >> 24) & 0xff;
+    r = (tmp >> 16) & 0xff;
+    g = (tmp >> 8) & 0xff;
+    b = (tmp >> 0) & 0xff;
+
+    r = to_linear[r] * 255.0f + 0.5f;
+    g = to_linear[g] * 255.0f + 0.5f;
+    b = to_linear[b] * 255.0f + 0.5f;
+
+    return (a << 24) | (r << 16) | (g << 8) | (b << 0);
+}
+
+static void
+store_scanline_a8r8g8b8_32_sRGB (bits_image_t   *image,
+                                 int             x,
+                                 int             y,
+                                 int             width,
+                                 const uint32_t *v)
+{
+    uint32_t *bits = image->bits + image->rowstride * y;
+    uint64_t *values = (uint64_t *)v;
+    uint32_t *pixel = bits + x;
+    uint64_t tmp;
+    int i;
+    
+    for (i = 0; i < width; ++i)
+    {
+	uint8_t a, r, g, b;
+
+	tmp = values[i];
+
+	a = (tmp >> 24) & 0xff;
+	r = (tmp >> 16) & 0xff;
+	g = (tmp >> 8) & 0xff;
+	b = (tmp >> 0) & 0xff;
+
+	r = to_srgb (r * (1/255.0f));
+	g = to_srgb (g * (1/255.0f));
+	b = to_srgb (b * (1/255.0f));
+	
+	WRITE (image, pixel++, a | (r << 16) | (g << 8) | (b << 0));
+    }
+}
+
+static argb_t
+fetch_pixel_generic_float (bits_image_t *image,
+			   int		 offset,
+			   int           line)
+{
+    uint32_t pixel32 = image->fetch_pixel_32 (image, offset, line);
+    argb_t f;
+
+    pixman_expand_to_float (&f, &pixel32, image->format, 1);
+
+    return f;
+}
+
+/*
+ * XXX: The transformed fetch path only works at 32-bpp so far.  When all
+ * paths have wide versions, this can be removed.
+ *
+ * WARNING: This function loses precision!
+ */
+static uint32_t
+fetch_pixel_generic_lossy_32 (bits_image_t *image,
+			      int           offset,
+			      int           line)
+{
+    argb_t pixel64 = image->fetch_pixel_float (image, offset, line);
+    uint32_t result;
+
+    pixman_contract_from_float (&result, &pixel64, 1);
+
+    return result;
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    fetch_scanline_t		fetch_scanline_32;
+    fetch_scanline_t		fetch_scanline_float;
+    fetch_pixel_32_t		fetch_pixel_32;
+    fetch_pixel_float_t		fetch_pixel_float;
+    store_scanline_t		store_scanline_32;
+    store_scanline_t		store_scanline_float;
+} format_info_t;
+
+#define FORMAT_INFO(format) 						\
+    {									\
+	PIXMAN_ ## format,						\
+	    fetch_scanline_ ## format,					\
+	    fetch_scanline_generic_float,				\
+	    fetch_pixel_ ## format,					\
+	    fetch_pixel_generic_float,					\
+	    store_scanline_ ## format,					\
+	    store_scanline_generic_float				\
+    }
+
+static const format_info_t accessors[] =
+{
+/* 32 bpp formats */
+    FORMAT_INFO (a8r8g8b8),
+    FORMAT_INFO (x8r8g8b8),
+    FORMAT_INFO (a8b8g8r8),
+    FORMAT_INFO (x8b8g8r8),
+    FORMAT_INFO (b8g8r8a8),
+    FORMAT_INFO (b8g8r8x8),
+    FORMAT_INFO (r8g8b8a8),
+    FORMAT_INFO (r8g8b8x8),
+    FORMAT_INFO (x14r6g6b6),
+
+/* sRGB formats */
+  { PIXMAN_a8r8g8b8_sRGB,
+    fetch_scanline_a8r8g8b8_32_sRGB, fetch_scanline_a8r8g8b8_sRGB_float,
+    fetch_pixel_a8r8g8b8_32_sRGB, fetch_pixel_a8r8g8b8_sRGB_float,
+    store_scanline_a8r8g8b8_32_sRGB, store_scanline_a8r8g8b8_sRGB_float,
+  },
+
+/* 24bpp formats */
+    FORMAT_INFO (r8g8b8),
+    FORMAT_INFO (b8g8r8),
+    
+/* 16bpp formats */
+    FORMAT_INFO (r5g6b5),
+    FORMAT_INFO (b5g6r5),
+    
+    FORMAT_INFO (a1r5g5b5),
+    FORMAT_INFO (x1r5g5b5),
+    FORMAT_INFO (a1b5g5r5),
+    FORMAT_INFO (x1b5g5r5),
+    FORMAT_INFO (a4r4g4b4),
+    FORMAT_INFO (x4r4g4b4),
+    FORMAT_INFO (a4b4g4r4),
+    FORMAT_INFO (x4b4g4r4),
+    
+/* 8bpp formats */
+    FORMAT_INFO (a8),
+    FORMAT_INFO (r3g3b2),
+    FORMAT_INFO (b2g3r3),
+    FORMAT_INFO (a2r2g2b2),
+    FORMAT_INFO (a2b2g2r2),
+    
+    FORMAT_INFO (c8),
+    
+    FORMAT_INFO (g8),
+    
+#define fetch_scanline_x4c4 fetch_scanline_c8
+#define fetch_pixel_x4c4 fetch_pixel_c8
+#define store_scanline_x4c4 store_scanline_c8
+    FORMAT_INFO (x4c4),
+    
+#define fetch_scanline_x4g4 fetch_scanline_g8
+#define fetch_pixel_x4g4 fetch_pixel_g8
+#define store_scanline_x4g4 store_scanline_g8
+    FORMAT_INFO (x4g4),
+    
+    FORMAT_INFO (x4a4),
+    
+/* 4bpp formats */
+    FORMAT_INFO (a4),
+    FORMAT_INFO (r1g2b1),
+    FORMAT_INFO (b1g2r1),
+    FORMAT_INFO (a1r1g1b1),
+    FORMAT_INFO (a1b1g1r1),
+    
+    FORMAT_INFO (c4),
+    
+    FORMAT_INFO (g4),
+    
+/* 1bpp formats */
+    FORMAT_INFO (a1),
+    FORMAT_INFO (g1),
+    
+/* Wide formats */
+    
+    { PIXMAN_a2r10g10b10,
+      NULL, fetch_scanline_a2r10g10b10_float,
+      fetch_pixel_generic_lossy_32, fetch_pixel_a2r10g10b10_float,
+      NULL, store_scanline_a2r10g10b10_float },
+
+    { PIXMAN_x2r10g10b10,
+      NULL, fetch_scanline_x2r10g10b10_float,
+      fetch_pixel_generic_lossy_32, fetch_pixel_x2r10g10b10_float,
+      NULL, store_scanline_x2r10g10b10_float },
+
+    { PIXMAN_a2b10g10r10,
+      NULL, fetch_scanline_a2b10g10r10_float,
+      fetch_pixel_generic_lossy_32, fetch_pixel_a2b10g10r10_float,
+      NULL, store_scanline_a2b10g10r10_float },
+
+    { PIXMAN_x2b10g10r10,
+      NULL, fetch_scanline_x2b10g10r10_float,
+      fetch_pixel_generic_lossy_32, fetch_pixel_x2b10g10r10_float,
+      NULL, store_scanline_x2b10g10r10_float },
+
+/* YUV formats */
+    { PIXMAN_yuy2,
+      fetch_scanline_yuy2, fetch_scanline_generic_float,
+      fetch_pixel_yuy2, fetch_pixel_generic_float,
+      NULL, NULL },
+
+    { PIXMAN_yv12,
+      fetch_scanline_yv12, fetch_scanline_generic_float,
+      fetch_pixel_yv12, fetch_pixel_generic_float,
+      NULL, NULL },
+    
+    { PIXMAN_null },
+};
+
+static void
+setup_accessors (bits_image_t *image)
+{
+    const format_info_t *info = accessors;
+    
+    while (info->format != PIXMAN_null)
+    {
+	if (info->format == image->format)
+	{
+	    image->fetch_scanline_32 = info->fetch_scanline_32;
+	    image->fetch_scanline_float = info->fetch_scanline_float;
+	    image->fetch_pixel_32 = info->fetch_pixel_32;
+	    image->fetch_pixel_float = info->fetch_pixel_float;
+	    image->store_scanline_32 = info->store_scanline_32;
+	    image->store_scanline_float = info->store_scanline_float;
+	    
+	    return;
+	}
+	
+	info++;
+    }
+}
+
+#ifndef PIXMAN_FB_ACCESSORS
+void
+_pixman_bits_image_setup_accessors_accessors (bits_image_t *image);
+
+void
+_pixman_bits_image_setup_accessors (bits_image_t *image)
+{
+    if (image->read_func || image->write_func)
+	_pixman_bits_image_setup_accessors_accessors (image);
+    else
+	setup_accessors (image);
+}
+
+#else
+
+void
+_pixman_bits_image_setup_accessors_accessors (bits_image_t *image)
+{
+    setup_accessors (image);
+}
+
+#endif
diff --git a/src/pixman/pixman/pixman-accessor.h b/src/pixman/pixman/pixman-accessor.h
new file mode 100644
index 0000000..8e0b036
--- /dev/null
+++ b/src/pixman/pixman/pixman-accessor.h
@@ -0,0 +1,25 @@
+#ifdef PIXMAN_FB_ACCESSORS
+
+#define READ(img, ptr)							\
+    (((bits_image_t *)(img))->read_func ((ptr), sizeof(*(ptr))))
+#define WRITE(img, ptr,val)						\
+    (((bits_image_t *)(img))->write_func ((ptr), (val), sizeof (*(ptr))))
+
+#define MEMSET_WRAPPED(img, dst, val, size)				\
+    do {								\
+	size_t _i;							\
+	uint8_t *_dst = (uint8_t*)(dst);				\
+	for(_i = 0; _i < (size_t) size; _i++) {				\
+	    WRITE((img), _dst +_i, (val));				\
+	}								\
+    } while (0)
+
+#else
+
+#define READ(img, ptr)		(*(ptr))
+#define WRITE(img, ptr, val)	(*(ptr) = (val))
+#define MEMSET_WRAPPED(img, dst, val, size)				\
+    memset(dst, val, size)
+
+#endif
+
diff --git a/src/pixman/pixman/pixman-arm-common.h b/src/pixman/pixman/pixman-arm-common.h
new file mode 100644
index 0000000..3a7cb2b
--- /dev/null
+++ b/src/pixman/pixman/pixman-arm-common.h
@@ -0,0 +1,428 @@
+/*
+ * Copyright © 2010 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+#ifndef PIXMAN_ARM_COMMON_H
+#define PIXMAN_ARM_COMMON_H
+
+#include "pixman-inlines.h"
+
+/* Define some macros which can expand into proxy functions between
+ * ARM assembly optimized functions and the rest of pixman fast path API.
+ *
+ * All the low level ARM assembly functions have to use ARM EABI
+ * calling convention and take up to 8 arguments:
+ *    width, height, dst, dst_stride, src, src_stride, mask, mask_stride
+ *
+ * The arguments are ordered with the most important coming first (the
+ * first 4 arguments are passed to function in registers, the rest are
+ * on stack). The last arguments are optional, for example if the
+ * function is not using mask, then 'mask' and 'mask_stride' can be
+ * omitted when doing a function call.
+ *
+ * Arguments 'src' and 'mask' contain either a pointer to the top left
+ * pixel of the composited rectangle or a pixel color value depending
+ * on the function type. In the case of just a color value (solid source
+ * or mask), the corresponding stride argument is unused.
+ */
+
+#define SKIP_ZERO_SRC  1
+#define SKIP_ZERO_MASK 2
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_DST(cputype, name,                \
+                                          src_type, src_cnt,            \
+                                          dst_type, dst_cnt)            \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t   w,                   \
+                                         int32_t   h,                   \
+                                         dst_type *dst,                 \
+                                         int32_t   dst_stride,          \
+                                         src_type *src,                 \
+                                         int32_t   src_stride);         \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type *dst_line;							\
+    src_type *src_line;                                                 \
+    int32_t dst_stride, src_stride;                                     \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
+                           src_stride, src_line, src_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src_line, src_stride);     \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_N_DST(flags, cputype, name,           \
+                                        dst_type, dst_cnt)              \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         uint32_t   src);               \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+			    pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);					\
+    dst_type  *dst_line;                                                \
+    int32_t    dst_stride;                                              \
+    uint32_t   src;                                                     \
+                                                                        \
+    src = _pixman_image_get_solid (					\
+	imp, src_image, dest_image->bits.format);			\
+                                                                        \
+    if ((flags & SKIP_ZERO_SRC) && src == 0)                            \
+	return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src);                      \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST(flags, cputype, name,      \
+                                             mask_type, mask_cnt,       \
+                                             dst_type, dst_cnt)         \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         uint32_t   src,                \
+                                         int32_t    unused,             \
+                                         mask_type *mask,               \
+                                         int32_t    mask_stride);       \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type  *dst_line;						\
+    mask_type *mask_line;                                               \
+    int32_t    dst_stride, mask_stride;                                 \
+    uint32_t   src;                                                     \
+                                                                        \
+    src = _pixman_image_get_solid (					\
+	imp, src_image, dest_image->bits.format);			\
+                                                                        \
+    if ((flags & SKIP_ZERO_SRC) && src == 0)                            \
+	return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,       \
+                           mask_stride, mask_line, mask_cnt);           \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src, 0,                    \
+                                             mask_line, mask_stride);   \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST(flags, cputype, name,       \
+                                            src_type, src_cnt,          \
+                                            dst_type, dst_cnt)          \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         src_type  *src,                \
+                                         int32_t    src_stride,         \
+                                         uint32_t   mask);              \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type  *dst_line;						\
+    src_type  *src_line;                                                \
+    int32_t    dst_stride, src_stride;                                  \
+    uint32_t   mask;                                                    \
+                                                                        \
+    mask = _pixman_image_get_solid (					\
+	imp, mask_image, dest_image->bits.format);			\
+                                                                        \
+    if ((flags & SKIP_ZERO_MASK) && mask == 0)                          \
+	return;                                                         \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
+                           src_stride, src_line, src_cnt);              \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src_line, src_stride,      \
+                                             mask);                     \
+}
+
+#define PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST(cputype, name,           \
+                                               src_type, src_cnt,       \
+                                               mask_type, mask_cnt,     \
+                                               dst_type, dst_cnt)       \
+void                                                                    \
+pixman_composite_##name##_asm_##cputype (int32_t    w,                  \
+                                         int32_t    h,                  \
+                                         dst_type  *dst,                \
+                                         int32_t    dst_stride,         \
+                                         src_type  *src,                \
+                                         int32_t    src_stride,         \
+                                         mask_type *mask,               \
+                                         int32_t    mask_stride);       \
+                                                                        \
+static void                                                             \
+cputype##_composite_##name (pixman_implementation_t *imp,               \
+                            pixman_composite_info_t *info)              \
+{                                                                       \
+    PIXMAN_COMPOSITE_ARGS (info);                                       \
+    dst_type  *dst_line;						\
+    src_type  *src_line;                                                \
+    mask_type *mask_line;                                               \
+    int32_t    dst_stride, src_stride, mask_stride;                     \
+                                                                        \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,        \
+                           dst_stride, dst_line, dst_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,           \
+                           src_stride, src_line, src_cnt);              \
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,       \
+                           mask_stride, mask_line, mask_cnt);           \
+                                                                        \
+    pixman_composite_##name##_asm_##cputype (width, height,             \
+                                             dst_line, dst_stride,      \
+                                             src_line, src_stride,      \
+                                             mask_line, mask_stride);   \
+}
+
+#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST(cputype, name, op,             \
+                                               src_type, dst_type)            \
+void                                                                          \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
+                                                   int32_t          w,        \
+                                                   dst_type *       dst,      \
+                                                   const src_type * src,      \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx);  \
+                                                                              \
+static force_inline void                                                      \
+scaled_nearest_scanline_##cputype##_##name##_##op (dst_type *       pd,       \
+                                                   const src_type * ps,       \
+                                                   int32_t          w,        \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx,   \
+                                                   pixman_bool_t    zero_src) \
+{                                                                             \
+    pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
+                                                                  vx, unit_x, \
+                                                                  max_vx);    \
+}                                                                             \
+                                                                              \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_cover_##op,                         \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, COVER)                             \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_none_##op,                          \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, NONE)                              \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_pad_##op,                           \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, PAD)                               \
+FAST_NEAREST_MAINLOOP (cputype##_##name##_normal_##op,                        \
+                       scaled_nearest_scanline_##cputype##_##name##_##op,     \
+                       src_type, dst_type, NORMAL)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                      \
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                             \
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                              \
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),                               \
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
+
+#define PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST(flags, cputype, name, op,   \
+                                                  src_type, dst_type)         \
+void                                                                          \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (                \
+                                                   int32_t          w,        \
+                                                   dst_type *       dst,      \
+                                                   const src_type * src,      \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx,   \
+                                                   const uint8_t *  mask);    \
+                                                                              \
+static force_inline void                                                      \
+scaled_nearest_scanline_##cputype##_##name##_##op (const uint8_t *  mask,     \
+                                                   dst_type *       pd,       \
+                                                   const src_type * ps,       \
+                                                   int32_t          w,        \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x,   \
+                                                   pixman_fixed_t   max_vx,   \
+                                                   pixman_bool_t    zero_src) \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+	return;                                                               \
+    pixman_scaled_nearest_scanline_##name##_##op##_asm_##cputype (w, pd, ps,  \
+                                                                  vx, unit_x, \
+                                                                  max_vx,     \
+                                                                  mask);      \
+}                                                                             \
+                                                                              \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                  \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_none_##op,                   \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                    \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)  \
+FAST_NEAREST_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                 \
+                              scaled_nearest_scanline_##cputype##_##name##_##op,\
+                              src_type, uint8_t, dst_type, NORMAL, TRUE, FALSE)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)              \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),                     \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),                      \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func),                       \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
+
+/*****************************************************************************/
+
+#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST(flags, cputype, name, op,     \
+                                                src_type, dst_type)           \
+void                                                                          \
+pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
+                                                dst_type *       dst,         \
+                                                const src_type * top,         \
+                                                const src_type * bottom,      \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   x,           \
+                                                pixman_fixed_t   ux,          \
+                                                int              width);      \
+                                                                              \
+static force_inline void                                                      \
+scaled_bilinear_scanline_##cputype##_##name##_##op (                          \
+                                                dst_type *       dst,         \
+                                                const uint32_t * mask,        \
+                                                const src_type * src_top,     \
+                                                const src_type * src_bottom,  \
+                                                int32_t          w,           \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   vx,          \
+                                                pixman_fixed_t   unit_x,      \
+                                                pixman_fixed_t   max_vx,      \
+                                                pixman_bool_t    zero_src)    \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+	return;                                                               \
+    pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
+                            dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \
+}                                                                             \
+                                                                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, COVER, FLAG_NONE)        \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, NONE, FLAG_NONE)         \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, PAD, FLAG_NONE)          \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint32_t, dst_type, NORMAL,                  \
+                       FLAG_NONE)
+
+
+#define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST(flags, cputype, name, op,  \
+                                                src_type, dst_type)           \
+void                                                                          \
+pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
+                                                dst_type *       dst,         \
+                                                const uint8_t *  mask,        \
+                                                const src_type * top,         \
+                                                const src_type * bottom,      \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   x,           \
+                                                pixman_fixed_t   ux,          \
+                                                int              width);      \
+                                                                              \
+static force_inline void                                                      \
+scaled_bilinear_scanline_##cputype##_##name##_##op (                          \
+                                                dst_type *       dst,         \
+                                                const uint8_t *  mask,        \
+                                                const src_type * src_top,     \
+                                                const src_type * src_bottom,  \
+                                                int32_t          w,           \
+                                                int              wt,          \
+                                                int              wb,          \
+                                                pixman_fixed_t   vx,          \
+                                                pixman_fixed_t   unit_x,      \
+                                                pixman_fixed_t   max_vx,      \
+                                                pixman_bool_t    zero_src)    \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+	return;                                                                   \
+    pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
+                      dst, mask, src_top, src_bottom, wt, wb, vx, unit_x, w); \
+}                                                                             \
+                                                                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, COVER,                    \
+                       FLAG_HAVE_NON_SOLID_MASK)                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, NONE,                     \
+                       FLAG_HAVE_NON_SOLID_MASK)                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, PAD,                      \
+                       FLAG_HAVE_NON_SOLID_MASK)                              \
+FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
+                       scaled_bilinear_scanline_##cputype##_##name##_##op,    \
+                       src_type, uint8_t, dst_type, NORMAL,                   \
+                       FLAG_HAVE_NON_SOLID_MASK)
+
+
+#endif
diff --git a/src/pixman/pixman/pixman-arm-detect-win32.asm b/src/pixman/pixman/pixman-arm-detect-win32.asm
new file mode 100644
index 0000000..8f5d5eb
--- /dev/null
+++ b/src/pixman/pixman/pixman-arm-detect-win32.asm
@@ -0,0 +1,21 @@
+    area pixman_msvc, code, readonly
+
+    export  pixman_msvc_try_arm_simd_op
+
+pixman_msvc_try_arm_simd_op
+    ;; I don't think the msvc arm asm knows how to do SIMD insns
+    ;; uqadd8 r3,r3,r3
+    dcd 0xe6633f93
+    mov pc,lr
+    endp
+
+    export  pixman_msvc_try_arm_neon_op
+
+pixman_msvc_try_arm_neon_op
+    ;; I don't think the msvc arm asm knows how to do NEON insns
+    ;; veor d0,d0,d0
+    dcd 0xf3000110
+    mov pc,lr
+    endp
+
+    end
diff --git a/src/pixman/pixman/pixman-arm-neon-asm-bilinear.S b/src/pixman/pixman/pixman-arm-neon-asm-bilinear.S
new file mode 100644
index 0000000..e37b5c2
--- /dev/null
+++ b/src/pixman/pixman/pixman-arm-neon-asm-bilinear.S
@@ -0,0 +1,1368 @@
+/*
+ * Copyright © 2011 SCore Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ * Author:  Taekyun Kim (tkq.kim@samsung.com)
+ */
+
+/*
+ * This file contains scaled bilinear scanline functions implemented
+ * using older siarhei's bilinear macro template.
+ *
+ * << General scanline function procedures >>
+ *  1. bilinear interpolate source pixels
+ *  2. load mask pixels
+ *  3. load destination pixels
+ *  4. duplicate mask to fill whole register
+ *  5. interleave source & destination pixels
+ *  6. apply mask to source pixels
+ *  7. combine source & destination pixels
+ *  8, Deinterleave final result
+ *  9. store destination pixels
+ *
+ * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
+ * Registers with double numbers(src01, dst01) are 128-bits registers.
+ * All temp registers can be used freely outside the code block.
+ * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
+ *
+ * Remarks
+ *  There can be lots of pipeline stalls inside code block and between code blocks.
+ *  Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined (__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.eabi_attribute 10, 0
+.eabi_attribute 12, 0
+.arm
+.altmacro
+.p2align 2
+
+#include "pixman-private.h"
+#include "pixman-arm-neon-asm.h"
+
+/*
+ * Bilinear macros from pixman-arm-neon-asm.S
+ */
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+    .func fname
+    .global fname
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    vld1.32   {reg1}, [TMP1], STRIDE
+    vld1.32   {reg2}, [TMP1]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    vld1.32   {reg2[0]}, [TMP1], STRIDE
+    vld1.32   {reg2[1]}, [TMP1]
+    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+    bilinear_load_8888 reg1, reg2, tmp1
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    bilinear_load_8888 reg3, reg4, tmp2
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {acc2lo[1]}, [TMP1]
+    vld1.32   {acc2hi[1]}, [TMP2]
+    convert_0565_to_x888 acc2, reg3, reg2, reg1
+    vzip.u8   reg1, reg3
+    vzip.u8   reg2, reg4
+    vzip.u8   reg3, reg4
+    vzip.u8   reg1, reg2
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {xacc2lo[1]}, [TMP1]
+    vld1.32   {xacc2hi[1]}, [TMP2]
+    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
+    vzip.u8   xreg1, xreg3
+    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
+    vzip.u8   xreg2, xreg4
+    vld1.32   {yacc2lo[1]}, [TMP1]
+    vzip.u8   xreg3, xreg4
+    vld1.32   {yacc2hi[1]}, [TMP2]
+    vzip.u8   xreg1, xreg2
+    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+    vmull.u8  xacc1, xreg1, d28
+    vzip.u8   yreg1, yreg3
+    vmlal.u8  xacc1, xreg2, d29
+    vzip.u8   yreg2, yreg4
+    vmull.u8  xacc2, xreg3, d28
+    vzip.u8   yreg3, yreg4
+    vmlal.u8  xacc2, xreg4, d29
+    vzip.u8   yreg1, yreg2
+    vmull.u8  yacc1, yreg1, d28
+    vmlal.u8  yacc1, yreg2, d29
+    vmull.u8  yacc2, yreg3, d28
+    vmlal.u8  yacc2, yreg4, d29
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if numpix == 4
+    vst1.32   {d0, d1}, [OUT]!
+.elseif numpix == 2
+    vst1.32   {d0}, [OUT]!
+.elseif numpix == 1
+    vst1.32   {d0[0]}, [OUT, :32]!
+.else
+    .error bilinear_store_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+    vuzp.u8 d0, d1
+    vuzp.u8 d2, d3
+    vuzp.u8 d1, d3
+    vuzp.u8 d0, d2
+    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
+.if numpix == 4
+    vst1.16   {d2}, [OUT]!
+.elseif numpix == 2
+    vst1.32   {d2[0]}, [OUT]!
+.elseif numpix == 1
+    vst1.16   {d2[0]}, [OUT]!
+.else
+    .error bilinear_store_0565 numpix is unsupported
+.endif
+.endm
+
+
+/*
+ * Macros for loading mask pixels into register 'mask'.
+ * vdup must be done in somewhere else.
+ */
+.macro bilinear_load_mask_x numpix, mask
+.endm
+
+.macro bilinear_load_mask_8 numpix, mask
+.if numpix == 4
+    vld1.32     {mask[0]}, [MASK]!
+.elseif numpix == 2
+    vld1.16     {mask[0]}, [MASK]!
+.elseif numpix == 1
+    vld1.8      {mask[0]}, [MASK]!
+.else
+    .error bilinear_load_mask_8 numpix is unsupported
+.endif
+    pld         [MASK, #prefetch_offset]
+.endm
+
+.macro bilinear_load_mask mask_fmt, numpix, mask
+    bilinear_load_mask_&mask_fmt numpix, mask
+.endm
+
+
+/*
+ * Macros for loading destination pixels into register 'dst0' and 'dst1'.
+ * Interleave should be done somewhere else.
+ */
+.macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+.if numpix == 4
+    vld1.32     {dst0, dst1}, [OUT]
+.elseif numpix == 2
+    vld1.32     {dst0}, [OUT]
+.elseif numpix == 1
+    vld1.32     {dst0[0]}, [OUT]
+.else
+    .error bilinear_load_dst_8888 numpix is unsupported
+.endif
+    pld         [OUT, #(prefetch_offset * 4)]
+.endm
+
+.macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
+    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
+    bilinear_load_dst_8888 numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
+    bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
+.endm
+
+/*
+ * Macros for duplicating partially loaded mask to fill entire register.
+ * We will apply mask to interleaved source pixels, that is
+ *  (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
+ *  (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
+ * So, we need to duplicate loaded mask into whole register.
+ *
+ * For two pixel case
+ *  (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
+ *  (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
+ * We can do some optimizations for this including last pixel cases.
+ */
+.macro bilinear_duplicate_mask_x numpix, mask
+.endm
+
+.macro bilinear_duplicate_mask_8 numpix, mask
+.if numpix == 4
+    vdup.32     mask, mask[0]
+.elseif numpix == 2
+    vdup.16     mask, mask[0]
+.elseif numpix == 1
+    vdup.8      mask, mask[0]
+.else
+    .error bilinear_duplicate_mask_8 is unsupported
+.endif
+.endm
+
+.macro bilinear_duplicate_mask mask_fmt, numpix, mask
+    bilinear_duplicate_mask_&mask_fmt numpix, mask
+.endm
+
+/*
+ * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
+ * Interleave should be done when maks is enabled or operator is 'over'.
+ */
+.macro bilinear_interleave src0, src1, dst0, dst1
+    vuzp.8      src0, src1
+    vuzp.8      dst0, dst1
+    vuzp.8      src0, src1
+    vuzp.8      dst0, dst1
+.endm
+
+.macro bilinear_interleave_src_dst_x_src \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+.endm
+
+.macro bilinear_interleave_src_dst_x_over \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave src0, src1, dst0, dst1
+.endm
+
+.macro bilinear_interleave_src_dst_x_add \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+.endm
+
+.macro bilinear_interleave_src_dst_8_src \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave src0, src1, dst0, dst1
+.endm
+
+.macro bilinear_interleave_src_dst_8_over \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave src0, src1, dst0, dst1
+.endm
+
+.macro bilinear_interleave_src_dst_8_add \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave src0, src1, dst0, dst1
+.endm
+
+.macro bilinear_interleave_src_dst \
+                mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
+
+    bilinear_interleave_src_dst_&mask_fmt&_&op \
+                numpix, src0, src1, src01, dst0, dst1, dst01
+.endm
+
+
+/*
+ * Macros for applying masks to src pixels. (see combine_mask_u() function)
+ * src, dst should be in interleaved form.
+ * mask register should be in form (m0, m1, m2, m3).
+ */
+.macro bilinear_apply_mask_to_src_x \
+                numpix, src0, src1, src01, mask, \
+                tmp01, tmp23, tmp45, tmp67
+.endm
+
+.macro bilinear_apply_mask_to_src_8 \
+                numpix, src0, src1, src01, mask, \
+                tmp01, tmp23, tmp45, tmp67
+
+    vmull.u8        tmp01, src0, mask
+    vmull.u8        tmp23, src1, mask
+    /* bubbles */
+    vrshr.u16       tmp45, tmp01, #8
+    vrshr.u16       tmp67, tmp23, #8
+    /* bubbles */
+    vraddhn.u16     src0, tmp45, tmp01
+    vraddhn.u16     src1, tmp67, tmp23
+.endm
+
+.macro bilinear_apply_mask_to_src \
+                mask_fmt, numpix, src0, src1, src01, mask, \
+                tmp01, tmp23, tmp45, tmp67
+
+    bilinear_apply_mask_to_src_&mask_fmt \
+                numpix, src0, src1, src01, mask, \
+                tmp01, tmp23, tmp45, tmp67
+.endm
+
+
+/*
+ * Macros for combining src and destination pixels.
+ * Interleave or not is depending on operator 'op'.
+ */
+.macro bilinear_combine_src \
+                numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+.endm
+
+.macro bilinear_combine_over \
+                numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+
+    vdup.32     tmp8, src1[1]
+    /* bubbles */
+    vmvn.8      tmp8, tmp8
+    /* bubbles */
+    vmull.u8    tmp01, dst0, tmp8
+    /* bubbles */
+    vmull.u8    tmp23, dst1, tmp8
+    /* bubbles */
+    vrshr.u16   tmp45, tmp01, #8
+    vrshr.u16   tmp67, tmp23, #8
+    /* bubbles */
+    vraddhn.u16 dst0, tmp45, tmp01
+    vraddhn.u16 dst1, tmp67, tmp23
+    /* bubbles */
+    vqadd.u8    src01, dst01, src01
+.endm
+
+.macro bilinear_combine_add \
+                numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+
+    vqadd.u8    src01, dst01, src01
+.endm
+
+.macro bilinear_combine \
+                op, numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+
+    bilinear_combine_&op \
+                numpix, src0, src1, src01, dst0, dst1, dst01, \
+                tmp01, tmp23, tmp45, tmp67, tmp8
+.endm
+
+/*
+ * Macros for final deinterleaving of destination pixels if needed.
+ */
+.macro bilinear_deinterleave numpix, dst0, dst1, dst01
+    vuzp.8      dst0, dst1
+    /* bubbles */
+    vuzp.8      dst0, dst1
+.endm
+
+.macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
+    bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
+    bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
+    bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
+    bilinear_deinterleave numpix, dst0, dst1, dst01
+.endm
+
+.macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
+    bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
+.endm
+
+
+.macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
+    bilinear_load_&src_fmt d0, d1, d2
+    bilinear_load_mask mask_fmt, 1, d4
+    bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
+    vmull.u8  q1, d0, d28
+    vmlal.u8  q1, d1, d29
+    /* 5 cycles bubble */
+    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    /* 5 cycles bubble */
+    bilinear_duplicate_mask mask_fmt, 1, d4
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    /* 3 cycles bubble */
+    vmovn.u16 d0, q0
+    /* 1 cycle bubble */
+    bilinear_interleave_src_dst \
+                mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
+    bilinear_apply_mask_to_src \
+                mask_fmt, 1, d0, d1, q0, d4, \
+                q3, q8, q10, q11
+    bilinear_combine \
+                op, 1, d0, d1, q0, d18, d19, q9, \
+                q3, q8, q10, q11, d5
+    bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
+    bilinear_store_&dst_fmt 1, q2, q3
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
+    bilinear_load_and_vertical_interpolate_two_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23
+    bilinear_load_mask mask_fmt, 2, d4
+    bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
+    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+    bilinear_duplicate_mask mask_fmt, 2, d4
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vadd.u16  q12, q12, q13
+    vmovn.u16 d0, q0
+    bilinear_interleave_src_dst \
+                mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
+    bilinear_apply_mask_to_src \
+                mask_fmt, 2, d0, d1, q0, d4, \
+                q3, q8, q10, q11
+    bilinear_combine \
+                op, 2, d0, d1, q0, d18, d19, q9, \
+                q3, q8, q10, q11, d5
+    bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
+    bilinear_store_&dst_fmt 2, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
+    bilinear_load_and_vertical_interpolate_four_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23 \
+                q3, q9,  d4, d5, d16, d17, d18, d19
+    pld       [TMP1, PF_OFFS]
+    sub       TMP1, TMP1, STRIDE
+    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q2, d6, d30
+    vmlal.u16 q2, d7, d30
+    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
+    bilinear_load_mask mask_fmt, 4, d22
+    bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
+    pld       [TMP1, PF_OFFS]
+    vmlsl.u16 q8, d18, d31
+    vmlal.u16 q8, d19, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
+    bilinear_duplicate_mask mask_fmt, 4, d22
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vmovn.u16 d0, q0
+    vmovn.u16 d1, q2
+    vadd.u16  q12, q12, q13
+    bilinear_interleave_src_dst \
+                mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
+    bilinear_apply_mask_to_src \
+                mask_fmt, 4, d0, d1, q0, d22, \
+                q3, q8, q9, q10
+    bilinear_combine \
+                op, 4, d0, d1, q0, d2, d3, q1, \
+                q3, q8, q9, q10, d23
+    bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
+    bilinear_store_&dst_fmt 4, q2, q3
+.endm
+
+.set BILINEAR_FLAG_USE_MASK,		1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS,	2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline functions.
+ *
+ * Bilinear scanline generator macro take folling arguments:
+ *  fname			- name of the function to generate
+ *  src_fmt			- source color format (8888 or 0565)
+ *  dst_fmt			- destination color format (8888 or 0565)
+ *  src/dst_bpp_shift		- (1 << bpp_shift) is the size of src/dst pixel in bytes
+ *  process_last_pixel		- code block that interpolate one pixel and does not
+ *				  update horizontal weight
+ *  process_two_pixels		- code block that interpolate two pixels and update
+ *				  horizontal weight
+ *  process_four_pixels		- code block that interpolate four pixels and update
+ *				  horizontal weight
+ *  process_pixblock_head	- head part of middle loop
+ *  process_pixblock_tail	- tail part of middle loop
+ *  process_pixblock_tail_head	- tail_head of middle loop
+ *  pixblock_size		- number of pixels processed in a single middle loop
+ *  prefetch_distance		- prefetch in the source image by that many pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func \
+	fname, \
+	src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
+	bilinear_process_last_pixel, \
+	bilinear_process_two_pixels, \
+	bilinear_process_four_pixels, \
+	bilinear_process_pixblock_head, \
+	bilinear_process_pixblock_tail, \
+	bilinear_process_pixblock_tail_head, \
+	pixblock_size, \
+	prefetch_distance, \
+	flags
+
+pixman_asm_function fname
+.if pixblock_size == 8
+.elseif pixblock_size == 4
+.else
+    .error unsupported pixblock size
+.endif
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+    OUT       .req    r0
+    TOP       .req    r1
+    BOTTOM    .req    r2
+    WT        .req    r3
+    WB        .req    r4
+    X         .req    r5
+    UX        .req    r6
+    WIDTH     .req    ip
+    TMP1      .req    r3
+    TMP2      .req    r4
+    PF_OFFS   .req    r7
+    TMP3      .req    r8
+    TMP4      .req    r9
+    STRIDE    .req    r2
+
+    mov		ip, sp
+    push	{r4, r5, r6, r7, r8, r9}
+    mov		PF_OFFS, #prefetch_distance
+    ldmia	ip, {WB, X, UX, WIDTH}
+.else
+    OUT       .req      r0
+    MASK      .req      r1
+    TOP       .req      r2
+    BOTTOM    .req      r3
+    WT        .req      r4
+    WB        .req      r5
+    X         .req      r6
+    UX        .req      r7
+    WIDTH     .req      ip
+    TMP1      .req      r4
+    TMP2      .req      r5
+    PF_OFFS   .req      r8
+    TMP3      .req      r9
+    TMP4      .req      r10
+    STRIDE    .req      r3
+
+    .set prefetch_offset, prefetch_distance
+
+    mov       ip, sp
+    push      {r4, r5, r6, r7, r8, r9, r10, ip}
+    mov       PF_OFFS, #prefetch_distance
+    ldmia     ip, {WT, WB, X, UX, WIDTH}
+.endif
+
+    mul       PF_OFFS, PF_OFFS, UX
+
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpush     {d8-d15}
+.endif
+
+    sub	      STRIDE, BOTTOM, TOP
+    .unreq    BOTTOM
+
+    cmp       WIDTH, #0
+    ble       3f
+
+    vdup.u16  q12, X
+    vdup.u16  q13, UX
+    vdup.u8   d28, WT
+    vdup.u8   d29, WB
+    vadd.u16  d25, d25, d26
+
+    /* ensure good destination alignment  */
+    cmp       WIDTH, #1
+    blt       0f
+    tst       OUT, #(1 << dst_bpp_shift)
+    beq       0f
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vadd.u16  q12, q12, q13
+    bilinear_process_last_pixel
+    sub       WIDTH, WIDTH, #1
+0:
+    vadd.u16  q13, q13, q13
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vadd.u16  q12, q12, q13
+
+    cmp       WIDTH, #2
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 1))
+    beq       0f
+    bilinear_process_two_pixels
+    sub       WIDTH, WIDTH, #2
+0:
+.if pixblock_size == 8
+    cmp       WIDTH, #4
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 2))
+    beq       0f
+    bilinear_process_four_pixels
+    sub       WIDTH, WIDTH, #4
+0:
+.endif
+    subs      WIDTH, WIDTH, #pixblock_size
+    blt       1f
+    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+    bilinear_process_pixblock_head
+    subs      WIDTH, WIDTH, #pixblock_size
+    blt       5f
+0:
+    bilinear_process_pixblock_tail_head
+    subs      WIDTH, WIDTH, #pixblock_size
+    bge       0b
+5:
+    bilinear_process_pixblock_tail
+1:
+.if pixblock_size == 8
+    tst       WIDTH, #4
+    beq       2f
+    bilinear_process_four_pixels
+2:
+.endif
+    /* handle the remaining trailing pixels */
+    tst       WIDTH, #2
+    beq       2f
+    bilinear_process_two_pixels
+2:
+    tst       WIDTH, #1
+    beq       3f
+    bilinear_process_last_pixel
+3:
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpop      {d8-d15}
+.endif
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+    pop       {r4, r5, r6, r7, r8, r9}
+.else
+    pop       {r4, r5, r6, r7, r8, r9, r10, ip}
+.endif
+    bx        lr
+
+    .unreq    OUT
+    .unreq    TOP
+    .unreq    WT
+    .unreq    WB
+    .unreq    X
+    .unreq    UX
+    .unreq    WIDTH
+    .unreq    TMP1
+    .unreq    TMP2
+    .unreq    PF_OFFS
+    .unreq    TMP3
+    .unreq    TMP4
+    .unreq    STRIDE
+.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
+    .unreq    MASK
+.endif
+
+.endfunc
+
+.endm
+
+/* src_8888_8_8888 */
+.macro bilinear_src_8888_8_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_head
+    bilinear_src_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail_head
+    bilinear_src_8888_8_8888_process_pixblock_tail
+    bilinear_src_8888_8_8888_process_pixblock_head
+.endm
+
+/* src_8888_8_0565 */
+.macro bilinear_src_8888_8_0565_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_head
+    bilinear_src_8888_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail_head
+    bilinear_src_8888_8_0565_process_pixblock_tail
+    bilinear_src_8888_8_0565_process_pixblock_head
+.endm
+
+/* src_0565_8_x888 */
+.macro bilinear_src_0565_8_x888_process_last_pixel
+    bilinear_interpolate_last_pixel 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_two_pixels
+    bilinear_interpolate_two_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_four_pixels
+    bilinear_interpolate_four_pixels 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_head
+    bilinear_src_0565_8_x888_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_x888_process_pixblock_tail_head
+    bilinear_src_0565_8_x888_process_pixblock_tail
+    bilinear_src_0565_8_x888_process_pixblock_head
+.endm
+
+/* src_0565_8_0565 */
+.macro bilinear_src_0565_8_0565_process_last_pixel
+    bilinear_interpolate_last_pixel 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_two_pixels
+    bilinear_interpolate_two_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_four_pixels
+    bilinear_interpolate_four_pixels 0565, 8, 0565, src
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_head
+    bilinear_src_0565_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_0565_8_0565_process_pixblock_tail_head
+    bilinear_src_0565_8_0565_process_pixblock_tail
+    bilinear_src_0565_8_0565_process_pixblock_head
+.endm
+
+/* over_8888_8888 */
+.macro bilinear_over_8888_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, x, 8888, over
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_head
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+
+    vld1.32     {d22}, [TMP1], STRIDE
+    vld1.32     {d23}, [TMP1]
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+    vmull.u8    q8, d22, d28
+    vmlal.u8    q8, d23, d29
+
+    vld1.32     {d22}, [TMP2], STRIDE
+    vld1.32     {d23}, [TMP2]
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+    vmull.u8    q9, d22, d28
+    vmlal.u8    q9, d23, d29
+
+    vld1.32     {d22}, [TMP3], STRIDE
+    vld1.32     {d23}, [TMP3]
+    vmull.u8    q10, d22, d28
+    vmlal.u8    q10, d23, d29
+
+    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16   q0, d16, d30
+    vmlal.u16   q0, d17, d30
+
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d16}, [TMP4], STRIDE
+    vld1.32     {d17}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q11, d16, d28
+    vmlal.u8    q11, d17, d29
+
+    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16   q1, d18, d31
+    vmlal.u16   q1, d19, d31
+    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vadd.u16    q12, q12, q13
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail
+    vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16   q2, d20, d30
+    vmlal.u16   q2, d21, d30
+    vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16   q3, d22, d31
+    vmlal.u16   q3, d23, d31
+    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vld1.32     {d2, d3}, [OUT, :128]
+    pld         [OUT, #(prefetch_offset * 4)]
+    vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vmovn.u16   d6, q0
+    vmovn.u16   d7, q2
+    vuzp.8      d6, d7
+    vuzp.8      d2, d3
+    vuzp.8      d6, d7
+    vuzp.8      d2, d3
+    vdup.32     d4, d7[1]
+    vmvn.8      d4, d4
+    vmull.u8    q11, d2, d4
+    vmull.u8    q2, d3, d4
+    vrshr.u16   q1, q11, #8
+    vrshr.u16   q10, q2, #8
+    vraddhn.u16 d2, q1, q11
+    vraddhn.u16 d3, q10, q2
+    vqadd.u8    q3, q1, q3
+    vuzp.8      d6, d7
+    vuzp.8      d6, d7
+    vadd.u16    q12, q12, q13
+    vst1.32     {d6, d7}, [OUT, :128]!
+.endm
+
+.macro bilinear_over_8888_8888_process_pixblock_tail_head
+                                            vshll.u16   q2, d20, #BILINEAR_INTERPOLATION_BITS
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+                                            vmlsl.u16   q2, d20, d30
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+                                            vmlal.u16   q2, d21, d30
+                                            vshll.u16   q3, d22, #BILINEAR_INTERPOLATION_BITS
+    vld1.32     {d20}, [TMP1], STRIDE
+                                            vmlsl.u16   q3, d22, d31
+                                            vmlal.u16   q3, d23, d31
+    vld1.32     {d21}, [TMP1]
+    vmull.u8    q8, d20, d28
+    vmlal.u8    q8, d21, d29
+                                            vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+                                            vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+                                            vld1.32     {d2, d3}, [OUT, :128]
+                                            pld         [OUT, PF_OFFS]
+                                            vshrn.u32   d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vld1.32     {d22}, [TMP2], STRIDE
+                                            vshrn.u32   d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
+                                            vmovn.u16   d6, q0
+    vld1.32     {d23}, [TMP2]
+    vmull.u8    q9, d22, d28
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+    vmlal.u8    q9, d23, d29
+                                            vmovn.u16   d7, q2
+    vld1.32     {d22}, [TMP3], STRIDE
+                                            vuzp.8      d6, d7
+                                            vuzp.8      d2, d3
+                                            vuzp.8      d6, d7
+                                            vuzp.8      d2, d3
+                                            vdup.32     d4, d7[1]
+    vld1.32     {d23}, [TMP3]
+                                            vmvn.8      d4, d4
+    vmull.u8    q10, d22, d28
+    vmlal.u8    q10, d23, d29
+                                            vmull.u8    q11, d2, d4
+                                            vmull.u8    q2, d3, d4
+    vshll.u16   q0, d16, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16   q0, d16, d30
+                                            vrshr.u16   q1, q11, #8
+    vmlal.u16   q0, d17, d30
+                                            vrshr.u16   q8, q2, #8
+                                            vraddhn.u16 d2, q1, q11
+                                            vraddhn.u16 d3, q8, q2
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d16}, [TMP4], STRIDE
+                                            vqadd.u8    q3, q1, q3
+    vld1.32     {d17}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q11, d16, d28
+    vmlal.u8    q11, d17, d29
+                                            vuzp.8      d6, d7
+    vshll.u16   q1, d18, #BILINEAR_INTERPOLATION_BITS
+                                            vuzp.8      d6, d7
+    vmlsl.u16   q1, d18, d31
+                                            vadd.u16    q12, q12, q13
+    vmlal.u16   q1, d19, d31
+    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vadd.u16    q12, q12, q13
+                                            vst1.32     {d6, d7}, [OUT, :128]!
+.endm
+
+/* over_8888_8_8888 */
+.macro bilinear_over_8888_8_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, 8, 8888, over
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_head
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+    vld1.32     {d0}, [TMP1], STRIDE
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+    vld1.32     {d1}, [TMP1]
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+    vld1.32     {d2}, [TMP2], STRIDE
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+    vld1.32     {d3}, [TMP2]
+    vmull.u8    q2, d0, d28
+    vmull.u8    q3, d2, d28
+    vmlal.u8    q2, d1, d29
+    vmlal.u8    q3, d3, d29
+    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS
+    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16   q0, d4, d30
+    vmlsl.u16   q1, d6, d31
+    vmlal.u16   q0, d5, d30
+    vmlal.u16   q1, d7, d31
+    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vld1.32     {d2}, [TMP3], STRIDE
+    vld1.32     {d3}, [TMP3]
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d4}, [TMP4], STRIDE
+    vld1.32     {d5}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q3, d2, d28
+    vmlal.u8    q3, d3, d29
+    vmull.u8    q1, d4, d28
+    vmlal.u8    q1, d5, d29
+    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vld1.32     {d22[0]}, [MASK]!
+    pld         [MASK, #prefetch_offset]
+    vadd.u16    q12, q12, q13
+    vmovn.u16   d16, q0
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail
+    vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS
+    vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16   q9, d6, d30
+    vmlsl.u16   q10, d2, d31
+    vmlal.u16   q9, d7, d30
+    vmlal.u16   q10, d3, d31
+    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vadd.u16    q12, q12, q13
+    vdup.32     d22, d22[0]
+    vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vmovn.u16   d17, q9
+    vld1.32     {d18, d19}, [OUT, :128]
+    pld         [OUT, PF_OFFS]
+    vuzp.8      d16, d17
+    vuzp.8      d18, d19
+    vuzp.8      d16, d17
+    vuzp.8      d18, d19
+    vmull.u8    q10, d16, d22
+    vmull.u8    q11, d17, d22
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+    vrshrn.u16  d16, q10, #8
+    vrshrn.u16  d17, q11, #8
+    vdup.32     d22, d17[1]
+    vmvn.8      d22, d22
+    vmull.u8    q10, d18, d22
+    vmull.u8    q11, d19, d22
+    vrshr.u16   q9, q10, #8
+    vrshr.u16   q0, q11, #8
+    vraddhn.u16 d18, q9, q10
+    vraddhn.u16 d19, q0, q11
+    vqadd.u8    q9, q8, q9
+    vuzp.8      d18, d19
+    vuzp.8      d18, d19
+    vst1.32     {d18, d19}, [OUT, :128]!
+.endm
+
+.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
+                                            vshll.u16   q9, d6, #BILINEAR_INTERPOLATION_BITS
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+                                            vshll.u16   q10, d2, #BILINEAR_INTERPOLATION_BITS
+    vld1.32     {d0}, [TMP1], STRIDE
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+                                            vmlsl.u16   q9, d6, d30
+                                            vmlsl.u16   q10, d2, d31
+    vld1.32     {d1}, [TMP1]
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+                                            vmlal.u16   q9, d7, d30
+                                            vmlal.u16   q10, d3, d31
+    vld1.32     {d2}, [TMP2], STRIDE
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+                                            vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+                                            vadd.u16    q12, q12, q13
+    vld1.32     {d3}, [TMP2]
+                                            vdup.32     d22, d22[0]
+                                            vshrn.u32   d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
+                                            vshrn.u32   d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vmull.u8    q2, d0, d28
+    vmull.u8    q3, d2, d28
+                                            vmovn.u16   d17, q9
+                                            vld1.32     {d18, d19}, [OUT, :128]
+                                            pld         [OUT, #(prefetch_offset * 4)]
+    vmlal.u8    q2, d1, d29
+    vmlal.u8    q3, d3, d29
+                                            vuzp.8      d16, d17
+                                            vuzp.8      d18, d19
+    vshll.u16   q0, d4, #BILINEAR_INTERPOLATION_BITS
+    vshll.u16   q1, d6, #BILINEAR_INTERPOLATION_BITS
+                                            vuzp.8      d16, d17
+                                            vuzp.8      d18, d19
+    vmlsl.u16   q0, d4, d30
+    vmlsl.u16   q1, d6, d31
+                                            vmull.u8    q10, d16, d22
+                                            vmull.u8    q11, d17, d22
+    vmlal.u16   q0, d5, d30
+    vmlal.u16   q1, d7, d31
+                                            vrsra.u16   q10, q10, #8
+                                            vrsra.u16   q11, q11, #8
+    vshrn.u32   d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32   d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+                                            vrshrn.u16  d16, q10, #8
+                                            vrshrn.u16  d17, q11, #8
+    vld1.32     {d2}, [TMP3], STRIDE
+                                            vdup.32     d22, d17[1]
+    vld1.32     {d3}, [TMP3]
+                                            vmvn.8      d22, d22
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d4}, [TMP4], STRIDE
+                                            vmull.u8    q10, d18, d22
+                                            vmull.u8    q11, d19, d22
+    vld1.32     {d5}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q3, d2, d28
+                                            vrshr.u16   q9, q10, #8
+                                            vrshr.u16   q15, q11, #8
+    vmlal.u8    q3, d3, d29
+    vmull.u8    q1, d4, d28
+                                            vraddhn.u16 d18, q9, q10
+                                            vraddhn.u16 d19, q15, q11
+    vmlal.u8    q1, d5, d29
+    vshr.u16    q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+                                            vqadd.u8    q9, q8, q9
+    vld1.32     {d22[0]}, [MASK]!
+                                            vuzp.8      d18, d19
+    vadd.u16    q12, q12, q13
+                                            vuzp.8      d18, d19
+    vmovn.u16   d16, q0
+                                            vst1.32     {d18, d19}, [OUT, :128]!
+.endm
+
+/* add_8888_8888 */
+.macro bilinear_add_8888_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, x, 8888, add
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_head
+    bilinear_add_8888_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8888_process_pixblock_tail_head
+    bilinear_add_8888_8888_process_pixblock_tail
+    bilinear_add_8888_8888_process_pixblock_head
+.endm
+
+/* add_8888_8_8888 */
+.macro bilinear_add_8888_8_8888_process_last_pixel
+    bilinear_interpolate_last_pixel 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_two_pixels
+    bilinear_interpolate_two_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_four_pixels
+    bilinear_interpolate_four_pixels 8888, 8, 8888, add
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_head
+    bilinear_add_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_add_8888_8_8888_process_pixblock_tail_head
+    bilinear_add_8888_8_8888_process_pixblock_tail
+    bilinear_add_8888_8_8888_process_pixblock_head
+.endm
+
+
+/* Bilinear scanline functions */
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_src_8888_8_8888_process_last_pixel, \
+    bilinear_src_8888_8_8888_process_two_pixels, \
+    bilinear_src_8888_8_8888_process_four_pixels, \
+    bilinear_src_8888_8_8888_process_pixblock_head, \
+    bilinear_src_8888_8_8888_process_pixblock_tail, \
+    bilinear_src_8888_8_8888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
+    8888, 0565, 2, 1, \
+    bilinear_src_8888_8_0565_process_last_pixel, \
+    bilinear_src_8888_8_0565_process_two_pixels, \
+    bilinear_src_8888_8_0565_process_four_pixels, \
+    bilinear_src_8888_8_0565_process_pixblock_head, \
+    bilinear_src_8888_8_0565_process_pixblock_tail, \
+    bilinear_src_8888_8_0565_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
+    0565, 8888, 1, 2, \
+    bilinear_src_0565_8_x888_process_last_pixel, \
+    bilinear_src_0565_8_x888_process_two_pixels, \
+    bilinear_src_0565_8_x888_process_four_pixels, \
+    bilinear_src_0565_8_x888_process_pixblock_head, \
+    bilinear_src_0565_8_x888_process_pixblock_tail, \
+    bilinear_src_0565_8_x888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
+    0565, 0565, 1, 1, \
+    bilinear_src_0565_8_0565_process_last_pixel, \
+    bilinear_src_0565_8_0565_process_two_pixels, \
+    bilinear_src_0565_8_0565_process_four_pixels, \
+    bilinear_src_0565_8_0565_process_pixblock_head, \
+    bilinear_src_0565_8_0565_process_pixblock_tail, \
+    bilinear_src_0565_8_0565_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_over_8888_8888_process_last_pixel, \
+    bilinear_over_8888_8888_process_two_pixels, \
+    bilinear_over_8888_8888_process_four_pixels, \
+    bilinear_over_8888_8888_process_pixblock_head, \
+    bilinear_over_8888_8888_process_pixblock_tail, \
+    bilinear_over_8888_8888_process_pixblock_tail_head, \
+    4, 28, 0
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_over_8888_8_8888_process_last_pixel, \
+    bilinear_over_8888_8_8888_process_two_pixels, \
+    bilinear_over_8888_8_8888_process_four_pixels, \
+    bilinear_over_8888_8_8888_process_pixblock_head, \
+    bilinear_over_8888_8_8888_process_pixblock_tail, \
+    bilinear_over_8888_8_8888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_add_8888_8888_process_last_pixel, \
+    bilinear_add_8888_8888_process_two_pixels, \
+    bilinear_add_8888_8888_process_four_pixels, \
+    bilinear_add_8888_8888_process_pixblock_head, \
+    bilinear_add_8888_8888_process_pixblock_tail, \
+    bilinear_add_8888_8888_process_pixblock_tail_head, \
+    4, 28, 0
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
+    8888, 8888, 2, 2, \
+    bilinear_add_8888_8_8888_process_last_pixel, \
+    bilinear_add_8888_8_8888_process_two_pixels, \
+    bilinear_add_8888_8_8888_process_four_pixels, \
+    bilinear_add_8888_8_8888_process_pixblock_head, \
+    bilinear_add_8888_8_8888_process_pixblock_tail, \
+    bilinear_add_8888_8_8888_process_pixblock_tail_head, \
+    4, 28, BILINEAR_FLAG_USE_MASK
diff --git a/src/pixman/pixman/pixman-arm-neon-asm.S b/src/pixman/pixman/pixman-arm-neon-asm.S
new file mode 100644
index 0000000..187197d
--- /dev/null
+++ b/src/pixman/pixman/pixman-arm-neon-asm.S
@@ -0,0 +1,3637 @@
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains implementations of NEON optimized pixel processing
+ * functions. There is no full and detailed tutorial, but some functions
+ * (those which are exposing some new or interesting features) are
+ * extensively commented and can be used as examples.
+ *
+ * You may want to have a look at the comments for following functions:
+ *  - pixman_composite_over_8888_0565_asm_neon
+ *  - pixman_composite_over_n_8_0565_asm_neon
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+    .text
+    .fpu neon
+    .arch armv7a
+    .object_arch armv4
+    .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
+    .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
+    .arm
+    .altmacro
+    .p2align 2
+
+#include "pixman-private.h"
+#include "pixman-arm-neon-asm.h"
+
+/* Global configuration options and preferences */
+
+/*
+ * The code can optionally make use of unaligned memory accesses to improve
+ * performance of handling leading/trailing pixels for each scanline.
+ * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
+ * example in linux if unaligned memory accesses are not configured to
+ * generate.exceptions.
+ */
+.set RESPECT_STRICT_ALIGNMENT, 1
+
+/*
+ * Set default prefetch type. There is a choice between the following options:
+ *
+ * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
+ * as NOP to workaround some HW bugs or for whatever other reason)
+ *
+ * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
+ * advanced prefetch intruduces heavy overhead)
+ *
+ * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
+ * which can run ARM and NEON instructions simultaneously so that extra ARM
+ * instructions do not add (many) extra cycles, but improve prefetch efficiency)
+ *
+ * Note: some types of function can't support advanced prefetch and fallback
+ *       to simple one (those which handle 24bpp pixels)
+ */
+.set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
+
+/* Prefetch distance in pixels for simple prefetch */
+.set PREFETCH_DISTANCE_SIMPLE, 64
+
+/*
+ * Implementation of pixman_composite_over_8888_0565_asm_neon
+ *
+ * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
+ * performs OVER compositing operation. Function fast_composite_over_8888_0565
+ * from pixman-fast-path.c does the same in C and can be used as a reference.
+ *
+ * First we need to have some NEON assembly code which can do the actual
+ * operation on the pixels and provide it to the template macro.
+ *
+ * Template macro quite conveniently takes care of emitting all the necessary
+ * code for memory reading and writing (including quite tricky cases of
+ * handling unaligned leading/trailing pixels), so we only need to deal with
+ * the data in NEON registers.
+ *
+ * NEON registers allocation in general is recommented to be the following:
+ * d0,  d1,  d2,  d3  - contain loaded source pixel data
+ * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
+ * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
+ * d28, d29, d30, d31 - place for storing the result (destination pixels)
+ *
+ * As can be seen above, four 64-bit NEON registers are used for keeping
+ * intermediate pixel data and up to 8 pixels can be processed in one step
+ * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
+ *
+ * This particular function uses the following registers allocation:
+ * d0,  d1,  d2,  d3  - contain loaded source pixel data
+ * d4,  d5            - contain loaded destination pixels (they are needed)
+ * d28, d29           - place for storing the result (destination pixels)
+ */
+
+/*
+ * Step one. We need to have some code to do some arithmetics on pixel data.
+ * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
+ * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
+ * perform all the needed calculations and write the result to {d28, d29}.
+ * The rationale for having two macros and not just one will be explained
+ * later. In practice, any single monolitic function which does the work can
+ * be split into two parts in any arbitrary way without affecting correctness.
+ *
+ * There is one special trick here too. Common template macro can optionally
+ * make our life a bit easier by doing R, G, B, A color components
+ * deinterleaving for 32bpp pixel formats (and this feature is used in
+ * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
+ * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
+ * actually use d0 register for blue channel (a vector of eight 8-bit
+ * values), d1 register for green, d2 for red and d3 for alpha. This
+ * simple conversion can be also done with a few NEON instructions:
+ *
+ * Packed to planar conversion:
+ *  vuzp.8 d0, d1
+ *  vuzp.8 d2, d3
+ *  vuzp.8 d1, d3
+ *  vuzp.8 d0, d2
+ *
+ * Planar to packed conversion:
+ *  vzip.8 d0, d2
+ *  vzip.8 d1, d3
+ *  vzip.8 d2, d3
+ *  vzip.8 d0, d1
+ *
+ * But pixel can be loaded directly in planar format using VLD4.8 NEON
+ * instruction. It is 1 cycle slower than VLD1.32, so this is not always
+ * desirable, that's why deinterleaving is optional.
+ *
+ * But anyway, here is the code:
+ */
+.macro pixman_composite_over_8888_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vmvn.8      d3, d3      /* invert source alpha */
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/*
+ * OK, now we got almost everything that we need. Using the above two
+ * macros, the work can be done right. But now we want to optimize
+ * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
+ * a lot from good code scheduling and software pipelining.
+ *
+ * Let's construct some code, which will run in the core main loop.
+ * Some pseudo-code of the main loop will look like this:
+ *   head
+ *   while (...) {
+ *     tail
+ *     head
+ *   }
+ *   tail
+ *
+ * It may look a bit weird, but this setup allows to hide instruction
+ * latencies better and also utilize dual-issue capability more
+ * efficiently (make pairs of load-store and ALU instructions).
+ *
+ * So what we need now is a '*_tail_head' macro, which will be used
+ * in the core main loop. A trivial straightforward implementation
+ * of this macro would look like this:
+ *
+ *   pixman_composite_over_8888_0565_process_pixblock_tail
+ *   vst1.16     {d28, d29}, [DST_W, :128]!
+ *   vld1.16     {d4, d5}, [DST_R, :128]!
+ *   vld4.32     {d0, d1, d2, d3}, [SRC]!
+ *   pixman_composite_over_8888_0565_process_pixblock_head
+ *   cache_preload 8, 8
+ *
+ * Now it also got some VLD/VST instructions. We simply can't move from
+ * processing one block of pixels to the other one with just arithmetics.
+ * The previously processed data needs to be written to memory and new
+ * data needs to be fetched. Fortunately, this main loop does not deal
+ * with partial leading/trailing pixels and can load/store a full block
+ * of pixels in a bulk. Additionally, destination buffer is already
+ * 16 bytes aligned here (which is good for performance).
+ *
+ * New things here are DST_R, DST_W, SRC and MASK identifiers. These
+ * are the aliases for ARM registers which are used as pointers for
+ * accessing data. We maintain separate pointers for reading and writing
+ * destination buffer (DST_R and DST_W).
+ *
+ * Another new thing is 'cache_preload' macro. It is used for prefetching
+ * data into CPU L2 cache and improve performance when dealing with large
+ * images which are far larger than cache size. It uses one argument
+ * (actually two, but they need to be the same here) - number of pixels
+ * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
+ * details about this macro. Moreover, if good performance is needed
+ * the code from this macro needs to be copied into '*_tail_head' macro
+ * and mixed with the rest of code for optimal instructions scheduling.
+ * We are actually doing it below.
+ *
+ * Now after all the explanations, here is the optimized code.
+ * Different instruction streams (originaling from '*_head', '*_tail'
+ * and 'cache_preload' macro) use different indentation levels for
+ * better readability. Actually taking the code from one of these
+ * indentation levels and ignoring a few VLD/VST instructions would
+ * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
+ * macro!
+ */
+
+#if 1
+
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+        vqadd.u8    d16, d2, d20
+    vld1.16     {d4, d5}, [DST_R, :128]!
+        vqadd.u8    q9, q0, q11
+    vshrn.u16   d6, q2, #8
+    fetch_src_pixblock
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+        vshll.u8    q14, d16, #8
+                                    PF add PF_X, PF_X, #8
+        vshll.u8    q8, d19, #8
+                                    PF tst PF_CTL, #0xF
+    vsri.u8     d6, d6, #5
+                                    PF addne PF_X, PF_X, #8
+    vmvn.8      d3, d3
+                                    PF subne PF_CTL, PF_CTL, #1
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    vmull.u8    q10, d3, d6
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vsri.u16    q14, q8, #5
+                                    PF cmp PF_X, ORIG_W
+        vshll.u8    q9, d18, #8
+    vrshr.u16   q13, q10, #8
+                                    PF subge PF_X, PF_X, ORIG_W
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+                                    PF subges PF_CTL, PF_CTL, #0x10
+        vsri.u16    q14, q9, #11
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vraddhn.u16 d22, q12, q15
+        vst1.16     {d28, d29}, [DST_W, :128]!
+.endm
+
+#else
+
+/* If we did not care much about the performance, we would just use this... */
+.macro pixman_composite_over_8888_0565_process_pixblock_tail_head
+    pixman_composite_over_8888_0565_process_pixblock_tail
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    fetch_src_pixblock
+    pixman_composite_over_8888_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+#endif
+
+/*
+ * And now the final part. We are using 'generate_composite_function' macro
+ * to put all the stuff together. We are specifying the name of the function
+ * which we want to get, number of bits per pixel for the source, mask and
+ * destination (0 if unused, like mask in this case). Next come some bit
+ * flags:
+ *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
+ *                             and written, for write-only buffer we would use
+ *                             FLAG_DST_WRITEONLY flag instead
+ *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
+ *                             and separate color channels for 32bpp format.
+ * The next things are:
+ *  - the number of pixels processed per iteration (8 in this case, because
+ *    that's the maximum what can fit into four 64-bit NEON registers).
+ *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
+ *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
+ *    prefetch distance can be selected by running some benchmarks.
+ *
+ * After that we specify some macros, these are 'default_init',
+ * 'default_cleanup' here which are empty (but it is possible to have custom
+ * init/cleanup macros to be able to save/restore some extra NEON registers
+ * like d8-d15 or do anything else) followed by
+ * 'pixman_composite_over_8888_0565_process_pixblock_head',
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
+ * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
+ * which we got implemented above.
+ *
+ * The last part is the NEON registers allocation scheme.
+ */
+generate_composite_function \
+    pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_0565_process_pixblock_head
+    /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+       and put data into d6 - red, d7 - green, d30 - blue */
+    vshrn.u16   d6, q2, #8
+    vshrn.u16   d7, q2, #3
+    vsli.u16    q2, q2, #5
+    vsri.u8     d6, d6, #5
+    vsri.u8     d7, d7, #6
+    vshrn.u16   d30, q2, #2
+    /* now do alpha blending, storing results in 8-bit planar format
+       into d16 - red, d19 - green, d18 - blue */
+    vmull.u8    q10, d3, d6
+    vmull.u8    q11, d3, d7
+    vmull.u8    q12, d3, d30
+    vrshr.u16   q13, q10, #8
+    vrshr.u16   q3, q11, #8
+    vrshr.u16   q15, q12, #8
+    vraddhn.u16 d20, q10, q13
+    vraddhn.u16 d23, q11, q3
+    vraddhn.u16 d22, q12, q15
+.endm
+
+.macro pixman_composite_over_n_0565_process_pixblock_tail
+    /* ... continue alpha blending */
+    vqadd.u8    d16, d2, d20
+    vqadd.u8    q9, q0, q11
+    /* convert the result to r5g6b5 and store it into {d28, d29} */
+    vshll.u8    q14, d16, #8
+    vshll.u8    q8, d19, #8
+    vshll.u8    q9, d18, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_0565_process_pixblock_tail_head
+    pixman_composite_over_n_0565_process_pixblock_tail
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    pixman_composite_over_n_0565_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_over_n_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+    vmvn.8      d3, d3      /* invert source alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_0565_init, \
+    default_cleanup, \
+    pixman_composite_over_n_0565_process_pixblock_head, \
+    pixman_composite_over_n_0565_process_pixblock_tail, \
+    pixman_composite_over_n_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_0565_process_pixblock_head
+    vshll.u8    q8, d1, #8
+    vshll.u8    q14, d2, #8
+    vshll.u8    q9, d0, #8
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+.macro pixman_composite_src_8888_0565_process_pixblock_tail_head
+        vsri.u16    q14, q8, #5
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    fetch_src_pixblock
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vsri.u16    q14, q9, #11
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vshll.u8    q8, d1, #8
+        vst1.16     {d28, d29}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vshll.u8    q14, d2, #8
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vshll.u8    q9, d0, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_8888_process_pixblock_head
+    vshrn.u16   d30, q0, #8
+    vshrn.u16   d29, q0, #3
+    vsli.u16    q0, q0, #5
+    vmov.u8     d31, #255
+    vsri.u8     d30, d30, #5
+    vsri.u8     d29, d29, #6
+    vshrn.u16   d28, q0, #2
+.endm
+
+.macro pixman_composite_src_0565_8888_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_src_0565_8888_process_pixblock_tail_head
+    pixman_composite_src_0565_8888_process_pixblock_tail
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+    fetch_src_pixblock
+    pixman_composite_src_0565_8888_process_pixblock_head
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_process_pixblock_head
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_add_8_8_process_pixblock_tail_head
+    fetch_src_pixblock
+                                    PF add PF_X, PF_X, #32
+                                    PF tst PF_CTL, #0xF
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+                                    PF addne PF_X, PF_X, #32
+                                    PF subne PF_CTL, PF_CTL, #1
+        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vqadd.u8    q14, q0, q2
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vqadd.u8    q15, q1, q3
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+    vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vqadd.u8    q14, q0, q2
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vqadd.u8    q15, q1, q3
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+.endm
+
+.macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+    fetch_src_pixblock
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8888_process_pixblock_head
+    pixman_composite_out_reverse_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail
+    pixman_composite_out_reverse_8888_8888_process_pixblock_tail
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+.macro pixman_composite_over_8888_8888_process_pixblock_tail_head
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    fetch_src_pixblock
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_process_pixblock_head
+    /* deinterleaved source pixels in {d0, d1, d2, d3} */
+    /* inverted alpha in {d24} */
+    /* destination pixels in {d4, d5, d6, d7} */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q2, q10, #8
+    vrshr.u16   q3, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q2, q10
+    vraddhn.u16 d31, q3, q11
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+.macro pixman_composite_over_n_8888_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q2, q10, #8
+        vrshr.u16   q3, q11, #8
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+        vraddhn.u16 d30, q2, q10
+        vraddhn.u16 d31, q3, q11
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vqadd.u8    q14, q0, q14
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0x0F
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vqadd.u8    q15, q1, q15
+                                    PF cmp PF_X, ORIG_W
+    vmull.u8    q8, d24, d4
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+    vmull.u8    q9, d24, d5
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q10, d24, d6
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q11, d24, d7
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+    vmvn.8      d24, d3  /* get inverted alpha */
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+        vrshr.u16   q15, q9, #8
+        vrshr.u16   q12, q10, #8
+        vrshr.u16   q13, q11, #8
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d30, q12, q10
+        vraddhn.u16 d31, q13, q11
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
+    vmvn.8      d22, d3
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d22, d4
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d22, d5
+    vmull.u8    q10, d22, d6
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+    vmull.u8    q11, d22, d7
+.endm
+
+.macro pixman_composite_over_reverse_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d7[0]}, [DUMMY]
+    vdup.8      d4, d7[0]
+    vdup.8      d5, d7[1]
+    vdup.8      d6, d7[2]
+    vdup.8      d7, d7[3]
+.endm
+
+generate_composite_function \
+    pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_reverse_n_8888_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0,  /* dst_r_basereg */ \
+    4,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_head
+    vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
+    vmull.u8    q1,  d24, d9
+    vmull.u8    q6,  d24, d10
+    vmull.u8    q7,  d24, d11
+        vshrn.u16   d6,  q2, #8 /* convert DST_R data to 32-bpp (part1) */
+        vshrn.u16   d7,  q2, #3
+        vsli.u16    q2,  q2, #5
+    vrshr.u16   q8,  q0,  #8    /* IN for SRC pixels (part2) */
+    vrshr.u16   q9,  q1,  #8
+    vrshr.u16   q10, q6,  #8
+    vrshr.u16   q11, q7,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q9
+    vraddhn.u16 d2,  q6,  q10
+    vraddhn.u16 d3,  q7,  q11
+        vsri.u8     d6,  d6, #5 /* convert DST_R data to 32-bpp (part2) */
+        vsri.u8     d7,  d7, #6
+    vmvn.8      d3,  d3
+        vshrn.u16   d30, q2, #2
+    vmull.u8    q8,  d3, d6     /* now do alpha blending */
+    vmull.u8    q9,  d3, d7
+    vmull.u8    q10, d3, d30
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail
+    /* 3 cycle bubble (after vmull.u8) */
+    vrshr.u16   q13, q8,  #8
+    vrshr.u16   q11, q9,  #8
+    vrshr.u16   q15, q10, #8
+    vraddhn.u16 d16, q8,  q13
+    vraddhn.u16 d27, q9,  q11
+    vraddhn.u16 d26, q10, q15
+    vqadd.u8    d16, d2,  d16
+    /* 1 cycle bubble */
+    vqadd.u8    q9,  q0,  q13
+    vshll.u8    q14, d16, #8    /* convert to 16bpp */
+    vshll.u8    q8,  d19, #8
+    vshll.u8    q9,  d18, #8
+    vsri.u16    q14, q8,  #5
+    /* 1 cycle bubble */
+    vsri.u16    q14, q9,  #11
+.endm
+
+.macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+    vld1.16     {d4, d5}, [DST_R, :128]!
+    vshrn.u16   d6,  q2,  #8
+    fetch_mask_pixblock
+    vshrn.u16   d7,  q2,  #3
+    fetch_src_pixblock
+    vmull.u8    q6,  d24, d10
+        vrshr.u16   q13, q8,  #8
+        vrshr.u16   q11, q9,  #8
+        vrshr.u16   q15, q10, #8
+        vraddhn.u16 d16, q8,  q13
+        vraddhn.u16 d27, q9,  q11
+        vraddhn.u16 d26, q10, q15
+        vqadd.u8    d16, d2,  d16
+    vmull.u8    q1,  d24, d9
+        vqadd.u8    q9,  q0,  q13
+        vshll.u8    q14, d16, #8
+    vmull.u8    q0,  d24, d8
+        vshll.u8    q8,  d19, #8
+        vshll.u8    q9,  d18, #8
+        vsri.u16    q14, q8,  #5
+    vmull.u8    q7,  d24, d11
+        vsri.u16    q14, q9,  #11
+
+    cache_preload 8, 8
+
+    vsli.u16    q2,  q2,  #5
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q9,  q1,  #8
+    vrshr.u16   q10, q6,  #8
+    vrshr.u16   q11, q7,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q9
+    vraddhn.u16 d2,  q6,  q10
+    vraddhn.u16 d3,  q7,  q11
+    vsri.u8     d6,  d6,  #5
+    vsri.u8     d7,  d7,  #6
+    vmvn.8      d3,  d3
+    vshrn.u16   d30, q2,  #2
+    vst1.16     {d28, d29}, [DST_W, :128]!
+    vmull.u8    q8,  d3,  d6
+    vmull.u8    q9,  d3,  d7
+    vmull.u8    q10, d3,  d30
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+/*
+ * This function needs a special initialization of solid mask.
+ * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
+ * offset, split into color components and replicated in d8-d11
+ * registers. Additionally, this function needs all the NEON registers,
+ * so it has to save d8-d15 registers which are callee saved according
+ * to ABI. These registers are restored from 'cleanup' macro. All the
+ * other NEON registers are caller saved, so can be clobbered freely
+ * without introducing any problems.
+ */
+.macro pixman_composite_over_n_8_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_0565_init, \
+    pixman_composite_over_n_8_0565_cleanup, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_0565_init
+    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+    vpush       {d8-d15}
+    vld1.32     {d24[0]}, [DUMMY]
+    vdup.8      d24, d24[3]
+.endm
+
+.macro pixman_composite_over_8888_n_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_0565_init, \
+    pixman_composite_over_8888_n_0565_cleanup, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0565_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0565_0565_process_pixblock_tail_head
+    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+    fetch_src_pixblock
+    cache_preload 16, 16
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_0565_process_pixblock_head, \
+    pixman_composite_src_0565_0565_process_pixblock_tail, \
+    pixman_composite_src_0565_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8_process_pixblock_tail_head
+    vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #8
+    vsli.u64    d0, d0, #16
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_WRITEONLY, \
+    32, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_8_init, \
+    pixman_composite_src_n_8_cleanup, \
+    pixman_composite_src_n_8_process_pixblock_head, \
+    pixman_composite_src_n_8_process_pixblock_tail, \
+    pixman_composite_src_n_8_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_0565_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_0565_process_pixblock_tail_head
+    vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_0565_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #16
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_0565_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    16, /* number of pixels, processed in a single block */ \
+    0,  /* prefetch distance */ \
+    pixman_composite_src_n_0565_init, \
+    pixman_composite_src_n_0565_cleanup, \
+    pixman_composite_src_n_0565_process_pixblock_head, \
+    pixman_composite_src_n_0565_process_pixblock_tail, \
+    pixman_composite_src_n_0565_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_n_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_src_n_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d0[0]}, [DUMMY]
+    vsli.u64    d0, d0, #32
+    vorr        d1, d0, d0
+    vorr        q1, q0, q0
+.endm
+
+.macro pixman_composite_src_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    0, /* prefetch distance */ \
+    pixman_composite_src_n_8888_init, \
+    pixman_composite_src_n_8888_cleanup, \
+    pixman_composite_src_n_8888_process_pixblock_head, \
+    pixman_composite_src_n_8888_process_pixblock_tail, \
+    pixman_composite_src_n_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_8888_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_8888_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+    fetch_src_pixblock
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_8888_process_pixblock_head, \
+    pixman_composite_src_8888_8888_process_pixblock_tail, \
+    pixman_composite_src_8888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_x888_8888_process_pixblock_head
+    vorr     q0, q0, q2
+    vorr     q1, q1, q2
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_x888_8888_process_pixblock_tail_head
+    vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
+    fetch_src_pixblock
+    vorr     q0, q0, q2
+    vorr     q1, q1, q2
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_x888_8888_init
+    vmov.u8  q2, #0xFF
+    vshl.u32 q2, q2, #24
+.endm
+
+generate_composite_function \
+    pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_x888_8888_init, \
+    default_cleanup, \
+    pixman_composite_src_x888_8888_process_pixblock_head, \
+    pixman_composite_src_x888_8888_process_pixblock_tail, \
+    pixman_composite_src_x888_8888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_head
+    /* expecting solid source in {d0, d1, d2, d3} */
+    /* mask is in d24 (d25, d26, d27 are unused) */
+
+    /* in */
+    vmull.u8    q8, d24, d0
+    vmull.u8    q9, d24, d1
+    vmull.u8    q10, d24, d2
+    vmull.u8    q11, d24, d3
+    vrsra.u16   q8, q8, #8
+    vrsra.u16   q9, q9, #8
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail
+    vrshrn.u16  d28, q8, #8
+    vrshrn.u16  d29, q9, #8
+    vrshrn.u16  d30, q10, #8
+    vrshrn.u16  d31, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
+    fetch_mask_pixblock
+                                    PF add PF_X, PF_X, #8
+        vrshrn.u16  d28, q8, #8
+                                    PF tst PF_CTL, #0x0F
+        vrshrn.u16  d29, q9, #8
+                                    PF addne PF_X, PF_X, #8
+        vrshrn.u16  d30, q10, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vrshrn.u16  d31, q11, #8
+                                    PF cmp PF_X, ORIG_W
+    vmull.u8    q8, d24, d0
+                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+    vmull.u8    q9, d24, d1
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q10, d24, d2
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q11, d24, d3
+                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vrsra.u16   q8, q8, #8
+    vrsra.u16   q9, q9, #8
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+.endm
+
+.macro pixman_composite_src_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_src_n_8_8888_init, \
+    pixman_composite_src_n_8_8888_cleanup, \
+    pixman_composite_src_n_8_8888_process_pixblock_head, \
+    pixman_composite_src_n_8_8888_process_pixblock_tail, \
+    pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
+
+/******************************************************************************/
+
+.macro pixman_composite_src_n_8_8_process_pixblock_head
+    vmull.u8    q0, d24, d16
+    vmull.u8    q1, d25, d16
+    vmull.u8    q2, d26, d16
+    vmull.u8    q3, d27, d16
+    vrsra.u16   q0, q0,  #8
+    vrsra.u16   q1, q1,  #8
+    vrsra.u16   q2, q2,  #8
+    vrsra.u16   q3, q3,  #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail
+    vrshrn.u16  d28, q0, #8
+    vrshrn.u16  d29, q1, #8
+    vrshrn.u16  d30, q2, #8
+    vrshrn.u16  d31, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
+    fetch_mask_pixblock
+                                    PF add PF_X, PF_X, #8
+        vrshrn.u16  d28, q0, #8
+                                    PF tst PF_CTL, #0x0F
+        vrshrn.u16  d29, q1, #8
+                                    PF addne PF_X, PF_X, #8
+        vrshrn.u16  d30, q2, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vrshrn.u16  d31, q3, #8
+                                    PF cmp PF_X, ORIG_W
+    vmull.u8    q0,  d24, d16
+                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+    vmull.u8    q1,  d25, d16
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q2,  d26, d16
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q3,  d27, d16
+                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vrsra.u16   q0, q0,  #8
+    vrsra.u16   q1, q1,  #8
+    vrsra.u16   q2, q2,  #8
+    vrsra.u16   q3, q3,  #8
+.endm
+
+.macro pixman_composite_src_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d16[0]}, [DUMMY]
+    vdup.8      d16, d16[3]
+.endm
+
+.macro pixman_composite_src_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_WRITEONLY, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_src_n_8_8_init, \
+    pixman_composite_src_n_8_8_cleanup, \
+    pixman_composite_src_n_8_8_process_pixblock_head, \
+    pixman_composite_src_n_8_8_process_pixblock_tail, \
+    pixman_composite_src_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_head
+    /* expecting deinterleaved source data in {d8, d9, d10, d11} */
+    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+    /* and destination data in {d4, d5, d6, d7} */
+    /* mask is in d24 (d25, d26, d27 are unused) */
+
+    /* in */
+    vmull.u8    q6, d24, d8
+    vmull.u8    q7, d24, d9
+    vmull.u8    q8, d24, d10
+    vmull.u8    q9, d24, d11
+    vrshr.u16   q10, q6, #8
+    vrshr.u16   q11, q7, #8
+    vrshr.u16   q12, q8, #8
+    vrshr.u16   q13, q9, #8
+    vraddhn.u16 d0, q6, q10
+    vraddhn.u16 d1, q7, q11
+    vraddhn.u16 d2, q8, q12
+    vraddhn.u16 d3, q9, q13
+    vmvn.8      d25, d3  /* get inverted alpha */
+    /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
+    /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d25, d4
+    vmull.u8    q9, d25, d5
+    vmull.u8    q10, d25, d6
+    vmull.u8    q11, d25, d7
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q6, q10, #8
+    vrshr.u16   q7, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q6, q10
+    vraddhn.u16 d31, q7, q11
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+.macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q15, q9, #8
+    fetch_mask_pixblock
+        vrshr.u16   q6, q10, #8
+                                    PF add PF_X, PF_X, #8
+        vrshr.u16   q7, q11, #8
+                                    PF tst PF_CTL, #0x0F
+        vraddhn.u16 d28, q14, q8
+                                    PF addne PF_X, PF_X, #8
+        vraddhn.u16 d29, q15, q9
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d30, q6, q10
+                                    PF cmp PF_X, ORIG_W
+        vraddhn.u16 d31, q7, q11
+                                    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+    vmull.u8    q6, d24, d8
+                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+    vmull.u8    q7, d24, d9
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q8, d24, d10
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q9, d24, d11
+                                    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+        vqadd.u8    q14, q0, q14
+                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+        vqadd.u8    q15, q1, q15
+    vrshr.u16   q10, q6, #8
+    vrshr.u16   q11, q7, #8
+    vrshr.u16   q12, q8, #8
+    vrshr.u16   q13, q9, #8
+    vraddhn.u16 d0, q6, q10
+    vraddhn.u16 d1, q7, q11
+    vraddhn.u16 d2, q8, q12
+    vraddhn.u16 d3, q9, q13
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vmvn.8      d25, d3
+    vmull.u8    q8, d25, d4
+    vmull.u8    q9, d25, d5
+    vmull.u8    q10, d25, d6
+    vmull.u8    q11, d25, d7
+.endm
+
+.macro pixman_composite_over_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8888_init, \
+    pixman_composite_over_n_8_8888_cleanup, \
+    pixman_composite_over_n_8_8888_process_pixblock_head, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail, \
+    pixman_composite_over_n_8_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8_8_process_pixblock_head
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d8
+    vmull.u8    q6,  d26, d8
+    vmull.u8    q7,  d27, d8
+    vrshr.u16   q10, q0,  #8
+    vrshr.u16   q11, q1,  #8
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q13, q7,  #8
+    vraddhn.u16 d0,  q0,  q10
+    vraddhn.u16 d1,  q1,  q11
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d3,  q7,  q13
+    vmvn.8      q12, q0
+    vmvn.8      q13, q1
+    vmull.u8    q8,  d24, d4
+    vmull.u8    q9,  d25, d5
+    vmull.u8    q10, d26, d6
+    vmull.u8    q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8_8_process_pixblock_tail
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    vqadd.u8    q14, q0,  q14
+    vqadd.u8    q15, q1,  q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_n_8_8_process_pixblock_tail_head
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_n_8_8_process_pixblock_tail
+    fetch_mask_pixblock
+    cache_preload 32, 32
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    pixman_composite_over_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d8[0]}, [DUMMY]
+    vdup.8      d8, d8[3]
+.endm
+
+.macro pixman_composite_over_n_8_8_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8_8_init, \
+    pixman_composite_over_n_8_8_cleanup, \
+    pixman_composite_over_n_8_8_process_pixblock_head, \
+    pixman_composite_over_n_8_8_process_pixblock_tail, \
+    pixman_composite_over_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {d8,  d9,  d10, d11}
+     *         dest in          {d4,  d5,  d6,  d7 }
+     *         mask in          {d24, d25, d26, d27}
+     * output: updated src in   {d0,  d1,  d2,  d3 }
+     *         updated mask in  {d24, d25, d26, d3 }
+     */
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d9
+    vmull.u8    q6,  d26, d10
+    vmull.u8    q7,  d27, d11
+    vmull.u8    q9,  d11, d25
+    vmull.u8    q12, d11, d24
+    vmull.u8    q13, d11, d26
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q10, q1,  #8
+    vrshr.u16   q11, q6,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q10
+    vraddhn.u16 d2,  q6,  q11
+    vrshr.u16   q11, q12, #8
+    vrshr.u16   q8,  q9,  #8
+    vrshr.u16   q6,  q13, #8
+    vrshr.u16   q10, q7,  #8
+    vraddhn.u16 d24, q12, q11
+    vraddhn.u16 d25, q9,  q8
+    vraddhn.u16 d26, q13, q6
+    vraddhn.u16 d3,  q7,  q10
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in {d28, d29, d30, d31}
+     */
+    vmvn.8      q12, q12
+    vmvn.8      d26, d26
+    vmull.u8    q8,  d24, d4
+    vmull.u8    q9,  d25, d5
+    vmvn.8      d27, d3
+    vmull.u8    q10, d26, d6
+    vmull.u8    q11, d27, d7
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q6,  q10, #8
+    vrshr.u16   q7,  q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q6,  q10
+    vraddhn.u16 d31, q7,  q11
+    vqadd.u8    q14, q0,  q14
+    vqadd.u8    q15, q1,  q15
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+        vrshr.u16   q14, q8, #8
+        vrshr.u16   q15, q9, #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+        vrshr.u16   q6, q10, #8
+        vrshr.u16   q7, q11, #8
+        vraddhn.u16 d28, q14, q8
+        vraddhn.u16 d29, q15, q9
+        vraddhn.u16 d30, q6, q10
+        vraddhn.u16 d31, q7, q11
+    fetch_mask_pixblock
+        vqadd.u8    q14, q0, q14
+        vqadd.u8    q15, q1, q15
+    cache_preload 8, 8
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head
+    vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_8888_ca_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_8888_ca_init, \
+    pixman_composite_over_n_8888_8888_ca_cleanup, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
+    /*
+     * 'combine_mask_ca' replacement
+     *
+     * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
+     *         mask in          {d24, d25, d26}       [B, G, R]
+     * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
+     *         updated mask in  {d24, d25, d26}       [B, G, R]
+     */
+    vmull.u8    q0,  d24, d8
+    vmull.u8    q1,  d25, d9
+    vmull.u8    q6,  d26, d10
+    vmull.u8    q9,  d11, d25
+    vmull.u8    q12, d11, d24
+    vmull.u8    q13, d11, d26
+    vrshr.u16   q8,  q0,  #8
+    vrshr.u16   q10, q1,  #8
+    vrshr.u16   q11, q6,  #8
+    vraddhn.u16 d0,  q0,  q8
+    vraddhn.u16 d1,  q1,  q10
+    vraddhn.u16 d2,  q6,  q11
+    vrshr.u16   q11, q12, #8
+    vrshr.u16   q8,  q9,  #8
+    vrshr.u16   q6,  q13, #8
+    vraddhn.u16 d24, q12, q11
+    vraddhn.u16 d25, q9,  q8
+    /*
+     * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
+     * and put data into d16 - blue, d17 - green, d18 - red
+     */
+       vshrn.u16   d17, q2,  #3
+       vshrn.u16   d18, q2,  #8
+    vraddhn.u16 d26, q13, q6
+       vsli.u16    q2,  q2,  #5
+       vsri.u8     d18, d18, #5
+       vsri.u8     d17, d17, #6
+    /*
+     * 'combine_over_ca' replacement
+     *
+     * output: updated dest in d16 - blue, d17 - green, d18 - red
+     */
+    vmvn.8      q12, q12
+       vshrn.u16   d16, q2,  #2
+    vmvn.8      d26, d26
+    vmull.u8    q6,  d16, d24
+    vmull.u8    q7,  d17, d25
+    vmull.u8    q11, d18, d26
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
+    /* ... continue 'combine_over_ca' replacement */
+    vrshr.u16   q10, q6,  #8
+    vrshr.u16   q14, q7,  #8
+    vrshr.u16   q15, q11, #8
+    vraddhn.u16 d16, q10, q6
+    vraddhn.u16 d17, q14, q7
+    vraddhn.u16 d18, q15, q11
+    vqadd.u8    q8,  q0,  q8
+    vqadd.u8    d18, d2,  d18
+    /*
+     * convert the results in d16, d17, d18 to r5g6b5 and store
+     * them into {d28, d29}
+     */
+    vshll.u8    q14, d18, #8
+    vshll.u8    q10, d17, #8
+    vshll.u8    q15, d16, #8
+    vsri.u16    q14, q10, #5
+    vsri.u16    q14, q15, #11
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+    fetch_mask_pixblock
+        vrshr.u16   q10, q6, #8
+        vrshr.u16   q14, q7, #8
+    vld1.16     {d4, d5}, [DST_R, :128]!
+        vrshr.u16   q15, q11, #8
+        vraddhn.u16 d16, q10, q6
+        vraddhn.u16 d17, q14, q7
+        vraddhn.u16 d22, q15, q11
+            /* process_pixblock_head */
+            /*
+             * 'combine_mask_ca' replacement
+             *
+             * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
+             *         mask in          {d24, d25, d26}       [B, G, R]
+             * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
+             *         updated mask in  {d24, d25, d26}       [B, G, R]
+             */
+            vmull.u8    q6,  d26, d10
+        vqadd.u8    q8,  q0, q8
+            vmull.u8    q0,  d24, d8
+        vqadd.u8    d22, d2, d22
+            vmull.u8    q1,  d25, d9
+        /*
+         * convert the result in d16, d17, d22 to r5g6b5 and store
+         * it into {d28, d29}
+         */
+        vshll.u8    q14, d22, #8
+        vshll.u8    q10, d17, #8
+        vshll.u8    q15, d16, #8
+            vmull.u8    q9,  d11, d25
+        vsri.u16    q14, q10, #5
+            vmull.u8    q12, d11, d24
+            vmull.u8    q13, d11, d26
+        vsri.u16    q14, q15, #11
+    cache_preload 8, 8
+            vrshr.u16   q8,  q0,  #8
+            vrshr.u16   q10, q1,  #8
+            vrshr.u16   q11, q6,  #8
+            vraddhn.u16 d0,  q0,  q8
+            vraddhn.u16 d1,  q1,  q10
+            vraddhn.u16 d2,  q6,  q11
+            vrshr.u16   q11, q12, #8
+            vrshr.u16   q8,  q9,  #8
+            vrshr.u16   q6,  q13, #8
+            vraddhn.u16 d24, q12, q11
+            vraddhn.u16 d25, q9,  q8
+                /*
+                 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
+	         * 8-bit format and put data into d16 - blue, d17 - green,
+	         * d18 - red
+                 */
+                vshrn.u16   d17, q2,  #3
+                vshrn.u16   d18, q2,  #8
+            vraddhn.u16 d26, q13, q6
+                vsli.u16    q2,  q2,  #5
+                vsri.u8     d17, d17, #6
+                vsri.u8     d18, d18, #5
+            /*
+             * 'combine_over_ca' replacement
+             *
+             * output: updated dest in d16 - blue, d17 - green, d18 - red
+             */
+            vmvn.8      q12, q12
+                vshrn.u16   d16, q2,  #2
+            vmvn.8      d26, d26
+            vmull.u8    q7,  d17, d25
+            vmull.u8    q6,  d16, d24
+            vmull.u8    q11, d18, d26
+    vst1.16     {d28, d29}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d8, d11[0]
+    vdup.8      d9, d11[1]
+    vdup.8      d10, d11[2]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_over_n_8888_0565_ca_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_n_8888_0565_ca_init, \
+    pixman_composite_over_n_8888_0565_ca_cleanup, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
+    pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_in_n_8_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* and destination data in {d4, d5, d6, d7} */
+    vmull.u8    q8,  d4,  d3
+    vmull.u8    q9,  d5,  d3
+    vmull.u8    q10, d6,  d3
+    vmull.u8    q11, d7,  d3
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q8,  q14
+    vraddhn.u16 d29, q9,  q15
+    vraddhn.u16 d30, q10, q12
+    vraddhn.u16 d31, q11, q13
+.endm
+
+.macro pixman_composite_in_n_8_process_pixblock_tail_head
+    pixman_composite_in_n_8_process_pixblock_tail
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    cache_preload 32, 32
+    pixman_composite_in_n_8_process_pixblock_head
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_in_n_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d3, d3[3]
+.endm
+
+.macro pixman_composite_in_n_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_in_n_8_init, \
+    pixman_composite_in_n_8_cleanup, \
+    pixman_composite_in_n_8_process_pixblock_head, \
+    pixman_composite_in_n_8_process_pixblock_tail, \
+    pixman_composite_in_n_8_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+.macro pixman_composite_add_n_8_8_process_pixblock_head
+    /* expecting source data in {d8, d9, d10, d11} */
+    /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
+    /* and destination data in {d4, d5, d6, d7} */
+    /* mask is in d24, d25, d26, d27 */
+    vmull.u8    q0, d24, d11
+    vmull.u8    q1, d25, d11
+    vmull.u8    q6, d26, d11
+    vmull.u8    q7, d27, d11
+    vrshr.u16   q10, q0, #8
+    vrshr.u16   q11, q1, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q13, q7, #8
+    vraddhn.u16 d0, q0, q10
+    vraddhn.u16 d1, q1, q11
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d3, q7, q13
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_n_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_n_8_8_process_pixblock_tail_head
+    pixman_composite_add_n_8_8_process_pixblock_tail
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    fetch_mask_pixblock
+    cache_preload 32, 32
+    pixman_composite_add_n_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vpush       {d8-d15}
+    vld1.32     {d11[0]}, [DUMMY]
+    vdup.8      d11, d11[3]
+.endm
+
+.macro pixman_composite_add_n_8_8_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8_init, \
+    pixman_composite_add_n_8_8_cleanup, \
+    pixman_composite_add_n_8_8_process_pixblock_head, \
+    pixman_composite_add_n_8_8_process_pixblock_tail, \
+    pixman_composite_add_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8_8_8_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* mask in {d24, d25, d26, d27} */
+    vmull.u8    q8, d24, d0
+    vmull.u8    q9, d25, d1
+    vmull.u8    q10, d26, d2
+    vmull.u8    q11, d27, d3
+    vrshr.u16   q0, q8, #8
+    vrshr.u16   q1, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d0, q0, q8
+    vraddhn.u16 d1, q1, q9
+    vraddhn.u16 d2, q12, q10
+    vraddhn.u16 d3, q13, q11
+    vqadd.u8    q14, q0, q2
+    vqadd.u8    q15, q1, q3
+.endm
+
+.macro pixman_composite_add_8_8_8_process_pixblock_tail
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_8_8_8_process_pixblock_tail_head
+    pixman_composite_add_8_8_8_process_pixblock_tail
+    vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    fetch_mask_pixblock
+    fetch_src_pixblock
+    cache_preload 32, 32
+    pixman_composite_add_8_8_8_process_pixblock_head
+.endm
+
+.macro pixman_composite_add_8_8_8_init
+.endm
+
+.macro pixman_composite_add_8_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
+    FLAG_DST_READWRITE, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8_8_8_init, \
+    pixman_composite_add_8_8_8_cleanup, \
+    pixman_composite_add_8_8_8_process_pixblock_head, \
+    pixman_composite_add_8_8_8_process_pixblock_tail, \
+    pixman_composite_add_8_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* mask in {d24, d25, d26, d27} */
+    vmull.u8    q8,  d27, d0
+    vmull.u8    q9,  d27, d1
+    vmull.u8    q10, d27, d2
+    vmull.u8    q11, d27, d3
+    /* 1 cycle bubble */
+    vrsra.u16   q8,  q8,  #8
+    vrsra.u16   q9,  q9,  #8
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
+    /* 2 cycle bubble */
+    vrshrn.u16  d28, q8,  #8
+    vrshrn.u16  d29, q9,  #8
+    vrshrn.u16  d30, q10, #8
+    vrshrn.u16  d31, q11, #8
+    vqadd.u8    q14, q2,  q14
+    /* 1 cycle bubble */
+    vqadd.u8    q15, q3,  q15
+.endm
+
+.macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+        vrshrn.u16  d28, q8,  #8
+    fetch_mask_pixblock
+        vrshrn.u16  d29, q9,  #8
+    vmull.u8    q8,  d27, d0
+        vrshrn.u16  d30, q10, #8
+    vmull.u8    q9,  d27, d1
+        vrshrn.u16  d31, q11, #8
+    vmull.u8    q10, d27, d2
+        vqadd.u8    q14, q2,  q14
+    vmull.u8    q11, d27, d3
+        vqadd.u8    q15, q3,  q15
+    vrsra.u16   q8,  q8,  #8
+    vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
+    vrsra.u16   q9,  q9,  #8
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vrsra.u16   q10, q10, #8
+
+    cache_preload 8, 8
+
+    vrsra.u16   q11, q11, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+generate_composite_function \
+    pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+.endm
+
+.macro pixman_composite_add_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_n_8_8888_init, \
+    pixman_composite_add_n_8_8888_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_8888_n_8888_init
+    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+    vld1.32     {d27[0]}, [DUMMY]
+    vdup.8      d27, d27[3]
+.endm
+
+.macro pixman_composite_add_8888_n_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_add_8888_n_8888_init, \
+    pixman_composite_add_8888_n_8888_cleanup, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_head, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
+    pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    27  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+    /* expecting source data in {d0, d1, d2, d3} */
+    /* destination data in {d4, d5, d6, d7} */
+    /* solid mask is in d15 */
+
+    /* 'in' */
+    vmull.u8    q8, d15, d3
+    vmull.u8    q6, d15, d2
+    vmull.u8    q5, d15, d1
+    vmull.u8    q4, d15, d0
+    vrshr.u16   q13, q8, #8
+    vrshr.u16   q12, q6, #8
+    vrshr.u16   q11, q5, #8
+    vrshr.u16   q10, q4, #8
+    vraddhn.u16 d3, q8, q13
+    vraddhn.u16 d2, q6, q12
+    vraddhn.u16 d1, q5, q11
+    vraddhn.u16 d0, q4, q10
+    vmvn.8      d24, d3  /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+    vmull.u8    q11, d24, d7
+.endm
+
+.macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_head
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
+.endm
+
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail
+    pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
+    vqadd.u8    q14, q0, q14
+    vqadd.u8    q15, q1, q15
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+.macro pixman_composite_over_8888_n_8888_init
+    add         DUMMY, sp, #48
+    vpush       {d8-d15}
+    vld1.32     {d15[0]}, [DUMMY]
+    vdup.8      d15, d15[3]
+.endm
+
+.macro pixman_composite_over_8888_n_8888_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_8888_n_8888_init, \
+    pixman_composite_over_8888_n_8888_cleanup, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail_head
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+generate_composite_function_single_scanline \
+    pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    12  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
+    vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
+    pixman_composite_over_8888_n_8888_process_pixblock_tail
+    fetch_src_pixblock
+    cache_preload 8, 8
+    fetch_mask_pixblock
+    pixman_composite_over_8888_n_8888_process_pixblock_head
+    vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_n_8888_process_pixblock_head, \
+    pixman_composite_over_8888_n_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0888_process_pixblock_head
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_0888_process_pixblock_tail_head
+    vst3.8 {d0, d1, d2}, [DST_W]!
+    fetch_src_pixblock
+    cache_preload 8, 8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0888_process_pixblock_head, \
+    pixman_composite_src_0888_0888_process_pixblock_tail, \
+    pixman_composite_src_0888_0888_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_head
+    vswp   d0, d2
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
+    vst4.8 {d0, d1, d2, d3}, [DST_W]!
+    fetch_src_pixblock
+    vswp   d0, d2
+    cache_preload 8, 8
+.endm
+
+.macro pixman_composite_src_0888_8888_rev_init
+    veor   d3, d3, d3
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    pixman_composite_src_0888_8888_rev_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_head, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
+    0, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_head
+    vshll.u8    q8, d1, #8
+    vshll.u8    q9, d2, #8
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
+    vshll.u8    q14, d0, #8
+    vsri.u16    q14, q8, #5
+    vsri.u16    q14, q9, #11
+.endm
+
+.macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
+        vshll.u8    q14, d0, #8
+    fetch_src_pixblock
+        vsri.u16    q14, q8, #5
+        vsri.u16    q14, q9, #11
+    vshll.u8    q8, d1, #8
+        vst1.16 {d28, d29}, [DST_W, :128]!
+    vshll.u8    q9, d2, #8
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
+    FLAG_DST_WRITEONLY, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_head, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
+    pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_head
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
+    vrshr.u16   q11, q8, #8
+    vswp        d3, d31
+    vrshr.u16   q12, q9, #8
+    vrshr.u16   q13, q10, #8
+    vraddhn.u16 d30, q11, q8
+    vraddhn.u16 d29, q12, q9
+    vraddhn.u16 d28, q13, q10
+.endm
+
+.macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
+        vrshr.u16   q11, q8, #8
+        vswp        d3, d31
+        vrshr.u16   q12, q9, #8
+        vrshr.u16   q13, q10, #8
+    fetch_src_pixblock
+        vraddhn.u16 d30, q11, q8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d29, q12, q9
+        vraddhn.u16 d28, q13, q10
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+    pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
+    vrshr.u16   q11, q8, #8
+    vswp        d3, d31
+    vrshr.u16   q12, q9, #8
+    vrshr.u16   q13, q10, #8
+    vraddhn.u16 d28, q11, q8
+    vraddhn.u16 d29, q12, q9
+    vraddhn.u16 d30, q13, q10
+.endm
+
+.macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
+        vrshr.u16   q11, q8, #8
+        vswp        d3, d31
+        vrshr.u16   q12, q9, #8
+        vrshr.u16   q13, q10, #8
+    fetch_src_pixblock
+        vraddhn.u16 d28, q11, q8
+                                    PF add PF_X, PF_X, #8
+                                    PF tst PF_CTL, #0xF
+                                    PF addne PF_X, PF_X, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vraddhn.u16 d29, q12, q9
+        vraddhn.u16 d30, q13, q10
+    vmull.u8    q8, d3, d0
+    vmull.u8    q9, d3, d1
+    vmull.u8    q10, d3, d2
+        vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
+                                    PF cmp PF_X, ORIG_W
+                                    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+                                    PF subge PF_X, PF_X, ORIG_W
+                                    PF subges PF_CTL, PF_CTL, #0x10
+                                    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endm
+
+generate_composite_function \
+    pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    10, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
+    pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    0, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q4, d2, d1, d0
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* source pixel data is in      {d0, d1, d2, XX} */
+    /* destination pixel data is in {d4, d5, d6, XX} */
+    vmvn.8      d7,  d15
+    vmull.u8    q6,  d15, d2
+    vmull.u8    q5,  d15, d1
+    vmull.u8    q4,  d15, d0
+    vmull.u8    q8,  d7,  d4
+    vmull.u8    q9,  d7,  d5
+    vmull.u8    q13, d7,  d6
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q11, q5,  #8
+    vrshr.u16   q10, q4,  #8
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d1,  q5,  q11
+    vraddhn.u16 d0,  q4,  q10
+.endm
+
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail
+    vrshr.u16   q14, q8,  #8
+    vrshr.u16   q15, q9,  #8
+    vrshr.u16   q12, q13, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q13
+    vqadd.u8    q0,  q0,  q14
+    vqadd.u8    q1,  q1,  q15
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
+    fetch_mask_pixblock
+    pixman_composite_over_0565_8_0565_process_pixblock_tail
+    fetch_src_pixblock
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_over_0565_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_over_0565_n_0565_init
+    add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
+    vpush       {d8-d15}
+    vld1.32     {d15[0]}, [DUMMY]
+    vdup.8      d15, d15[3]
+.endm
+
+.macro pixman_composite_over_0565_n_0565_cleanup
+    vpop        {d8-d15}
+.endm
+
+generate_composite_function \
+    pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_over_0565_n_0565_init, \
+    pixman_composite_over_0565_n_0565_cleanup, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q4, d2, d1, d0
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* source pixel data is in      {d0, d1, d2, XX} */
+    /* destination pixel data is in {d4, d5, d6, XX} */
+    vmull.u8    q6,  d15, d2
+    vmull.u8    q5,  d15, d1
+    vmull.u8    q4,  d15, d0
+    vrshr.u16   q12, q6,  #8
+    vrshr.u16   q11, q5,  #8
+    vrshr.u16   q10, q4,  #8
+    vraddhn.u16 d2,  q6,  q12
+    vraddhn.u16 d1,  q5,  q11
+    vraddhn.u16 d0,  q4,  q10
+.endm
+
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail
+    vqadd.u8    q0,  q0,  q2
+    vqadd.u8    q1,  q1,  q3
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
+    fetch_mask_pixblock
+    pixman_composite_add_0565_8_0565_process_pixblock_tail
+    fetch_src_pixblock
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_add_0565_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_add_0565_8_0565_process_pixblock_head, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_head
+    /* mask is in d15 */
+    convert_0565_to_x888 q5, d6, d5, d4
+    /* destination pixel data is in {d4, d5, d6, xx} */
+    vmvn.8      d24, d15 /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d24, d4
+    vmull.u8    q9, d24, d5
+    vmull.u8    q10, d24, d6
+.endm
+
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vraddhn.u16 d0, q14, q8
+    vraddhn.u16 d1, q15, q9
+    vraddhn.u16 d2, q12, q10
+    /* 32bpp result is in {d0, d1, d2, XX} */
+    convert_8888_to_0565 d2, d1, d0, q14, q15, q3
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
+    fetch_src_pixblock
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail
+    vld1.16    {d10, d11}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_out_reverse_8_0565_process_pixblock_head
+    vst1.16    {d28, d29}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_head, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
+    pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10, /* dst_r_basereg */ \
+    15, /* src_basereg   */ \
+    0   /* mask_basereg  */
+
+/******************************************************************************/
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_head
+    /* src is in d0 */
+    /* destination pixel data is in {d4, d5, d6, d7} */
+    vmvn.8      d1, d0 /* get inverted alpha */
+    /* now do alpha blending */
+    vmull.u8    q8, d1, d4
+    vmull.u8    q9, d1, d5
+    vmull.u8    q10, d1, d6
+    vmull.u8    q11, d1, d7
+.endm
+
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
+    vrshr.u16   q14, q8, #8
+    vrshr.u16   q15, q9, #8
+    vrshr.u16   q12, q10, #8
+    vrshr.u16   q13, q11, #8
+    vraddhn.u16 d28, q14, q8
+    vraddhn.u16 d29, q15, q9
+    vraddhn.u16 d30, q12, q10
+    vraddhn.u16 d31, q13, q11
+    /* 32bpp result is in {d28, d29, d30, d31} */
+.endm
+
+/* TODO: expand macros and do better instructions scheduling */
+.macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
+    fetch_src_pixblock
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail
+    vld4.8    {d4, d5, d6, d7}, [DST_R, :128]!
+    cache_preload 8, 8
+    pixman_composite_out_reverse_8_8888_process_pixblock_head
+    vst4.8    {d28, d29, d30, d31}, [DST_W, :128]!
+.endm
+
+generate_composite_function \
+    pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_head, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
+    pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4, /* dst_r_basereg */ \
+    0, /* src_basereg   */ \
+    0   /* mask_basereg  */
+
+/******************************************************************************/
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_8888_process_pixblock_head, \
+    pixman_composite_over_8888_8888_process_pixblock_tail, \
+    pixman_composite_over_8888_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_over_8888_0565_process_pixblock_head, \
+    pixman_composite_over_8888_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    0,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_8888_0565_process_pixblock_head, \
+    pixman_composite_src_8888_0565_process_pixblock_tail, \
+    pixman_composite_src_8888_0565_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init, \
+    default_cleanup, \
+    pixman_composite_src_0565_8888_process_pixblock_head, \
+    pixman_composite_src_0565_8888_process_pixblock_tail, \
+    pixman_composite_src_0565_8888_process_pixblock_tail_head
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
+    FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_8888_8_0565_process_pixblock_head, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail, \
+    pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    4,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    24  /* mask_basereg  */
+
+generate_composite_function_nearest_scanline \
+    pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
+    FLAG_DST_READWRITE, \
+    8, /* number of pixels, processed in a single block */ \
+    default_init_need_all_regs, \
+    default_cleanup_need_all_regs, \
+    pixman_composite_over_0565_8_0565_process_pixblock_head, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail, \
+    pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
+    28, /* dst_w_basereg */ \
+    10,  /* dst_r_basereg */ \
+    8,  /* src_basereg   */ \
+    15  /* mask_basereg  */
+
+/******************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+    .func fname
+    .global fname
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    vld1.32   {reg1}, [TMP1], STRIDE
+    vld1.32   {reg2}, [TMP1]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    vld1.32   {reg2[0]}, [TMP1], STRIDE
+    vld1.32   {reg2[1]}, [TMP1]
+    convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+                    acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+    bilinear_load_8888 reg1, reg2, tmp1
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    bilinear_load_8888 reg3, reg4, tmp2
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+    bilinear_load_and_vertical_interpolate_two_8888 \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+                acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {acc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {acc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {acc2lo[1]}, [TMP1]
+    vld1.32   {acc2hi[1]}, [TMP2]
+    convert_0565_to_x888 acc2, reg3, reg2, reg1
+    vzip.u8   reg1, reg3
+    vzip.u8   reg2, reg4
+    vzip.u8   reg3, reg4
+    vzip.u8   reg1, reg2
+    vmull.u8  acc1, reg1, d28
+    vmlal.u8  acc1, reg2, d29
+    vmull.u8  acc2, reg3, d28
+    vmlal.u8  acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+                xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+                yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
+    vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
+    vld1.32   {xacc2lo[1]}, [TMP1]
+    vld1.32   {xacc2hi[1]}, [TMP2]
+    convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #1
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #1
+    vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
+    vzip.u8   xreg1, xreg3
+    vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
+    vzip.u8   xreg2, xreg4
+    vld1.32   {yacc2lo[1]}, [TMP1]
+    vzip.u8   xreg3, xreg4
+    vld1.32   {yacc2hi[1]}, [TMP2]
+    vzip.u8   xreg1, xreg2
+    convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
+    vmull.u8  xacc1, xreg1, d28
+    vzip.u8   yreg1, yreg3
+    vmlal.u8  xacc1, xreg2, d29
+    vzip.u8   yreg2, yreg4
+    vmull.u8  xacc2, xreg3, d28
+    vzip.u8   yreg3, yreg4
+    vmlal.u8  xacc2, xreg4, d29
+    vzip.u8   yreg1, yreg2
+    vmull.u8  yacc1, yreg1, d28
+    vmlal.u8  yacc1, yreg2, d29
+    vmull.u8  yacc2, yreg3, d28
+    vmlal.u8  yacc2, yreg4, d29
+.endm
+
+.macro bilinear_store_8888 numpix, tmp1, tmp2
+.if numpix == 4
+    vst1.32   {d0, d1}, [OUT, :128]!
+.elseif numpix == 2
+    vst1.32   {d0}, [OUT, :64]!
+.elseif numpix == 1
+    vst1.32   {d0[0]}, [OUT, :32]!
+.else
+    .error bilinear_store_8888 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_store_0565 numpix, tmp1, tmp2
+    vuzp.u8 d0, d1
+    vuzp.u8 d2, d3
+    vuzp.u8 d1, d3
+    vuzp.u8 d0, d2
+    convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
+.if numpix == 4
+    vst1.16   {d2}, [OUT, :64]!
+.elseif numpix == 2
+    vst1.32   {d2[0]}, [OUT, :32]!
+.elseif numpix == 1
+    vst1.16   {d2[0]}, [OUT, :16]!
+.else
+    .error bilinear_store_0565 numpix is unsupported
+.endif
+.endm
+
+.macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    bilinear_load_&src_fmt d0, d1, d2
+    vmull.u8  q1, d0, d28
+    vmlal.u8  q1, d1, d29
+    /* 5 cycles bubble */
+    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    /* 5 cycles bubble */
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    /* 3 cycles bubble */
+    vmovn.u16 d0, q0
+    /* 1 cycle bubble */
+    bilinear_store_&dst_fmt 1, q2, q3
+.endm
+
+.macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    bilinear_load_and_vertical_interpolate_two_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23
+    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vadd.u16  q12, q12, q13
+    vmovn.u16 d0, q0
+    bilinear_store_&dst_fmt 2, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    bilinear_load_and_vertical_interpolate_four_&src_fmt \
+                q1, q11, d0, d1, d20, d21, d22, d23 \
+                q3, q9,  d4, d5, d16, d17, d18, d19
+    pld       [TMP1, PF_OFFS]
+    sub       TMP1, TMP1, STRIDE
+    vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q0, d2, d30
+    vmlal.u16 q0, d3, d30
+    vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q10, d22, d31
+    vmlal.u16 q10, d23, d31
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q2, d6, d30
+    vmlal.u16 q2, d7, d30
+    vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
+    pld       [TMP2, PF_OFFS]
+    vmlsl.u16 q8, d18, d31
+    vmlal.u16 q8, d19, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vmovn.u16 d0, q0
+    vmovn.u16 d1, q2
+    vadd.u16  q12, q12, q13
+    bilinear_store_&dst_fmt 4, q2, q3
+.endm
+
+.macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
+.else
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
+.endif
+.endm
+
+.macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
+.else
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
+.else
+    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
+.else
+    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+.endif
+.endm
+
+.macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+.ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
+    bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
+.else
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+.endif
+.endm
+
+.set BILINEAR_FLAG_UNROLL_4,          0
+.set BILINEAR_FLAG_UNROLL_8,          1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline
+ * functions.
+ *
+ * Bilinear scanline scaler macro template uses the following arguments:
+ *  fname             - name of the function to generate
+ *  src_fmt           - source color format (8888 or 0565)
+ *  dst_fmt           - destination color format (8888 or 0565)
+ *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
+ *  prefetch_distance - prefetch in the source image by that many
+ *                      pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
+                                       src_bpp_shift, dst_bpp_shift, \
+                                       prefetch_distance, flags
+
+pixman_asm_function fname
+    OUT       .req      r0
+    TOP       .req      r1
+    BOTTOM    .req      r2
+    WT        .req      r3
+    WB        .req      r4
+    X         .req      r5
+    UX        .req      r6
+    WIDTH     .req      ip
+    TMP1      .req      r3
+    TMP2      .req      r4
+    PF_OFFS   .req      r7
+    TMP3      .req      r8
+    TMP4      .req      r9
+    STRIDE    .req      r2
+
+    mov       ip, sp
+    push      {r4, r5, r6, r7, r8, r9}
+    mov       PF_OFFS, #prefetch_distance
+    ldmia     ip, {WB, X, UX, WIDTH}
+    mul       PF_OFFS, PF_OFFS, UX
+
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpush     {d8-d15}
+.endif
+
+    sub       STRIDE, BOTTOM, TOP
+    .unreq    BOTTOM
+
+    cmp       WIDTH, #0
+    ble       3f
+
+    vdup.u16  q12, X
+    vdup.u16  q13, UX
+    vdup.u8   d28, WT
+    vdup.u8   d29, WB
+    vadd.u16  d25, d25, d26
+
+    /* ensure good destination alignment  */
+    cmp       WIDTH, #1
+    blt       0f
+    tst       OUT, #(1 << dst_bpp_shift)
+    beq       0f
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vadd.u16  q12, q12, q13
+    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #1
+0:
+    vadd.u16  q13, q13, q13
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vadd.u16  q12, q12, q13
+
+    cmp       WIDTH, #2
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 1))
+    beq       0f
+    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #2
+0:
+.if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
+/*********** 8 pixels per iteration *****************/
+    cmp       WIDTH, #4
+    blt       0f
+    tst       OUT, #(1 << (dst_bpp_shift + 2))
+    beq       0f
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+    sub       WIDTH, WIDTH, #4
+0:
+    subs      WIDTH, WIDTH, #8
+    blt       1f
+    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+    bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #8
+    blt       5f
+0:
+    bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #8
+    bge       0b
+5:
+    bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
+1:
+    tst       WIDTH, #4
+    beq       2f
+    bilinear_interpolate_four_pixels src_fmt, dst_fmt
+2:
+.else
+/*********** 4 pixels per iteration *****************/
+    subs      WIDTH, WIDTH, #4
+    blt       1f
+    mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+    bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #4
+    blt       5f
+0:
+    bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
+    subs      WIDTH, WIDTH, #4
+    bge       0b
+5:
+    bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
+1:
+/****************************************************/
+.endif
+    /* handle the remaining trailing pixels */
+    tst       WIDTH, #2
+    beq       2f
+    bilinear_interpolate_two_pixels src_fmt, dst_fmt
+2:
+    tst       WIDTH, #1
+    beq       3f
+    bilinear_interpolate_last_pixel src_fmt, dst_fmt
+3:
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+    vpop      {d8-d15}
+.endif
+    pop       {r4, r5, r6, r7, r8, r9}
+    bx        lr
+
+    .unreq    OUT
+    .unreq    TOP
+    .unreq    WT
+    .unreq    WB
+    .unreq    X
+    .unreq    UX
+    .unreq    WIDTH
+    .unreq    TMP1
+    .unreq    TMP2
+    .unreq    PF_OFFS
+    .unreq    TMP3
+    .unreq    TMP4
+    .unreq    STRIDE
+.endfunc
+
+.endm
+
+/*****************************************************************************/
+
+.set have_bilinear_interpolate_four_pixels_8888_8888, 1
+
+.macro bilinear_interpolate_four_pixels_8888_8888_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+
+    vld1.32   {d22}, [TMP1], STRIDE
+    vld1.32   {d23}, [TMP1]
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    vmull.u8  q8, d22, d28
+    vmlal.u8  q8, d23, d29
+
+    vld1.32   {d22}, [TMP2], STRIDE
+    vld1.32   {d23}, [TMP2]
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmull.u8  q9, d22, d28
+    vmlal.u8  q9, d23, d29
+
+    vld1.32   {d22}, [TMP3], STRIDE
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+
+    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+
+    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q1, d18, d31
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail
+    vmlal.u16 q1, d19, d31
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q2, d20, d30
+    vmlal.u16 q2, d21, d30
+    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q3, d22, d31
+    vmlal.u16 q3, d23, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vmovn.u16 d6, q0
+    vmovn.u16 d7, q2
+    vadd.u16  q12, q12, q13
+    vst1.32   {d6, d7}, [OUT, :128]!
+.endm
+
+.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d6, q0
+    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
+        vmovn.u16 d7, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+        vst1.32   {d6, d7}, [OUT, :128]!
+    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q1, d18, d31
+.endm
+
+/*****************************************************************************/
+
+.set have_bilinear_interpolate_eight_pixels_8888_0565, 1
+
+.macro bilinear_interpolate_eight_pixels_8888_0565_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+    vld1.32   {d20}, [TMP1], STRIDE
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+    vld1.32   {d22}, [TMP2], STRIDE
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q1, d18, d31
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d8, q0
+    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
+        vmovn.u16 d9, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q1, d18, d31
+.endm
+
+.macro bilinear_interpolate_eight_pixels_8888_0565_tail
+    vmlal.u16 q1, d19, d31
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q2, d20, d30
+    vmlal.u16 q2, d21, d30
+    vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q3, d22, d31
+    vmlal.u16 q3, d23, d31
+    vadd.u16  q12, q12, q13
+    vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vmovn.u16 d10, q0
+    vmovn.u16 d11, q2
+    vadd.u16  q12, q12, q13
+
+    vuzp.u8   d8, d9
+    vuzp.u8   d10, d11
+    vuzp.u8   d9, d11
+    vuzp.u8   d8, d10
+    vshll.u8  q6, d9, #8
+    vshll.u8  q5, d10, #8
+    vshll.u8  q7, d8, #8
+    vsri.u16  q5, q6, #5
+    vsri.u16  q5, q7, #11
+    vst1.32   {d10, d11}, [OUT, :128]!
+.endm
+
+.macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+            vuzp.u8 d8, d9
+        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d10, q0
+    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
+        vmovn.u16 d11, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+            vuzp.u8 d10, d11
+    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
+    vmlsl.u16 q1, d18, d31
+
+    mov       TMP1, X, asr #16
+    add       X, X, UX
+    add       TMP1, TOP, TMP1, asl #2
+    mov       TMP2, X, asr #16
+    add       X, X, UX
+    add       TMP2, TOP, TMP2, asl #2
+        vmlal.u16 q1, d19, d31
+            vuzp.u8 d9, d11
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+        vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
+            vuzp.u8 d8, d10
+        vmlsl.u16 q2, d20, d30
+        vmlal.u16 q2, d21, d30
+        vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
+    vld1.32   {d20}, [TMP1], STRIDE
+        vmlsl.u16 q3, d22, d31
+        vmlal.u16 q3, d23, d31
+    vld1.32   {d21}, [TMP1]
+    vmull.u8  q8, d20, d28
+    vmlal.u8  q8, d21, d29
+            vshll.u8  q6, d9, #8
+            vshll.u8  q5, d10, #8
+            vshll.u8  q7, d8, #8
+        vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
+            vsri.u16  q5, q6, #5
+        vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
+            vsri.u16  q5, q7, #11
+        vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
+    vld1.32   {d22}, [TMP2], STRIDE
+        vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
+        vadd.u16  q12, q12, q13
+    vld1.32   {d23}, [TMP2]
+    vmull.u8  q9, d22, d28
+    mov       TMP3, X, asr #16
+    add       X, X, UX
+    add       TMP3, TOP, TMP3, asl #2
+    mov       TMP4, X, asr #16
+    add       X, X, UX
+    add       TMP4, TOP, TMP4, asl #2
+    vmlal.u8  q9, d23, d29
+    vld1.32   {d22}, [TMP3], STRIDE
+        vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
+    vld1.32   {d23}, [TMP3]
+    vmull.u8  q10, d22, d28
+    vmlal.u8  q10, d23, d29
+        vmovn.u16 d8, q0
+    vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
+        vmovn.u16 d9, q2
+    vmlsl.u16 q0, d16, d30
+    vmlal.u16 q0, d17, d30
+    pld       [TMP4, PF_OFFS]
+    vld1.32   {d16}, [TMP4], STRIDE
+        vadd.u16  q12, q12, q13
+    vld1.32   {d17}, [TMP4]
+    pld       [TMP4, PF_OFFS]
+    vmull.u8  q11, d16, d28
+    vmlal.u8  q11, d17, d29
+    vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
+            vst1.32   {d10, d11}, [OUT, :128]!
+    vmlsl.u16 q1, d18, d31
+.endm
+/*****************************************************************************/
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
+    2, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
+    2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
+    1, 2, 28, BILINEAR_FLAG_UNROLL_4
+
+generate_bilinear_scanline_func \
+    pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
+    1, 1, 28, BILINEAR_FLAG_UNROLL_4
diff --git a/src/pixman/pixman/pixman-arm-neon-asm.h b/src/pixman/pixman/pixman-arm-neon-asm.h
new file mode 100644
index 0000000..d0d92d7
--- /dev/null
+++ b/src/pixman/pixman/pixman-arm-neon-asm.h
@@ -0,0 +1,1196 @@
+/*
+ * Copyright © 2009 Nokia Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ */
+
+/*
+ * This file contains a macro ('generate_composite_function') which can
+ * construct 2D image processing functions, based on a common template.
+ * Any combinations of source, destination and mask images with 8bpp,
+ * 16bpp, 24bpp, 32bpp color formats are supported.
+ *
+ * This macro takes care of:
+ *  - handling of leading and trailing unaligned pixels
+ *  - doing most of the work related to L2 cache preload
+ *  - encourages the use of software pipelining for better instructions
+ *    scheduling
+ *
+ * The user of this macro has to provide some configuration parameters
+ * (bit depths for the images, prefetch distance, etc.) and a set of
+ * macros, which should implement basic code chunks responsible for
+ * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
+ * examples.
+ *
+ * TODO:
+ *  - try overlapped pixel method (from Ian Rickards) when processing
+ *    exactly two blocks of pixels
+ *  - maybe add an option to do reverse scanline processing
+ */
+
+/*
+ * Bit flags for 'generate_composite_function' macro which are used
+ * to tune generated functions behavior.
+ */
+.set FLAG_DST_WRITEONLY,       0
+.set FLAG_DST_READWRITE,       1
+.set FLAG_DEINTERLEAVE_32BPP,  2
+
+/*
+ * Offset in stack where mask and source pointer/stride can be accessed
+ * from 'init' macro. This is useful for doing special handling for solid mask.
+ */
+.set ARGS_STACK_OFFSET,        40
+
+/*
+ * Constants for selecting preferable prefetch type.
+ */
+.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
+.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
+.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
+
+/*
+ * Definitions of supplementary pixld/pixst macros (for partial load/store of
+ * pixel data).
+ */
+
+.macro pixldst1 op, elem_size, reg1, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
+.if abits > 0
+    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
+.else
+    op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
+.endif
+.endm
+
+.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
+    op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
+.endm
+
+.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
+    op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
+.endm
+
+.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
+    op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
+.endm
+
+.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
+.if numbytes == 32
+    pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
+                              %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif numbytes == 16
+    pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
+.elseif numbytes == 8
+    pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
+.elseif numbytes == 4
+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
+        pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
+    .elseif elem_size == 16
+        pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
+        pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
+    .else
+        pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
+    .endif
+.elseif numbytes == 2
+    .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
+        pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
+    .else
+        pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
+        pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
+    .endif
+.elseif numbytes == 1
+    pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
+.else
+    .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
+                      %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+    pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixst numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
+                      %(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+    pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+    pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+    pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+
+.macro pixld_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+    pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+    pixld numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+.macro pixst_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+    pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+    pixst numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+
+/*
+ * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
+ * aliases to be defined)
+ */
+.macro pixld1_s elem_size, reg1, mem_operand
+.if elem_size == 16
+    mov     TMP1, VX, asr #16
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+    add     TMP1, mem_operand, TMP1, asl #1
+    mov     TMP2, VX, asr #16
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+    add     TMP2, mem_operand, TMP2, asl #1
+    vld1.16 {d&reg1&[0]}, [TMP1, :16]
+    mov     TMP1, VX, asr #16
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+    add     TMP1, mem_operand, TMP1, asl #1
+    vld1.16 {d&reg1&[1]}, [TMP2, :16]
+    mov     TMP2, VX, asr #16
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+    add     TMP2, mem_operand, TMP2, asl #1
+    vld1.16 {d&reg1&[2]}, [TMP1, :16]
+    vld1.16 {d&reg1&[3]}, [TMP2, :16]
+.elseif elem_size == 32
+    mov     TMP1, VX, asr #16
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+    add     TMP1, mem_operand, TMP1, asl #2
+    mov     TMP2, VX, asr #16
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[0]}, [TMP1, :32]
+    vld1.32 {d&reg1&[1]}, [TMP2, :32]
+.else
+    .error "unsupported"
+.endif
+.endm
+
+.macro pixld2_s elem_size, reg1, reg2, mem_operand
+.if 0 /* elem_size == 32 */
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X, asl #1
+    add     TMP1, mem_operand, TMP1, asl #2
+    mov     TMP2, VX, asr #16
+    sub     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[0]}, [TMP1, :32]
+    mov     TMP1, VX, asr #16
+    add     VX, VX, UNIT_X, asl #1
+    add     TMP1, mem_operand, TMP1, asl #2
+    vld1.32 {d&reg2&[0]}, [TMP2, :32]
+    mov     TMP2, VX, asr #16
+    add     VX, VX, UNIT_X
+    add     TMP2, mem_operand, TMP2, asl #2
+    vld1.32 {d&reg1&[1]}, [TMP1, :32]
+    vld1.32 {d&reg2&[1]}, [TMP2, :32]
+.else
+    pixld1_s elem_size, reg1, mem_operand
+    pixld1_s elem_size, reg2, mem_operand
+.endif
+.endm
+
+.macro pixld0_s elem_size, reg1, idx, mem_operand
+.if elem_size == 16
+    mov     TMP1, VX, asr #16
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+    add     TMP1, mem_operand, TMP1, asl #1
+    vld1.16 {d&reg1&[idx]}, [TMP1, :16]
+.elseif elem_size == 32
+    mov     TMP1, VX, asr #16
+    adds    VX, VX, UNIT_X
+5:  subpls  VX, VX, SRC_WIDTH_FIXED
+    bpl     5b
+    add     TMP1, mem_operand, TMP1, asl #2
+    vld1.32 {d&reg1&[idx]}, [TMP1, :32]
+.endif
+.endm
+
+.macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
+.if numbytes == 32
+    pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
+    pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
+    pixdeinterleave elem_size, %(basereg+4)
+.elseif numbytes == 16
+    pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
+.elseif numbytes == 8
+    pixld1_s elem_size, %(basereg+1), mem_operand
+.elseif numbytes == 4
+    .if elem_size == 32
+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+    .elseif elem_size == 16
+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+    .else
+        pixld0_s elem_size, %(basereg+0), 4, mem_operand
+        pixld0_s elem_size, %(basereg+0), 5, mem_operand
+        pixld0_s elem_size, %(basereg+0), 6, mem_operand
+        pixld0_s elem_size, %(basereg+0), 7, mem_operand
+    .endif
+.elseif numbytes == 2
+    .if elem_size == 16
+        pixld0_s elem_size, %(basereg+0), 1, mem_operand
+    .else
+        pixld0_s elem_size, %(basereg+0), 2, mem_operand
+        pixld0_s elem_size, %(basereg+0), 3, mem_operand
+    .endif
+.elseif numbytes == 1
+    pixld0_s elem_size, %(basereg+0), 1, mem_operand
+.else
+    .error "unsupported size: numbytes"
+.endif
+.endm
+
+.macro pixld_s numpix, bpp, basereg, mem_operand
+.if bpp > 0
+    pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
+.endif
+.endm
+
+.macro vuzp8 reg1, reg2
+    vuzp.8 d&reg1, d&reg2
+.endm
+
+.macro vzip8 reg1, reg2
+    vzip.8 d&reg1, d&reg2
+.endm
+
+/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixdeinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vuzp8 %(basereg+0), %(basereg+1)
+    vuzp8 %(basereg+2), %(basereg+3)
+    vuzp8 %(basereg+1), %(basereg+3)
+    vuzp8 %(basereg+0), %(basereg+2)
+.endif
+.endm
+
+/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+    vzip8 %(basereg+0), %(basereg+2)
+    vzip8 %(basereg+1), %(basereg+3)
+    vzip8 %(basereg+2), %(basereg+3)
+    vzip8 %(basereg+0), %(basereg+1)
+.endif
+.endm
+
+/*
+ * This is a macro for implementing cache preload. The main idea is that
+ * cache preload logic is mostly independent from the rest of pixels
+ * processing code. It starts at the top left pixel and moves forward
+ * across pixels and can jump across scanlines. Prefetch distance is
+ * handled in an 'incremental' way: it starts from 0 and advances to the
+ * optimal distance over time. After reaching optimal prefetch distance,
+ * it is kept constant. There are some checks which prevent prefetching
+ * unneeded pixel lines below the image (but it still can prefetch a bit
+ * more data on the right side of the image - not a big issue and may
+ * be actually helpful when rendering text glyphs). Additional trick is
+ * the use of LDR instruction for prefetch instead of PLD when moving to
+ * the next line, the point is that we have a high chance of getting TLB
+ * miss in this case, and PLD would be useless.
+ *
+ * This sounds like it may introduce a noticeable overhead (when working with
+ * fully cached data). But in reality, due to having a separate pipeline and
+ * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
+ * execute simultaneously with NEON and be completely shadowed by it. Thus
+ * we get no performance overhead at all (*). This looks like a very nice
+ * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
+ * but still can implement some rather advanced prefetch logic in software
+ * for almost zero cost!
+ *
+ * (*) The overhead of the prefetcher is visible when running some trivial
+ * pixels processing like simple copy. Anyway, having prefetch is a must
+ * when working with the graphics data.
+ */
+.macro PF a, x:vararg
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
+    a x
+.endif
+.endm
+
+.macro cache_preload std_increment, boost_increment
+.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
+.if regs_shortage
+    PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+.endif
+.if std_increment != 0
+    PF add PF_X, PF_X, #std_increment
+.endif
+    PF tst PF_CTL, #0xF
+    PF addne PF_X, PF_X, #boost_increment
+    PF subne PF_CTL, PF_CTL, #1
+    PF cmp PF_X, ORIG_W
+.if src_bpp_shift >= 0
+    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+.endif
+.if dst_r_bpp != 0
+    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+.endif
+.if mask_bpp_shift >= 0
+    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+.endif
+    PF subge PF_X, PF_X, ORIG_W
+    PF subges PF_CTL, PF_CTL, #0x10
+.if src_bpp_shift >= 0
+    PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endif
+.if dst_r_bpp != 0
+    PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+.endif
+.if mask_bpp_shift >= 0
+    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+.endif
+.endif
+.endm
+
+.macro cache_preload_simple
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
+.if src_bpp > 0
+    pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
+.endif
+.if dst_r_bpp > 0
+    pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
+.endif
+.if mask_bpp > 0
+    pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
+.endif
+.endif
+.endm
+
+.macro fetch_mask_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+.endm
+
+/*
+ * Macro which is used to process leading pixels until destination
+ * pointer is properly aligned (at 16 bytes boundary). When destination
+ * buffer uses 16bpp format, this is unnecessary, or even pointless.
+ */
+.macro ensure_destination_ptr_alignment process_pixblock_head, \
+                                        process_pixblock_tail, \
+                                        process_pixblock_tail_head
+.if dst_w_bpp != 24
+    tst         DST_R, #0xF
+    beq         2f
+
+.irp lowbit, 1, 2, 4, 8, 16
+local skip1
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_R, #lowbit
+    beq         1f
+.endif
+    pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+    pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+.if dst_r_bpp > 0
+    pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+.else
+    add         DST_R, DST_R, #lowbit
+.endif
+    PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
+    sub         W, W, #(lowbit * 8 / dst_w_bpp)
+1:
+.endif
+.endr
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    process_pixblock_tail
+
+    pixinterleave dst_w_bpp, dst_w_basereg
+.irp lowbit, 1, 2, 4, 8, 16
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+    tst         DST_W, #lowbit
+    beq         1f
+.endif
+    pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+1:
+.endif
+.endr
+.endif
+2:
+.endm
+
+/*
+ * Special code for processing up to (pixblock_size - 1) remaining
+ * trailing pixels. As SIMD processing performs operation on
+ * pixblock_size pixels, anything smaller than this has to be loaded
+ * and stored in a special way. Loading and storing of pixel data is
+ * performed in such a way that we fill some 'slots' in the NEON
+ * registers (some slots naturally are unused), then perform compositing
+ * operation as usual. In the end, the data is taken from these 'slots'
+ * and saved to memory.
+ *
+ * cache_preload_flag - allows to suppress prefetch if
+ *                      set to 0
+ * dst_aligned_flag   - selects whether destination buffer
+ *                      is aligned
+ */
+.macro process_trailing_pixels cache_preload_flag, \
+                               dst_aligned_flag, \
+                               process_pixblock_head, \
+                               process_pixblock_tail, \
+                               process_pixblock_tail_head
+    tst         W, #(pixblock_size - 1)
+    beq         2f
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+    tst         W, #chunk_size
+    beq         1f
+    pixld_src   chunk_size, src_bpp, src_basereg, SRC
+    pixld       chunk_size, mask_bpp, mask_basereg, MASK
+.if dst_aligned_flag != 0
+    pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.else
+    pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.endif
+.if cache_preload_flag != 0
+    PF add      PF_X, PF_X, #chunk_size
+.endif
+1:
+.endif
+.endr
+    pixdeinterleave src_bpp, src_basereg
+    pixdeinterleave mask_bpp, mask_basereg
+    pixdeinterleave dst_r_bpp, dst_r_basereg
+
+    process_pixblock_head
+.if cache_preload_flag != 0
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+.endif
+    process_pixblock_tail
+    pixinterleave dst_w_bpp, dst_w_basereg
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+    tst         W, #chunk_size
+    beq         1f
+.if dst_aligned_flag != 0
+    pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.else
+    pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.endif
+1:
+.endif
+.endr
+2:
+.endm
+
+/*
+ * Macro, which performs all the needed operations to switch to the next
+ * scanline and start the next loop iteration unless all the scanlines
+ * are already processed.
+ */
+.macro advance_to_next_scanline start_of_loop_label
+.if regs_shortage
+    ldrd        W, [sp] /* load W and H (width and height) from stack */
+.else
+    mov         W, ORIG_W
+.endif
+    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
+.if src_bpp != 0
+    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
+.endif
+.if mask_bpp != 0
+    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
+.endif
+.if (dst_w_bpp != 24)
+    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
+.endif
+.if (src_bpp != 24) && (src_bpp != 0)
+    sub         SRC, SRC, W, lsl #src_bpp_shift
+.endif
+.if (mask_bpp != 24) && (mask_bpp != 0)
+    sub         MASK, MASK, W, lsl #mask_bpp_shift
+.endif
+    subs        H, H, #1
+    mov         DST_R, DST_W
+.if regs_shortage
+    str         H, [sp, #4] /* save updated height to stack */
+.endif
+    bge         start_of_loop_label
+.endm
+
+/*
+ * Registers are allocated in the following way by default:
+ * d0, d1, d2, d3     - reserved for loading source pixel data
+ * d4, d5, d6, d7     - reserved for loading destination pixel data
+ * d24, d25, d26, d27 - reserved for loading mask pixel data
+ * d28, d29, d30, d31 - final destination pixel data for writeback to memory
+ */
+.macro generate_composite_function fname, \
+                                   src_bpp_, \
+                                   mask_bpp_, \
+                                   dst_w_bpp_, \
+                                   flags, \
+                                   pixblock_size_, \
+                                   prefetch_distance, \
+                                   init, \
+                                   cleanup, \
+                                   process_pixblock_head, \
+                                   process_pixblock_tail, \
+                                   process_pixblock_tail_head, \
+                                   dst_w_basereg_ = 28, \
+                                   dst_r_basereg_ = 4, \
+                                   src_basereg_   = 0, \
+                                   mask_basereg_  = 24
+
+    .func fname
+    .global fname
+    /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+    push        {r4-r12, lr}        /* save all registers */
+
+/*
+ * Select prefetch type for this function. If prefetch distance is
+ * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
+ * has to be used instead of ADVANCED.
+ */
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
+.if prefetch_distance == 0
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
+        ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
+.endif
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, src_bpp_
+    .set mask_bpp, mask_bpp_
+    .set dst_w_bpp, dst_w_bpp_
+    .set pixblock_size, pixblock_size_
+    .set dst_w_basereg, dst_w_basereg_
+    .set dst_r_basereg, dst_r_basereg_
+    .set src_basereg, src_basereg_
+    .set mask_basereg, mask_basereg_
+
+    .macro pixld_src x:vararg
+        pixld x
+    .endm
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
+/*
+ * Assign symbolic names to registers
+ */
+    W           .req        r0      /* width (is updated during processing) */
+    H           .req        r1      /* height (is updated during processing) */
+    DST_W       .req        r2      /* destination buffer pointer for writes */
+    DST_STRIDE  .req        r3      /* destination image stride */
+    SRC         .req        r4      /* source buffer pointer */
+    SRC_STRIDE  .req        r5      /* source image stride */
+    DST_R       .req        r6      /* destination buffer pointer for reads */
+
+    MASK        .req        r7      /* mask pointer */
+    MASK_STRIDE .req        r8      /* mask stride */
+
+    PF_CTL      .req        r9      /* combined lines counter and prefetch */
+                                    /* distance increment counter */
+    PF_X        .req        r10     /* pixel index in a scanline for current */
+                                    /* pretetch position */
+    PF_SRC      .req        r11     /* pointer to source scanline start */
+                                    /* for prefetch purposes */
+    PF_DST      .req        r12     /* pointer to destination scanline start */
+                                    /* for prefetch purposes */
+    PF_MASK     .req        r14     /* pointer to mask scanline start */
+                                    /* for prefetch purposes */
+/*
+ * Check whether we have enough registers for all the local variables.
+ * If we don't have enough registers, original width and height are
+ * kept on top of stack (and 'regs_shortage' variable is set to indicate
+ * this for the rest of code). Even if there are enough registers, the
+ * allocation scheme may be a bit different depending on whether source
+ * or mask is not used.
+ */
+.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
+    ORIG_W      .req        r10     /* saved original width */
+    DUMMY       .req        r12     /* temporary register */
+    .set        regs_shortage, 0
+.elseif mask_bpp == 0
+    ORIG_W      .req        r7      /* saved original width */
+    DUMMY       .req        r8      /* temporary register */
+    .set        regs_shortage, 0
+.elseif src_bpp == 0
+    ORIG_W      .req        r4      /* saved original width */
+    DUMMY       .req        r5      /* temporary register */
+    .set        regs_shortage, 0
+.else
+    ORIG_W      .req        r1      /* saved original width */
+    DUMMY       .req        r1      /* temporary register */
+    .set        regs_shortage, 1
+.endif
+
+    .set mask_bpp_shift, -1
+.if src_bpp == 32
+    .set src_bpp_shift, 2
+.elseif src_bpp == 24
+    .set src_bpp_shift, 0
+.elseif src_bpp == 16
+    .set src_bpp_shift, 1
+.elseif src_bpp == 8
+    .set src_bpp_shift, 0
+.elseif src_bpp == 0
+    .set src_bpp_shift, -1
+.else
+    .error "requested src bpp (src_bpp) is not supported"
+.endif
+.if mask_bpp == 32
+    .set mask_bpp_shift, 2
+.elseif mask_bpp == 24
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 8
+    .set mask_bpp_shift, 0
+.elseif mask_bpp == 0
+    .set mask_bpp_shift, -1
+.else
+    .error "requested mask bpp (mask_bpp) is not supported"
+.endif
+.if dst_w_bpp == 32
+    .set dst_bpp_shift, 2
+.elseif dst_w_bpp == 24
+    .set dst_bpp_shift, 0
+.elseif dst_w_bpp == 16
+    .set dst_bpp_shift, 1
+.elseif dst_w_bpp == 8
+    .set dst_bpp_shift, 0
+.else
+    .error "requested dst bpp (dst_w_bpp) is not supported"
+.endif
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+.if prefetch_distance < 0 || prefetch_distance > 15
+    .error "invalid prefetch distance (prefetch_distance)"
+.endif
+
+.if src_bpp > 0
+    ldr         SRC, [sp, #40]
+.endif
+.if mask_bpp > 0
+    ldr         MASK, [sp, #48]
+.endif
+    PF mov      PF_X, #0
+.if src_bpp > 0
+    ldr         SRC_STRIDE, [sp, #44]
+.endif
+.if mask_bpp > 0
+    ldr         MASK_STRIDE, [sp, #52]
+.endif
+    mov         DST_R, DST_W
+
+.if src_bpp == 24
+    sub         SRC_STRIDE, SRC_STRIDE, W
+    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
+.endif
+.if mask_bpp == 24
+    sub         MASK_STRIDE, MASK_STRIDE, W
+    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
+.endif
+.if dst_w_bpp == 24
+    sub         DST_STRIDE, DST_STRIDE, W
+    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
+.endif
+
+/*
+ * Setup advanced prefetcher initial state
+ */
+    PF mov      PF_SRC, SRC
+    PF mov      PF_DST, DST_R
+    PF mov      PF_MASK, MASK
+    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
+    PF mov      PF_CTL, H, lsl #4
+    PF add      PF_CTL, #(prefetch_distance - 0x10)
+
+    init
+.if regs_shortage
+    push        {r0, r1}
+.endif
+    subs        H, H, #1
+.if regs_shortage
+    str         H, [sp, #4] /* save updated height to stack */
+.else
+    mov         ORIG_W, W
+.endif
+    blt         9f
+    cmp         W, #(pixblock_size * 2)
+    blt         8f
+/*
+ * This is the start of the pipelined loop, which if optimized for
+ * long scanlines
+ */
+0:
+    ensure_destination_ptr_alignment process_pixblock_head, \
+                                     process_pixblock_tail, \
+                                     process_pixblock_tail_head
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    PF add      PF_X, PF_X, #pixblock_size
+    process_pixblock_head
+    cache_preload 0, pixblock_size
+    cache_preload_simple
+    subs        W, W, #(pixblock_size * 2)
+    blt         2f
+1:
+    process_pixblock_tail_head
+    cache_preload_simple
+    subs        W, W, #pixblock_size
+    bge         1b
+2:
+    process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 1, 1, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+    advance_to_next_scanline 0b
+
+.if regs_shortage
+    pop         {r0, r1}
+.endif
+    cleanup
+    pop         {r4-r12, pc}  /* exit */
+/*
+ * This is the start of the loop, designed to process images with small width
+ * (less than pixblock_size * 2 pixels). In this case neither pipelining
+ * nor prefetch are used.
+ */
+8:
+    /* Process exactly pixblock_size pixels if needed */
+    tst         W, #pixblock_size
+    beq         1f
+    pixld       pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    process_pixblock_head
+    process_pixblock_tail
+    pixst       pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+1:
+    /* Process the remaining trailing pixels in the scanline */
+    process_trailing_pixels 0, 0, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+    advance_to_next_scanline 8b
+9:
+.if regs_shortage
+    pop         {r0, r1}
+.endif
+    cleanup
+    pop         {r4-r12, pc}  /* exit */
+
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      ORIG_W
+    .unreq      W
+    .unreq      H
+    .unreq      SRC_STRIDE
+    .unreq      DST_STRIDE
+    .unreq      MASK_STRIDE
+    .unreq      PF_CTL
+    .unreq      PF_X
+    .unreq      PF_SRC
+    .unreq      PF_DST
+    .unreq      PF_MASK
+    .unreq      DUMMY
+    .endfunc
+.endm
+
+/*
+ * A simplified variant of function generation template for a single
+ * scanline processing (for implementing pixman combine functions)
+ */
+.macro generate_composite_function_scanline        use_nearest_scaling, \
+                                                   fname, \
+                                                   src_bpp_, \
+                                                   mask_bpp_, \
+                                                   dst_w_bpp_, \
+                                                   flags, \
+                                                   pixblock_size_, \
+                                                   init, \
+                                                   cleanup, \
+                                                   process_pixblock_head, \
+                                                   process_pixblock_tail, \
+                                                   process_pixblock_tail_head, \
+                                                   dst_w_basereg_ = 28, \
+                                                   dst_r_basereg_ = 4, \
+                                                   src_basereg_   = 0, \
+                                                   mask_basereg_  = 24
+
+    .func fname
+    .global fname
+    /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+    .hidden fname
+    .type fname, %function
+#endif
+fname:
+    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+    .set src_bpp, src_bpp_
+    .set mask_bpp, mask_bpp_
+    .set dst_w_bpp, dst_w_bpp_
+    .set pixblock_size, pixblock_size_
+    .set dst_w_basereg, dst_w_basereg_
+    .set dst_r_basereg, dst_r_basereg_
+    .set src_basereg, src_basereg_
+    .set mask_basereg, mask_basereg_
+
+.if use_nearest_scaling != 0
+    /*
+     * Assign symbolic names to registers for nearest scaling
+     */
+    W           .req        r0
+    DST_W       .req        r1
+    SRC         .req        r2
+    VX          .req        r3
+    UNIT_X      .req        ip
+    MASK        .req        lr
+    TMP1        .req        r4
+    TMP2        .req        r5
+    DST_R       .req        r6
+    SRC_WIDTH_FIXED .req        r7
+
+    .macro pixld_src x:vararg
+        pixld_s x
+    .endm
+
+    ldr         UNIT_X, [sp]
+    push        {r4-r8, lr}
+    ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
+    .if mask_bpp != 0
+    ldr         MASK, [sp, #(24 + 8)]
+    .endif
+.else
+    /*
+     * Assign symbolic names to registers
+     */
+    W           .req        r0      /* width (is updated during processing) */
+    DST_W       .req        r1      /* destination buffer pointer for writes */
+    SRC         .req        r2      /* source buffer pointer */
+    DST_R       .req        ip      /* destination buffer pointer for reads */
+    MASK        .req        r3      /* mask pointer */
+
+    .macro pixld_src x:vararg
+        pixld x
+    .endm
+.endif
+
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+    .set dst_r_bpp, dst_w_bpp
+.else
+    .set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+    .set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+    .set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+
+    .macro fetch_src_pixblock
+        pixld_src   pixblock_size, src_bpp, \
+                    (src_basereg - pixblock_size * src_bpp / 64), SRC
+    .endm
+
+    init
+    mov         DST_R, DST_W
+
+    cmp         W, #pixblock_size
+    blt         8f
+
+    ensure_destination_ptr_alignment process_pixblock_head, \
+                                     process_pixblock_tail, \
+                                     process_pixblock_tail_head
+
+    subs        W, W, #pixblock_size
+    blt         7f
+
+    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+    pixld_a     pixblock_size, dst_r_bpp, \
+                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+    fetch_src_pixblock
+    pixld       pixblock_size, mask_bpp, \
+                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
+    process_pixblock_head
+    subs        W, W, #pixblock_size
+    blt         2f
+1:
+    process_pixblock_tail_head
+    subs        W, W, #pixblock_size
+    bge         1b
+2:
+    process_pixblock_tail
+    pixst_a     pixblock_size, dst_w_bpp, \
+                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+7:
+    /* Process the remaining trailing pixels in the scanline (dst aligned) */
+    process_trailing_pixels 0, 1, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+
+    cleanup
+.if use_nearest_scaling != 0
+    pop         {r4-r8, pc}  /* exit */
+.else
+    bx          lr  /* exit */
+.endif
+8:
+    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
+    process_trailing_pixels 0, 0, \
+                            process_pixblock_head, \
+                            process_pixblock_tail, \
+                            process_pixblock_tail_head
+
+    cleanup
+
+.if use_nearest_scaling != 0
+    pop         {r4-r8, pc}  /* exit */
+
+    .unreq      DST_R
+    .unreq      SRC
+    .unreq      W
+    .unreq      VX
+    .unreq      UNIT_X
+    .unreq      TMP1
+    .unreq      TMP2
+    .unreq      DST_W
+    .unreq      MASK
+    .unreq      SRC_WIDTH_FIXED
+
+.else
+    bx          lr  /* exit */
+
+    .unreq      SRC
+    .unreq      MASK
+    .unreq      DST_R
+    .unreq      DST_W
+    .unreq      W
+.endif
+
+    .purgem     fetch_src_pixblock
+    .purgem     pixld_src
+
+    .endfunc
+.endm
+
+.macro generate_composite_function_single_scanline x:vararg
+    generate_composite_function_scanline 0, x
+.endm
+
+.macro generate_composite_function_nearest_scanline x:vararg
+    generate_composite_function_scanline 1, x
+.endm
+
+/* Default prologue/epilogue, nothing special needs to be done */
+
+.macro default_init
+.endm
+
+.macro default_cleanup
+.endm
+
+/*
+ * Prologue/epilogue variant which additionally saves/restores d8-d15
+ * registers (they need to be saved/restored by callee according to ABI).
+ * This is required if the code needs to use all the NEON registers.
+ */
+
+.macro default_init_need_all_regs
+    vpush       {d8-d15}
+.endm
+
+.macro default_cleanup_need_all_regs
+    vpop        {d8-d15}
+.endm
+
+/******************************************************************************/
+
+/*
+ * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
+ * into a planar a8r8g8b8 format (with a, r, g, b color components
+ * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
+ *
+ * Warning: the conversion is destructive and the original
+ *          value (in) is lost.
+ */
+.macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
+    vshrn.u16   out_r, in,    #8
+    vshrn.u16   out_g, in,    #3
+    vsli.u16    in,    in,    #5
+    vmov.u8     out_a, #255
+    vsri.u8     out_r, out_r, #5
+    vsri.u8     out_g, out_g, #6
+    vshrn.u16   out_b, in,    #2
+.endm
+
+.macro convert_0565_to_x888 in, out_r, out_g, out_b
+    vshrn.u16   out_r, in,    #8
+    vshrn.u16   out_g, in,    #3
+    vsli.u16    in,    in,    #5
+    vsri.u8     out_r, out_r, #5
+    vsri.u8     out_g, out_g, #6
+    vshrn.u16   out_b, in,    #2
+.endm
+
+/*
+ * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
+ * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
+ * pixels packed in 128-bit register (out). Requires two temporary 128-bit
+ * registers (tmp1, tmp2)
+ */
+.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
+    vshll.u8    tmp1, in_g, #8
+    vshll.u8    out, in_r, #8
+    vshll.u8    tmp2, in_b, #8
+    vsri.u16    out, tmp1, #5
+    vsri.u16    out, tmp2, #11
+.endm
+
+/*
+ * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
+ * returned in (out0, out1) registers pair. Requires one temporary
+ * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
+ * value from 'in' is lost
+ */
+.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
+    vshl.u16    out0, in,   #5  /* G top 6 bits */
+    vshl.u16    tmp,  in,   #11 /* B top 5 bits */
+    vsri.u16    in,   in,   #5  /* R is ready in top bits */
+    vsri.u16    out0, out0, #6  /* G is ready in top bits */
+    vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
+    vshr.u16    out1, in,   #8  /* R is in place */
+    vsri.u16    out0, tmp,  #8  /* G & B is in place */
+    vzip.u16    out0, out1      /* everything is in place */
+.endm
diff --git a/src/pixman/pixman/pixman-arm-neon.c b/src/pixman/pixman/pixman-arm-neon.c
new file mode 100644
index 0000000..60e9c78
--- /dev/null
+++ b/src/pixman/pixman/pixman-arm-neon.c
@@ -0,0 +1,472 @@
+/*
+ * Copyright © 2009 ARM Ltd, Movial Creative Technologies Oy
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of ARM Ltd not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  ARM Ltd makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Ian Rickards (ian.rickards@arm.com)
+ * Author:  Jonathan Morton (jonathan.morton@movial.com)
+ * Author:  Markku Vire (markku.vire@movial.com)
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+#include "pixman-private.h"
+#include "pixman-arm-common.h"
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_x888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_0565,
+                                   uint16_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0888,
+                                   uint8_t, 3, uint8_t, 3)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_8888_0565,
+                                   uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0565_8888,
+                                   uint16_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_8888_rev,
+                                   uint8_t, 3, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_0888_0565_rev,
+                                   uint8_t, 3, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_pixbuf_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, src_rpixbuf_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8_8,
+                                   uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, add_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_0565,
+                                   uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, over_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_0565,
+                                   uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (neon, out_reverse_8_8888,
+                                   uint8_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_0565,
+                                 uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_n_8888,
+                                 uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, neon, over_reverse_n_8888,
+                                 uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, neon, in_n_8,
+                                 uint8_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_0565,
+                                      uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_8888_ca,
+                                      uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8888_0565_ca,
+				      uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, over_n_8_8,
+                                      uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
+                                      uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8,
+                                      uint8_t, 1, uint8_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_0565,
+                                     uint32_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_0565_n_0565,
+                                     uint16_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, add_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8_8_8,
+                                        uint8_t, 1, uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_0565_8_0565,
+                                        uint16_t, 1, uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8_8888,
+                                        uint32_t, 1, uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, add_8888_8888_8888,
+                                        uint32_t, 1, uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_8888,
+                                        uint32_t, 1, uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8888_8888,
+                                        uint32_t, 1, uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_8888_8_0565,
+                                        uint32_t, 1, uint8_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_MASK_DST (neon, over_0565_8_0565,
+                                        uint16_t, 1, uint8_t, 1, uint16_t, 1)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_8888, OVER,
+                                        uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, OVER,
+                                        uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 8888_0565, SRC,
+                                        uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (neon, 0565_8888, SRC,
+                                        uint16_t, uint32_t)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_0565,
+                                           OVER, uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, neon, 0565_8_0565,
+                                           OVER, uint16_t, uint16_t)
+
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_8888, SRC,
+                                         uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
+                                         uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
+                                         uint16_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,
+                                         uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, OVER,
+                                         uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, ADD,
+                                         uint32_t, uint32_t)
+
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_8888, SRC,
+                                            uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_0565, SRC,
+                                            uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_x888, SRC,
+                                            uint16_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_0565, SRC,
+                                            uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, OVER,
+                                            uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, ADD,
+                                            uint32_t, uint32_t)
+
+void
+pixman_composite_src_n_8_asm_neon (int32_t   w,
+                                   int32_t   h,
+                                   uint8_t  *dst,
+                                   int32_t   dst_stride,
+                                   uint8_t   src);
+
+void
+pixman_composite_src_n_0565_asm_neon (int32_t   w,
+                                      int32_t   h,
+                                      uint16_t *dst,
+                                      int32_t   dst_stride,
+                                      uint16_t  src);
+
+void
+pixman_composite_src_n_8888_asm_neon (int32_t   w,
+                                      int32_t   h,
+                                      uint32_t *dst,
+                                      int32_t   dst_stride,
+                                      uint32_t  src);
+
+static pixman_bool_t
+arm_neon_fill (pixman_implementation_t *imp,
+               uint32_t *               bits,
+               int                      stride,
+               int                      bpp,
+               int                      x,
+               int                      y,
+               int                      width,
+               int                      height,
+	       uint32_t                 _xor)
+{
+    /* stride is always multiple of 32bit units in pixman */
+    uint32_t byte_stride = stride * sizeof(uint32_t);
+
+    switch (bpp)
+    {
+    case 8:
+	pixman_composite_src_n_8_asm_neon (
+		width,
+		height,
+		(uint8_t *)(((char *) bits) + y * byte_stride + x),
+		byte_stride,
+		_xor & 0xff);
+	return TRUE;
+    case 16:
+	pixman_composite_src_n_0565_asm_neon (
+		width,
+		height,
+		(uint16_t *)(((char *) bits) + y * byte_stride + x * 2),
+		byte_stride / 2,
+		_xor & 0xffff);
+	return TRUE;
+    case 32:
+	pixman_composite_src_n_8888_asm_neon (
+		width,
+		height,
+		(uint32_t *)(((char *) bits) + y * byte_stride + x * 4),
+		byte_stride / 4,
+		_xor);
+	return TRUE;
+    default:
+	return FALSE;
+    }
+}
+
+static pixman_bool_t
+arm_neon_blt (pixman_implementation_t *imp,
+              uint32_t *               src_bits,
+              uint32_t *               dst_bits,
+              int                      src_stride,
+              int                      dst_stride,
+              int                      src_bpp,
+              int                      dst_bpp,
+              int                      src_x,
+              int                      src_y,
+              int                      dest_x,
+              int                      dest_y,
+              int                      width,
+              int                      height)
+{
+    if (src_bpp != dst_bpp)
+	return FALSE;
+
+    switch (src_bpp)
+    {
+    case 16:
+	pixman_composite_src_0565_0565_asm_neon (
+		width, height,
+		(uint16_t *)(((char *) dst_bits) +
+		dest_y * dst_stride * 4 + dest_x * 2), dst_stride * 2,
+		(uint16_t *)(((char *) src_bits) +
+		src_y * src_stride * 4 + src_x * 2), src_stride * 2);
+	return TRUE;
+    case 32:
+	pixman_composite_src_8888_8888_asm_neon (
+		width, height,
+		(uint32_t *)(((char *) dst_bits) +
+		dest_y * dst_stride * 4 + dest_x * 4), dst_stride,
+		(uint32_t *)(((char *) src_bits) +
+		src_y * src_stride * 4 + src_x * 4), src_stride);
+	return TRUE;
+    default:
+	return FALSE;
+    }
+}
+
+static const pixman_fast_path_t arm_neon_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     r5g6b5,   neon_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     b5g6r5,   neon_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     a8r8g8b8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     x8r8g8b8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     a8b8g8r8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     x8b8g8r8, neon_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     x8r8g8b8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     x8r8g8b8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     x8b8g8r8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     x8b8g8r8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     a8r8g8b8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     a8b8g8r8, neon_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  r8g8b8,   null,     r8g8b8,   neon_composite_src_0888_0888),
+    PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     x8r8g8b8, neon_composite_src_0888_8888_rev),
+    PIXMAN_STD_FAST_PATH (SRC,  b8g8r8,   null,     r5g6b5,   neon_composite_src_0888_0565_rev),
+    PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8r8g8b8, neon_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8b8g8r8, neon_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8r8g8b8, neon_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8b8g8r8, neon_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8r8g8b8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       x8r8g8b8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8b8g8r8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       x8b8g8r8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8,       neon_composite_src_n_8_8),
+
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8,       neon_composite_over_n_8_8),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       r5g6b5,   neon_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       b5g6r5,   neon_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8r8g8b8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8r8g8b8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8b8g8r8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8b8g8r8, neon_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     r5g6b5,   neon_composite_over_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     a8r8g8b8, neon_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     x8r8g8b8, neon_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, neon_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5,   neon_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5,   neon_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    a8r8g8b8, neon_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    x8r8g8b8, neon_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    r5g6b5,   neon_composite_over_8888_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid,    b5g6r5,   neon_composite_over_8888_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   solid,    r5g6b5,   neon_composite_over_0565_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   solid,    b5g6r5,   neon_composite_over_0565_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       a8r8g8b8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       x8r8g8b8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       a8b8g8r8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       x8b8g8r8, neon_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       r5g6b5,   neon_composite_over_8888_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       b5g6r5,   neon_composite_over_8888_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   a8,       r5g6b5,   neon_composite_over_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   a8,       b5g6r5,   neon_composite_over_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_over_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     r5g6b5,   neon_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     b5g6r5,   neon_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     a8r8g8b8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     x8r8g8b8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     a8b8g8r8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     x8b8g8r8, neon_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null,     a8r8g8b8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null,     a8b8g8r8, neon_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       neon_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8r8g8b8, neon_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8b8g8r8, neon_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       neon_composite_add_8_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  r5g6b5,   a8,       r5g6b5,   neon_composite_add_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (ADD,  b5g6r5,   a8,       b5g6r5,   neon_composite_add_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       a8r8g8b8, neon_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       a8b8g8r8, neon_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, neon_composite_add_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    a8r8g8b8, neon_composite_add_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    a8b8g8r8, neon_composite_add_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       neon_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, neon_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, neon_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (IN,   solid,    null,     a8,       neon_composite_in_n_8),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, neon_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, neon_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, r5g6b5,   neon_composite_out_reverse_8_0565),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, b5g6r5,   neon_composite_out_reverse_8_0565),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8r8g8b8, neon_composite_out_reverse_8_8888),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE,  a8,    null, a8b8g8r8, neon_composite_out_reverse_8_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, neon_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, neon_8888_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, b5g6r5, neon_8888_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, b5g6r5, neon_8888_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, neon_0565_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8888),
+    /* Note: NONE repeat is not supported yet */
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, neon_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, neon_0565_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_8_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, neon_8888_8_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, neon_0565_8_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, neon_0565_8_0565),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_0565),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_0565),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565),
+
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_8_0565),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_8_0565),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8_x888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_8_0565),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+
+    { PIXMAN_OP_NONE },
+};
+
+#define BIND_COMBINE_U(name)                                             \
+void                                                                     \
+pixman_composite_scanline_##name##_mask_asm_neon (int32_t         w,     \
+                                                  const uint32_t *dst,   \
+                                                  const uint32_t *src,   \
+                                                  const uint32_t *mask); \
+                                                                         \
+void                                                                     \
+pixman_composite_scanline_##name##_asm_neon (int32_t         w,          \
+                                             const uint32_t *dst,        \
+                                             const uint32_t *src);       \
+                                                                         \
+static void                                                              \
+neon_combine_##name##_u (pixman_implementation_t *imp,                   \
+                         pixman_op_t              op,                    \
+                         uint32_t *               dest,                  \
+                         const uint32_t *         src,                   \
+                         const uint32_t *         mask,                  \
+                         int                      width)                 \
+{                                                                        \
+    if (mask)                                                            \
+	pixman_composite_scanline_##name##_mask_asm_neon (width, dest,   \
+	                                                  src, mask);    \
+    else                                                                 \
+	pixman_composite_scanline_##name##_asm_neon (width, dest, src);  \
+}
+
+BIND_COMBINE_U (over)
+BIND_COMBINE_U (add)
+BIND_COMBINE_U (out_reverse)
+
+pixman_implementation_t *
+_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp =
+	_pixman_implementation_create (fallback, arm_neon_fast_paths);
+
+    imp->combine_32[PIXMAN_OP_OVER] = neon_combine_over_u;
+    imp->combine_32[PIXMAN_OP_ADD] = neon_combine_add_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = neon_combine_out_reverse_u;
+
+    imp->blt = arm_neon_blt;
+    imp->fill = arm_neon_fill;
+
+    return imp;
+}
diff --git a/src/pixman/pixman/pixman-arm-simd-asm-scaled.S b/src/pixman/pixman/pixman-arm-simd-asm-scaled.S
new file mode 100644
index 0000000..7110995
--- /dev/null
+++ b/src/pixman/pixman/pixman-arm-simd-asm-scaled.S
@@ -0,0 +1,165 @@
+/*
+ * Copyright © 2008 Mozilla Corporation
+ * Copyright © 2010 Nokia Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+	.text
+	.arch armv6
+	.object_arch armv4
+	.arm
+	.altmacro
+	.p2align 2
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+	.func fname
+	.global fname
+#ifdef __ELF__
+	.hidden fname
+	.type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * Note: This code is only using armv5te instructions (not even armv6),
+ *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
+ *       be split into a few variants, tuned for each microarchitecture.
+ *
+ * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
+ * have efficient write combining), it needs to be changed to use 16-byte
+ * aligned writes using STM instruction.
+ *
+ * Nearest scanline scaler macro template uses the following arguments:
+ *  fname                     - name of the function to generate
+ *  bpp_shift                 - (1 << bpp_shift) is the size of pixel in bytes
+ *  t                         - type suffix for LDR/STR instructions
+ *  prefetch_distance         - prefetch in the source image by that many
+ *                              pixels ahead
+ *  prefetch_braking_distance - stop prefetching when that many pixels are
+ *                              remaining before the end of scanline
+ */
+
+.macro generate_nearest_scanline_func fname, bpp_shift, t,      \
+                                      prefetch_distance,        \
+                                      prefetch_braking_distance
+
+pixman_asm_function fname
+	W		.req	r0
+	DST		.req	r1
+	SRC		.req	r2
+	VX		.req	r3
+	UNIT_X		.req	ip
+	TMP1		.req	r4
+	TMP2		.req	r5
+	VXMASK		.req	r6
+	PF_OFFS		.req	r7
+	SRC_WIDTH_FIXED	.req	r8
+
+	ldr	UNIT_X, [sp]
+	push	{r4, r5, r6, r7, r8, r10}
+	mvn	VXMASK, #((1 << bpp_shift) - 1)
+	ldr	SRC_WIDTH_FIXED, [sp, #28]
+
+	/* define helper macro */
+	.macro	scale_2_pixels
+		ldr&t	TMP1, [SRC, TMP1]
+		and	TMP2, VXMASK, VX, asr #(16 - bpp_shift)
+		adds	VX, VX, UNIT_X
+		str&t	TMP1, [DST], #(1 << bpp_shift)
+9:		subpls	VX, VX, SRC_WIDTH_FIXED
+		bpl	9b
+
+		ldr&t	TMP2, [SRC, TMP2]
+		and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+		adds	VX, VX, UNIT_X
+		str&t	TMP2, [DST], #(1 << bpp_shift)
+9:		subpls	VX, VX, SRC_WIDTH_FIXED
+		bpl	9b
+	.endm
+
+	/* now do the scaling */
+	and	TMP1, VXMASK, VX, asr #(16 - bpp_shift)
+	adds	VX, VX, UNIT_X
+9:	subpls	VX, VX, SRC_WIDTH_FIXED
+	bpl	9b
+	subs	W, W, #(8 + prefetch_braking_distance)
+	blt	2f
+	/* calculate prefetch offset */
+	mov	PF_OFFS, #prefetch_distance
+	mla	PF_OFFS, UNIT_X, PF_OFFS, VX
+1:	/* main loop, process 8 pixels per iteration with prefetch */
+	pld	[SRC, PF_OFFS, asr #(16 - bpp_shift)]
+	add	PF_OFFS, UNIT_X, lsl #3
+	scale_2_pixels
+	scale_2_pixels
+	scale_2_pixels
+	scale_2_pixels
+	subs	W, W, #8
+	bge	1b
+2:
+	subs	W, W, #(4 - 8 - prefetch_braking_distance)
+	blt	2f
+1:	/* process the remaining pixels */
+	scale_2_pixels
+	scale_2_pixels
+	subs	W, W, #4
+	bge	1b
+2:
+	tst	W, #2
+	beq	2f
+	scale_2_pixels
+2:
+	tst	W, #1
+	ldrne&t	TMP1, [SRC, TMP1]
+	strne&t	TMP1, [DST]
+	/* cleanup helper macro */
+	.purgem	scale_2_pixels
+	.unreq	DST
+	.unreq	SRC
+	.unreq	W
+	.unreq	VX
+	.unreq	UNIT_X
+	.unreq	TMP1
+	.unreq	TMP2
+	.unreq	VXMASK
+	.unreq	PF_OFFS
+	.unreq  SRC_WIDTH_FIXED
+	/* return */
+	pop	{r4, r5, r6, r7, r8, r10}
+	bx	lr
+.endfunc
+.endm
+
+generate_nearest_scanline_func \
+    pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
+
+generate_nearest_scanline_func \
+    pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32
diff --git a/src/pixman/pixman/pixman-arm-simd-asm.S b/src/pixman/pixman/pixman-arm-simd-asm.S
new file mode 100644
index 0000000..c209688
--- /dev/null
+++ b/src/pixman/pixman/pixman-arm-simd-asm.S
@@ -0,0 +1,613 @@
+/*
+ * Copyright © 2012 Raspberry Pi Foundation
+ * Copyright © 2012 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Ben Avison (bavison@riscosopen.org)
+ *
+ */
+
+/* Prevent the stack from becoming executable */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+	.text
+	.arch armv6
+	.object_arch armv4
+	.arm
+	.altmacro
+	.p2align 2
+
+#include "pixman-arm-simd-asm.h"
+
+/* A head macro should do all processing which results in an output of up to
+ * 16 bytes, as far as the final load instruction. The corresponding tail macro
+ * should complete the processing of the up-to-16 bytes. The calling macro will
+ * sometimes choose to insert a preload or a decrement of X between them.
+ *   cond           ARM condition code for code block
+ *   numbytes       Number of output bytes that should be generated this time
+ *   firstreg       First WK register in which to place output
+ *   unaligned_src  Whether to use non-wordaligned loads of source image
+ *   unaligned_mask Whether to use non-wordaligned loads of mask image
+ *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
+ */
+
+.macro blit_init
+        line_saved_regs STRIDE_D, STRIDE_S
+.endm
+
+.macro blit_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+        pixld   cond, numbytes, firstreg, SRC, unaligned_src
+.endm
+
+.macro blit_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
+    WK4     .req    STRIDE_D
+    WK5     .req    STRIDE_S
+    WK6     .req    MASK
+    WK7     .req    STRIDE_M
+110:    pixld   , 16, 0, SRC, unaligned_src
+        pixld   , 16, 4, SRC, unaligned_src
+        pld     [SRC, SCRATCH]
+        pixst   , 16, 0, DST
+        pixst   , 16, 4, DST
+        subs    X, X, #32*8/src_bpp
+        bhs     110b
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_src_8888_8888_asm_armv6, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
+    4, /* prefetch distance */ \
+    blit_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    blit_process_head, \
+    nop_macro, /* process tail */ \
+    blit_inner_loop
+
+generate_composite_function \
+    pixman_composite_src_0565_0565_asm_armv6, 16, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
+    4, /* prefetch distance */ \
+    blit_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    blit_process_head, \
+    nop_macro, /* process tail */ \
+    blit_inner_loop
+
+generate_composite_function \
+    pixman_composite_src_8_8_asm_armv6, 8, 0, 8, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_SPILL_LINE_VARS_WIDE | FLAG_PROCESS_PRESERVES_SCRATCH, \
+    3, /* prefetch distance */ \
+    blit_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    blit_process_head, \
+    nop_macro, /* process tail */ \
+    blit_inner_loop
+
+/******************************************************************************/
+
+.macro src_n_8888_init
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        mov     STRIDE_S, SRC
+        mov     MASK, SRC
+        mov     STRIDE_M, SRC
+.endm
+
+.macro src_n_0565_init
+        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
+        orr     SRC, SRC, lsl #16
+        mov     STRIDE_S, SRC
+        mov     MASK, SRC
+        mov     STRIDE_M, SRC
+.endm
+
+.macro src_n_8_init
+        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
+        orr     SRC, SRC, lsl #8
+        orr     SRC, SRC, lsl #16
+        mov     STRIDE_S, SRC
+        mov     MASK, SRC
+        mov     STRIDE_M, SRC
+.endm
+
+.macro fill_process_tail  cond, numbytes, firstreg
+    WK4     .req    SRC
+    WK5     .req    STRIDE_S
+    WK6     .req    MASK
+    WK7     .req    STRIDE_M
+        pixst   cond, numbytes, 4, DST
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8888_asm_armv6, 0, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+    0, /* prefetch distance doesn't apply */ \
+    src_n_8888_init \
+    nop_macro, /* newline */ \
+    nop_macro /* cleanup */ \
+    nop_macro /* process head */ \
+    fill_process_tail
+
+generate_composite_function \
+    pixman_composite_src_n_0565_asm_armv6, 0, 0, 16, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+    0, /* prefetch distance doesn't apply */ \
+    src_n_0565_init \
+    nop_macro, /* newline */ \
+    nop_macro /* cleanup */ \
+    nop_macro /* process head */ \
+    fill_process_tail
+
+generate_composite_function \
+    pixman_composite_src_n_8_asm_armv6, 0, 0, 8, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
+    0, /* prefetch distance doesn't apply */ \
+    src_n_8_init \
+    nop_macro, /* newline */ \
+    nop_macro /* cleanup */ \
+    nop_macro /* process head */ \
+    fill_process_tail
+
+/******************************************************************************/
+
+.macro src_x888_8888_pixel, cond, reg
+        orr&cond WK&reg, WK&reg, #0xFF000000
+.endm
+
+.macro pixman_composite_src_x888_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+        pixld   cond, numbytes, firstreg, SRC, unaligned_src
+.endm
+
+.macro pixman_composite_src_x888_8888_process_tail   cond, numbytes, firstreg
+        src_x888_8888_pixel cond, %(firstreg+0)
+ .if numbytes >= 8
+        src_x888_8888_pixel cond, %(firstreg+1)
+  .if numbytes == 16
+        src_x888_8888_pixel cond, %(firstreg+2)
+        src_x888_8888_pixel cond, %(firstreg+3)
+  .endif
+ .endif
+.endm
+
+generate_composite_function \
+    pixman_composite_src_x888_8888_asm_armv6, 32, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
+    3, /* prefetch distance */ \
+    nop_macro, /* init */ \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    pixman_composite_src_x888_8888_process_head, \
+    pixman_composite_src_x888_8888_process_tail
+
+/******************************************************************************/
+
+.macro src_0565_8888_init
+        /* Hold loop invariants in MASK and STRIDE_M */
+        ldr     MASK, =0x07E007E0
+        mov     STRIDE_M, #0xFF000000
+        /* Set GE[3:0] to 1010 so SEL instructions do what we want */
+        ldr     SCRATCH, =0x80008000
+        uadd8   SCRATCH, SCRATCH, SCRATCH
+.endm
+
+.macro src_0565_8888_2pixels, reg1, reg2
+        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
+        bic     WK&reg2, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
+        mov     WK&reg1, WK&reg2, lsl #16          @ rrrrr000000bbbbb0000000000000000
+        mov     SCRATCH, SCRATCH, ror #19          @ GGGG0000ggggggggggg00000GGGGGGGG
+        bic     WK&reg2, WK&reg2, WK&reg1, lsr #16 @ RRRRR000000BBBBB0000000000000000
+        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ rrrrrrrrrr0bbbbbbbbbb00000000000
+        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ RRRRRRRRRR0BBBBBBBBBB00000000000
+        pkhtb   WK&reg1, WK&reg1, WK&reg1, asr #5  @ rrrrrrrr--------bbbbbbbb--------
+        sel     WK&reg1, WK&reg1, SCRATCH          @ rrrrrrrrggggggggbbbbbbbb--------
+        mov     SCRATCH, SCRATCH, ror #16          @ ggg00000GGGGGGGGGGGG0000gggggggg
+        pkhtb   WK&reg2, WK&reg2, WK&reg2, asr #5  @ RRRRRRRR--------BBBBBBBB--------
+        sel     WK&reg2, WK&reg2, SCRATCH          @ RRRRRRRRGGGGGGGGBBBBBBBB--------
+        orr     WK&reg1, STRIDE_M, WK&reg1, lsr #8 @ 11111111rrrrrrrrggggggggbbbbbbbb
+        orr     WK&reg2, STRIDE_M, WK&reg2, lsr #8 @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+.endm
+
+/* This version doesn't need STRIDE_M, but is one instruction longer.
+   It would however be preferable for an XRGB target, since we could knock off the last 2 instructions, but is that a common case?
+        and     SCRATCH, WK&reg1, MASK             @ 00000GGGGGG0000000000gggggg00000
+        bic     WK&reg1, WK&reg1, MASK             @ RRRRR000000BBBBBrrrrr000000bbbbb
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #6  @ 00000GGGGGGGGGGGG0000ggggggggggg
+        mov     WK&reg2, WK&reg1, lsr #16          @ 0000000000000000RRRRR000000BBBBB
+        mov     SCRATCH, SCRATCH, ror #27          @ GGGGGGGGGGGG0000ggggggggggg00000
+        bic     WK&reg1, WK&reg1, WK&reg2, lsl #16 @ 0000000000000000rrrrr000000bbbbb
+        mov     WK&reg2, WK&reg2, lsl #3           @ 0000000000000RRRRR000000BBBBB000
+        mov     WK&reg1, WK&reg1, lsl #3           @ 0000000000000rrrrr000000bbbbb000
+        orr     WK&reg2, WK&reg2, WK&reg2, lsr #5  @ 0000000000000RRRRRRRRRR0BBBBBBBB
+        orr     WK&reg1, WK&reg1, WK&reg1, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
+        pkhbt   WK&reg2, WK&reg2, WK&reg2, lsl #5  @ --------RRRRRRRR--------BBBBBBBB
+        pkhbt   WK&reg1, WK&reg1, WK&reg1, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
+        sel     WK&reg2, SCRATCH, WK&reg2          @ --------RRRRRRRRGGGGGGGGBBBBBBBB
+        sel     WK&reg1, SCRATCH, WK&reg1          @ --------rrrrrrrrggggggggbbbbbbbb
+        orr     WK&reg2, WK&reg2, #0xFF000000      @ 11111111RRRRRRRRGGGGGGGGBBBBBBBB
+        orr     WK&reg1, WK&reg1, #0xFF000000      @ 11111111rrrrrrrrggggggggbbbbbbbb
+*/
+
+.macro src_0565_8888_1pixel, reg
+        bic     SCRATCH, WK&reg, MASK              @ 0000000000000000rrrrr000000bbbbb
+        and     WK&reg, WK&reg, MASK               @ 000000000000000000000gggggg00000
+        mov     SCRATCH, SCRATCH, lsl #3           @ 0000000000000rrrrr000000bbbbb000
+        mov     WK&reg, WK&reg, lsl #5             @ 0000000000000000gggggg0000000000
+        orr     SCRATCH, SCRATCH, SCRATCH, lsr #5  @ 0000000000000rrrrrrrrrr0bbbbbbbb
+        orr     WK&reg, WK&reg, WK&reg, lsr #6     @ 000000000000000gggggggggggg00000
+        pkhbt   SCRATCH, SCRATCH, SCRATCH, lsl #5  @ --------rrrrrrrr--------bbbbbbbb
+        sel     WK&reg, WK&reg, SCRATCH            @ --------rrrrrrrrggggggggbbbbbbbb
+        orr     WK&reg, WK&reg, #0xFF000000        @ 11111111rrrrrrrrggggggggbbbbbbbb
+.endm
+
+.macro src_0565_8888_process_head   cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ .if numbytes == 16
+        pixldst ld,, 8, firstreg, %(firstreg+2),,, SRC, unaligned_src
+ .elseif numbytes == 8
+        pixld   , 4, firstreg, SRC, unaligned_src
+ .elseif numbytes == 4
+        pixld   , 2, firstreg, SRC, unaligned_src
+ .endif
+.endm
+
+.macro src_0565_8888_process_tail   cond, numbytes, firstreg
+ .if numbytes == 16
+        src_0565_8888_2pixels firstreg, %(firstreg+1)
+        src_0565_8888_2pixels %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+        src_0565_8888_2pixels firstreg, %(firstreg+1)
+ .else
+        src_0565_8888_1pixel firstreg
+ .endif
+.endm
+
+generate_composite_function \
+    pixman_composite_src_0565_8888_asm_armv6, 16, 0, 32, \
+    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
+    3, /* prefetch distance */ \
+    src_0565_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    src_0565_8888_process_head, \
+    src_0565_8888_process_tail
+
+/******************************************************************************/
+
+.macro add_8_8_8pixels  cond, dst1, dst2
+        uqadd8&cond  WK&dst1, WK&dst1, MASK
+        uqadd8&cond  WK&dst2, WK&dst2, STRIDE_M
+.endm
+
+.macro add_8_8_4pixels  cond, dst
+        uqadd8&cond  WK&dst, WK&dst, MASK
+.endm
+
+.macro add_8_8_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+    WK4     .req    MASK
+    WK5     .req    STRIDE_M
+ .if numbytes == 16
+        pixld   cond, 8, 4, SRC, unaligned_src
+        pixld   cond, 16, firstreg, DST, 0
+        add_8_8_8pixels cond, firstreg, %(firstreg+1)
+        pixld   cond, 8, 4, SRC, unaligned_src
+ .else
+        pixld   cond, numbytes, 4, SRC, unaligned_src
+        pixld   cond, numbytes, firstreg, DST, 0
+ .endif
+    .unreq  WK4
+    .unreq  WK5
+.endm
+
+.macro add_8_8_process_tail  cond, numbytes, firstreg
+ .if numbytes == 16
+        add_8_8_8pixels cond, %(firstreg+2), %(firstreg+3)
+ .elseif numbytes == 8
+        add_8_8_8pixels cond, firstreg, %(firstreg+1)
+ .else
+        add_8_8_4pixels cond, firstreg
+ .endif
+.endm
+
+generate_composite_function \
+    pixman_composite_add_8_8_asm_armv6, 8, 0, 8, \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_PRESERVES_SCRATCH, \
+    2, /* prefetch distance */ \
+    nop_macro, /* init */ \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    add_8_8_process_head, \
+    add_8_8_process_tail
+
+/******************************************************************************/
+
+.macro over_8888_8888_init
+        /* Hold loop invariant in MASK */
+        ldr     MASK, =0x00800080
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, MASK, MASK
+        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
+.endm
+
+.macro over_8888_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+    WK4     .req    STRIDE_D
+    WK5     .req    STRIDE_S
+    WK6     .req    STRIDE_M
+    WK7     .req    ORIG_W
+        pixld   , numbytes, %(4+firstreg), SRC, unaligned_src
+        pixld   , numbytes, firstreg, DST, 0
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+.macro over_8888_8888_check_transparent  numbytes, reg0, reg1, reg2, reg3
+        /* Since these colours a premultiplied by alpha, only 0 indicates transparent (any other colour with 0 in the alpha byte is luminous) */
+        teq     WK&reg0, #0
+ .if numbytes > 4
+        teqeq   WK&reg1, #0
+  .if numbytes > 8
+        teqeq   WK&reg2, #0
+        teqeq   WK&reg3, #0
+  .endif
+ .endif
+.endm
+
+.macro over_8888_8888_prepare  next
+        mov     WK&next, WK&next, lsr #24
+.endm
+
+.macro over_8888_8888_1pixel src, dst, offset, next
+        /* src = destination component multiplier */
+        rsb     WK&src, WK&src, #255
+        /* Split even/odd bytes of dst into SCRATCH/dst */
+        uxtb16  SCRATCH, WK&dst
+        uxtb16  WK&dst, WK&dst, ror #8
+        /* Multiply through, adding 0.5 to the upper byte of result for rounding */
+        mla     SCRATCH, SCRATCH, WK&src, MASK
+        mla     WK&dst, WK&dst, WK&src, MASK
+        /* Where we would have had a stall between the result of the first MLA and the shifter input,
+         * reload the complete source pixel */
+        ldr     WK&src, [SRC, #offset]
+        /* Multiply by 257/256 to approximate 256/255 */
+        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+        /* In this stall, start processing the next pixel */
+ .if offset < -4
+        mov     WK&next, WK&next, lsr #24
+ .endif
+        uxtab16 WK&dst, WK&dst, WK&dst, ror #8
+        /* Recombine even/odd bytes of multiplied destination */
+        mov     SCRATCH, SCRATCH, ror #8
+        sel     WK&dst, SCRATCH, WK&dst
+        /* Saturated add of source to multiplied destination */
+        uqadd8  WK&dst, WK&dst, WK&src
+.endm
+
+.macro over_8888_8888_process_tail  cond, numbytes, firstreg
+    WK4     .req    STRIDE_D
+    WK5     .req    STRIDE_S
+    WK6     .req    STRIDE_M
+    WK7     .req    ORIG_W
+        over_8888_8888_check_transparent numbytes, %(4+firstreg), %(5+firstreg), %(6+firstreg), %(7+firstreg)
+        beq     10f
+        over_8888_8888_prepare  %(4+firstreg)
+ .set PROCESS_REG, firstreg
+ .set PROCESS_OFF, -numbytes
+ .rept numbytes / 4
+        over_8888_8888_1pixel %(4+PROCESS_REG), %(0+PROCESS_REG), PROCESS_OFF, %(5+PROCESS_REG)
+  .set PROCESS_REG, PROCESS_REG+1
+  .set PROCESS_OFF, PROCESS_OFF+4
+ .endr
+        pixst   , numbytes, firstreg, DST
+10:
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_8888_asm_armv6, 32, 0, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+    2, /* prefetch distance */ \
+    over_8888_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    over_8888_8888_process_head, \
+    over_8888_8888_process_tail
+
+/******************************************************************************/
+
+/* Multiply each byte of a word by a byte.
+ * Useful when there aren't any obvious ways to fill the stalls with other instructions.
+ * word  Register containing 4 bytes
+ * byte  Register containing byte multiplier (bits 8-31 must be 0)
+ * tmp   Scratch register
+ * half  Register containing the constant 0x00800080
+ * GE[3:0] bits must contain 0101
+ */
+.macro mul_8888_8  word, byte, tmp, half
+        /* Split even/odd bytes of word apart */
+        uxtb16  tmp, word
+        uxtb16  word, word, ror #8
+        /* Multiply bytes together with rounding, then by 257/256 */
+        mla     tmp, tmp, byte, half
+        mla     word, word, byte, half /* 1 stall follows */
+        uxtab16 tmp, tmp, tmp, ror #8  /* 1 stall follows */
+        uxtab16 word, word, word, ror #8
+        /* Recombine bytes */
+        mov     tmp, tmp, ror #8
+        sel     word, tmp, word
+.endm
+
+/******************************************************************************/
+
+.macro over_8888_n_8888_init
+        /* Mask is constant */
+        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
+        /* Hold loop invariant in STRIDE_M */
+        ldr     STRIDE_M, =0x00800080
+        /* We only want the alpha bits of the constant mask */
+        mov     MASK, MASK, lsr #24
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, STRIDE_M, STRIDE_M
+        line_saved_regs Y, STRIDE_D, STRIDE_S, ORIG_W
+.endm
+
+.macro over_8888_n_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+    WK4     .req    Y
+    WK5     .req    STRIDE_D
+    WK6     .req    STRIDE_S
+    WK7     .req    ORIG_W
+        pixld   , numbytes, %(4+(firstreg%2)), SRC, unaligned_src
+        pixld   , numbytes, firstreg, DST, 0
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+.macro over_8888_n_8888_1pixel src, dst
+        mul_8888_8  WK&src, MASK, SCRATCH, STRIDE_M
+        sub     WK7, WK6, WK&src, lsr #24
+        mul_8888_8  WK&dst, WK7, SCRATCH, STRIDE_M
+        uqadd8  WK&dst, WK&dst, WK&src
+.endm
+
+.macro over_8888_n_8888_process_tail  cond, numbytes, firstreg
+    WK4     .req    Y
+    WK5     .req    STRIDE_D
+    WK6     .req    STRIDE_S
+    WK7     .req    ORIG_W
+        over_8888_8888_check_transparent numbytes, %(4+(firstreg%2)), %(5+(firstreg%2)), %(6+firstreg), %(7+firstreg)
+        beq     10f
+        mov     WK6, #255
+ .set PROCESS_REG, firstreg
+ .rept numbytes / 4
+  .if numbytes == 16 && PROCESS_REG == 2
+        /* We're using WK6 and WK7 as temporaries, so half way through
+         * 4 pixels, reload the second two source pixels but this time
+         * into WK4 and WK5 */
+        ldmdb   SRC, {WK4, WK5}
+  .endif
+        over_8888_n_8888_1pixel  %(4+(PROCESS_REG%2)), %(PROCESS_REG)
+  .set PROCESS_REG, PROCESS_REG+1
+ .endr
+        pixst   , numbytes, firstreg, DST
+10:
+    .unreq  WK4
+    .unreq  WK5
+    .unreq  WK6
+    .unreq  WK7
+.endm
+
+generate_composite_function \
+    pixman_composite_over_8888_n_8888_asm_armv6, 32, 0, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+    2, /* prefetch distance */ \
+    over_8888_n_8888_init, \
+    nop_macro, /* newline */ \
+    nop_macro, /* cleanup */ \
+    over_8888_n_8888_process_head, \
+    over_8888_n_8888_process_tail
+
+/******************************************************************************/
+
+.macro over_n_8_8888_init
+        /* Source is constant, but splitting it into even/odd bytes is a loop invariant */
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        /* Not enough registers to hold this constant, but we still use it here to set GE[3:0] */
+        ldr     SCRATCH, =0x00800080
+        uxtb16  STRIDE_S, SRC
+        uxtb16  SRC, SRC, ror #8
+        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+        uadd8   SCRATCH, SCRATCH, SCRATCH
+        line_saved_regs Y, STRIDE_D, STRIDE_M, ORIG_W
+.endm
+
+.macro over_n_8_8888_newline
+        ldr     STRIDE_D, =0x00800080
+        b       1f
+ .ltorg
+1:
+.endm
+
+.macro over_n_8_8888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+    WK4     .req    STRIDE_M
+        pixld   , numbytes/4, 4, MASK, unaligned_mask
+        pixld   , numbytes, firstreg, DST, 0
+    .unreq  WK4
+.endm
+
+.macro over_n_8_8888_1pixel src, dst
+        uxtb    Y, WK4, ror #src*8
+        /* Trailing part of multiplication of source */
+        mla     SCRATCH, STRIDE_S, Y, STRIDE_D
+        mla     Y, SRC, Y, STRIDE_D
+        mov     ORIG_W, #255
+        uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+        uxtab16 Y, Y, Y, ror #8
+        mov     SCRATCH, SCRATCH, ror #8
+        sub     ORIG_W, ORIG_W, Y, lsr #24
+        sel     Y, SCRATCH, Y
+        /* Then multiply the destination */
+        mul_8888_8  WK&dst, ORIG_W, SCRATCH, STRIDE_D
+        uqadd8  WK&dst, WK&dst, Y
+.endm
+
+.macro over_n_8_8888_process_tail  cond, numbytes, firstreg
+    WK4     .req    STRIDE_M
+        teq     WK4, #0
+        beq     10f
+ .set PROCESS_REG, firstreg
+ .rept numbytes / 4
+        over_n_8_8888_1pixel  %(PROCESS_REG-firstreg), %(PROCESS_REG)
+  .set PROCESS_REG, PROCESS_REG+1
+ .endr
+        pixst   , numbytes, firstreg, DST
+10:
+    .unreq  WK4
+.endm
+
+generate_composite_function \
+    pixman_composite_over_n_8_8888_asm_armv6, 0, 8, 32 \
+    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS \
+    2, /* prefetch distance */ \
+    over_n_8_8888_init, \
+    over_n_8_8888_newline, \
+    nop_macro, /* cleanup */ \
+    over_n_8_8888_process_head, \
+    over_n_8_8888_process_tail
+
+/******************************************************************************/
+
diff --git a/src/pixman/pixman/pixman-arm-simd-asm.h b/src/pixman/pixman/pixman-arm-simd-asm.h
new file mode 100644
index 0000000..6543606
--- /dev/null
+++ b/src/pixman/pixman/pixman-arm-simd-asm.h
@@ -0,0 +1,908 @@
+/*
+ * Copyright © 2012 Raspberry Pi Foundation
+ * Copyright © 2012 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Ben Avison (bavison@riscosopen.org)
+ *
+ */
+
+/*
+ * Because the alignment of pixel data to cachelines, and even the number of
+ * cachelines per row can vary from row to row, and because of the need to
+ * preload each scanline once and only once, this prefetch strategy treats
+ * each row of pixels independently. When a pixel row is long enough, there
+ * are three distinct phases of prefetch:
+ * * an inner loop section, where each time a cacheline of data is
+ *    processed, another cacheline is preloaded (the exact distance ahead is
+ *    determined empirically using profiling results from lowlevel-blt-bench)
+ * * a leading section, where enough cachelines are preloaded to ensure no
+ *    cachelines escape being preloaded when the inner loop starts
+ * * a trailing section, where a limited number (0 or more) of cachelines
+ *    are preloaded to deal with data (if any) that hangs off the end of the
+ *    last iteration of the inner loop, plus any trailing bytes that were not
+ *    enough to make up one whole iteration of the inner loop
+ * 
+ * There are (in general) three distinct code paths, selected between
+ * depending upon how long the pixel row is. If it is long enough that there
+ * is at least one iteration of the inner loop (as described above) then
+ * this is described as the "wide" case. If it is shorter than that, but
+ * there are still enough bytes output that there is at least one 16-byte-
+ * long, 16-byte-aligned write to the destination (the optimum type of
+ * write), then this is the "medium" case. If it is not even this long, then
+ * this is the "narrow" case, and there is no attempt to align writes to
+ * 16-byte boundaries. In the "medium" and "narrow" cases, all the
+ * cachelines containing data from the pixel row are prefetched up-front.
+ */
+
+/*
+ * Determine whether we put the arguments on the stack for debugging.
+ */
+#undef DEBUG_PARAMS
+
+/*
+ * Bit flags for 'generate_composite_function' macro which are used
+ * to tune generated functions behavior.
+ */
+.set FLAG_DST_WRITEONLY,         0
+.set FLAG_DST_READWRITE,         1
+.set FLAG_COND_EXEC,             0
+.set FLAG_BRANCH_OVER,           2
+.set FLAG_PROCESS_PRESERVES_PSR, 0
+.set FLAG_PROCESS_CORRUPTS_PSR,  4
+.set FLAG_PROCESS_DOESNT_STORE,  0
+.set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */
+.set FLAG_NO_SPILL_LINE_VARS,        0
+.set FLAG_SPILL_LINE_VARS_WIDE,      16
+.set FLAG_SPILL_LINE_VARS_NON_WIDE,  32
+.set FLAG_SPILL_LINE_VARS,           48
+.set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
+.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
+
+/*
+ * Offset into stack where mask and source pointer/stride can be accessed.
+ */
+#ifdef DEBUG_PARAMS
+.set ARGS_STACK_OFFSET,        (9*4+9*4)
+#else
+.set ARGS_STACK_OFFSET,        (9*4)
+#endif
+
+/*
+ * Constants for selecting preferable prefetch type.
+ */
+.set PREFETCH_TYPE_NONE,       0
+.set PREFETCH_TYPE_STANDARD,   1
+
+/*
+ * Definitions of macros for load/store of pixel data.
+ */
+
+.macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
+ .if numbytes == 16
+  .if unaligned == 1
+        op&r&cond    WK&reg0, [base], #4
+        op&r&cond    WK&reg1, [base], #4
+        op&r&cond    WK&reg2, [base], #4
+        op&r&cond    WK&reg3, [base], #4
+  .else
+        op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
+  .endif
+ .elseif numbytes == 8
+  .if unaligned == 1
+        op&r&cond    WK&reg0, [base], #4
+        op&r&cond    WK&reg1, [base], #4
+  .else
+        op&m&cond&ia base!, {WK&reg0,WK&reg1}
+  .endif
+ .elseif numbytes == 4
+        op&r&cond    WK&reg0, [base], #4
+ .elseif numbytes == 2
+        op&r&cond&h  WK&reg0, [base], #2
+ .elseif numbytes == 1
+        op&r&cond&b  WK&reg0, [base], #1
+ .else
+  .error "unsupported size: numbytes"
+ .endif
+.endm
+
+.macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
+ .if numbytes == 16
+        stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
+ .elseif numbytes == 8
+        stm&cond&db base, {WK&reg0,WK&reg1}
+ .elseif numbytes == 4
+        str&cond    WK&reg0, [base, #-4]
+ .elseif numbytes == 2
+        str&cond&h  WK&reg0, [base, #-2]
+ .elseif numbytes == 1
+        str&cond&b  WK&reg0, [base, #-1]
+ .else
+  .error "unsupported size: numbytes"
+ .endif
+.endm
+
+.macro pixld cond, numbytes, firstreg, base, unaligned
+        pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
+.endm
+
+.macro pixst cond, numbytes, firstreg, base
+ .if (flags) & FLAG_DST_READWRITE
+        pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+ .else
+        pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
+ .endif
+.endm
+
+.macro PF a, x:vararg
+ .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
+        a x
+ .endif
+.endm
+
+
+.macro preload_leading_step1  bpp, ptr, base
+/* If the destination is already 16-byte aligned, then we need to preload
+ * between 0 and prefetch_distance (inclusive) cache lines ahead so there
+ * are no gaps when the inner loop starts.
+ */
+ .if bpp > 0
+        PF  bic,    ptr, base, #31
+  .set OFFSET, 0
+  .rept prefetch_distance+1
+        PF  pld,    [ptr, #OFFSET]
+   .set OFFSET, OFFSET+32
+  .endr
+ .endif
+.endm
+
+.macro preload_leading_step2  bpp, bpp_shift, ptr, base
+/* However, if the destination is not 16-byte aligned, we may need to
+ * preload more cache lines than that. The question we need to ask is:
+ * are the bytes corresponding to the leading pixels more than the amount
+ * by which the source pointer will be rounded down for preloading, and if
+ * so, by how many cache lines? Effectively, we want to calculate
+ *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
+ *     inner_loop_offset = (src+leading_bytes)&31
+ *     extra_needed = leading_bytes - inner_loop_offset
+ * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
+ * possible when there are 4 src bytes for every 1 dst byte).
+ */
+ .if bpp > 0
+  .ifc base,DST
+        /* The test can be simplified further when preloading the destination */
+        PF  tst,    base, #16
+        PF  beq,    61f
+  .else
+   .if bpp/dst_w_bpp == 4
+        PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
+        PF  and,    SCRATCH, SCRATCH, #31
+        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
+        PF  sub,    SCRATCH, SCRATCH, #1    /* so now ranges are -16..-1 / 0..31 / 32..63 */
+        PF  movs,   SCRATCH, SCRATCH, #32-6 /* so this sets         NC   /  nc   /   Nc   */
+        PF  bcs,    61f
+        PF  bpl,    60f
+        PF  pld,    [ptr, #32*(prefetch_distance+2)]
+   .else
+        PF  mov,    SCRATCH, base, lsl #32-5
+        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
+        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
+        PF  bls,    61f
+   .endif
+  .endif
+60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
+61:
+ .endif
+.endm
+
+#define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
+.macro preload_middle   bpp, base, scratch_holds_offset
+ .if bpp > 0
+        /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
+  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
+   .if scratch_holds_offset
+        PF  pld,    [base, SCRATCH]
+   .else
+        PF  bic,    SCRATCH, base, #31
+        PF  pld,    [SCRATCH, #32*prefetch_distance]
+   .endif
+  .endif
+ .endif
+.endm
+
+.macro preload_trailing  bpp, bpp_shift, base
+ .if bpp > 0
+  .if bpp*pix_per_block > 256
+        /* Calculations are more complex if more than one fetch per block */
+        PF  and,    WK1, base, #31
+        PF  add,    WK1, WK1, WK0, lsl #bpp_shift
+        PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
+        PF  bic,    SCRATCH, base, #31
+80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
+        PF  add,    SCRATCH, SCRATCH, #32
+        PF  subs,   WK1, WK1, #32
+        PF  bhi,    80b
+  .else
+        /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
+        PF  mov,    SCRATCH, base, lsl #32-5
+        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
+        PF  adceqs, SCRATCH, SCRATCH, #0
+        /* The instruction above has two effects: ensures Z is only
+         * set if C was clear (so Z indicates that both shifted quantities
+         * were 0), and clears C if Z was set (so C indicates that the sum
+         * of the shifted quantities was greater and not equal to 32) */
+        PF  beq,    82f
+        PF  bic,    SCRATCH, base, #31
+        PF  bcc,    81f
+        PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
+81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
+82:
+  .endif
+ .endif
+.endm
+
+
+.macro preload_line    narrow_case, bpp, bpp_shift, base
+/* "narrow_case" - just means that the macro was invoked from the "narrow"
+ *    code path rather than the "medium" one - because in the narrow case,
+ *    the row of pixels is known to output no more than 30 bytes, then
+ *    (assuming the source pixels are no wider than the the destination
+ *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,
+ *    meaning there's no need for a loop.
+ * "bpp" - number of bits per pixel in the channel (source, mask or
+ *    destination) that's being preloaded, or 0 if this channel is not used
+ *    for reading
+ * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
+ * "base" - base address register of channel to preload (SRC, MASK or DST)
+ */
+ .if bpp > 0
+  .if narrow_case && (bpp <= dst_w_bpp)
+        /* In these cases, each line for each channel is in either 1 or 2 cache lines */
+        PF  bic,    WK0, base, #31
+        PF  pld,    [WK0]
+        PF  add,    WK1, base, X, LSL #bpp_shift
+        PF  sub,    WK1, WK1, #1
+        PF  bic,    WK1, WK1, #31
+        PF  cmp,    WK1, WK0
+        PF  beq,    90f
+        PF  pld,    [WK1]
+90:
+  .else
+        PF  bic,    WK0, base, #31
+        PF  pld,    [WK0]
+        PF  add,    WK1, base, X, lsl #bpp_shift
+        PF  sub,    WK1, WK1, #1
+        PF  bic,    WK1, WK1, #31
+        PF  cmp,    WK1, WK0
+        PF  beq,    92f
+91:     PF  add,    WK0, WK0, #32
+        PF  cmp,    WK0, WK1
+        PF  pld,    [WK0]
+        PF  bne,    91b
+92:
+  .endif
+ .endif
+.endm
+
+
+.macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+        process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
+ .if decrementx
+        sub&cond X, X, #8*numbytes/dst_w_bpp
+ .endif
+        process_tail  cond, numbytes, firstreg
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+        pixst   cond, numbytes, firstreg, DST
+ .endif
+.endm
+
+.macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ .if (flags) & FLAG_BRANCH_OVER
+  .ifc cond,mi
+        bpl     100f
+  .endif
+  .ifc cond,cs
+        bcc     100f
+  .endif
+  .ifc cond,ne
+        beq     100f
+  .endif
+        conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+100:
+ .else
+        conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
+ .endif
+.endm
+
+.macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
+ .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
+        /* Can't interleave reads and writes */
+        test
+        conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
+  .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
+        test
+  .endif
+        conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
+ .else
+        /* Can interleave reads and writes for better scheduling */
+        test
+        process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
+        process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
+  .if decrementx
+        sub&cond1 X, X, #8*numbytes1/dst_w_bpp
+        sub&cond2 X, X, #8*numbytes2/dst_w_bpp
+  .endif
+        process_tail  cond1, numbytes1, firstreg1
+        process_tail  cond2, numbytes2, firstreg2
+        pixst   cond1, numbytes1, firstreg1, DST
+        pixst   cond2, numbytes2, firstreg2, DST
+ .endif
+.endm
+
+
+.macro test_bits_1_0_ptr
+        movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
+.endm
+
+.macro test_bits_3_2_ptr
+        movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
+.endm
+
+.macro leading_15bytes  process_head, process_tail
+        /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
+        /* Use unaligned loads in all cases for simplicity */
+ .if dst_w_bpp == 8
+        conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
+ .elseif dst_w_bpp == 16
+        test_bits_1_0_ptr
+        conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, 1
+ .endif
+        conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
+.endm
+
+.macro test_bits_3_2_pix
+        movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
+.endm
+
+.macro test_bits_1_0_pix
+ .if dst_w_bpp == 8
+        movs    SCRATCH, X, lsl #dst_bpp_shift+32-1
+ .else
+        movs    SCRATCH, X, lsr #1
+ .endif
+.endm
+
+.macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
+        conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
+ .if dst_w_bpp == 16
+        test_bits_1_0_pix
+        conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
+ .elseif dst_w_bpp == 8
+        conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
+ .endif
+.endm
+
+
+.macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
+110:
+ .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
+ .rept pix_per_block*dst_w_bpp/128
+        process_head  , 16, 0, unaligned_src, unaligned_mask, 1
+  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+        preload_middle  src_bpp, SRC, 1
+  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+        preload_middle  mask_bpp, MASK, 1
+  .else
+        preload_middle  src_bpp, SRC, 0
+        preload_middle  mask_bpp, MASK, 0
+  .endif
+  .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
+        /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
+         * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
+         * preloads for, to achieve staggered prefetches for multiple channels, because there are
+         * always two STMs per prefetch, so there is always an opposite STM on which to put the
+         * preload. Note, no need to BIC the base register here */
+        PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
+  .endif
+        process_tail  , 16, 0
+  .if !((flags) & FLAG_PROCESS_DOES_STORE)
+        pixst   , 16, 0, DST
+  .endif
+  .set SUBBLOCK, SUBBLOCK+1
+ .endr
+        subs    X, X, #pix_per_block
+        bhs     110b
+.endm
+
+.macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
+        /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
+ .if dst_r_bpp > 0
+        tst     DST, #16
+        bne     111f
+        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16
+        b       112f
+111:
+ .endif
+        process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0
+112:
+        /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
+ .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
+        PF  and,    WK0, X, #pix_per_block-1
+ .endif
+        preload_trailing  src_bpp, src_bpp_shift, SRC
+        preload_trailing  mask_bpp, mask_bpp_shift, MASK
+        preload_trailing  dst_r_bpp, dst_bpp_shift, DST
+        add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
+        /* The remainder of the line is handled identically to the medium case */
+        medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
+.endm
+
+.macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
+120:
+        process_head  , 16, 0, unaligned_src, unaligned_mask, 0
+        process_tail  , 16, 0
+ .if !((flags) & FLAG_PROCESS_DOES_STORE)
+        pixst   , 16, 0, DST
+ .endif
+        subs    X, X, #128/dst_w_bpp
+        bhs     120b
+        /* Trailing pixels */
+        tst     X, #128/dst_w_bpp - 1
+        beq     exit_label
+        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
+.endm
+
+.macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
+        tst     X, #16*8/dst_w_bpp
+        conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
+        /* Trailing pixels */
+        /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
+        trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
+.endm
+
+.macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
+ /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
+ .if mask_bpp == 8 || mask_bpp == 16
+        tst     MASK, #3
+        bne     141f
+ .endif
+  .if src_bpp == 8 || src_bpp == 16
+        tst     SRC, #3
+        bne     140f
+  .endif
+        action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
+  .if src_bpp == 8 || src_bpp == 16
+        b       exit_label
+140:
+        action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
+  .endif
+ .if mask_bpp == 8 || mask_bpp == 16
+        b       exit_label
+141:
+  .if src_bpp == 8 || src_bpp == 16
+        tst     SRC, #3
+        bne     142f
+  .endif
+        action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
+  .if src_bpp == 8 || src_bpp == 16
+        b       exit_label
+142:
+        action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
+  .endif
+ .endif
+.endm
+
+
+.macro end_of_line      restore_x, vars_spilled, loop_label, last_one
+ .if vars_spilled
+        /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
+        /* This is ldmia sp,{} */
+        .word   0xE89D0000 | LINE_SAVED_REGS
+ .endif
+        subs    Y, Y, #1
+ .if vars_spilled
+  .if (LINE_SAVED_REGS) & (1<<1)
+        str     Y, [sp]
+  .endif
+ .endif
+        add     DST, DST, STRIDE_D
+ .if src_bpp > 0
+        add     SRC, SRC, STRIDE_S
+ .endif
+ .if mask_bpp > 0
+        add     MASK, MASK, STRIDE_M
+ .endif
+ .if restore_x
+        mov     X, ORIG_W
+ .endif
+        bhs     loop_label
+ .ifc "last_one",""
+  .if vars_spilled
+        b       197f
+  .else
+        b       198f
+  .endif
+ .else
+  .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
+        b       198f
+  .endif
+ .endif
+.endm
+
+
+.macro generate_composite_function fname, \
+                                   src_bpp_, \
+                                   mask_bpp_, \
+                                   dst_w_bpp_, \
+                                   flags_, \
+                                   prefetch_distance_, \
+                                   init, \
+                                   newline, \
+                                   cleanup, \
+                                   process_head, \
+                                   process_tail, \
+                                   process_inner_loop
+
+ .func fname
+ .global fname
+ /* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+ .hidden fname
+ .type fname, %function
+#endif
+
+/*
+ * Make some macro arguments globally visible and accessible
+ * from other macros
+ */
+ .set src_bpp, src_bpp_
+ .set mask_bpp, mask_bpp_
+ .set dst_w_bpp, dst_w_bpp_
+ .set flags, flags_
+ .set prefetch_distance, prefetch_distance_
+
+/*
+ * Select prefetch type for this function.
+ */
+ .if prefetch_distance == 0
+  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+ .else
+  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
+ .endif
+
+ .if src_bpp == 32
+  .set src_bpp_shift, 2
+ .elseif src_bpp == 24
+  .set src_bpp_shift, 0
+ .elseif src_bpp == 16
+  .set src_bpp_shift, 1
+ .elseif src_bpp == 8
+  .set src_bpp_shift, 0
+ .elseif src_bpp == 0
+  .set src_bpp_shift, -1
+ .else
+  .error "requested src bpp (src_bpp) is not supported"
+ .endif
+
+ .if mask_bpp == 32
+  .set mask_bpp_shift, 2
+ .elseif mask_bpp == 24
+  .set mask_bpp_shift, 0
+ .elseif mask_bpp == 8
+  .set mask_bpp_shift, 0
+ .elseif mask_bpp == 0
+  .set mask_bpp_shift, -1
+ .else
+  .error "requested mask bpp (mask_bpp) is not supported"
+ .endif
+
+ .if dst_w_bpp == 32
+  .set dst_bpp_shift, 2
+ .elseif dst_w_bpp == 24
+  .set dst_bpp_shift, 0
+ .elseif dst_w_bpp == 16
+  .set dst_bpp_shift, 1
+ .elseif dst_w_bpp == 8
+  .set dst_bpp_shift, 0
+ .else
+  .error "requested dst bpp (dst_w_bpp) is not supported"
+ .endif
+
+ .if (((flags) & FLAG_DST_READWRITE) != 0)
+  .set dst_r_bpp, dst_w_bpp
+ .else
+  .set dst_r_bpp, 0
+ .endif
+
+ .set pix_per_block, 16*8/dst_w_bpp
+ .if src_bpp != 0
+  .if 32*8/src_bpp > pix_per_block
+   .set pix_per_block, 32*8/src_bpp
+  .endif
+ .endif
+ .if mask_bpp != 0
+  .if 32*8/mask_bpp > pix_per_block
+   .set pix_per_block, 32*8/mask_bpp
+  .endif
+ .endif
+ .if dst_r_bpp != 0
+  .if 32*8/dst_r_bpp > pix_per_block
+   .set pix_per_block, 32*8/dst_r_bpp
+  .endif
+ .endif
+
+/* The standard entry conditions set up by pixman-arm-common.h are:
+ * r0 = width (pixels)
+ * r1 = height (rows)
+ * r2 = pointer to top-left pixel of destination
+ * r3 = destination stride (pixels)
+ * [sp] = source pixel value, or pointer to top-left pixel of source
+ * [sp,#4] = 0 or source stride (pixels)
+ * The following arguments are unused for non-mask operations
+ * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
+ * [sp,#12] = 0 or mask stride (pixels)
+ */
+
+/*
+ * Assign symbolic names to registers
+ */
+    X           .req    r0  /* pixels to go on this line */
+    Y           .req    r1  /* lines to go */
+    DST         .req    r2  /* destination pixel pointer */
+    STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */
+    SRC         .req    r4  /* source pixel pointer */
+    STRIDE_S    .req    r5  /* source stride (bytes, minus width) */
+    MASK        .req    r6  /* mask pixel pointer (if applicable) */
+    STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */
+    WK0         .req    r8  /* pixel data registers */
+    WK1         .req    r9
+    WK2         .req    r10
+    WK3         .req    r11
+    SCRATCH     .req    r12
+    ORIG_W      .req    r14 /* width (pixels) */
+
+fname:
+        push    {r4-r11, lr}        /* save all registers */
+
+        subs    Y, Y, #1
+        blo     199f
+
+#ifdef DEBUG_PARAMS
+        sub     sp, sp, #9*4
+#endif
+
+ .if src_bpp > 0
+        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
+        ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
+ .endif
+ .if mask_bpp > 0
+        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
+        ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
+ .endif
+        
+#ifdef DEBUG_PARAMS
+        add     Y, Y, #1
+        stmia   sp, {r0-r7,pc}
+        sub     Y, Y, #1
+#endif
+
+        init
+        
+        lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
+        sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
+ .if src_bpp > 0
+        lsl     STRIDE_S, #src_bpp_shift
+        sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
+ .endif
+ .if mask_bpp > 0
+        lsl     STRIDE_M, #mask_bpp_shift
+        sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
+ .endif
+ 
+        /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
+        cmp     X, #2*16*8/dst_w_bpp - 1
+        blo     170f
+ .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
+        /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
+        cmp     X, #(prefetch_distance+3)*pix_per_block - 1
+        blo     160f
+
+        /* Wide case */
+        /* Adjust X so that the decrement instruction can also test for
+         * inner loop termination. We want it to stop when there are
+         * (prefetch_distance+1) complete blocks to go. */
+        sub     X, X, #(prefetch_distance+2)*pix_per_block
+        mov     ORIG_W, X
+  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
+        /* This is stmdb sp!,{} */
+        .word   0xE92D0000 | LINE_SAVED_REGS
+  .endif
+151:    /* New line */
+        newline
+        preload_leading_step1  src_bpp, WK1, SRC
+        preload_leading_step1  mask_bpp, WK2, MASK
+        preload_leading_step1  dst_r_bpp, WK3, DST
+        
+        tst     DST, #15
+        beq     154f
+        rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
+  .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp)
+        PF  and,    WK0, WK0, #15
+  .endif
+
+        preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
+        preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
+        preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
+
+        leading_15bytes  process_head, process_tail
+        
+154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
+ .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+        and     SCRATCH, SRC, #31
+        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
+ .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
+        and     SCRATCH, MASK, #31
+        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
+ .endif
+ .ifc "process_inner_loop",""
+        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
+ .else
+        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
+ .endif
+
+157:    /* Check for another line */
+        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
+ .endif
+
+ .ltorg
+
+160:    /* Medium case */
+        mov     ORIG_W, X
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+        /* This is stmdb sp!,{} */
+        .word   0xE92D0000 | LINE_SAVED_REGS
+ .endif
+161:    /* New line */
+        newline
+        preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
+        preload_line 0, mask_bpp, mask_bpp_shift, MASK
+        preload_line 0, dst_r_bpp, dst_bpp_shift, DST
+        
+        sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
+        tst     DST, #15
+        beq     164f
+        rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
+        
+        leading_15bytes  process_head, process_tail
+        
+164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
+        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
+        
+167:    /* Check for another line */
+        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
+
+ .ltorg
+
+170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
+ .if dst_w_bpp < 32
+        mov     ORIG_W, X
+ .endif
+ .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
+        /* This is stmdb sp!,{} */
+        .word   0xE92D0000 | LINE_SAVED_REGS
+ .endif
+171:    /* New line */
+        newline
+        preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
+        preload_line 1, mask_bpp, mask_bpp_shift, MASK
+        preload_line 1, dst_r_bpp, dst_bpp_shift, DST
+        
+ .if dst_w_bpp == 8
+        tst     DST, #3
+        beq     174f
+172:    subs    X, X, #1
+        blo     177f
+        process_head  , 1, 0, 1, 1, 0
+        process_tail  , 1, 0
+  .if !((flags) & FLAG_PROCESS_DOES_STORE)
+        pixst   , 1, 0, DST
+  .endif
+        tst     DST, #3
+        bne     172b
+ .elseif dst_w_bpp == 16
+        tst     DST, #2
+        beq     174f
+        subs    X, X, #1
+        blo     177f
+        process_head  , 2, 0, 1, 1, 0
+        process_tail  , 2, 0
+  .if !((flags) & FLAG_PROCESS_DOES_STORE)
+        pixst   , 2, 0, DST
+  .endif
+ .endif
+
+174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
+        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
+
+177:    /* Check for another line */
+        end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
+
+197:
+ .if (flags) & FLAG_SPILL_LINE_VARS
+        add     sp, sp, #LINE_SAVED_REG_COUNT*4
+ .endif
+198:
+        cleanup
+
+#ifdef DEBUG_PARAMS
+        add     sp, sp, #9*4 /* junk the debug copy of arguments */
+#endif
+199:
+        pop     {r4-r11, pc}  /* exit */
+
+ .ltorg
+
+    .unreq  X
+    .unreq  Y
+    .unreq  DST
+    .unreq  STRIDE_D
+    .unreq  SRC
+    .unreq  STRIDE_S
+    .unreq  MASK
+    .unreq  STRIDE_M
+    .unreq  WK0
+    .unreq  WK1
+    .unreq  WK2
+    .unreq  WK3
+    .unreq  SCRATCH
+    .unreq  ORIG_W
+    .endfunc
+.endm
+
+.macro line_saved_regs  x:vararg
+ .set LINE_SAVED_REGS, 0
+ .set LINE_SAVED_REG_COUNT, 0
+ .irp SAVED_REG,x
+  .ifc "SAVED_REG","Y"
+   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
+   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+  .endif
+  .ifc "SAVED_REG","STRIDE_D"
+   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
+   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+  .endif
+  .ifc "SAVED_REG","STRIDE_S"
+   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
+   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+  .endif
+  .ifc "SAVED_REG","STRIDE_M"
+   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
+   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+  .endif
+  .ifc "SAVED_REG","ORIG_W"
+   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
+   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
+  .endif
+ .endr
+.endm
+
+.macro nop_macro x:vararg
+.endm
diff --git a/src/pixman/pixman/pixman-arm-simd.c b/src/pixman/pixman/pixman-arm-simd.c
new file mode 100644
index 0000000..af062e1
--- /dev/null
+++ b/src/pixman/pixman/pixman-arm-simd.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright © 2008 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Jeff Muizelaar (jeff@infidigm.net)
+ *
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+#include "pixman-arm-common.h"
+#include "pixman-inlines.h"
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8888_8888,
+		                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_x888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_0565,
+                                   uint16_t, 1, uint16_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_8_8,
+                                   uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, src_0565_8888,
+                                   uint16_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
+                                   uint8_t, 1, uint8_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
+                                   uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
+                                     uint32_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, armv6, over_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
+
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 0565_0565, SRC,
+                                        uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_NEAREST_SRC_DST (armv6, 8888_8888, SRC,
+                                        uint32_t, uint32_t)
+
+void
+pixman_composite_src_n_8888_asm_armv6 (int32_t   w,
+                                       int32_t   h,
+                                       uint32_t *dst,
+                                       int32_t   dst_stride,
+                                       uint32_t  src);
+
+void
+pixman_composite_src_n_0565_asm_armv6 (int32_t   w,
+                                       int32_t   h,
+                                       uint16_t *dst,
+                                       int32_t   dst_stride,
+                                       uint16_t  src);
+
+void
+pixman_composite_src_n_8_asm_armv6 (int32_t   w,
+                                    int32_t   h,
+                                    uint8_t  *dst,
+                                    int32_t   dst_stride,
+                                    uint8_t  src);
+
+static pixman_bool_t
+arm_simd_fill (pixman_implementation_t *imp,
+               uint32_t *               bits,
+               int                      stride, /* in 32-bit words */
+               int                      bpp,
+               int                      x,
+               int                      y,
+               int                      width,
+               int                      height,
+               uint32_t                 _xor)
+{
+    /* stride is always multiple of 32bit units in pixman */
+    uint32_t byte_stride = stride * sizeof(uint32_t);
+
+    switch (bpp)
+    {
+    case 8:
+	pixman_composite_src_n_8_asm_armv6 (
+		width,
+		height,
+		(uint8_t *)(((char *) bits) + y * byte_stride + x),
+		byte_stride,
+		_xor & 0xff);
+	return TRUE;
+    case 16:
+	pixman_composite_src_n_0565_asm_armv6 (
+		width,
+		height,
+		(uint16_t *)(((char *) bits) + y * byte_stride + x * 2),
+		byte_stride / 2,
+		_xor & 0xffff);
+	return TRUE;
+    case 32:
+	pixman_composite_src_n_8888_asm_armv6 (
+		width,
+		height,
+		(uint32_t *)(((char *) bits) + y * byte_stride + x * 4),
+		byte_stride / 4,
+		_xor);
+	return TRUE;
+    default:
+	return FALSE;
+    }
+}
+
+static pixman_bool_t
+arm_simd_blt (pixman_implementation_t *imp,
+              uint32_t *               src_bits,
+              uint32_t *               dst_bits,
+              int                      src_stride, /* in 32-bit words */
+              int                      dst_stride, /* in 32-bit words */
+              int                      src_bpp,
+              int                      dst_bpp,
+              int                      src_x,
+              int                      src_y,
+              int                      dest_x,
+              int                      dest_y,
+              int                      width,
+              int                      height)
+{
+    if (src_bpp != dst_bpp)
+	return FALSE;
+
+    switch (src_bpp)
+    {
+    case 8:
+        pixman_composite_src_8_8_asm_armv6 (
+                width, height,
+                (uint8_t *)(((char *) dst_bits) +
+                dest_y * dst_stride * 4 + dest_x * 1), dst_stride * 4,
+                (uint8_t *)(((char *) src_bits) +
+                src_y * src_stride * 4 + src_x * 1), src_stride * 4);
+        return TRUE;
+    case 16:
+	pixman_composite_src_0565_0565_asm_armv6 (
+		width, height,
+		(uint16_t *)(((char *) dst_bits) +
+		dest_y * dst_stride * 4 + dest_x * 2), dst_stride * 2,
+		(uint16_t *)(((char *) src_bits) +
+		src_y * src_stride * 4 + src_x * 2), src_stride * 2);
+	return TRUE;
+    case 32:
+	pixman_composite_src_8888_8888_asm_armv6 (
+		width, height,
+		(uint32_t *)(((char *) dst_bits) +
+		dest_y * dst_stride * 4 + dest_x * 4), dst_stride,
+		(uint32_t *)(((char *) src_bits) +
+		src_y * src_stride * 4 + src_x * 4), src_stride);
+	return TRUE;
+    default:
+	return FALSE;
+    }
+}
+
+static const pixman_fast_path_t arm_simd_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, armv6_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, armv6_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, armv6_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, armv6_composite_src_8888_8888),
+
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, armv6_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, armv6_composite_src_x888_8888),
+
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, a1r5g5b5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a1b5g5r5, null, a1b5g5r5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a1b5g5r5, null, x1b5g5r5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x1b5g5r5, null, x1b5g5r5, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a4r4g4b4, null, a4r4g4b4, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a4b4g4r4, null, a4b4g4r4, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a4r4g4b4, null, x4r4g4b4, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a4b4g4r4, null, x4b4g4r4, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x4r4g4b4, null, x4r4g4b4, armv6_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x4b4g4r4, null, x4b4g4r4, armv6_composite_src_0565_0565),
+
+    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, r3g3b2, null, r3g3b2, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, b2g3r3, null, b2g3r3, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, a2r2g2b2, null, a2r2g2b2, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, a2b2g2r2, null, a2b2g2r2, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, c8, null, c8, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, g8, null, g8, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, x4a4, null, x4a4, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, x4c4, null, x4c4, armv6_composite_src_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, x4g4, null, x4g4, armv6_composite_src_8_8),
+
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, a8r8g8b8, armv6_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, x8r8g8b8, armv6_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, a8b8g8r8, armv6_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, x8b8g8r8, armv6_composite_src_0565_8888),
+
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, armv6_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
+
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
+
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, armv6_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, armv6_composite_over_n_8_8888),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, armv6_0565_0565),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, b5g6r5, armv6_0565_0565),
+
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, armv6_8888_8888),
+    PIXMAN_ARM_SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, armv6_8888_8888),
+
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, arm_simd_fast_paths);
+
+    imp->blt = arm_simd_blt;
+    imp->fill = arm_simd_fill;
+
+    return imp;
+}
diff --git a/src/pixman/pixman/pixman-arm.c b/src/pixman/pixman/pixman-arm.c
new file mode 100644
index 0000000..23374e4
--- /dev/null
+++ b/src/pixman/pixman/pixman-arm.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+
+typedef enum
+{
+    ARM_V7		= (1 << 0),
+    ARM_V6		= (1 << 1),
+    ARM_VFP		= (1 << 2),
+    ARM_NEON		= (1 << 3),
+    ARM_IWMMXT		= (1 << 4)
+} arm_cpu_features_t;
+
+#if defined(USE_ARM_SIMD) || defined(USE_ARM_NEON) || defined(USE_ARM_IWMMXT)
+
+#if defined(_MSC_VER)
+
+/* Needed for EXCEPTION_ILLEGAL_INSTRUCTION */
+#include <windows.h>
+
+extern int pixman_msvc_try_arm_neon_op ();
+extern int pixman_msvc_try_arm_simd_op ();
+
+static arm_cpu_features_t
+detect_cpu_features (void)
+{
+    arm_cpu_features_t features = 0;
+
+    __try
+    {
+	pixman_msvc_try_arm_simd_op ();
+	features |= ARM_V6;
+    }
+    __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION)
+    {
+    }
+
+    __try
+    {
+	pixman_msvc_try_arm_neon_op ();
+	features |= ARM_NEON;
+    }
+    __except (GetExceptionCode () == EXCEPTION_ILLEGAL_INSTRUCTION)
+    {
+    }
+
+    return features;
+}
+
+#elif defined(__APPLE__) && defined(TARGET_OS_IPHONE) /* iOS */
+
+#include "TargetConditionals.h"
+
+static arm_cpu_features_t
+detect_cpu_features (void)
+{
+    arm_cpu_features_t features = 0;
+
+    features |= ARM_V6;
+
+    /* Detection of ARM NEON on iOS is fairly simple because iOS binaries
+     * contain separate executable images for each processor architecture.
+     * So all we have to do is detect the armv7 architecture build. The
+     * operating system automatically runs the armv7 binary for armv7 devices
+     * and the armv6 binary for armv6 devices.
+     */
+#if defined(__ARM_NEON__)
+    features |= ARM_NEON;
+#endif
+
+    return features;
+}
+
+#elif defined(__ANDROID__) || defined(ANDROID) /* Android */
+
+#include <cpu-features.h>
+
+static arm_cpu_features_t
+detect_cpu_features (void)
+{
+    arm_cpu_features_t features = 0;
+    AndroidCpuFamily cpu_family;
+    uint64_t cpu_features;
+
+    cpu_family = android_getCpuFamily();
+    cpu_features = android_getCpuFeatures();
+
+    if (cpu_family == ANDROID_CPU_FAMILY_ARM)
+    {
+	if (cpu_features & ANDROID_CPU_ARM_FEATURE_ARMv7)
+	    features |= ARM_V7;
+
+	if (cpu_features & ANDROID_CPU_ARM_FEATURE_VFPv3)
+	    features |= ARM_VFP;
+
+	if (cpu_features & ANDROID_CPU_ARM_FEATURE_NEON)
+	    features |= ARM_NEON;
+    }
+
+    return features;
+}
+
+#elif defined (__linux__) /* linux ELF */
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <elf.h>
+
+static arm_cpu_features_t
+detect_cpu_features (void)
+{
+    arm_cpu_features_t features = 0;
+    Elf32_auxv_t aux;
+    int fd;
+
+    fd = open ("/proc/self/auxv", O_RDONLY);
+    if (fd >= 0)
+    {
+	while (read (fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t))
+	{
+	    if (aux.a_type == AT_HWCAP)
+	    {
+		uint32_t hwcap = aux.a_un.a_val;
+
+		/* hardcode these values to avoid depending on specific
+		 * versions of the hwcap header, e.g. HWCAP_NEON
+		 */
+		if ((hwcap & 64) != 0)
+		    features |= ARM_VFP;
+		if ((hwcap & 512) != 0)
+		    features |= ARM_IWMMXT;
+		/* this flag is only present on kernel 2.6.29 */
+		if ((hwcap & 4096) != 0)
+		    features |= ARM_NEON;
+	    }
+	    else if (aux.a_type == AT_PLATFORM)
+	    {
+		const char *plat = (const char*) aux.a_un.a_val;
+
+		if (strncmp (plat, "v7l", 3) == 0)
+		    features |= (ARM_V7 | ARM_V6);
+		else if (strncmp (plat, "v6l", 3) == 0)
+		    features |= ARM_V6;
+	    }
+	}
+	close (fd);
+    }
+
+    return features;
+}
+
+#else /* Unknown */
+
+static arm_cpu_features_t
+detect_cpu_features (void)
+{
+    return 0;
+}
+
+#endif /* Linux elf */
+
+static pixman_bool_t
+have_feature (arm_cpu_features_t feature)
+{
+    static pixman_bool_t initialized;
+    static arm_cpu_features_t features;
+
+    if (!initialized)
+    {
+	features = detect_cpu_features();
+	initialized = TRUE;
+    }
+
+    return (features & feature) == feature;
+}
+
+#endif /* USE_ARM_SIMD || USE_ARM_NEON || USE_ARM_IWMMXT */
+
+pixman_implementation_t *
+_pixman_arm_get_implementations (pixman_implementation_t *imp)
+{
+#ifdef USE_ARM_SIMD
+    if (!_pixman_disabled ("arm-simd") && have_feature (ARM_V6))
+	imp = _pixman_implementation_create_arm_simd (imp);
+#endif
+
+#ifdef USE_ARM_IWMMXT
+    if (!_pixman_disabled ("arm-iwmmxt") && have_feature (ARM_IWMMXT))
+	imp = _pixman_implementation_create_mmx (imp);
+#endif
+
+#ifdef USE_ARM_NEON
+    if (!_pixman_disabled ("arm-neon") && have_feature (ARM_NEON))
+	imp = _pixman_implementation_create_arm_neon (imp);
+#endif
+
+    return imp;
+}
diff --git a/src/pixman/pixman/pixman-bits-image.c b/src/pixman/pixman/pixman-bits-image.c
new file mode 100644
index 0000000..dcdcc69
--- /dev/null
+++ b/src/pixman/pixman/pixman-bits-image.c
@@ -0,0 +1,1039 @@
+/*
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *             2008 Aaron Plattner, NVIDIA Corporation
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007, 2009 Red Hat, Inc.
+ * Copyright © 2008 André Tupinambá <andrelrt@gmail.com>
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+static uint32_t *
+_pixman_image_get_scanline_generic_float (pixman_iter_t * iter,
+					  const uint32_t *mask)
+{
+    pixman_iter_get_scanline_t fetch_32 = iter->data;
+    uint32_t *buffer = iter->buffer;
+
+    fetch_32 (iter, NULL);
+
+    pixman_expand_to_float ((argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return iter->buffer;
+}
+
+/* Fetch functions */
+
+static force_inline uint32_t
+fetch_pixel_no_alpha (bits_image_t *image,
+		      int x, int y, pixman_bool_t check_bounds)
+{
+    if (check_bounds &&
+	(x < 0 || x >= image->width || y < 0 || y >= image->height))
+    {
+	return 0;
+    }
+
+    return image->fetch_pixel_32 (image, x, y);
+}
+
+typedef uint32_t (* get_pixel_t) (bits_image_t *image,
+				  int x, int y, pixman_bool_t check_bounds);
+
+static force_inline uint32_t
+bits_image_fetch_pixel_nearest (bits_image_t   *image,
+				pixman_fixed_t  x,
+				pixman_fixed_t  y,
+				get_pixel_t	get_pixel)
+{
+    int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+    int y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+    if (image->common.repeat != PIXMAN_REPEAT_NONE)
+    {
+	repeat (image->common.repeat, &x0, image->width);
+	repeat (image->common.repeat, &y0, image->height);
+
+	return get_pixel (image, x0, y0, FALSE);
+    }
+    else
+    {
+	return get_pixel (image, x0, y0, TRUE);
+    }
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_bilinear (bits_image_t   *image,
+				 pixman_fixed_t  x,
+				 pixman_fixed_t  y,
+				 get_pixel_t	 get_pixel)
+{
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+    int x1, y1, x2, y2;
+    uint32_t tl, tr, bl, br;
+    int32_t distx, disty;
+
+    x1 = x - pixman_fixed_1 / 2;
+    y1 = y - pixman_fixed_1 / 2;
+
+    distx = pixman_fixed_to_bilinear_weight (x1);
+    disty = pixman_fixed_to_bilinear_weight (y1);
+
+    x1 = pixman_fixed_to_int (x1);
+    y1 = pixman_fixed_to_int (y1);
+    x2 = x1 + 1;
+    y2 = y1 + 1;
+
+    if (repeat_mode != PIXMAN_REPEAT_NONE)
+    {
+	repeat (repeat_mode, &x1, width);
+	repeat (repeat_mode, &y1, height);
+	repeat (repeat_mode, &x2, width);
+	repeat (repeat_mode, &y2, height);
+
+	tl = get_pixel (image, x1, y1, FALSE);
+	bl = get_pixel (image, x1, y2, FALSE);
+	tr = get_pixel (image, x2, y1, FALSE);
+	br = get_pixel (image, x2, y2, FALSE);
+    }
+    else
+    {
+	tl = get_pixel (image, x1, y1, TRUE);
+	tr = get_pixel (image, x2, y1, TRUE);
+	bl = get_pixel (image, x1, y2, TRUE);
+	br = get_pixel (image, x2, y2, TRUE);
+    }
+
+    return bilinear_interpolation (tl, tr, bl, br, distx, disty);
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_convolution (bits_image_t   *image,
+				    pixman_fixed_t  x,
+				    pixman_fixed_t  y,
+				    get_pixel_t     get_pixel)
+{
+    pixman_fixed_t *params = image->common.filter_params;
+    int x_off = (params[0] - pixman_fixed_1) >> 1;
+    int y_off = (params[1] - pixman_fixed_1) >> 1;
+    int32_t cwidth = pixman_fixed_to_int (params[0]);
+    int32_t cheight = pixman_fixed_to_int (params[1]);
+    int32_t i, j, x1, x2, y1, y2;
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+    int srtot, sgtot, sbtot, satot;
+
+    params += 2;
+
+    x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
+    y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
+    x2 = x1 + cwidth;
+    y2 = y1 + cheight;
+
+    srtot = sgtot = sbtot = satot = 0;
+
+    for (i = y1; i < y2; ++i)
+    {
+	for (j = x1; j < x2; ++j)
+	{
+	    int rx = j;
+	    int ry = i;
+
+	    pixman_fixed_t f = *params;
+
+	    if (f)
+	    {
+		uint32_t pixel;
+
+		if (repeat_mode != PIXMAN_REPEAT_NONE)
+		{
+		    repeat (repeat_mode, &rx, width);
+		    repeat (repeat_mode, &ry, height);
+
+		    pixel = get_pixel (image, rx, ry, FALSE);
+		}
+		else
+		{
+		    pixel = get_pixel (image, rx, ry, TRUE);
+		}
+
+		srtot += (int)RED_8 (pixel) * f;
+		sgtot += (int)GREEN_8 (pixel) * f;
+		sbtot += (int)BLUE_8 (pixel) * f;
+		satot += (int)ALPHA_8 (pixel) * f;
+	    }
+
+	    params++;
+	}
+    }
+
+    satot = (satot + 0x8000) >> 16;
+    srtot = (srtot + 0x8000) >> 16;
+    sgtot = (sgtot + 0x8000) >> 16;
+    sbtot = (sbtot + 0x8000) >> 16;
+
+    satot = CLIP (satot, 0, 0xff);
+    srtot = CLIP (srtot, 0, 0xff);
+    sgtot = CLIP (sgtot, 0, 0xff);
+    sbtot = CLIP (sbtot, 0, 0xff);
+
+    return ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
+}
+
+static uint32_t
+bits_image_fetch_pixel_separable_convolution (bits_image_t *image,
+                                              pixman_fixed_t x,
+                                              pixman_fixed_t y,
+                                              get_pixel_t    get_pixel)
+{
+    pixman_fixed_t *params = image->common.filter_params;
+    pixman_repeat_t repeat_mode = image->common.repeat;
+    int width = image->width;
+    int height = image->height;
+    int cwidth = pixman_fixed_to_int (params[0]);
+    int cheight = pixman_fixed_to_int (params[1]);
+    int x_phase_bits = pixman_fixed_to_int (params[2]);
+    int y_phase_bits = pixman_fixed_to_int (params[3]);
+    int x_phase_shift = 16 - x_phase_bits;
+    int y_phase_shift = 16 - y_phase_bits;
+    int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1;
+    int y_off = ((cheight << 16) - pixman_fixed_1) >> 1;
+    pixman_fixed_t *y_params;
+    int srtot, sgtot, sbtot, satot;
+    int32_t x1, x2, y1, y2;
+    int32_t px, py;
+    int i, j;
+
+    /* Round x and y to the middle of the closest phase before continuing. This
+     * ensures that the convolution matrix is aligned right, since it was
+     * positioned relative to a particular phase (and not relative to whatever
+     * exact fraction we happen to get here).
+     */
+    x = ((x >> x_phase_shift) << x_phase_shift) + ((1 << x_phase_shift) >> 1);
+    y = ((y >> y_phase_shift) << y_phase_shift) + ((1 << y_phase_shift) >> 1);
+
+    px = (x & 0xffff) >> x_phase_shift;
+    py = (y & 0xffff) >> y_phase_shift;
+
+    y_params = params + 4 + (1 << x_phase_bits) * cwidth + py * cheight;
+
+    x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
+    y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
+    x2 = x1 + cwidth;
+    y2 = y1 + cheight;
+
+    srtot = sgtot = sbtot = satot = 0;
+
+    for (i = y1; i < y2; ++i)
+    {
+        pixman_fixed_48_16_t fy = *y_params++;
+        pixman_fixed_t *x_params = params + 4 + px * cwidth;
+
+        if (fy)
+        {
+            for (j = x1; j < x2; ++j)
+            {
+                pixman_fixed_t fx = *x_params++;
+		int rx = j;
+		int ry = i;
+
+                if (fx)
+                {
+                    pixman_fixed_t f;
+                    uint32_t pixel;
+
+                    if (repeat_mode != PIXMAN_REPEAT_NONE)
+                    {
+                        repeat (repeat_mode, &rx, width);
+                        repeat (repeat_mode, &ry, height);
+
+                        pixel = get_pixel (image, rx, ry, FALSE);
+                    }
+                    else
+                    {
+                        pixel = get_pixel (image, rx, ry, TRUE);
+		    }
+
+                    f = (fy * fx + 0x8000) >> 16;
+
+                    srtot += (int)RED_8 (pixel) * f;
+                    sgtot += (int)GREEN_8 (pixel) * f;
+                    sbtot += (int)BLUE_8 (pixel) * f;
+                    satot += (int)ALPHA_8 (pixel) * f;
+                }
+            }
+	}
+    }
+
+    satot = (satot + 0x8000) >> 16;
+    srtot = (srtot + 0x8000) >> 16;
+    sgtot = (sgtot + 0x8000) >> 16;
+    sbtot = (sbtot + 0x8000) >> 16;
+
+    satot = CLIP (satot, 0, 0xff);
+    srtot = CLIP (srtot, 0, 0xff);
+    sgtot = CLIP (sgtot, 0, 0xff);
+    sbtot = CLIP (sbtot, 0, 0xff);
+
+    return ((satot << 24) | (srtot << 16) | (sgtot <<  8) | (sbtot));
+}
+
+static force_inline uint32_t
+bits_image_fetch_pixel_filtered (bits_image_t *image,
+				 pixman_fixed_t x,
+				 pixman_fixed_t y,
+				 get_pixel_t    get_pixel)
+{
+    switch (image->common.filter)
+    {
+    case PIXMAN_FILTER_NEAREST:
+    case PIXMAN_FILTER_FAST:
+	return bits_image_fetch_pixel_nearest (image, x, y, get_pixel);
+	break;
+
+    case PIXMAN_FILTER_BILINEAR:
+    case PIXMAN_FILTER_GOOD:
+    case PIXMAN_FILTER_BEST:
+	return bits_image_fetch_pixel_bilinear (image, x, y, get_pixel);
+	break;
+
+    case PIXMAN_FILTER_CONVOLUTION:
+	return bits_image_fetch_pixel_convolution (image, x, y, get_pixel);
+	break;
+
+    case PIXMAN_FILTER_SEPARABLE_CONVOLUTION:
+        return bits_image_fetch_pixel_separable_convolution (image, x, y, get_pixel);
+        break;
+
+    default:
+        break;
+    }
+
+    return 0;
+}
+
+static uint32_t *
+bits_image_fetch_affine_no_alpha (pixman_iter_t *  iter,
+				  const uint32_t * mask)
+{
+    pixman_image_t *image  = iter->image;
+    int             offset = iter->x;
+    int             line   = iter->y++;
+    int             width  = iter->width;
+    uint32_t *      buffer = iter->buffer;
+
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (image->common.transform)
+    {
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return iter->buffer;
+
+	ux = image->common.transform->matrix[0][0];
+	uy = image->common.transform->matrix[1][0];
+    }
+    else
+    {
+	ux = pixman_fixed_1;
+	uy = 0;
+    }
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+	if (!mask || mask[i])
+	{
+	    buffer[i] = bits_image_fetch_pixel_filtered (
+		&image->bits, x, y, fetch_pixel_no_alpha);
+	}
+
+	x += ux;
+	y += uy;
+    }
+
+    return buffer;
+}
+
+/* General fetcher */
+static force_inline uint32_t
+fetch_pixel_general (bits_image_t *image, int x, int y, pixman_bool_t check_bounds)
+{
+    uint32_t pixel;
+
+    if (check_bounds &&
+	(x < 0 || x >= image->width || y < 0 || y >= image->height))
+    {
+	return 0;
+    }
+
+    pixel = image->fetch_pixel_32 (image, x, y);
+
+    if (image->common.alpha_map)
+    {
+	uint32_t pixel_a;
+
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	if (x < 0 || x >= image->common.alpha_map->width ||
+	    y < 0 || y >= image->common.alpha_map->height)
+	{
+	    pixel_a = 0;
+	}
+	else
+	{
+	    pixel_a = image->common.alpha_map->fetch_pixel_32 (
+		image->common.alpha_map, x, y);
+
+	    pixel_a = ALPHA_8 (pixel_a);
+	}
+
+	pixel &= 0x00ffffff;
+	pixel |= (pixel_a << 24);
+    }
+
+    return pixel;
+}
+
+static uint32_t *
+bits_image_fetch_general (pixman_iter_t  *iter,
+			  const uint32_t *mask)
+{
+    pixman_image_t *image  = iter->image;
+    int             offset = iter->x;
+    int             line   = iter->y++;
+    int             width  = iter->width;
+    uint32_t *      buffer = iter->buffer;
+
+    pixman_fixed_t x, y, w;
+    pixman_fixed_t ux, uy, uw;
+    pixman_vector_t v;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (image->common.transform)
+    {
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return buffer;
+
+	ux = image->common.transform->matrix[0][0];
+	uy = image->common.transform->matrix[1][0];
+	uw = image->common.transform->matrix[2][0];
+    }
+    else
+    {
+	ux = pixman_fixed_1;
+	uy = 0;
+	uw = 0;
+    }
+
+    x = v.vector[0];
+    y = v.vector[1];
+    w = v.vector[2];
+
+    for (i = 0; i < width; ++i)
+    {
+	pixman_fixed_t x0, y0;
+
+	if (!mask || mask[i])
+	{
+	    if (w != 0)
+	    {
+		x0 = ((pixman_fixed_48_16_t)x << 16) / w;
+		y0 = ((pixman_fixed_48_16_t)y << 16) / w;
+	    }
+	    else
+	    {
+		x0 = 0;
+		y0 = 0;
+	    }
+
+	    buffer[i] = bits_image_fetch_pixel_filtered (
+		&image->bits, x0, y0, fetch_pixel_general);
+	}
+
+	x += ux;
+	y += uy;
+	w += uw;
+    }
+
+    return buffer;
+}
+
+static void
+replicate_pixel_32 (bits_image_t *   bits,
+		    int              x,
+		    int              y,
+		    int              width,
+		    uint32_t *       buffer)
+{
+    uint32_t color;
+    uint32_t *end;
+
+    color = bits->fetch_pixel_32 (bits, x, y);
+
+    end = buffer + width;
+    while (buffer < end)
+	*(buffer++) = color;
+}
+
+static void
+replicate_pixel_float (bits_image_t *   bits,
+		       int              x,
+		       int              y,
+		       int              width,
+		       uint32_t *       b)
+{
+    argb_t color;
+    argb_t *buffer = (argb_t *)b;
+    argb_t *end;
+
+    color = bits->fetch_pixel_float (bits, x, y);
+
+    end = buffer + width;
+    while (buffer < end)
+	*(buffer++) = color;
+}
+
+static void
+bits_image_fetch_untransformed_repeat_none (bits_image_t *image,
+                                            pixman_bool_t wide,
+                                            int           x,
+                                            int           y,
+                                            int           width,
+                                            uint32_t *    buffer)
+{
+    uint32_t w;
+
+    if (y < 0 || y >= image->height)
+    {
+	memset (buffer, 0, width * (wide? sizeof (argb_t) : 4));
+	return;
+    }
+
+    if (x < 0)
+    {
+	w = MIN (width, -x);
+
+	memset (buffer, 0, w * (wide ? sizeof (argb_t) : 4));
+
+	width -= w;
+	buffer += w * (wide? 4 : 1);
+	x += w;
+    }
+
+    if (x < image->width)
+    {
+	w = MIN (width, image->width - x);
+
+	if (wide)
+	    image->fetch_scanline_float (image, x, y, w, buffer, NULL);
+	else
+	    image->fetch_scanline_32 (image, x, y, w, buffer, NULL);
+
+	width -= w;
+	buffer += w * (wide? 4 : 1);
+	x += w;
+    }
+
+    memset (buffer, 0, width * (wide ? sizeof (argb_t) : 4));
+}
+
+static void
+bits_image_fetch_untransformed_repeat_normal (bits_image_t *image,
+                                              pixman_bool_t wide,
+                                              int           x,
+                                              int           y,
+                                              int           width,
+                                              uint32_t *    buffer)
+{
+    uint32_t w;
+
+    while (y < 0)
+	y += image->height;
+
+    while (y >= image->height)
+	y -= image->height;
+
+    if (image->width == 1)
+    {
+	if (wide)
+	    replicate_pixel_float (image, 0, y, width, buffer);
+	else
+	    replicate_pixel_32 (image, 0, y, width, buffer);
+
+	return;
+    }
+
+    while (width)
+    {
+	while (x < 0)
+	    x += image->width;
+	while (x >= image->width)
+	    x -= image->width;
+
+	w = MIN (width, image->width - x);
+
+	if (wide)
+	    image->fetch_scanline_float (image, x, y, w, buffer, NULL);
+	else
+	    image->fetch_scanline_32 (image, x, y, w, buffer, NULL);
+
+	buffer += w * (wide? 4 : 1);
+	x += w;
+	width -= w;
+    }
+}
+
+static uint32_t *
+bits_image_fetch_untransformed_32 (pixman_iter_t * iter,
+				   const uint32_t *mask)
+{
+    pixman_image_t *image  = iter->image;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *      buffer = iter->buffer;
+
+    if (image->common.repeat == PIXMAN_REPEAT_NONE)
+    {
+	bits_image_fetch_untransformed_repeat_none (
+	    &image->bits, FALSE, x, y, width, buffer);
+    }
+    else
+    {
+	bits_image_fetch_untransformed_repeat_normal (
+	    &image->bits, FALSE, x, y, width, buffer);
+    }
+
+    iter->y++;
+    return buffer;
+}
+
+static uint32_t *
+bits_image_fetch_untransformed_float (pixman_iter_t * iter,
+				      const uint32_t *mask)
+{
+    pixman_image_t *image  = iter->image;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *      buffer = iter->buffer;
+
+    if (image->common.repeat == PIXMAN_REPEAT_NONE)
+    {
+	bits_image_fetch_untransformed_repeat_none (
+	    &image->bits, TRUE, x, y, width, buffer);
+    }
+    else
+    {
+	bits_image_fetch_untransformed_repeat_normal (
+	    &image->bits, TRUE, x, y, width, buffer);
+    }
+
+    iter->y++;
+    return buffer;
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    uint32_t			flags;
+    pixman_iter_get_scanline_t	get_scanline_32;
+    pixman_iter_get_scanline_t  get_scanline_float;
+} fetcher_info_t;
+
+static const fetcher_info_t fetcher_info[] =
+{
+    { PIXMAN_any,
+      (FAST_PATH_NO_ALPHA_MAP			|
+       FAST_PATH_ID_TRANSFORM			|
+       FAST_PATH_NO_CONVOLUTION_FILTER		|
+       FAST_PATH_NO_PAD_REPEAT			|
+       FAST_PATH_NO_REFLECT_REPEAT),
+      bits_image_fetch_untransformed_32,
+      bits_image_fetch_untransformed_float
+    },
+
+    /* Affine, no alpha */
+    { PIXMAN_any,
+      (FAST_PATH_NO_ALPHA_MAP | FAST_PATH_HAS_TRANSFORM | FAST_PATH_AFFINE_TRANSFORM),
+      bits_image_fetch_affine_no_alpha,
+      _pixman_image_get_scanline_generic_float
+    },
+
+    /* General */
+    { PIXMAN_any,
+      0,
+      bits_image_fetch_general,
+      _pixman_image_get_scanline_generic_float
+    },
+
+    { PIXMAN_null },
+};
+
+static void
+bits_image_property_changed (pixman_image_t *image)
+{
+    _pixman_bits_image_setup_accessors (&image->bits);
+}
+
+void
+_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    pixman_format_code_t format = image->common.extended_format_code;
+    uint32_t flags = image->common.flags;
+    const fetcher_info_t *info;
+
+    for (info = fetcher_info; info->format != PIXMAN_null; ++info)
+    {
+	if ((info->format == format || info->format == PIXMAN_any)	&&
+	    (info->flags & flags) == info->flags)
+	{
+	    if (iter->iter_flags & ITER_NARROW)
+	    {
+		iter->get_scanline = info->get_scanline_32;
+	    }
+	    else
+	    {
+		iter->data = info->get_scanline_32;
+		iter->get_scanline = info->get_scanline_float;
+	    }
+	    return;
+	}
+    }
+
+    /* Just in case we somehow didn't find a scanline function */
+    iter->get_scanline = _pixman_iter_get_scanline_noop;
+}
+
+static uint32_t *
+dest_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_image_t *image  = iter->image;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *	    buffer = iter->buffer;
+
+    image->bits.fetch_scanline_32 (&image->bits, x, y, width, buffer, mask);
+    if (image->common.alpha_map)
+    {
+	uint32_t *alpha;
+
+	if ((alpha = malloc (width * sizeof (uint32_t))))
+	{
+	    int i;
+
+	    x -= image->common.alpha_origin_x;
+	    y -= image->common.alpha_origin_y;
+
+	    image->common.alpha_map->fetch_scanline_32 (
+		image->common.alpha_map, x, y, width, alpha, mask);
+
+	    for (i = 0; i < width; ++i)
+	    {
+		buffer[i] &= ~0xff000000;
+		buffer[i] |= (alpha[i] & 0xff000000);
+	    }
+
+	    free (alpha);
+	}
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+dest_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    argb_t *	    buffer = (argb_t *)iter->buffer;
+
+    image->fetch_scanline_float (
+	image, x, y, width, (uint32_t *)buffer, mask);
+    if (image->common.alpha_map)
+    {
+	argb_t *alpha;
+
+	if ((alpha = malloc (width * sizeof (argb_t))))
+	{
+	    int i;
+
+	    x -= image->common.alpha_origin_x;
+	    y -= image->common.alpha_origin_y;
+
+	    image->common.alpha_map->fetch_scanline_float (
+		image->common.alpha_map, x, y, width, (uint32_t *)alpha, mask);
+
+	    for (i = 0; i < width; ++i)
+		buffer[i].a = alpha[i].a;
+
+	    free (alpha);
+	}
+    }
+
+    return iter->buffer;
+}
+
+static void
+dest_write_back_narrow (pixman_iter_t *iter)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    const uint32_t *buffer = iter->buffer;
+
+    image->store_scanline_32 (image, x, y, width, buffer);
+
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->store_scanline_32 (
+	    image->common.alpha_map, x, y, width, buffer);
+    }
+
+    iter->y++;
+}
+
+static void
+dest_write_back_wide (pixman_iter_t *iter)
+{
+    bits_image_t *  image  = &iter->image->bits;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    const uint32_t *buffer = iter->buffer;
+
+    image->store_scanline_float (image, x, y, width, buffer);
+
+    if (image->common.alpha_map)
+    {
+	x -= image->common.alpha_origin_x;
+	y -= image->common.alpha_origin_y;
+
+	image->common.alpha_map->store_scanline_float (
+	    image->common.alpha_map, x, y, width, buffer);
+    }
+
+    iter->y++;
+}
+
+void
+_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->iter_flags & ITER_NARROW)
+    {
+	if ((iter->iter_flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
+	    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
+	{
+	    iter->get_scanline = _pixman_iter_get_scanline_noop;
+	}
+	else
+	{
+	    iter->get_scanline = dest_get_scanline_narrow;
+	}
+	
+	iter->write_back = dest_write_back_narrow;
+    }
+    else
+    {
+	iter->get_scanline = dest_get_scanline_wide;
+	iter->write_back = dest_write_back_wide;
+    }
+}
+
+static uint32_t *
+create_bits (pixman_format_code_t format,
+             int                  width,
+             int                  height,
+             int *		  rowstride_bytes,
+	     pixman_bool_t	  clear)
+{
+    int stride;
+    size_t buf_size;
+    int bpp;
+
+    /* what follows is a long-winded way, avoiding any possibility of integer
+     * overflows, of saying:
+     * stride = ((width * bpp + 0x1f) >> 5) * sizeof (uint32_t);
+     */
+
+    bpp = PIXMAN_FORMAT_BPP (format);
+    if (_pixman_multiply_overflows_int (width, bpp))
+	return NULL;
+
+    stride = width * bpp;
+    if (_pixman_addition_overflows_int (stride, 0x1f))
+	return NULL;
+
+    stride += 0x1f;
+    stride >>= 5;
+
+    stride *= sizeof (uint32_t);
+
+    if (_pixman_multiply_overflows_size (height, stride))
+	return NULL;
+
+    buf_size = (size_t)height * stride;
+
+    if (rowstride_bytes)
+	*rowstride_bytes = stride;
+
+    if (clear)
+	return calloc (buf_size, 1);
+    else
+	return malloc (buf_size);
+}
+
+pixman_bool_t
+_pixman_bits_image_init (pixman_image_t *     image,
+                         pixman_format_code_t format,
+                         int                  width,
+                         int                  height,
+                         uint32_t *           bits,
+                         int                  rowstride,
+			 pixman_bool_t	      clear)
+{
+    uint32_t *free_me = NULL;
+
+    if (!bits && width && height)
+    {
+	int rowstride_bytes;
+
+	free_me = bits = create_bits (format, width, height, &rowstride_bytes, clear);
+
+	if (!bits)
+	    return FALSE;
+
+	rowstride = rowstride_bytes / (int) sizeof (uint32_t);
+    }
+
+    _pixman_image_init (image);
+
+    image->type = BITS;
+    image->bits.format = format;
+    image->bits.width = width;
+    image->bits.height = height;
+    image->bits.bits = bits;
+    image->bits.free_me = free_me;
+    image->bits.read_func = NULL;
+    image->bits.write_func = NULL;
+    image->bits.rowstride = rowstride;
+    image->bits.indexed = NULL;
+
+    image->common.property_changed = bits_image_property_changed;
+
+    _pixman_image_reset_clip_region (image);
+
+    return TRUE;
+}
+
+static pixman_image_t *
+create_bits_image_internal (pixman_format_code_t format,
+			    int                  width,
+			    int                  height,
+			    uint32_t *           bits,
+			    int                  rowstride_bytes,
+			    pixman_bool_t	 clear)
+{
+    pixman_image_t *image;
+
+    /* must be a whole number of uint32_t's
+     */
+    return_val_if_fail (
+	bits == NULL || (rowstride_bytes % sizeof (uint32_t)) == 0, NULL);
+
+    return_val_if_fail (PIXMAN_FORMAT_BPP (format) >= PIXMAN_FORMAT_DEPTH (format), NULL);
+
+    image = _pixman_image_allocate ();
+
+    if (!image)
+	return NULL;
+
+    if (!_pixman_bits_image_init (image, format, width, height, bits,
+				  rowstride_bytes / (int) sizeof (uint32_t),
+				  clear))
+    {
+	free (image);
+	return NULL;
+    }
+
+    return image;
+}
+
+/* If bits is NULL, a buffer will be allocated and initialized to 0 */
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_bits (pixman_format_code_t format,
+                          int                  width,
+                          int                  height,
+                          uint32_t *           bits,
+                          int                  rowstride_bytes)
+{
+    return create_bits_image_internal (
+	format, width, height, bits, rowstride_bytes, TRUE);
+}
+
+
+/* If bits is NULL, a buffer will be allocated and _not_ initialized */
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_bits_no_clear (pixman_format_code_t format,
+				   int                  width,
+				   int                  height,
+				   uint32_t *           bits,
+				   int                  rowstride_bytes)
+{
+    return create_bits_image_internal (
+	format, width, height, bits, rowstride_bytes, FALSE);
+}
diff --git a/src/pixman/pixman/pixman-combine-float.c b/src/pixman/pixman/pixman-combine-float.c
new file mode 100644
index 0000000..5ea739f
--- /dev/null
+++ b/src/pixman/pixman/pixman-combine-float.c
@@ -0,0 +1,1016 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2010, 2012 Soren Sandmann Pedersen
+ * Copyright © 2010, 2012 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Soren Sandmann Pedersen (sandmann@cs.au.dk)
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+#include <float.h>
+
+#include "pixman-private.h"
+
+/* Workaround for http://gcc.gnu.org/PR54965 */
+/* GCC 4.6 has problems with force_inline, so just use normal inline instead */
+#if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 6)
+#undef force_inline
+#define force_inline __inline__
+#endif
+
+typedef float (* combine_channel_t) (float sa, float s, float da, float d);
+
+static force_inline void
+combine_inner (pixman_bool_t component,
+	       float *dest, const float *src, const float *mask, int n_pixels,
+	       combine_channel_t combine_a, combine_channel_t combine_c)
+{
+    int i;
+
+    if (!mask)
+    {
+	for (i = 0; i < 4 * n_pixels; i += 4)
+	{
+	    float sa = src[i + 0];
+	    float sr = src[i + 1];
+	    float sg = src[i + 2];
+	    float sb = src[i + 3];
+	    
+	    float da = dest[i + 0];
+	    float dr = dest[i + 1];
+	    float dg = dest[i + 2];
+	    float db = dest[i + 3];					
+	    
+	    dest[i + 0] = combine_a (sa, sa, da, da);
+	    dest[i + 1] = combine_c (sa, sr, da, dr);
+	    dest[i + 2] = combine_c (sa, sg, da, dg);
+	    dest[i + 3] = combine_c (sa, sb, da, db);
+	}
+    }
+    else
+    {
+	for (i = 0; i < 4 * n_pixels; i += 4)
+	{
+	    float sa, sr, sg, sb;
+	    float ma, mr, mg, mb;
+	    float da, dr, dg, db;
+	    
+	    sa = src[i + 0];
+	    sr = src[i + 1];
+	    sg = src[i + 2];
+	    sb = src[i + 3];
+	    
+	    if (component)
+	    {
+		ma = mask[i + 0];
+		mr = mask[i + 1];
+		mg = mask[i + 2];
+		mb = mask[i + 3];
+
+		sr *= mr;
+		sg *= mg;
+		sb *= mb;
+
+		ma *= sa;
+		mr *= sa;
+		mg *= sa;
+		mb *= sa;
+		
+		sa = ma;
+	    }
+	    else
+	    {
+		ma = mask[i + 0];
+
+		sa *= ma;
+		sr *= ma;
+		sg *= ma;
+		sb *= ma;
+
+		ma = mr = mg = mb = sa;
+	    }
+	    
+	    da = dest[i + 0];
+	    dr = dest[i + 1];
+	    dg = dest[i + 2];
+	    db = dest[i + 3];
+	    
+	    dest[i + 0] = combine_a (ma, sa, da, da);
+	    dest[i + 1] = combine_c (mr, sr, da, dr);
+	    dest[i + 2] = combine_c (mg, sg, da, dg);
+	    dest[i + 3] = combine_c (mb, sb, da, db);
+	}
+    }
+}
+
+#define MAKE_COMBINER(name, component, combine_a, combine_c)		\
+    static void								\
+    combine_ ## name ## _float (pixman_implementation_t *imp,		\
+				pixman_op_t              op,		\
+				float                   *dest,		\
+				const float             *src,		\
+				const float             *mask,		\
+				int		         n_pixels)	\
+    {									\
+	combine_inner (component, dest, src, mask, n_pixels,		\
+		       combine_a, combine_c);				\
+    }
+
+#define MAKE_COMBINERS(name, combine_a, combine_c)			\
+    MAKE_COMBINER(name ## _ca, TRUE, combine_a, combine_c)		\
+    MAKE_COMBINER(name ## _u, FALSE, combine_a, combine_c)
+
+
+/*
+ * Porter/Duff operators
+ */
+typedef enum
+{
+    ZERO,
+    ONE,
+    SRC_ALPHA,
+    DEST_ALPHA,
+    INV_SA,
+    INV_DA,
+    SA_OVER_DA,
+    DA_OVER_SA,
+    INV_SA_OVER_DA,
+    INV_DA_OVER_SA,
+    ONE_MINUS_SA_OVER_DA,
+    ONE_MINUS_DA_OVER_SA,
+    ONE_MINUS_INV_DA_OVER_SA,
+    ONE_MINUS_INV_SA_OVER_DA
+} combine_factor_t;
+
+#define CLAMP(f)					\
+    (((f) < 0)? 0 : (((f) > 1.0) ? 1.0 : (f)))
+
+static force_inline float
+get_factor (combine_factor_t factor, float sa, float da)
+{
+    float f = -1;
+
+    switch (factor)
+    {
+    case ZERO:
+	f = 0.0f;
+	break;
+
+    case ONE:
+	f = 1.0f;
+	break;
+
+    case SRC_ALPHA:
+	f = sa;
+	break;
+
+    case DEST_ALPHA:
+	f = da;
+	break;
+
+    case INV_SA:
+	f = 1 - sa;
+	break;
+
+    case INV_DA:
+	f = 1 - da;
+	break;
+
+    case SA_OVER_DA:
+	if (FLOAT_IS_ZERO (da))
+	    f = 1.0f;
+	else
+	    f = CLAMP (sa / da);
+	break;
+
+    case DA_OVER_SA:
+	if (FLOAT_IS_ZERO (sa))
+	    f = 1.0f;
+	else
+	    f = CLAMP (da / sa);
+	break;
+
+    case INV_SA_OVER_DA:
+	if (FLOAT_IS_ZERO (da))
+	    f = 1.0f;
+	else
+	    f = CLAMP ((1.0f - sa) / da);
+	break;
+
+    case INV_DA_OVER_SA:
+	if (FLOAT_IS_ZERO (sa))
+	    f = 1.0f;
+	else
+	    f = CLAMP ((1.0f - da) / sa);
+	break;
+
+    case ONE_MINUS_SA_OVER_DA:
+	if (FLOAT_IS_ZERO (da))
+	    f = 0.0f;
+	else
+	    f = CLAMP (1.0f - sa / da);
+	break;
+
+    case ONE_MINUS_DA_OVER_SA:
+	if (FLOAT_IS_ZERO (sa))
+	    f = 0.0f;
+	else
+	    f = CLAMP (1.0f - da / sa);
+	break;
+
+    case ONE_MINUS_INV_DA_OVER_SA:
+	if (FLOAT_IS_ZERO (sa))
+	    f = 0.0f;
+	else
+	    f = CLAMP (1.0f - (1.0f - da) / sa);
+	break;
+
+    case ONE_MINUS_INV_SA_OVER_DA:
+	if (FLOAT_IS_ZERO (da))
+	    f = 0.0f;
+	else
+	    f = CLAMP (1.0f - (1.0f - sa) / da);
+	break;
+    }
+
+    return f;
+}
+
+#define MAKE_PD_COMBINERS(name, a, b)					\
+    static float force_inline						\
+    pd_combine_ ## name (float sa, float s, float da, float d)		\
+    {									\
+	const float fa = get_factor (a, sa, da);			\
+	const float fb = get_factor (b, sa, da);			\
+									\
+	return MIN (1.0f, s * fa + d * fb);				\
+    }									\
+    									\
+    MAKE_COMBINERS(name, pd_combine_ ## name, pd_combine_ ## name)
+
+MAKE_PD_COMBINERS (clear,			ZERO,				ZERO)
+MAKE_PD_COMBINERS (src,				ONE,				ZERO)
+MAKE_PD_COMBINERS (dst,				ZERO,				ONE)
+MAKE_PD_COMBINERS (over,			ONE,				INV_SA)
+MAKE_PD_COMBINERS (over_reverse,		INV_DA,				ONE)
+MAKE_PD_COMBINERS (in,				DEST_ALPHA,			ZERO)
+MAKE_PD_COMBINERS (in_reverse,			ZERO,				SRC_ALPHA)
+MAKE_PD_COMBINERS (out,				INV_DA,				ZERO)
+MAKE_PD_COMBINERS (out_reverse,			ZERO,				INV_SA)
+MAKE_PD_COMBINERS (atop,			DEST_ALPHA,			INV_SA)
+MAKE_PD_COMBINERS (atop_reverse,		INV_DA,				SRC_ALPHA)
+MAKE_PD_COMBINERS (xor,				INV_DA,				INV_SA)
+MAKE_PD_COMBINERS (add,				ONE,				ONE)
+
+MAKE_PD_COMBINERS (saturate,			INV_DA_OVER_SA,			ONE)
+
+MAKE_PD_COMBINERS (disjoint_clear,		ZERO,				ZERO)
+MAKE_PD_COMBINERS (disjoint_src,		ONE,				ZERO)
+MAKE_PD_COMBINERS (disjoint_dst,		ZERO,				ONE)
+MAKE_PD_COMBINERS (disjoint_over,		ONE,				INV_SA_OVER_DA)
+MAKE_PD_COMBINERS (disjoint_over_reverse,	INV_DA_OVER_SA,			ONE)
+MAKE_PD_COMBINERS (disjoint_in,			ONE_MINUS_INV_DA_OVER_SA,	ZERO)
+MAKE_PD_COMBINERS (disjoint_in_reverse,		ZERO,				ONE_MINUS_INV_SA_OVER_DA)
+MAKE_PD_COMBINERS (disjoint_out,		INV_DA_OVER_SA,			ZERO)
+MAKE_PD_COMBINERS (disjoint_out_reverse,	ZERO,				INV_SA_OVER_DA)
+MAKE_PD_COMBINERS (disjoint_atop,		ONE_MINUS_INV_DA_OVER_SA,	INV_SA_OVER_DA)
+MAKE_PD_COMBINERS (disjoint_atop_reverse,	INV_DA_OVER_SA,			ONE_MINUS_INV_SA_OVER_DA)
+MAKE_PD_COMBINERS (disjoint_xor,		INV_DA_OVER_SA,			INV_SA_OVER_DA)
+
+MAKE_PD_COMBINERS (conjoint_clear,		ZERO,				ZERO)
+MAKE_PD_COMBINERS (conjoint_src,		ONE,				ZERO)
+MAKE_PD_COMBINERS (conjoint_dst,		ZERO,				ONE)
+MAKE_PD_COMBINERS (conjoint_over,		ONE,				ONE_MINUS_SA_OVER_DA)
+MAKE_PD_COMBINERS (conjoint_over_reverse,	ONE_MINUS_DA_OVER_SA,		ONE)
+MAKE_PD_COMBINERS (conjoint_in,			DA_OVER_SA,			ZERO)
+MAKE_PD_COMBINERS (conjoint_in_reverse,		ZERO,				SA_OVER_DA)
+MAKE_PD_COMBINERS (conjoint_out,		ONE_MINUS_DA_OVER_SA,		ZERO)
+MAKE_PD_COMBINERS (conjoint_out_reverse,	ZERO,				ONE_MINUS_SA_OVER_DA)
+MAKE_PD_COMBINERS (conjoint_atop,		DA_OVER_SA,			ONE_MINUS_SA_OVER_DA)
+MAKE_PD_COMBINERS (conjoint_atop_reverse,	ONE_MINUS_DA_OVER_SA,		SA_OVER_DA)
+MAKE_PD_COMBINERS (conjoint_xor,		ONE_MINUS_DA_OVER_SA,		ONE_MINUS_SA_OVER_DA)
+
+/*
+ * PDF blend modes:
+ *
+ * The following blend modes have been taken from the PDF ISO 32000
+ * specification, which at this point in time is available from
+ * http://www.adobe.com/devnet/acrobat/pdfs/PDF32000_2008.pdf
+ * The relevant chapters are 11.3.5 and 11.3.6.
+ * The formula for computing the final pixel color given in 11.3.6 is:
+ * αr × Cr = (1 – αs) × αb × Cb + (1 – αb) × αs × Cs + αb × αs × B(Cb, Cs)
+ * with B() being the blend function.
+ * Note that OVER is a special case of this operation, using B(Cb, Cs) = Cs
+ *
+ * These blend modes should match the SVG filter draft specification, as
+ * it has been designed to mirror ISO 32000. Note that at the current point
+ * no released draft exists that shows this, as the formulas have not been
+ * updated yet after the release of ISO 32000.
+ *
+ * The default implementation here uses the PDF_SEPARABLE_BLEND_MODE and
+ * PDF_NON_SEPARABLE_BLEND_MODE macros, which take the blend function as an
+ * argument. Note that this implementation operates on premultiplied colors,
+ * while the PDF specification does not. Therefore the code uses the formula
+ * ar.Cra = (1 – as) . Dca + (1 – ad) . Sca + B(Dca, ad, Sca, as)
+ */
+
+#define MAKE_SEPARABLE_PDF_COMBINERS(name)				\
+    static force_inline float						\
+    combine_ ## name ## _a (float sa, float s, float da, float d)	\
+    {									\
+	return da + sa - da * sa;					\
+    }									\
+    									\
+    static force_inline float						\
+    combine_ ## name ## _c (float sa, float s, float da, float d)	\
+    {									\
+	float f = (1 - sa) * d + (1 - da) * s;				\
+									\
+	return f + blend_ ## name (sa, s, da, d);			\
+    }									\
+    									\
+    MAKE_COMBINERS (name, combine_ ## name ## _a, combine_ ## name ## _c)
+
+static force_inline float
+blend_multiply (float sa, float s, float da, float d)
+{
+    return d * s;
+}
+
+static force_inline float
+blend_screen (float sa, float s, float da, float d)
+{
+    return d * sa + s * da - s * d;
+}
+
+static force_inline float
+blend_overlay (float sa, float s, float da, float d)
+{
+    if (2 * d < da)
+	return 2 * s * d;
+    else
+	return sa * da - 2 * (da - d) * (sa - s);
+}
+
+static force_inline float
+blend_darken (float sa, float s, float da, float d)
+{
+    s = s * da;
+    d = d * sa;
+
+    if (s > d)
+	return d;
+    else
+	return s;
+}
+
+static force_inline float
+blend_lighten (float sa, float s, float da, float d)
+{
+    s = s * da;
+    d = d * sa;
+
+    if (s > d)
+	return s;
+    else
+	return d;
+}
+
+static force_inline float
+blend_color_dodge (float sa, float s, float da, float d)
+{
+    if (FLOAT_IS_ZERO (d))
+	return 0.0f;
+    else if (d * sa >= sa * da - s * da)
+	return sa * da;
+    else if (FLOAT_IS_ZERO (sa - s))
+	return sa * da;
+    else
+	return sa * sa * d / (sa - s);
+}
+
+static force_inline float
+blend_color_burn (float sa, float s, float da, float d)
+{
+    if (d >= da)
+	return sa * da;
+    else if (sa * (da - d) >= s * da)
+	return 0.0f;
+    else if (FLOAT_IS_ZERO (s))
+	return 0.0f;
+    else
+	return sa * (da - sa * (da - d) / s);
+}
+
+static force_inline float
+blend_hard_light (float sa, float s, float da, float d)
+{
+    if (2 * s < sa)
+	return 2 * s * d;
+    else
+	return sa * da - 2 * (da - d) * (sa - s);
+}
+
+static force_inline float
+blend_soft_light (float sa, float s, float da, float d)
+{
+    if (2 * s < sa)
+    {
+	if (FLOAT_IS_ZERO (da))
+	    return d * sa;
+	else
+	    return d * sa - d * (da - d) * (sa - 2 * s) / da;
+    }
+    else
+    {
+	if (FLOAT_IS_ZERO (da))
+	{
+	    return 0.0f;
+	}
+	else
+	{
+	    if (4 * d <= da)
+		return d * sa + (2 * s - sa) * d * ((16 * d / da - 12) * d / da + 3);
+	    else
+		return d * sa + (sqrtf (d * da) - d) * (2 * s - sa);
+	}
+    }
+}
+
+static force_inline float
+blend_difference (float sa, float s, float da, float d)
+{
+    float dsa = d * sa;
+    float sda = s * da;
+
+    if (sda < dsa)
+	return dsa - sda;
+    else
+	return sda - dsa;
+}
+
+static force_inline float
+blend_exclusion (float sa, float s, float da, float d)
+{
+    return s * da + d * sa - 2 * d * s;
+}
+
+MAKE_SEPARABLE_PDF_COMBINERS (multiply)
+MAKE_SEPARABLE_PDF_COMBINERS (screen)
+MAKE_SEPARABLE_PDF_COMBINERS (overlay)
+MAKE_SEPARABLE_PDF_COMBINERS (darken)
+MAKE_SEPARABLE_PDF_COMBINERS (lighten)
+MAKE_SEPARABLE_PDF_COMBINERS (color_dodge)
+MAKE_SEPARABLE_PDF_COMBINERS (color_burn)
+MAKE_SEPARABLE_PDF_COMBINERS (hard_light)
+MAKE_SEPARABLE_PDF_COMBINERS (soft_light)
+MAKE_SEPARABLE_PDF_COMBINERS (difference)
+MAKE_SEPARABLE_PDF_COMBINERS (exclusion)
+
+/*
+ * PDF nonseperable blend modes.
+ *
+ * These are implemented using the following functions to operate in Hsl
+ * space, with Cmax, Cmid, Cmin referring to the max, mid and min value
+ * of the red, green and blue components.
+ *
+ * LUM (C) = 0.3 × Cred + 0.59 × Cgreen + 0.11 × Cblue
+ *
+ * clip_color (C):
+ *   l = LUM (C)
+ *   min = Cmin
+ *   max = Cmax
+ *   if n < 0.0
+ *     C = l + (((C – l) × l) ⁄     (l – min))
+ *   if x > 1.0
+ *     C = l + (((C – l) × (1 – l)) (max – l))
+ *   return C
+ *
+ * set_lum (C, l):
+ *   d = l – LUM (C)
+ *   C += d
+ *   return clip_color (C)
+ *
+ * SAT (C) = CH_MAX (C) - CH_MIN (C)
+ *
+ * set_sat (C, s):
+ *  if Cmax > Cmin
+ *    Cmid = ( ( ( Cmid – Cmin ) × s ) ⁄ ( Cmax – Cmin ) )
+ *    Cmax = s
+ *  else
+ *    Cmid = Cmax = 0.0
+ *  Cmin = 0.0
+ *  return C
+ */
+
+/* For premultiplied colors, we need to know what happens when C is
+ * multiplied by a real number. LUM and SAT are linear:
+ *
+ *    LUM (r × C) = r × LUM (C)		SAT (r × C) = r × SAT (C)
+ *
+ * If we extend clip_color with an extra argument a and change
+ *
+ *        if x >= 1.0
+ *
+ * into
+ *
+ *        if x >= a
+ *
+ * then clip_color is also linear:
+ *
+ *     r * clip_color (C, a) = clip_color (r_c, ra);
+ *
+ * for positive r.
+ *
+ * Similarly, we can extend set_lum with an extra argument that is just passed
+ * on to clip_color:
+ *
+ *     r × set_lum ( C, l, a)
+ *
+ *   = r × clip_color ( C + l - LUM (C), a)
+ *
+ *   = clip_color ( r * C + r × l - LUM (r × C), r * a)
+ *
+ *   = set_lum ( r * C, r * l, r * a)
+ *
+ * Finally, set_sat:
+ *
+ *     r * set_sat (C, s) = set_sat (x * C, r * s)
+ *
+ * The above holds for all non-zero x because they x'es in the fraction for
+ * C_mid cancel out. Specifically, it holds for x = r:
+ *
+ *     r * set_sat (C, s) = set_sat (r_c, rs)
+ *
+ *
+ *
+ *
+ * So, for the non-separable PDF blend modes, we have (using s, d for
+ * non-premultiplied colors, and S, D for premultiplied:
+ *
+ *   Color:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (S/a_s, LUM (D/a_d), 1)
+ *   = set_lum (S * a_d, a_s * LUM (D), a_s * a_d)
+ *
+ *
+ *   Luminosity:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (D/a_d, LUM(S/a_s), 1)
+ *   = set_lum (a_s * D, a_d * LUM(S), a_s * a_d)
+ *
+ *
+ *   Saturation:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (set_sat (D/a_d, SAT (S/a_s)), LUM (D/a_d), 1)
+ *   = set_lum (a_s * a_d * set_sat (D/a_d, SAT (S/a_s)),
+ *                                        a_s * LUM (D), a_s * a_d)
+ *   = set_lum (set_sat (a_s * D, a_d * SAT (S), a_s * LUM (D), a_s * a_d))
+ *
+ *   Hue:
+ *
+ *     a_s * a_d * B(s, d)
+ *   = a_s * a_d * set_lum (set_sat (S/a_s, SAT (D/a_d)), LUM (D/a_d), 1)
+ *   = set_lum (set_sat (a_d * S, a_s * SAT (D)), a_s * LUM (D), a_s * a_d)
+ *
+ */
+
+typedef struct
+{
+    float	r;
+    float	g;
+    float	b;
+} rgb_t;
+
+static force_inline float
+minf (float a, float b)
+{
+    return a < b? a : b;
+}
+
+static force_inline float
+maxf (float a, float b)
+{
+    return a > b? a : b;
+}
+
+static force_inline float
+channel_min (const rgb_t *c)
+{
+    return minf (minf (c->r, c->g), c->b);
+}
+
+static force_inline float
+channel_max (const rgb_t *c)
+{
+    return maxf (maxf (c->r, c->g), c->b);
+}
+
+static force_inline float
+get_lum (const rgb_t *c)
+{
+    return c->r * 0.3f + c->g * 0.59f + c->b * 0.11f;
+}
+
+static force_inline float
+get_sat (const rgb_t *c)
+{
+    return channel_max (c) - channel_min (c);
+}
+
+static void
+clip_color (rgb_t *color, float a)
+{
+    float l = get_lum (color);
+    float n = channel_min (color);
+    float x = channel_max (color);
+    float t;
+
+    if (n < 0.0f)
+    {
+	t = l - n;
+	if (FLOAT_IS_ZERO (t))
+	{
+	    color->r = 0.0f;
+	    color->g = 0.0f;
+	    color->b = 0.0f;
+	}
+	else
+	{
+	    color->r = l + (((color->r - l) * l) / t);
+	    color->g = l + (((color->g - l) * l) / t);
+	    color->b = l + (((color->b - l) * l) / t);
+	}
+    }
+    if (x > a)
+    {
+	t = x - l;
+	if (FLOAT_IS_ZERO (t))
+	{
+	    color->r = a;
+	    color->g = a;
+	    color->b = a;
+	}
+	else
+	{
+	    color->r = l + (((color->r - l) * (a - l) / t));
+	    color->g = l + (((color->g - l) * (a - l) / t));
+	    color->b = l + (((color->b - l) * (a - l) / t));
+	}
+    }
+}
+
+static void
+set_lum (rgb_t *color, float sa, float l)
+{
+    float d = l - get_lum (color);
+
+    color->r = color->r + d;
+    color->g = color->g + d;
+    color->b = color->b + d;
+
+    clip_color (color, sa);
+}
+
+static void
+set_sat (rgb_t *src, float sat)
+{
+    float *max, *mid, *min;
+    float t;
+
+    if (src->r > src->g)
+    {
+	if (src->r > src->b)
+	{
+	    max = &(src->r);
+
+	    if (src->g > src->b)
+	    {
+		mid = &(src->g);
+		min = &(src->b);
+	    }
+	    else
+	    {
+		mid = &(src->b);
+		min = &(src->g);
+	    }
+	}
+	else
+	{
+	    max = &(src->b);
+	    mid = &(src->r);
+	    min = &(src->g);
+	}
+    }
+    else
+    {
+	if (src->r > src->b)
+	{
+	    max = &(src->g);
+	    mid = &(src->r);
+	    min = &(src->b);
+	}
+	else
+	{
+	    min = &(src->r);
+
+	    if (src->g > src->b)
+	    {
+		max = &(src->g);
+		mid = &(src->b);
+	    }
+	    else
+	    {
+		max = &(src->b);
+		mid = &(src->g);
+	    }
+	}
+    }
+
+    t = *max - *min;
+
+    if (FLOAT_IS_ZERO (t))
+    {
+	*mid = *max = 0.0f;
+    }
+    else
+    {
+	*mid = ((*mid - *min) * sat) / t;
+	*max = sat;
+    }
+
+    *min = 0.0f;
+}
+
+/*
+ * Hue:
+ * B(Cb, Cs) = set_lum (set_sat (Cs, SAT (Cb)), LUM (Cb))
+ */
+static force_inline void
+blend_hsl_hue (rgb_t *res,
+	       const rgb_t *dest, float da,
+	       const rgb_t *src, float sa)
+{
+    res->r = src->r * da;
+    res->g = src->g * da;
+    res->b = src->b * da;
+
+    set_sat (res, get_sat (dest) * sa);
+    set_lum (res, sa * da, get_lum (dest) * sa);
+}
+
+/*
+ * Saturation:
+ * B(Cb, Cs) = set_lum (set_sat (Cb, SAT (Cs)), LUM (Cb))
+ */
+static force_inline void
+blend_hsl_saturation (rgb_t *res,
+		      const rgb_t *dest, float da,
+		      const rgb_t *src, float sa)
+{
+    res->r = dest->r * sa;
+    res->g = dest->g * sa;
+    res->b = dest->b * sa;
+
+    set_sat (res, get_sat (src) * da);
+    set_lum (res, sa * da, get_lum (dest) * sa);
+}
+
+/*
+ * Color:
+ * B(Cb, Cs) = set_lum (Cs, LUM (Cb))
+ */
+static force_inline void
+blend_hsl_color (rgb_t *res,
+		 const rgb_t *dest, float da,
+		 const rgb_t *src, float sa)
+{
+    res->r = src->r * da;
+    res->g = src->g * da;
+    res->b = src->b * da;
+
+    set_lum (res, sa * da, get_lum (dest) * sa);
+}
+
+/*
+ * Luminosity:
+ * B(Cb, Cs) = set_lum (Cb, LUM (Cs))
+ */
+static force_inline void
+blend_hsl_luminosity (rgb_t *res,
+		      const rgb_t *dest, float da,
+		      const rgb_t *src, float sa)
+{
+    res->r = dest->r * sa;
+    res->g = dest->g * sa;
+    res->b = dest->b * sa;
+
+    set_lum (res, sa * da, get_lum (src) * da);
+}
+
+#define MAKE_NON_SEPARABLE_PDF_COMBINERS(name)				\
+    static void								\
+    combine_ ## name ## _u_float (pixman_implementation_t *imp,		\
+				  pixman_op_t              op,		\
+				  float                   *dest,	\
+				  const float             *src,		\
+				  const float             *mask,	\
+				  int		           n_pixels)	\
+    {									\
+    	int i;								\
+									\
+	for (i = 0; i < 4 * n_pixels; i += 4)				\
+	{								\
+	    float sa, da;						\
+	    rgb_t sc, dc, rc;						\
+									\
+	    sa = src[i + 0];						\
+	    sc.r = src[i + 1];						\
+	    sc.g = src[i + 2];						\
+	    sc.b = src[i + 3];						\
+									\
+	    da = dest[i + 0];						\
+	    dc.r = dest[i + 1];						\
+	    dc.g = dest[i + 2];						\
+	    dc.b = dest[i + 3];						\
+									\
+	    if (mask)							\
+	    {								\
+		float ma = mask[i + 0];					\
+									\
+		/* Component alpha is not supported for HSL modes */	\
+		sa *= ma;						\
+		sc.r *= ma;						\
+		sc.g *= ma;						\
+		sc.g *= ma;						\
+	    }								\
+									\
+	    blend_ ## name (&rc, &dc, da, &sc, sa);			\
+									\
+	    dest[i + 0] = sa + da - sa * da;				\
+	    dest[i + 1] = (1 - sa) * dc.r + (1 - da) * sc.r + rc.r;	\
+	    dest[i + 2] = (1 - sa) * dc.g + (1 - da) * sc.g + rc.g;	\
+	    dest[i + 3] = (1 - sa) * dc.b + (1 - da) * sc.b + rc.b;	\
+	}								\
+    }
+
+MAKE_NON_SEPARABLE_PDF_COMBINERS(hsl_hue)
+MAKE_NON_SEPARABLE_PDF_COMBINERS(hsl_saturation)
+MAKE_NON_SEPARABLE_PDF_COMBINERS(hsl_color)
+MAKE_NON_SEPARABLE_PDF_COMBINERS(hsl_luminosity)
+
+void
+_pixman_setup_combiner_functions_float (pixman_implementation_t *imp)
+{
+    /* Unified alpha */
+    imp->combine_float[PIXMAN_OP_CLEAR] = combine_clear_u_float;
+    imp->combine_float[PIXMAN_OP_SRC] = combine_src_u_float;
+    imp->combine_float[PIXMAN_OP_DST] = combine_dst_u_float;
+    imp->combine_float[PIXMAN_OP_OVER] = combine_over_u_float;
+    imp->combine_float[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_IN] = combine_in_u_float;
+    imp->combine_float[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_OUT] = combine_out_u_float;
+    imp->combine_float[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_ATOP] = combine_atop_u_float;
+    imp->combine_float[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_XOR] = combine_xor_u_float;
+    imp->combine_float[PIXMAN_OP_ADD] = combine_add_u_float;
+    imp->combine_float[PIXMAN_OP_SATURATE] = combine_saturate_u_float;
+
+    /* Disjoint, unified */
+    imp->combine_float[PIXMAN_OP_DISJOINT_CLEAR] = combine_disjoint_clear_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_SRC] = combine_disjoint_src_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_DST] = combine_disjoint_dst_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_disjoint_over_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_u_float;
+
+    /* Conjoint, unified */
+    imp->combine_float[PIXMAN_OP_CONJOINT_CLEAR] = combine_conjoint_clear_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_SRC] = combine_conjoint_src_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_DST] = combine_conjoint_dst_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_u_float;
+    imp->combine_float[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_u_float;
+
+    /* PDF operators, unified */
+    imp->combine_float[PIXMAN_OP_MULTIPLY] = combine_multiply_u_float;
+    imp->combine_float[PIXMAN_OP_SCREEN] = combine_screen_u_float;
+    imp->combine_float[PIXMAN_OP_OVERLAY] = combine_overlay_u_float;
+    imp->combine_float[PIXMAN_OP_DARKEN] = combine_darken_u_float;
+    imp->combine_float[PIXMAN_OP_LIGHTEN] = combine_lighten_u_float;
+    imp->combine_float[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_u_float;
+    imp->combine_float[PIXMAN_OP_COLOR_BURN] = combine_color_burn_u_float;
+    imp->combine_float[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_u_float;
+    imp->combine_float[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_u_float;
+    imp->combine_float[PIXMAN_OP_DIFFERENCE] = combine_difference_u_float;
+    imp->combine_float[PIXMAN_OP_EXCLUSION] = combine_exclusion_u_float;
+
+    imp->combine_float[PIXMAN_OP_HSL_HUE] = combine_hsl_hue_u_float;
+    imp->combine_float[PIXMAN_OP_HSL_SATURATION] = combine_hsl_saturation_u_float;
+    imp->combine_float[PIXMAN_OP_HSL_COLOR] = combine_hsl_color_u_float;
+    imp->combine_float[PIXMAN_OP_HSL_LUMINOSITY] = combine_hsl_luminosity_u_float;
+
+    /* Component alpha combiners */
+    imp->combine_float_ca[PIXMAN_OP_CLEAR] = combine_clear_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_SRC] = combine_src_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DST] = combine_dst_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_OVER] = combine_over_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_IN] = combine_in_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_OUT] = combine_out_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_ATOP] = combine_atop_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_XOR] = combine_xor_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_ADD] = combine_add_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_SATURATE] = combine_saturate_ca_float;
+
+    /* Disjoint CA */
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_CLEAR] = combine_disjoint_clear_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_SRC] = combine_disjoint_src_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_DST] = combine_disjoint_dst_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_disjoint_over_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_ca_float;
+
+    /* Conjoint CA */
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_CLEAR] = combine_conjoint_clear_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_SRC] = combine_conjoint_src_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_DST] = combine_conjoint_dst_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_ca_float;
+
+    /* PDF operators CA */
+    imp->combine_float_ca[PIXMAN_OP_MULTIPLY] = combine_multiply_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_SCREEN] = combine_screen_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_OVERLAY] = combine_overlay_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DARKEN] = combine_darken_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_LIGHTEN] = combine_lighten_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_COLOR_BURN] = combine_color_burn_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_DIFFERENCE] = combine_difference_ca_float;
+    imp->combine_float_ca[PIXMAN_OP_EXCLUSION] = combine_exclusion_ca_float;
+
+    /* It is not clear that these make sense, so make them noops for now */
+    imp->combine_float_ca[PIXMAN_OP_HSL_HUE] = combine_dst_u_float;
+    imp->combine_float_ca[PIXMAN_OP_HSL_SATURATION] = combine_dst_u_float;
+    imp->combine_float_ca[PIXMAN_OP_HSL_COLOR] = combine_dst_u_float;
+    imp->combine_float_ca[PIXMAN_OP_HSL_LUMINOSITY] = combine_dst_u_float;
+}
diff --git a/src/pixman/pixman/pixman-combine32.c b/src/pixman/pixman/pixman-combine32.c
new file mode 100644
index 0000000..450114a
--- /dev/null
+++ b/src/pixman/pixman/pixman-combine32.c
@@ -0,0 +1,2581 @@
+/*
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+
+/* component alpha helper functions */
+
+static void
+combine_mask_ca (uint32_t *src, uint32_t *mask)
+{
+    uint32_t a = *mask;
+
+    uint32_t x;
+    uint16_t xa;
+
+    if (!a)
+    {
+	*(src) = 0;
+	return;
+    }
+
+    x = *(src);
+    if (a == ~0)
+    {
+	x = x >> A_SHIFT;
+	x |= x << G_SHIFT;
+	x |= x << R_SHIFT;
+	*(mask) = x;
+	return;
+    }
+
+    xa = x >> A_SHIFT;
+    UN8x4_MUL_UN8x4 (x, a);
+    *(src) = x;
+    
+    UN8x4_MUL_UN8 (a, xa);
+    *(mask) = a;
+}
+
+static void
+combine_mask_value_ca (uint32_t *src, const uint32_t *mask)
+{
+    uint32_t a = *mask;
+    uint32_t x;
+
+    if (!a)
+    {
+	*(src) = 0;
+	return;
+    }
+
+    if (a == ~0)
+	return;
+
+    x = *(src);
+    UN8x4_MUL_UN8x4 (x, a);
+    *(src) = x;
+}
+
+static void
+combine_mask_alpha_ca (const uint32_t *src, uint32_t *mask)
+{
+    uint32_t a = *(mask);
+    uint32_t x;
+
+    if (!a)
+	return;
+
+    x = *(src) >> A_SHIFT;
+    if (x == MASK)
+	return;
+
+    if (a == ~0)
+    {
+	x |= x << G_SHIFT;
+	x |= x << R_SHIFT;
+	*(mask) = x;
+	return;
+    }
+
+    UN8x4_MUL_UN8 (a, x);
+    *(mask) = a;
+}
+
+/*
+ * There are two ways of handling alpha -- either as a single unified value or
+ * a separate value for each component, hence each macro must have two
+ * versions.  The unified alpha version has a 'u' at the end of the name,
+ * the component version has a 'ca'.  Similarly, functions which deal with
+ * this difference will have two versions using the same convention.
+ */
+
+static force_inline uint32_t
+combine_mask (const uint32_t *src, const uint32_t *mask, int i)
+{
+    uint32_t s, m;
+
+    if (mask)
+    {
+	m = *(mask + i) >> A_SHIFT;
+
+	if (!m)
+	    return 0;
+    }
+
+    s = *(src + i);
+
+    if (mask)
+	UN8x4_MUL_UN8 (s, m);
+
+    return s;
+}
+
+static void
+combine_clear (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               uint32_t *               dest,
+               const uint32_t *         src,
+               const uint32_t *         mask,
+               int                      width)
+{
+    memset (dest, 0, width * sizeof (uint32_t));
+}
+
+static void
+combine_dst (pixman_implementation_t *imp,
+	     pixman_op_t	      op,
+	     uint32_t *		      dest,
+	     const uint32_t *	      src,
+	     const uint32_t *         mask,
+	     int		      width)
+{
+    return;
+}
+
+static void
+combine_src_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               uint32_t *               dest,
+               const uint32_t *         src,
+               const uint32_t *         mask,
+               int                      width)
+{
+    int i;
+
+    if (!mask)
+    {
+	memcpy (dest, src, width * sizeof (uint32_t));
+    }
+    else
+    {
+	for (i = 0; i < width; ++i)
+	{
+	    uint32_t s = combine_mask (src, mask, i);
+
+	    *(dest + i) = s;
+	}
+    }
+}
+
+static void
+combine_over_u (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                uint32_t *               dest,
+                const uint32_t *         src,
+                const uint32_t *         mask,
+                int                      width)
+{
+    int i;
+
+    if (!mask)
+    {
+	for (i = 0; i < width; ++i)
+	{
+	    uint32_t s = *(src + i);
+	    uint32_t a = ALPHA_8 (s);
+	    if (a == 0xFF)
+	    {
+		*(dest + i) = s;
+	    }
+	    else if (s)
+	    {
+		uint32_t d = *(dest + i);
+		uint32_t ia = a ^ 0xFF;
+		UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+		*(dest + i) = d;
+	    }
+	}
+    }
+    else
+    {
+	for (i = 0; i < width; ++i)
+	{
+	    uint32_t m = ALPHA_8 (*(mask + i));
+	    if (m == 0xFF)
+	    {
+		uint32_t s = *(src + i);
+		uint32_t a = ALPHA_8 (s);
+		if (a == 0xFF)
+		{
+		    *(dest + i) = s;
+		}
+		else if (s)
+		{
+		    uint32_t d = *(dest + i);
+		    uint32_t ia = a ^ 0xFF;
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+		    *(dest + i) = d;
+		}
+	    }
+	    else if (m)
+	    {
+		uint32_t s = *(src + i);
+		if (s)
+		{
+		    uint32_t d = *(dest + i);
+		    UN8x4_MUL_UN8 (s, m);
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, ALPHA_8 (~s), s);
+		    *(dest + i) = d;
+		}
+	    }
+	}
+    }
+}
+
+static void
+combine_over_reverse_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *               dest,
+                        const uint32_t *         src,
+                        const uint32_t *         mask,
+                        int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = combine_mask (src, mask, i);
+	uint32_t d = *(dest + i);
+	uint32_t ia = ALPHA_8 (~*(dest + i));
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_in_u (pixman_implementation_t *imp,
+              pixman_op_t              op,
+              uint32_t *               dest,
+              const uint32_t *         src,
+              const uint32_t *         mask,
+              int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = combine_mask (src, mask, i);
+	uint32_t a = ALPHA_8 (*(dest + i));
+	UN8x4_MUL_UN8 (s, a);
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_in_reverse_u (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               dest,
+                      const uint32_t *         src,
+                      const uint32_t *         mask,
+                      int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = combine_mask (src, mask, i);
+	uint32_t d = *(dest + i);
+	uint32_t a = ALPHA_8 (s);
+	UN8x4_MUL_UN8 (d, a);
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_out_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               uint32_t *               dest,
+               const uint32_t *         src,
+               const uint32_t *         mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = combine_mask (src, mask, i);
+	uint32_t a = ALPHA_8 (~*(dest + i));
+	UN8x4_MUL_UN8 (s, a);
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_out_reverse_u (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       uint32_t *               dest,
+                       const uint32_t *         src,
+                       const uint32_t *         mask,
+                       int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = combine_mask (src, mask, i);
+	uint32_t d = *(dest + i);
+	uint32_t a = ALPHA_8 (~s);
+	UN8x4_MUL_UN8 (d, a);
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_atop_u (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                uint32_t *               dest,
+                const uint32_t *         src,
+                const uint32_t *         mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = combine_mask (src, mask, i);
+	uint32_t d = *(dest + i);
+	uint32_t dest_a = ALPHA_8 (d);
+	uint32_t src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_atop_reverse_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *               dest,
+                        const uint32_t *         src,
+                        const uint32_t *         mask,
+                        int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = combine_mask (src, mask, i);
+	uint32_t d = *(dest + i);
+	uint32_t src_a = ALPHA_8 (s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_xor_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               uint32_t *               dest,
+               const uint32_t *         src,
+               const uint32_t *         mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = combine_mask (src, mask, i);
+	uint32_t d = *(dest + i);
+	uint32_t src_ia = ALPHA_8 (~s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_add_u (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               uint32_t *               dest,
+               const uint32_t *         src,
+               const uint32_t *         mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = combine_mask (src, mask, i);
+	uint32_t d = *(dest + i);
+	UN8x4_ADD_UN8x4 (d, s);
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_saturate_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = combine_mask (src, mask, i);
+	uint32_t d = *(dest + i);
+	uint16_t sa, da;
+
+	sa = s >> A_SHIFT;
+	da = ~d >> A_SHIFT;
+	if (sa > da)
+	{
+	    sa = DIV_UN8 (da, sa);
+	    UN8x4_MUL_UN8 (s, sa);
+	}
+	;
+	UN8x4_ADD_UN8x4 (d, s);
+	*(dest + i) = d;
+    }
+}
+
+
+/*
+ * PDF blend modes:
+ *
+ * The following blend modes have been taken from the PDF ISO 32000
+ * specification, which at this point in time is available from
+ *
+ *     http://www.adobe.com/devnet/pdf/pdf_reference.html
+ *
+ * The specific documents of interest are the PDF spec itself:
+ *
+ *     http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/PDF32000_2008.pdf
+ *
+ * chapters 11.3.5 and 11.3.6 and a later supplement for Adobe Acrobat
+ * 9.1 and Reader 9.1:
+ *
+ *     http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/adobe_supplement_iso32000_1.pdf
+ *
+ * that clarifies the specifications for blend modes ColorDodge and
+ * ColorBurn.
+ *
+ * The formula for computing the final pixel color given in 11.3.6 is:
+ *
+ *     αr × Cr = (1 – αs) × αb × Cb + (1 – αb) × αs × Cs + αb × αs × B(Cb, Cs)
+ *
+ * with B() is the blend function. When B(Cb, Cs) = Cs, this formula
+ * reduces to the regular OVER operator.
+ *
+ * Cs and Cb are not premultiplied, so in our implementation we instead
+ * use:
+ *
+ *     cr = (1 – αs) × cb  +  (1 – αb) × cs  +  αb × αs × B (cb/αb, cs/αs)
+ *
+ * where cr, cs, and cb are premultiplied colors, and where the
+ *
+ *     αb × αs × B(cb/αb, cs/αs)
+ *
+ * part is first arithmetically simplified under the assumption that αb
+ * and αs are not 0, and then updated to produce a meaningful result when
+ * they are.
+ *
+ * For all the blend mode operators, the alpha channel is given by
+ *
+ *     αr = αs + αb + αb × αs
+ */
+
+/*
+ * Multiply
+ *
+ *      ad * as * B(d / ad, s / as)
+ *    = ad * as * d/ad * s/as
+ *    = d * s
+ *
+ */
+static void
+combine_multiply_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = combine_mask (src, mask, i);
+	uint32_t d = *(dest + i);
+	uint32_t ss = s;
+	uint32_t src_ia = ALPHA_8 (~s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (ss, dest_ia, d, src_ia);
+	UN8x4_MUL_UN8x4 (d, s);
+	UN8x4_ADD_UN8x4 (d, ss);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_multiply_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t m = *(mask + i);
+	uint32_t s = *(src + i);
+	uint32_t d = *(dest + i);
+	uint32_t r = d;
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	combine_mask_ca (&s, &m);
+
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (r, ~m, s, dest_ia);
+	UN8x4_MUL_UN8x4 (d, s);
+	UN8x4_ADD_UN8x4 (r, d);
+
+	*(dest + i) = r;
+    }
+}
+
+#define PDF_SEPARABLE_BLEND_MODE(name)					\
+    static void								\
+    combine_ ## name ## _u (pixman_implementation_t *imp,		\
+			    pixman_op_t              op,		\
+                            uint32_t *               dest,		\
+			    const uint32_t *         src,		\
+			    const uint32_t *         mask,		\
+			    int                      width)		\
+    {									\
+	int i;								\
+	for (i = 0; i < width; ++i)					\
+	{								\
+	    uint32_t s = combine_mask (src, mask, i);			\
+	    uint32_t d = *(dest + i);					\
+	    uint8_t sa = ALPHA_8 (s);					\
+	    uint8_t isa = ~sa;						\
+	    uint8_t da = ALPHA_8 (d);					\
+	    uint8_t ida = ~da;						\
+	    uint32_t result;						\
+									\
+	    result = d;							\
+	    UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (result, isa, s, ida);	\
+	    								\
+	    *(dest + i) = result +					\
+		(DIV_ONE_UN8 (sa * (uint32_t)da) << A_SHIFT) +		\
+		(blend_ ## name (RED_8 (d), da, RED_8 (s), sa) << R_SHIFT) + \
+		(blend_ ## name (GREEN_8 (d), da, GREEN_8 (s), sa) << G_SHIFT) + \
+		(blend_ ## name (BLUE_8 (d), da, BLUE_8 (s), sa));	\
+	}								\
+    }									\
+    									\
+    static void								\
+    combine_ ## name ## _ca (pixman_implementation_t *imp,		\
+			     pixman_op_t              op,		\
+                             uint32_t *               dest,		\
+			     const uint32_t *         src,		\
+			     const uint32_t *         mask,		\
+			     int                      width)		\
+    {									\
+	int i;								\
+	for (i = 0; i < width; ++i)					\
+	{								\
+	    uint32_t m = *(mask + i);					\
+	    uint32_t s = *(src + i);					\
+	    uint32_t d = *(dest + i);					\
+	    uint8_t da = ALPHA_8 (d);					\
+	    uint8_t ida = ~da;						\
+	    uint32_t result;						\
+            								\
+	    combine_mask_ca (&s, &m);					\
+            								\
+	    result = d;							\
+	    UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (result, ~m, s, ida);     \
+            								\
+	    result +=							\
+	        (DIV_ONE_UN8 (ALPHA_8 (m) * (uint32_t)da) << A_SHIFT) +	\
+	        (blend_ ## name (RED_8 (d), da, RED_8 (s), RED_8 (m)) << R_SHIFT) + \
+	        (blend_ ## name (GREEN_8 (d), da, GREEN_8 (s), GREEN_8 (m)) << G_SHIFT) + \
+	        (blend_ ## name (BLUE_8 (d), da, BLUE_8 (s), BLUE_8 (m))); \
+	    								\
+	    *(dest + i) = result;					\
+	}								\
+    }
+
+/*
+ * Screen
+ *
+ *      ad * as * B(d/ad, s/as)
+ *    = ad * as * (d/ad + s/as - s/as * d/ad)
+ *    = ad * s + as * d - s * d
+ */
+static inline uint32_t
+blend_screen (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
+{
+    return DIV_ONE_UN8 (s * ad + d * as - s * d);
+}
+
+PDF_SEPARABLE_BLEND_MODE (screen)
+
+/*
+ * Overlay
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = ad * as * Hardlight (s, d)
+ *   = if (d / ad < 0.5)
+ *         as * ad * Multiply (s/as, 2 * d/ad)
+ *     else
+ *         as * ad * Screen (s/as, 2 * d / ad - 1)
+ *   = if (d < 0.5 * ad)
+ *         as * ad * s/as * 2 * d /ad
+ *     else
+ *         as * ad * (s/as + 2 * d / ad - 1 - s / as * (2 * d / ad - 1))
+ *   = if (2 * d < ad)
+ *         2 * s * d
+ *     else
+ *         ad * s + 2 * as * d - as * ad - ad * s * (2 * d / ad - 1)
+ *   = if (2 * d < ad)
+ *         2 * s * d
+ *     else
+ *         as * ad - 2 * (ad - d) * (as - s)
+ */
+static inline uint32_t
+blend_overlay (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
+{
+    uint32_t r;
+
+    if (2 * d < ad)
+	r = 2 * s * d;
+    else
+	r = as * ad - 2 * (ad - d) * (as - s);
+
+    return DIV_ONE_UN8 (r);
+}
+
+PDF_SEPARABLE_BLEND_MODE (overlay)
+
+/*
+ * Darken
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = ad * as * MIN(d/ad, s/as)
+ *   = MIN (as * d, ad * s)
+ */
+static inline uint32_t
+blend_darken (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
+{
+    s = ad * s;
+    d = as * d;
+
+    return DIV_ONE_UN8 (s > d ? d : s);
+}
+
+PDF_SEPARABLE_BLEND_MODE (darken)
+
+/*
+ * Lighten
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = ad * as * MAX(d/ad, s/as)
+ *   = MAX (as * d, ad * s)
+ */
+static inline uint32_t
+blend_lighten (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
+{
+    s = ad * s;
+    d = as * d;
+    
+    return DIV_ONE_UN8 (s > d ? s : d);
+}
+
+PDF_SEPARABLE_BLEND_MODE (lighten)
+
+/*
+ * Color dodge
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = if d/ad = 0
+ *         ad * as * 0
+ *     else if (d/ad >= (1 - s/as)
+ *         ad * as * 1
+ *     else
+ *         ad * as * ((d/ad) / (1 - s/as))
+ *   = if d = 0
+ *         0
+ *     elif as * d >= ad * (as - s)
+ *         ad * as
+ *     else
+ *         as * (as * d / (as - s))
+ *
+ */
+static inline uint32_t
+blend_color_dodge (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
+{
+    if (d == 0)
+        return 0;
+    else if (as * d >= ad * (as - s))
+	return DIV_ONE_UN8 (as * ad);
+    else if (as - s == 0)
+        return DIV_ONE_UN8 (as * ad);
+    else
+        return DIV_ONE_UN8 (as * ((d * as) / ((as - s))));
+}
+
+PDF_SEPARABLE_BLEND_MODE (color_dodge)
+
+/*
+ * Color burn
+ *
+ * We modify the first clause "if d = 1" to "if d >= 1" since with
+ * premultiplied colors d > 1 can actually happen.
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = if d/ad >= 1
+ *         ad * as * 1
+ *     elif (1 - d/ad) >= s/as
+ *         ad * as * 0
+ *     else
+ *         ad * as * (1 - ((1 - d/ad) / (s/as)))
+ *   = if d >= ad
+ *         ad * as
+ *     elif as * ad - as * d >= ad * s
+ *         0
+ *     else
+ *         ad * as  - as * as * (ad - d) / s
+ */
+static inline uint32_t
+blend_color_burn (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
+{
+    if (d >= ad)
+	return DIV_ONE_UN8 (ad * as);
+    else if (as * ad - as * d >= ad * s)
+	return 0;
+    else if (s == 0)
+	return 0;
+    else
+	return DIV_ONE_UN8 (ad * as - (as * as * (ad - d)) / s);
+}
+
+PDF_SEPARABLE_BLEND_MODE (color_burn)
+
+/*
+ * Hard light
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = if (s/as <= 0.5)
+ *         ad * as * Multiply (d/ad, 2 * s/as)
+ *     else
+ *         ad * as * Screen (d/ad, 2 * s/as - 1)
+ *   = if 2 * s <= as
+ *         ad * as * d/ad * 2 * s / as
+ *     else
+ *         ad * as * (d/ad + (2 * s/as - 1) + d/ad * (2 * s/as - 1))
+ *   = if 2 * s <= as
+ *         2 * s * d
+ *     else
+ *         as * ad - 2 * (ad - d) * (as - s)
+ */
+static inline uint32_t
+blend_hard_light (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
+{
+    if (2 * s < as)
+	return DIV_ONE_UN8 (2 * s * d);
+    else
+	return DIV_ONE_UN8 (as * ad - 2 * (ad - d) * (as - s));
+}
+
+PDF_SEPARABLE_BLEND_MODE (hard_light)
+
+/*
+ * Soft light
+ *
+ *     ad * as * B(d/ad, s/as)
+ *   = if (s/as <= 0.5)
+ *         ad * as * (d/ad - (1 - 2 * s/as) * d/ad * (1 - d/ad))
+ *     else if (d/ad <= 0.25)
+ *         ad * as * (d/ad + (2 * s/as - 1) * ((((16 * d/ad - 12) * d/ad + 4) * d/ad) - d/ad))
+ *     else
+ *         ad * as * (d/ad + (2 * s/as - 1) * sqrt (d/ad))
+ *   = if (2 * s <= as)
+ *         d * as - d * (ad - d) * (as - 2 * s) / ad;
+ *     else if (4 * d <= ad)
+ *         (2 * s - as) * d * ((16 * d / ad - 12) * d / ad + 3);
+ *     else
+ *         d * as + (sqrt (d * ad) - d) * (2 * s - as);
+ */
+static inline uint32_t
+blend_soft_light (uint32_t d_org,
+		  uint32_t ad_org,
+		  uint32_t s_org,
+		  uint32_t as_org)
+{
+    double d = d_org * (1.0 / MASK);
+    double ad = ad_org * (1.0 / MASK);
+    double s = s_org * (1.0 / MASK);
+    double as = as_org * (1.0 / MASK);
+    double r;
+
+    if (2 * s < as)
+    {
+	if (ad == 0)
+	    r = d * as;
+	else
+	    r = d * as - d * (ad - d) * (as - 2 * s) / ad;
+    }
+    else if (ad == 0)
+    {
+	r = 0;
+    }
+    else if (4 * d <= ad)
+    {
+	r = d * as +
+	    (2 * s - as) * d * ((16 * d / ad - 12) * d / ad + 3);
+    }
+    else
+    {
+	r = d * as + (sqrt (d * ad) - d) * (2 * s - as);
+    }
+    return r * MASK + 0.5;
+}
+
+PDF_SEPARABLE_BLEND_MODE (soft_light)
+
+/*
+ * Difference
+ *
+ *     ad * as * B(s/as, d/ad)
+ *   = ad * as * abs (s/as - d/ad)
+ *   = if (s/as <= d/ad)
+ *         ad * as * (d/ad - s/as)
+ *     else
+ *         ad * as * (s/as - d/ad)
+ *   = if (ad * s <= as * d)
+ *        as * d - ad * s
+ *     else
+ *        ad * s - as * d
+ */
+static inline uint32_t
+blend_difference (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
+{
+    uint32_t das = d * as;
+    uint32_t sad = s * ad;
+
+    if (sad < das)
+	return DIV_ONE_UN8 (das - sad);
+    else
+	return DIV_ONE_UN8 (sad - das);
+}
+
+PDF_SEPARABLE_BLEND_MODE (difference)
+
+/*
+ * Exclusion
+ *
+ *     ad * as * B(s/as, d/ad)
+ *   = ad * as * (d/ad + s/as - 2 * d/ad * s/as)
+ *   = as * d + ad * s - 2 * s * d
+ */
+
+/* This can be made faster by writing it directly and not using
+ * PDF_SEPARABLE_BLEND_MODE, but that's a performance optimization */
+
+static inline uint32_t
+blend_exclusion (uint32_t d, uint32_t ad, uint32_t s, uint32_t as)
+{
+    return DIV_ONE_UN8 (s * ad + d * as - 2 * d * s);
+}
+
+PDF_SEPARABLE_BLEND_MODE (exclusion)
+
+#undef PDF_SEPARABLE_BLEND_MODE
+
+/*
+ * PDF nonseperable blend modes are implemented using the following functions
+ * to operate in Hsl space, with Cmax, Cmid, Cmin referring to the max, mid
+ * and min value of the red, green and blue components.
+ *
+ * LUM (C) = 0.3 × Cred + 0.59 × Cgreen + 0.11 × Cblue
+ *
+ * clip_color (C):
+ *     l = LUM (C)
+ *     min = Cmin
+ *     max = Cmax
+ *     if n < 0.0
+ *         C = l + (((C – l) × l) ⁄ (l – min))
+ *     if x > 1.0
+ *         C = l + (((C – l) × (1 – l) ) ⁄ (max – l))
+ *     return C
+ *
+ * set_lum (C, l):
+ *     d = l – LUM (C)
+ *     C += d
+ *     return clip_color (C)
+ *
+ * SAT (C) = CH_MAX (C) - CH_MIN (C)
+ *
+ * set_sat (C, s):
+ *     if Cmax > Cmin
+ *         Cmid = ( ( ( Cmid – Cmin ) × s ) ⁄ ( Cmax – Cmin ) )
+ *         Cmax = s
+ *     else
+ *         Cmid = Cmax = 0.0
+ *         Cmin = 0.0
+ *     return C
+ */
+
+/* For premultiplied colors, we need to know what happens when C is
+ * multiplied by a real number. LUM and SAT are linear:
+ *
+ *     LUM (r × C) = r × LUM (C)	SAT (r * C) = r * SAT (C)
+ *
+ * If we extend clip_color with an extra argument a and change
+ *
+ *     if x >= 1.0
+ *
+ * into
+ *
+ *     if x >= a
+ *
+ * then clip_color is also linear:
+ *
+ *     r * clip_color (C, a) = clip_color (r * C, r * a);
+ *
+ * for positive r.
+ *
+ * Similarly, we can extend set_lum with an extra argument that is just passed
+ * on to clip_color:
+ *
+ *       r * set_lum (C, l, a)
+ *
+ *     = r × clip_color (C + l - LUM (C), a)
+ *
+ *     = clip_color (r * C + r × l - r * LUM (C), r * a)
+ *
+ *     = set_lum (r * C, r * l, r * a)
+ *
+ * Finally, set_sat:
+ *
+ *       r * set_sat (C, s) = set_sat (x * C, r * s)
+ *
+ * The above holds for all non-zero x, because the x'es in the fraction for
+ * C_mid cancel out. Specifically, it holds for x = r:
+ *
+ *       r * set_sat (C, s) = set_sat (r * C, r * s)
+ *
+ */
+
+#define CH_MIN(c) (c[0] < c[1] ? (c[0] < c[2] ? c[0] : c[2]) : (c[1] < c[2] ? c[1] : c[2]))
+#define CH_MAX(c) (c[0] > c[1] ? (c[0] > c[2] ? c[0] : c[2]) : (c[1] > c[2] ? c[1] : c[2]))
+#define LUM(c) ((c[0] * 30 + c[1] * 59 + c[2] * 11) / 100)
+#define SAT(c) (CH_MAX (c) - CH_MIN (c))
+
+#define PDF_NON_SEPARABLE_BLEND_MODE(name)				\
+    static void								\
+    combine_ ## name ## _u (pixman_implementation_t *imp,		\
+			    pixman_op_t		     op,		\
+                            uint32_t *               dest,		\
+			    const uint32_t *         src,		\
+			    const uint32_t *         mask,		\
+			    int                      width)		\
+    {									\
+	int i;								\
+	for (i = 0; i < width; ++i)					\
+	{								\
+	    uint32_t s = combine_mask (src, mask, i);			\
+	    uint32_t d = *(dest + i);					\
+	    uint8_t sa = ALPHA_8 (s);					\
+	    uint8_t isa = ~sa;						\
+	    uint8_t da = ALPHA_8 (d);					\
+	    uint8_t ida = ~da;						\
+	    uint32_t result;						\
+	    uint32_t sc[3], dc[3], c[3];				\
+            								\
+	    result = d;							\
+	    UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (result, isa, s, ida);	\
+	    dc[0] = RED_8 (d);						\
+	    sc[0] = RED_8 (s);						\
+	    dc[1] = GREEN_8 (d);					\
+	    sc[1] = GREEN_8 (s);					\
+	    dc[2] = BLUE_8 (d);						\
+	    sc[2] = BLUE_8 (s);						\
+	    blend_ ## name (c, dc, da, sc, sa);				\
+            								\
+	    *(dest + i) = result +					\
+		(DIV_ONE_UN8 (sa * (uint32_t)da) << A_SHIFT) +		\
+		(DIV_ONE_UN8 (c[0]) << R_SHIFT) +			\
+		(DIV_ONE_UN8 (c[1]) << G_SHIFT) +			\
+		(DIV_ONE_UN8 (c[2]));					\
+	}								\
+    }
+
+static void
+set_lum (uint32_t dest[3], uint32_t src[3], uint32_t sa, uint32_t lum)
+{
+    double a, l, min, max;
+    double tmp[3];
+
+    a = sa * (1.0 / MASK);
+
+    l = lum * (1.0 / MASK);
+    tmp[0] = src[0] * (1.0 / MASK);
+    tmp[1] = src[1] * (1.0 / MASK);
+    tmp[2] = src[2] * (1.0 / MASK);
+
+    l = l - LUM (tmp);
+    tmp[0] += l;
+    tmp[1] += l;
+    tmp[2] += l;
+
+    /* clip_color */
+    l = LUM (tmp);
+    min = CH_MIN (tmp);
+    max = CH_MAX (tmp);
+
+    if (min < 0)
+    {
+	if (l - min == 0.0)
+	{
+	    tmp[0] = 0;
+	    tmp[1] = 0;
+	    tmp[2] = 0;
+	}
+	else
+	{
+	    tmp[0] = l + (tmp[0] - l) * l / (l - min);
+	    tmp[1] = l + (tmp[1] - l) * l / (l - min);
+	    tmp[2] = l + (tmp[2] - l) * l / (l - min);
+	}
+    }
+    if (max > a)
+    {
+	if (max - l == 0.0)
+	{
+	    tmp[0] = a;
+	    tmp[1] = a;
+	    tmp[2] = a;
+	}
+	else
+	{
+	    tmp[0] = l + (tmp[0] - l) * (a - l) / (max - l);
+	    tmp[1] = l + (tmp[1] - l) * (a - l) / (max - l);
+	    tmp[2] = l + (tmp[2] - l) * (a - l) / (max - l);
+	}
+    }
+
+    dest[0] = tmp[0] * MASK + 0.5;
+    dest[1] = tmp[1] * MASK + 0.5;
+    dest[2] = tmp[2] * MASK + 0.5;
+}
+
+static void
+set_sat (uint32_t dest[3], uint32_t src[3], uint32_t sat)
+{
+    int id[3];
+    uint32_t min, max;
+
+    if (src[0] > src[1])
+    {
+	if (src[0] > src[2])
+	{
+	    id[0] = 0;
+	    if (src[1] > src[2])
+	    {
+		id[1] = 1;
+		id[2] = 2;
+	    }
+	    else
+	    {
+		id[1] = 2;
+		id[2] = 1;
+	    }
+	}
+	else
+	{
+	    id[0] = 2;
+	    id[1] = 0;
+	    id[2] = 1;
+	}
+    }
+    else
+    {
+	if (src[0] > src[2])
+	{
+	    id[0] = 1;
+	    id[1] = 0;
+	    id[2] = 2;
+	}
+	else
+	{
+	    id[2] = 0;
+	    if (src[1] > src[2])
+	    {
+		id[0] = 1;
+		id[1] = 2;
+	    }
+	    else
+	    {
+		id[0] = 2;
+		id[1] = 1;
+	    }
+	}
+    }
+
+    max = dest[id[0]];
+    min = dest[id[2]];
+    if (max > min)
+    {
+	dest[id[1]] = (dest[id[1]] - min) * sat / (max - min);
+	dest[id[0]] = sat;
+	dest[id[2]] = 0;
+    }
+    else
+    {
+	dest[0] = dest[1] = dest[2] = 0;
+    }
+}
+
+/* Hue:
+ *
+ *       as * ad * B(s/as, d/as)
+ *     = as * ad * set_lum (set_sat (s/as, SAT (d/ad)), LUM (d/ad), 1)
+ *     = set_lum (set_sat (ad * s, as * SAT (d)), as * LUM (d), as * ad)
+ *
+ */
+static inline void
+blend_hsl_hue (uint32_t r[3],
+               uint32_t d[3],
+               uint32_t ad,
+               uint32_t s[3],
+               uint32_t as)
+{
+    r[0] = s[0] * ad;
+    r[1] = s[1] * ad;
+    r[2] = s[2] * ad;
+    set_sat (r, r, SAT (d) * as);
+    set_lum (r, r, as * ad, LUM (d) * as);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_hue)
+
+/* 
+ * Saturation
+ *
+ *     as * ad * B(s/as, d/ad)
+ *   = as * ad * set_lum (set_sat (d/ad, SAT (s/as)), LUM (d/ad), 1)
+ *   = set_lum (as * ad * set_sat (d/ad, SAT (s/as)),
+ *                                       as * LUM (d), as * ad)
+ *   = set_lum (set_sat (as * d, ad * SAT (s), as * LUM (d), as * ad))
+ */
+static inline void
+blend_hsl_saturation (uint32_t r[3],
+                      uint32_t d[3],
+                      uint32_t ad,
+                      uint32_t s[3],
+                      uint32_t as)
+{
+    r[0] = d[0] * as;
+    r[1] = d[1] * as;
+    r[2] = d[2] * as;
+    set_sat (r, r, SAT (s) * ad);
+    set_lum (r, r, as * ad, LUM (d) * as);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_saturation)
+
+/* 
+ * Color
+ *
+ *     as * ad * B(s/as, d/as)
+ *   = as * ad * set_lum (s/as, LUM (d/ad), 1)
+ *   = set_lum (s * ad, as * LUM (d), as * ad)
+ */
+static inline void
+blend_hsl_color (uint32_t r[3],
+                 uint32_t d[3],
+                 uint32_t ad,
+                 uint32_t s[3],
+                 uint32_t as)
+{
+    r[0] = s[0] * ad;
+    r[1] = s[1] * ad;
+    r[2] = s[2] * ad;
+    set_lum (r, r, as * ad, LUM (d) * as);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_color)
+
+/*
+ * Luminosity
+ *
+ *     as * ad * B(s/as, d/ad)
+ *   = as * ad * set_lum (d/ad, LUM (s/as), 1)
+ *   = set_lum (as * d, ad * LUM (s), as * ad)
+ */
+static inline void
+blend_hsl_luminosity (uint32_t r[3],
+                      uint32_t d[3],
+                      uint32_t ad,
+                      uint32_t s[3],
+                      uint32_t as)
+{
+    r[0] = d[0] * as;
+    r[1] = d[1] * as;
+    r[2] = d[2] * as;
+    set_lum (r, r, as * ad, LUM (s) * ad);
+}
+
+PDF_NON_SEPARABLE_BLEND_MODE (hsl_luminosity)
+
+#undef SAT
+#undef LUM
+#undef CH_MAX
+#undef CH_MIN
+#undef PDF_NON_SEPARABLE_BLEND_MODE
+
+/* All of the disjoint/conjoint composing functions
+ *
+ * The four entries in the first column indicate what source contributions
+ * come from each of the four areas of the picture -- areas covered by neither
+ * A nor B, areas covered only by A, areas covered only by B and finally
+ * areas covered by both A and B.
+ *
+ * Disjoint			Conjoint
+ * Fa		Fb		Fa		Fb
+ * (0,0,0,0)	0		0		0		0
+ * (0,A,0,A)	1		0		1		0
+ * (0,0,B,B)	0		1		0		1
+ * (0,A,B,A)	1		min((1-a)/b,1)	1		max(1-a/b,0)
+ * (0,A,B,B)	min((1-b)/a,1)	1		max(1-b/a,0)	1
+ * (0,0,0,A)	max(1-(1-b)/a,0) 0		min(1,b/a)	0
+ * (0,0,0,B)	0		max(1-(1-a)/b,0) 0		min(a/b,1)
+ * (0,A,0,0)	min(1,(1-b)/a)	0		max(1-b/a,0)	0
+ * (0,0,B,0)	0		min(1,(1-a)/b)	0		max(1-a/b,0)
+ * (0,0,B,A)	max(1-(1-b)/a,0) min(1,(1-a)/b)	 min(1,b/a)	max(1-a/b,0)
+ * (0,A,0,B)	min(1,(1-b)/a)	max(1-(1-a)/b,0) max(1-b/a,0)	min(1,a/b)
+ * (0,A,B,0)	min(1,(1-b)/a)	min(1,(1-a)/b)	max(1-b/a,0)	max(1-a/b,0)
+ *
+ * See  http://marc.info/?l=xfree-render&m=99792000027857&w=2  for more
+ * information about these operators.
+ */
+
+#define COMBINE_A_OUT 1
+#define COMBINE_A_IN  2
+#define COMBINE_B_OUT 4
+#define COMBINE_B_IN  8
+
+#define COMBINE_CLEAR   0
+#define COMBINE_A       (COMBINE_A_OUT | COMBINE_A_IN)
+#define COMBINE_B       (COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_ATOP  (COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_ATOP  (COMBINE_A_OUT | COMBINE_B_IN)
+#define COMBINE_XOR     (COMBINE_A_OUT | COMBINE_B_OUT)
+
+/* portion covered by a but not b */
+static uint8_t
+combine_disjoint_out_part (uint8_t a, uint8_t b)
+{
+    /* min (1, (1-b) / a) */
+
+    b = ~b;                 /* 1 - b */
+    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
+	return MASK;        /* 1 */
+    return DIV_UN8 (b, a);     /* (1-b) / a */
+}
+
+/* portion covered by both a and b */
+static uint8_t
+combine_disjoint_in_part (uint8_t a, uint8_t b)
+{
+    /* max (1-(1-b)/a,0) */
+    /*  = - min ((1-b)/a - 1, 0) */
+    /*  = 1 - min (1, (1-b)/a) */
+
+    b = ~b;                 /* 1 - b */
+    if (b >= a)             /* 1 - b >= a -> (1-b)/a >= 1 */
+	return 0;           /* 1 - 1 */
+    return ~DIV_UN8(b, a);    /* 1 - (1-b) / a */
+}
+
+/* portion covered by a but not b */
+static uint8_t
+combine_conjoint_out_part (uint8_t a, uint8_t b)
+{
+    /* max (1-b/a,0) */
+    /* = 1-min(b/a,1) */
+
+    /* min (1, (1-b) / a) */
+
+    if (b >= a)             /* b >= a -> b/a >= 1 */
+	return 0x00;        /* 0 */
+    return ~DIV_UN8(b, a);    /* 1 - b/a */
+}
+
+/* portion covered by both a and b */
+static uint8_t
+combine_conjoint_in_part (uint8_t a, uint8_t b)
+{
+    /* min (1,b/a) */
+
+    if (b >= a)             /* b >= a -> b/a >= 1 */
+	return MASK;        /* 1 */
+    return DIV_UN8 (b, a);     /* b/a */
+}
+
+#define GET_COMP(v, i)   ((uint16_t) (uint8_t) ((v) >> i))
+
+#define ADD(x, y, i, t)							\
+    ((t) = GET_COMP (x, i) + GET_COMP (y, i),				\
+     (uint32_t) ((uint8_t) ((t) | (0 - ((t) >> G_SHIFT)))) << (i))
+
+#define GENERIC(x, y, i, ax, ay, t, u, v)				\
+    ((t) = (MUL_UN8 (GET_COMP (y, i), ay, (u)) +			\
+            MUL_UN8 (GET_COMP (x, i), ax, (v))),			\
+     (uint32_t) ((uint8_t) ((t) |					\
+                           (0 - ((t) >> G_SHIFT)))) << (i))
+
+static void
+combine_disjoint_general_u (uint32_t *      dest,
+                            const uint32_t *src,
+                            const uint32_t *mask,
+                            int            width,
+                            uint8_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = combine_mask (src, mask, i);
+	uint32_t d = *(dest + i);
+	uint32_t m, n, o, p;
+	uint16_t Fa, Fb, t, u, v;
+	uint8_t sa = s >> A_SHIFT;
+	uint8_t da = d >> A_SHIFT;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    Fa = combine_disjoint_out_part (sa, da);
+	    break;
+
+	case COMBINE_A_IN:
+	    Fa = combine_disjoint_in_part (sa, da);
+	    break;
+
+	case COMBINE_A:
+	    Fa = MASK;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    Fb = combine_disjoint_out_part (da, sa);
+	    break;
+
+	case COMBINE_B_IN:
+	    Fb = combine_disjoint_in_part (da, sa);
+	    break;
+
+	case COMBINE_B:
+	    Fb = MASK;
+	    break;
+	}
+	m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
+	n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
+	o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
+	p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
+	s = m | n | o | p;
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_disjoint_over_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *                dest,
+                         const uint32_t *          src,
+                         const uint32_t *          mask,
+                         int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = combine_mask (src, mask, i);
+	uint16_t a = s >> A_SHIFT;
+
+	if (s != 0x00)
+	{
+	    uint32_t d = *(dest + i);
+	    a = combine_disjoint_out_part (d >> A_SHIFT, a);
+	    UN8x4_MUL_UN8_ADD_UN8x4 (d, a, s);
+
+	    *(dest + i) = d;
+	}
+    }
+}
+
+static void
+combine_disjoint_in_u (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       uint32_t *                dest,
+                       const uint32_t *          src,
+                       const uint32_t *          mask,
+                       int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_disjoint_in_reverse_u (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               uint32_t *                dest,
+                               const uint32_t *          src,
+                               const uint32_t *          mask,
+                               int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_disjoint_out_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *                dest,
+                        const uint32_t *          src,
+                        const uint32_t *          mask,
+                        int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_disjoint_out_reverse_u (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                uint32_t *                dest,
+                                const uint32_t *          src,
+                                const uint32_t *          mask,
+                                int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_disjoint_atop_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *                dest,
+                         const uint32_t *          src,
+                         const uint32_t *          mask,
+                         int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_disjoint_atop_reverse_u (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 uint32_t *                dest,
+                                 const uint32_t *          src,
+                                 const uint32_t *          mask,
+                                 int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_disjoint_xor_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *                dest,
+                        const uint32_t *          src,
+                        const uint32_t *          mask,
+                        int                      width)
+{
+    combine_disjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
+combine_conjoint_general_u (uint32_t *      dest,
+                            const uint32_t *src,
+                            const uint32_t *mask,
+                            int            width,
+                            uint8_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = combine_mask (src, mask, i);
+	uint32_t d = *(dest + i);
+	uint32_t m, n, o, p;
+	uint16_t Fa, Fb, t, u, v;
+	uint8_t sa = s >> A_SHIFT;
+	uint8_t da = d >> A_SHIFT;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    Fa = combine_conjoint_out_part (sa, da);
+	    break;
+
+	case COMBINE_A_IN:
+	    Fa = combine_conjoint_in_part (sa, da);
+	    break;
+
+	case COMBINE_A:
+	    Fa = MASK;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    Fb = combine_conjoint_out_part (da, sa);
+	    break;
+
+	case COMBINE_B_IN:
+	    Fb = combine_conjoint_in_part (da, sa);
+	    break;
+
+	case COMBINE_B:
+	    Fb = MASK;
+	    break;
+	}
+
+	m = GENERIC (s, d, 0, Fa, Fb, t, u, v);
+	n = GENERIC (s, d, G_SHIFT, Fa, Fb, t, u, v);
+	o = GENERIC (s, d, R_SHIFT, Fa, Fb, t, u, v);
+	p = GENERIC (s, d, A_SHIFT, Fa, Fb, t, u, v);
+
+	s = m | n | o | p;
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_conjoint_over_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *                dest,
+                         const uint32_t *          src,
+                         const uint32_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+combine_conjoint_over_reverse_u (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 uint32_t *                dest,
+                                 const uint32_t *          src,
+                                 const uint32_t *          mask,
+                                 int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+combine_conjoint_in_u (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       uint32_t *                dest,
+                       const uint32_t *          src,
+                       const uint32_t *          mask,
+                       int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_conjoint_in_reverse_u (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               uint32_t *                dest,
+                               const uint32_t *          src,
+                               const uint32_t *          mask,
+                               int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_conjoint_out_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *                dest,
+                        const uint32_t *          src,
+                        const uint32_t *          mask,
+                        int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_conjoint_out_reverse_u (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                uint32_t *                dest,
+                                const uint32_t *          src,
+                                const uint32_t *          mask,
+                                int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_conjoint_atop_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *                dest,
+                         const uint32_t *          src,
+                         const uint32_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_conjoint_atop_reverse_u (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 uint32_t *                dest,
+                                 const uint32_t *          src,
+                                 const uint32_t *          mask,
+                                 int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_conjoint_xor_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *                dest,
+                        const uint32_t *          src,
+                        const uint32_t *          mask,
+                        int                      width)
+{
+    combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
+
+
+/* Component alpha combiners */
+
+static void
+combine_clear_ca (pixman_implementation_t *imp,
+                  pixman_op_t              op,
+                  uint32_t *                dest,
+                  const uint32_t *          src,
+                  const uint32_t *          mask,
+                  int                      width)
+{
+    memset (dest, 0, width * sizeof(uint32_t));
+}
+
+static void
+combine_src_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                uint32_t *                dest,
+                const uint32_t *          src,
+                const uint32_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = *(src + i);
+	uint32_t m = *(mask + i);
+
+	combine_mask_value_ca (&s, &m);
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_over_ca (pixman_implementation_t *imp,
+                 pixman_op_t              op,
+                 uint32_t *                dest,
+                 const uint32_t *          src,
+                 const uint32_t *          mask,
+                 int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = *(src + i);
+	uint32_t m = *(mask + i);
+	uint32_t a;
+
+	combine_mask_ca (&s, &m);
+
+	a = ~m;
+	if (a)
+	{
+	    uint32_t d = *(dest + i);
+	    UN8x4_MUL_UN8x4_ADD_UN8x4 (d, a, s);
+	    s = d;
+	}
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_over_reverse_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *                dest,
+                         const uint32_t *          src,
+                         const uint32_t *          mask,
+                         int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t d = *(dest + i);
+	uint32_t a = ~d >> A_SHIFT;
+
+	if (a)
+	{
+	    uint32_t s = *(src + i);
+	    uint32_t m = *(mask + i);
+
+	    UN8x4_MUL_UN8x4 (s, m);
+	    UN8x4_MUL_UN8_ADD_UN8x4 (s, a, d);
+
+	    *(dest + i) = s;
+	}
+    }
+}
+
+static void
+combine_in_ca (pixman_implementation_t *imp,
+               pixman_op_t              op,
+               uint32_t *                dest,
+               const uint32_t *          src,
+               const uint32_t *          mask,
+               int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t d = *(dest + i);
+	uint16_t a = d >> A_SHIFT;
+	uint32_t s = 0;
+
+	if (a)
+	{
+	    uint32_t m = *(mask + i);
+
+	    s = *(src + i);
+	    combine_mask_value_ca (&s, &m);
+
+	    if (a != MASK)
+		UN8x4_MUL_UN8 (s, a);
+	}
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_in_reverse_ca (pixman_implementation_t *imp,
+                       pixman_op_t              op,
+                       uint32_t *                dest,
+                       const uint32_t *          src,
+                       const uint32_t *          mask,
+                       int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = *(src + i);
+	uint32_t m = *(mask + i);
+	uint32_t a;
+
+	combine_mask_alpha_ca (&s, &m);
+
+	a = m;
+	if (a != ~0)
+	{
+	    uint32_t d = 0;
+
+	    if (a)
+	    {
+		d = *(dest + i);
+		UN8x4_MUL_UN8x4 (d, a);
+	    }
+
+	    *(dest + i) = d;
+	}
+    }
+}
+
+static void
+combine_out_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                uint32_t *                dest,
+                const uint32_t *          src,
+                const uint32_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t d = *(dest + i);
+	uint16_t a = ~d >> A_SHIFT;
+	uint32_t s = 0;
+
+	if (a)
+	{
+	    uint32_t m = *(mask + i);
+
+	    s = *(src + i);
+	    combine_mask_value_ca (&s, &m);
+
+	    if (a != MASK)
+		UN8x4_MUL_UN8 (s, a);
+	}
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_out_reverse_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *                dest,
+                        const uint32_t *          src,
+                        const uint32_t *          mask,
+                        int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = *(src + i);
+	uint32_t m = *(mask + i);
+	uint32_t a;
+
+	combine_mask_alpha_ca (&s, &m);
+
+	a = ~m;
+	if (a != ~0)
+	{
+	    uint32_t d = 0;
+
+	    if (a)
+	    {
+		d = *(dest + i);
+		UN8x4_MUL_UN8x4 (d, a);
+	    }
+
+	    *(dest + i) = d;
+	}
+    }
+}
+
+static void
+combine_atop_ca (pixman_implementation_t *imp,
+                 pixman_op_t              op,
+                 uint32_t *                dest,
+                 const uint32_t *          src,
+                 const uint32_t *          mask,
+                 int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t d = *(dest + i);
+	uint32_t s = *(src + i);
+	uint32_t m = *(mask + i);
+	uint32_t ad;
+	uint16_t as = d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	ad = ~m;
+
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ad, s, as);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_atop_reverse_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *                dest,
+                         const uint32_t *          src,
+                         const uint32_t *          mask,
+                         int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t d = *(dest + i);
+	uint32_t s = *(src + i);
+	uint32_t m = *(mask + i);
+	uint32_t ad;
+	uint16_t as = ~d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	ad = m;
+
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ad, s, as);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_xor_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                uint32_t *                dest,
+                const uint32_t *          src,
+                const uint32_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t d = *(dest + i);
+	uint32_t s = *(src + i);
+	uint32_t m = *(mask + i);
+	uint32_t ad;
+	uint16_t as = ~d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	ad = ~m;
+
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ad, s, as);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_add_ca (pixman_implementation_t *imp,
+                pixman_op_t              op,
+                uint32_t *                dest,
+                const uint32_t *          src,
+                const uint32_t *          mask,
+                int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s = *(src + i);
+	uint32_t m = *(mask + i);
+	uint32_t d = *(dest + i);
+
+	combine_mask_value_ca (&s, &m);
+
+	UN8x4_ADD_UN8x4 (d, s);
+
+	*(dest + i) = d;
+    }
+}
+
+static void
+combine_saturate_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *                dest,
+                     const uint32_t *          src,
+                     const uint32_t *          mask,
+                     int                      width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s, d;
+	uint16_t sa, sr, sg, sb, da;
+	uint16_t t, u, v;
+	uint32_t m, n, o, p;
+
+	d = *(dest + i);
+	s = *(src + i);
+	m = *(mask + i);
+
+	combine_mask_ca (&s, &m);
+
+	sa = (m >> A_SHIFT);
+	sr = (m >> R_SHIFT) & MASK;
+	sg = (m >> G_SHIFT) & MASK;
+	sb =  m             & MASK;
+	da = ~d >> A_SHIFT;
+
+	if (sb <= da)
+	    m = ADD (s, d, 0, t);
+	else
+	    m = GENERIC (s, d, 0, (da << G_SHIFT) / sb, MASK, t, u, v);
+
+	if (sg <= da)
+	    n = ADD (s, d, G_SHIFT, t);
+	else
+	    n = GENERIC (s, d, G_SHIFT, (da << G_SHIFT) / sg, MASK, t, u, v);
+
+	if (sr <= da)
+	    o = ADD (s, d, R_SHIFT, t);
+	else
+	    o = GENERIC (s, d, R_SHIFT, (da << G_SHIFT) / sr, MASK, t, u, v);
+
+	if (sa <= da)
+	    p = ADD (s, d, A_SHIFT, t);
+	else
+	    p = GENERIC (s, d, A_SHIFT, (da << G_SHIFT) / sa, MASK, t, u, v);
+
+	*(dest + i) = m | n | o | p;
+    }
+}
+
+static void
+combine_disjoint_general_ca (uint32_t *      dest,
+                             const uint32_t *src,
+                             const uint32_t *mask,
+                             int            width,
+                             uint8_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s, d;
+	uint32_t m, n, o, p;
+	uint32_t Fa, Fb;
+	uint16_t t, u, v;
+	uint32_t sa;
+	uint8_t da;
+
+	s = *(src + i);
+	m = *(mask + i);
+	d = *(dest + i);
+	da = d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	sa = m;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    m = (uint32_t)combine_disjoint_out_part ((uint8_t) (sa >> 0), da);
+	    n = (uint32_t)combine_disjoint_out_part ((uint8_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (uint32_t)combine_disjoint_out_part ((uint8_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (uint32_t)combine_disjoint_out_part ((uint8_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A_IN:
+	    m = (uint32_t)combine_disjoint_in_part ((uint8_t) (sa >> 0), da);
+	    n = (uint32_t)combine_disjoint_in_part ((uint8_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (uint32_t)combine_disjoint_in_part ((uint8_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (uint32_t)combine_disjoint_in_part ((uint8_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A:
+	    Fa = ~0;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    m = (uint32_t)combine_disjoint_out_part (da, (uint8_t) (sa >> 0));
+	    n = (uint32_t)combine_disjoint_out_part (da, (uint8_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (uint32_t)combine_disjoint_out_part (da, (uint8_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (uint32_t)combine_disjoint_out_part (da, (uint8_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B_IN:
+	    m = (uint32_t)combine_disjoint_in_part (da, (uint8_t) (sa >> 0));
+	    n = (uint32_t)combine_disjoint_in_part (da, (uint8_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (uint32_t)combine_disjoint_in_part (da, (uint8_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (uint32_t)combine_disjoint_in_part (da, (uint8_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B:
+	    Fb = ~0;
+	    break;
+	}
+	m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
+	n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
+	o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
+	p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
+
+	s = m | n | o | p;
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_disjoint_over_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *                dest,
+                          const uint32_t *          src,
+                          const uint32_t *          mask,
+                          int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+combine_disjoint_in_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *                dest,
+                        const uint32_t *          src,
+                        const uint32_t *          mask,
+                        int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_disjoint_in_reverse_ca (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                uint32_t *                dest,
+                                const uint32_t *          src,
+                                const uint32_t *          mask,
+                                int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_disjoint_out_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *                dest,
+                         const uint32_t *          src,
+                         const uint32_t *          mask,
+                         int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_disjoint_out_reverse_ca (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 uint32_t *                dest,
+                                 const uint32_t *          src,
+                                 const uint32_t *          mask,
+                                 int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_disjoint_atop_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *                dest,
+                          const uint32_t *          src,
+                          const uint32_t *          mask,
+                          int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_disjoint_atop_reverse_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  uint32_t *                dest,
+                                  const uint32_t *          src,
+                                  const uint32_t *          mask,
+                                  int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_disjoint_xor_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *                dest,
+                         const uint32_t *          src,
+                         const uint32_t *          mask,
+                         int                      width)
+{
+    combine_disjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
+combine_conjoint_general_ca (uint32_t *      dest,
+                             const uint32_t *src,
+                             const uint32_t *mask,
+                             int            width,
+                             uint8_t        combine)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint32_t s, d;
+	uint32_t m, n, o, p;
+	uint32_t Fa, Fb;
+	uint16_t t, u, v;
+	uint32_t sa;
+	uint8_t da;
+
+	s = *(src + i);
+	m = *(mask + i);
+	d = *(dest + i);
+	da = d >> A_SHIFT;
+
+	combine_mask_ca (&s, &m);
+
+	sa = m;
+
+	switch (combine & COMBINE_A)
+	{
+	default:
+	    Fa = 0;
+	    break;
+
+	case COMBINE_A_OUT:
+	    m = (uint32_t)combine_conjoint_out_part ((uint8_t) (sa >> 0), da);
+	    n = (uint32_t)combine_conjoint_out_part ((uint8_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (uint32_t)combine_conjoint_out_part ((uint8_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (uint32_t)combine_conjoint_out_part ((uint8_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A_IN:
+	    m = (uint32_t)combine_conjoint_in_part ((uint8_t) (sa >> 0), da);
+	    n = (uint32_t)combine_conjoint_in_part ((uint8_t) (sa >> G_SHIFT), da) << G_SHIFT;
+	    o = (uint32_t)combine_conjoint_in_part ((uint8_t) (sa >> R_SHIFT), da) << R_SHIFT;
+	    p = (uint32_t)combine_conjoint_in_part ((uint8_t) (sa >> A_SHIFT), da) << A_SHIFT;
+	    Fa = m | n | o | p;
+	    break;
+
+	case COMBINE_A:
+	    Fa = ~0;
+	    break;
+	}
+
+	switch (combine & COMBINE_B)
+	{
+	default:
+	    Fb = 0;
+	    break;
+
+	case COMBINE_B_OUT:
+	    m = (uint32_t)combine_conjoint_out_part (da, (uint8_t) (sa >> 0));
+	    n = (uint32_t)combine_conjoint_out_part (da, (uint8_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (uint32_t)combine_conjoint_out_part (da, (uint8_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (uint32_t)combine_conjoint_out_part (da, (uint8_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B_IN:
+	    m = (uint32_t)combine_conjoint_in_part (da, (uint8_t) (sa >> 0));
+	    n = (uint32_t)combine_conjoint_in_part (da, (uint8_t) (sa >> G_SHIFT)) << G_SHIFT;
+	    o = (uint32_t)combine_conjoint_in_part (da, (uint8_t) (sa >> R_SHIFT)) << R_SHIFT;
+	    p = (uint32_t)combine_conjoint_in_part (da, (uint8_t) (sa >> A_SHIFT)) << A_SHIFT;
+	    Fb = m | n | o | p;
+	    break;
+
+	case COMBINE_B:
+	    Fb = ~0;
+	    break;
+	}
+	m = GENERIC (s, d, 0, GET_COMP (Fa, 0), GET_COMP (Fb, 0), t, u, v);
+	n = GENERIC (s, d, G_SHIFT, GET_COMP (Fa, G_SHIFT), GET_COMP (Fb, G_SHIFT), t, u, v);
+	o = GENERIC (s, d, R_SHIFT, GET_COMP (Fa, R_SHIFT), GET_COMP (Fb, R_SHIFT), t, u, v);
+	p = GENERIC (s, d, A_SHIFT, GET_COMP (Fa, A_SHIFT), GET_COMP (Fb, A_SHIFT), t, u, v);
+
+	s = m | n | o | p;
+
+	*(dest + i) = s;
+    }
+}
+
+static void
+combine_conjoint_over_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *                dest,
+                          const uint32_t *          src,
+                          const uint32_t *          mask,
+                          int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+combine_conjoint_over_reverse_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  uint32_t *                dest,
+                                  const uint32_t *          src,
+                                  const uint32_t *          mask,
+                                  int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+combine_conjoint_in_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *                dest,
+                        const uint32_t *          src,
+                        const uint32_t *          mask,
+                        int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+combine_conjoint_in_reverse_ca (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                uint32_t *                dest,
+                                const uint32_t *          src,
+                                const uint32_t *          mask,
+                                int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+combine_conjoint_out_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *                dest,
+                         const uint32_t *          src,
+                         const uint32_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+combine_conjoint_out_reverse_ca (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 uint32_t *                dest,
+                                 const uint32_t *          src,
+                                 const uint32_t *          mask,
+                                 int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+combine_conjoint_atop_ca (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *                dest,
+                          const uint32_t *          src,
+                          const uint32_t *          mask,
+                          int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+combine_conjoint_atop_reverse_ca (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  uint32_t *                dest,
+                                  const uint32_t *          src,
+                                  const uint32_t *          mask,
+                                  int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+combine_conjoint_xor_ca (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *                dest,
+                         const uint32_t *          src,
+                         const uint32_t *          mask,
+                         int                      width)
+{
+    combine_conjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+void
+_pixman_setup_combiner_functions_32 (pixman_implementation_t *imp)
+{
+    /* Unified alpha */
+    imp->combine_32[PIXMAN_OP_CLEAR] = combine_clear;
+    imp->combine_32[PIXMAN_OP_SRC] = combine_src_u;
+    imp->combine_32[PIXMAN_OP_DST] = combine_dst;
+    imp->combine_32[PIXMAN_OP_OVER] = combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = combine_add_u;
+    imp->combine_32[PIXMAN_OP_SATURATE] = combine_saturate_u;
+
+    /* Disjoint, unified */
+    imp->combine_32[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear;
+    imp->combine_32[PIXMAN_OP_DISJOINT_SRC] = combine_src_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_DST] = combine_dst;
+    imp->combine_32[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_u;
+
+    /* Conjoint, unified */
+    imp->combine_32[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear;
+    imp->combine_32[PIXMAN_OP_CONJOINT_SRC] = combine_src_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_DST] = combine_dst;
+    imp->combine_32[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_u;
+
+    imp->combine_32[PIXMAN_OP_MULTIPLY] = combine_multiply_u;
+    imp->combine_32[PIXMAN_OP_SCREEN] = combine_screen_u;
+    imp->combine_32[PIXMAN_OP_OVERLAY] = combine_overlay_u;
+    imp->combine_32[PIXMAN_OP_DARKEN] = combine_darken_u;
+    imp->combine_32[PIXMAN_OP_LIGHTEN] = combine_lighten_u;
+    imp->combine_32[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_u;
+    imp->combine_32[PIXMAN_OP_COLOR_BURN] = combine_color_burn_u;
+    imp->combine_32[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_u;
+    imp->combine_32[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_u;
+    imp->combine_32[PIXMAN_OP_DIFFERENCE] = combine_difference_u;
+    imp->combine_32[PIXMAN_OP_EXCLUSION] = combine_exclusion_u;
+    imp->combine_32[PIXMAN_OP_HSL_HUE] = combine_hsl_hue_u;
+    imp->combine_32[PIXMAN_OP_HSL_SATURATION] = combine_hsl_saturation_u;
+    imp->combine_32[PIXMAN_OP_HSL_COLOR] = combine_hsl_color_u;
+    imp->combine_32[PIXMAN_OP_HSL_LUMINOSITY] = combine_hsl_luminosity_u;
+
+    /* Component alpha combiners */
+    imp->combine_32_ca[PIXMAN_OP_CLEAR] = combine_clear_ca;
+    imp->combine_32_ca[PIXMAN_OP_SRC] = combine_src_ca;
+    /* dest */
+    imp->combine_32_ca[PIXMAN_OP_OVER] = combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = combine_add_ca;
+    imp->combine_32_ca[PIXMAN_OP_SATURATE] = combine_saturate_ca;
+
+    /* Disjoint CA */
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_CLEAR] = combine_clear_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_SRC] = combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_DST] = combine_dst;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_OVER] = combine_disjoint_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = combine_saturate_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_IN] = combine_disjoint_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = combine_disjoint_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_OUT] = combine_disjoint_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = combine_disjoint_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_ATOP] = combine_disjoint_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = combine_disjoint_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_XOR] = combine_disjoint_xor_ca;
+
+    /* Conjoint CA */
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_CLEAR] = combine_clear_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_SRC] = combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_DST] = combine_dst;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_OVER] = combine_conjoint_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = combine_conjoint_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_IN] = combine_conjoint_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = combine_conjoint_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_OUT] = combine_conjoint_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = combine_conjoint_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_ATOP] = combine_conjoint_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = combine_conjoint_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_XOR] = combine_conjoint_xor_ca;
+
+    imp->combine_32_ca[PIXMAN_OP_MULTIPLY] = combine_multiply_ca;
+    imp->combine_32_ca[PIXMAN_OP_SCREEN] = combine_screen_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVERLAY] = combine_overlay_ca;
+    imp->combine_32_ca[PIXMAN_OP_DARKEN] = combine_darken_ca;
+    imp->combine_32_ca[PIXMAN_OP_LIGHTEN] = combine_lighten_ca;
+    imp->combine_32_ca[PIXMAN_OP_COLOR_DODGE] = combine_color_dodge_ca;
+    imp->combine_32_ca[PIXMAN_OP_COLOR_BURN] = combine_color_burn_ca;
+    imp->combine_32_ca[PIXMAN_OP_HARD_LIGHT] = combine_hard_light_ca;
+    imp->combine_32_ca[PIXMAN_OP_SOFT_LIGHT] = combine_soft_light_ca;
+    imp->combine_32_ca[PIXMAN_OP_DIFFERENCE] = combine_difference_ca;
+    imp->combine_32_ca[PIXMAN_OP_EXCLUSION] = combine_exclusion_ca;
+
+    /* It is not clear that these make sense, so make them noops for now */
+    imp->combine_32_ca[PIXMAN_OP_HSL_HUE] = combine_dst;
+    imp->combine_32_ca[PIXMAN_OP_HSL_SATURATION] = combine_dst;
+    imp->combine_32_ca[PIXMAN_OP_HSL_COLOR] = combine_dst;
+    imp->combine_32_ca[PIXMAN_OP_HSL_LUMINOSITY] = combine_dst;
+}
diff --git a/src/pixman/pixman/pixman-combine32.h b/src/pixman/pixman/pixman-combine32.h
new file mode 100644
index 0000000..cdd56a6
--- /dev/null
+++ b/src/pixman/pixman/pixman-combine32.h
@@ -0,0 +1,272 @@
+#define COMPONENT_SIZE 8
+#define MASK 0xff
+#define ONE_HALF 0x80
+
+#define A_SHIFT 8 * 3
+#define R_SHIFT 8 * 2
+#define G_SHIFT 8
+#define A_MASK 0xff000000
+#define R_MASK 0xff0000
+#define G_MASK 0xff00
+
+#define RB_MASK 0xff00ff
+#define AG_MASK 0xff00ff00
+#define RB_ONE_HALF 0x800080
+#define RB_MASK_PLUS_ONE 0x10000100
+
+#define ALPHA_8(x) ((x) >> A_SHIFT)
+#define RED_8(x) (((x) >> R_SHIFT) & MASK)
+#define GREEN_8(x) (((x) >> G_SHIFT) & MASK)
+#define BLUE_8(x) ((x) & MASK)
+
+/*
+ * ARMv6 has UQADD8 instruction, which implements unsigned saturated
+ * addition for 8-bit values packed in 32-bit registers. It is very useful
+ * for UN8x4_ADD_UN8x4, UN8_rb_ADD_UN8_rb and ADD_UN8 macros (which would
+ * otherwise need a lot of arithmetic operations to simulate this operation).
+ * Since most of the major ARM linux distros are built for ARMv7, we are
+ * much less dependent on runtime CPU detection and can get practical
+ * benefits from conditional compilation here for a lot of users.
+ */
+
+#if defined(USE_GCC_INLINE_ASM) && defined(__arm__) && \
+    !defined(__aarch64__) && (!defined(__thumb__) || defined(__thumb2__))
+#if defined(__ARM_ARCH_6__)   || defined(__ARM_ARCH_6J__)  || \
+    defined(__ARM_ARCH_6K__)  || defined(__ARM_ARCH_6Z__)  || \
+    defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) || \
+    defined(__ARM_ARCH_6M__)  || defined(__ARM_ARCH_7__)   || \
+    defined(__ARM_ARCH_7A__)  || defined(__ARM_ARCH_7R__)  || \
+    defined(__ARM_ARCH_7M__)  || defined(__ARM_ARCH_7EM__)
+
+static force_inline uint32_t
+un8x4_add_un8x4 (uint32_t x, uint32_t y)
+{
+    uint32_t t;
+    asm ("uqadd8 %0, %1, %2" : "=r" (t) : "%r" (x), "r" (y));
+    return t;
+}
+
+#define UN8x4_ADD_UN8x4(x, y) \
+    ((x) = un8x4_add_un8x4 ((x), (y)))
+
+#define UN8_rb_ADD_UN8_rb(x, y, t) \
+    ((t) = un8x4_add_un8x4 ((x), (y)), (x) = (t))
+
+#define ADD_UN8(x, y, t) \
+    ((t) = (x), un8x4_add_un8x4 ((t), (y)))
+
+#endif
+#endif
+
+/*****************************************************************************/
+
+/*
+ * Helper macros.
+ */
+
+#define MUL_UN8(a, b, t)						\
+    ((t) = (a) * (uint16_t)(b) + ONE_HALF, ((((t) >> G_SHIFT ) + (t) ) >> G_SHIFT ))
+
+#define DIV_UN8(a, b)							\
+    (((uint16_t) (a) * MASK + ((b) / 2)) / (b))
+
+#ifndef ADD_UN8
+#define ADD_UN8(x, y, t)				     \
+    ((t) = (x) + (y),					     \
+     (uint32_t) (uint8_t) ((t) | (0 - ((t) >> G_SHIFT))))
+#endif
+
+#define DIV_ONE_UN8(x)							\
+    (((x) + ONE_HALF + (((x) + ONE_HALF) >> G_SHIFT)) >> G_SHIFT)
+
+/*
+ * The methods below use some tricks to be able to do two color
+ * components at the same time.
+ */
+
+/*
+ * x_rb = (x_rb * a) / 255
+ */
+#define UN8_rb_MUL_UN8(x, a, t)						\
+    do									\
+    {									\
+	t  = ((x) & RB_MASK) * (a);					\
+	t += RB_ONE_HALF;						\
+	x = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
+	x &= RB_MASK;							\
+    } while (0)
+
+/*
+ * x_rb = min (x_rb + y_rb, 255)
+ */
+#ifndef UN8_rb_ADD_UN8_rb
+#define UN8_rb_ADD_UN8_rb(x, y, t)					\
+    do									\
+    {									\
+	t = ((x) + (y));						\
+	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);		\
+	x = (t & RB_MASK);						\
+    } while (0)
+#endif
+
+/*
+ * x_rb = (x_rb * a_rb) / 255
+ */
+#define UN8_rb_MUL_UN8_rb(x, a, t)					\
+    do									\
+    {									\
+	t  = (x & MASK) * (a & MASK);					\
+	t |= (x & R_MASK) * ((a >> R_SHIFT) & MASK);			\
+	t += RB_ONE_HALF;						\
+	t = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
+	x = t & RB_MASK;						\
+    } while (0)
+
+/*
+ * x_c = (x_c * a) / 255
+ */
+#define UN8x4_MUL_UN8(x, a)						\
+    do									\
+    {									\
+	uint32_t r1__, r2__, t__;					\
+									\
+	r1__ = (x);							\
+	UN8_rb_MUL_UN8 (r1__, (a), t__);				\
+									\
+	r2__ = (x) >> G_SHIFT;						\
+	UN8_rb_MUL_UN8 (r2__, (a), t__);				\
+									\
+	(x) = r1__ | (r2__ << G_SHIFT);					\
+    } while (0)
+
+/*
+ * x_c = (x_c * a) / 255 + y_c
+ */
+#define UN8x4_MUL_UN8_ADD_UN8x4(x, a, y)				\
+    do									\
+    {									\
+	uint32_t r1__, r2__, r3__, t__;					\
+									\
+	r1__ = (x);							\
+	r2__ = (y) & RB_MASK;						\
+	UN8_rb_MUL_UN8 (r1__, (a), t__);				\
+	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
+									\
+	r2__ = (x) >> G_SHIFT;						\
+	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
+	UN8_rb_MUL_UN8 (r2__, (a), t__);				\
+	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
+									\
+	(x) = r1__ | (r2__ << G_SHIFT);					\
+    } while (0)
+
+/*
+ * x_c = (x_c * a + y_c * b) / 255
+ */
+#define UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8(x, a, y, b)			\
+    do									\
+    {									\
+	uint32_t r1__, r2__, r3__, t__;					\
+									\
+	r1__ = (x);							\
+	r2__ = (y);							\
+	UN8_rb_MUL_UN8 (r1__, (a), t__);				\
+	UN8_rb_MUL_UN8 (r2__, (b), t__);				\
+	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
+									\
+	r2__ = ((x) >> G_SHIFT);					\
+	r3__ = ((y) >> G_SHIFT);					\
+	UN8_rb_MUL_UN8 (r2__, (a), t__);				\
+	UN8_rb_MUL_UN8 (r3__, (b), t__);				\
+	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
+									\
+	(x) = r1__ | (r2__ << G_SHIFT);					\
+    } while (0)
+
+/*
+ * x_c = (x_c * a_c) / 255
+ */
+#define UN8x4_MUL_UN8x4(x, a)						\
+    do									\
+    {									\
+	uint32_t r1__, r2__, r3__, t__;					\
+									\
+	r1__ = (x);							\
+	r2__ = (a);							\
+	UN8_rb_MUL_UN8_rb (r1__, r2__, t__);				\
+									\
+	r2__ = (x) >> G_SHIFT;						\
+	r3__ = (a) >> G_SHIFT;						\
+	UN8_rb_MUL_UN8_rb (r2__, r3__, t__);				\
+									\
+	(x) = r1__ | (r2__ << G_SHIFT);					\
+    } while (0)
+
+/*
+ * x_c = (x_c * a_c) / 255 + y_c
+ */
+#define UN8x4_MUL_UN8x4_ADD_UN8x4(x, a, y)				\
+    do									\
+    {									\
+	uint32_t r1__, r2__, r3__, t__;					\
+									\
+	r1__ = (x);							\
+	r2__ = (a);							\
+	UN8_rb_MUL_UN8_rb (r1__, r2__, t__);				\
+	r2__ = (y) & RB_MASK;						\
+	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
+									\
+	r2__ = ((x) >> G_SHIFT);					\
+	r3__ = ((a) >> G_SHIFT);					\
+	UN8_rb_MUL_UN8_rb (r2__, r3__, t__);				\
+	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
+	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
+									\
+	(x) = r1__ | (r2__ << G_SHIFT);					\
+    } while (0)
+
+/*
+ * x_c = (x_c * a_c + y_c * b) / 255
+ */
+#define UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8(x, a, y, b)			\
+    do									\
+    {									\
+	uint32_t r1__, r2__, r3__, t__;					\
+									\
+	r1__ = (x);							\
+	r2__ = (a);							\
+	UN8_rb_MUL_UN8_rb (r1__, r2__, t__);				\
+	r2__ = (y);							\
+	UN8_rb_MUL_UN8 (r2__, (b), t__);				\
+	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
+									\
+	r2__ = (x) >> G_SHIFT;						\
+	r3__ = (a) >> G_SHIFT;						\
+	UN8_rb_MUL_UN8_rb (r2__, r3__, t__);				\
+	r3__ = (y) >> G_SHIFT;						\
+	UN8_rb_MUL_UN8 (r3__, (b), t__);				\
+	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
+									\
+	x = r1__ | (r2__ << G_SHIFT);					\
+    } while (0)
+
+/*
+  x_c = min(x_c + y_c, 255)
+*/
+#ifndef UN8x4_ADD_UN8x4
+#define UN8x4_ADD_UN8x4(x, y)						\
+    do									\
+    {									\
+	uint32_t r1__, r2__, r3__, t__;					\
+									\
+	r1__ = (x) & RB_MASK;						\
+	r2__ = (y) & RB_MASK;						\
+	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
+									\
+	r2__ = ((x) >> G_SHIFT) & RB_MASK;				\
+	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
+	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
+									\
+	x = r1__ | (r2__ << G_SHIFT);					\
+    } while (0)
+#endif
diff --git a/src/pixman/pixman/pixman-compiler.h b/src/pixman/pixman/pixman-compiler.h
new file mode 100644
index 0000000..2489adc
--- /dev/null
+++ b/src/pixman/pixman/pixman-compiler.h
@@ -0,0 +1,232 @@
+/* Pixman uses some non-standard compiler features. This file ensures
+ * they exist
+ *
+ * The features are:
+ *
+ *    FUNC	     must be defined to expand to the current function
+ *    PIXMAN_EXPORT  should be defined to whatever is required to
+ *                   export functions from a shared library
+ *    limits	     limits for various types must be defined
+ *    inline         must be defined
+ *    force_inline   must be defined
+ */
+#if defined (__GNUC__)
+#  define FUNC     ((const char*) (__PRETTY_FUNCTION__))
+#elif defined (__sun) || (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
+#  define FUNC     ((const char*) (__func__))
+#else
+#  define FUNC     ((const char*) ("???"))
+#endif
+
+#if defined (__GNUC__)
+#  define unlikely(expr) __builtin_expect ((expr), 0)
+#else
+#  define unlikely(expr)  (expr)
+#endif
+
+#if defined (__GNUC__)
+#  define MAYBE_UNUSED  __attribute__((unused))
+#else
+#  define MAYBE_UNUSED
+#endif
+
+#ifndef INT16_MIN
+# define INT16_MIN              (-32767-1)
+#endif
+
+#ifndef INT16_MAX
+# define INT16_MAX              (32767)
+#endif
+
+#ifndef INT32_MIN
+# define INT32_MIN              (-2147483647-1)
+#endif
+
+#ifndef INT32_MAX
+# define INT32_MAX              (2147483647)
+#endif
+
+#ifndef UINT32_MIN
+# define UINT32_MIN             (0)
+#endif
+
+#ifndef UINT32_MAX
+# define UINT32_MAX             (4294967295U)
+#endif
+
+#ifndef INT64_MIN
+# define INT64_MIN              (-9223372036854775807-1)
+#endif
+
+#ifndef INT64_MAX
+# define INT64_MAX              (9223372036854775807)
+#endif
+
+#ifndef SIZE_MAX
+# define SIZE_MAX               ((size_t)-1)
+#endif
+
+
+#ifndef M_PI
+# define M_PI			3.14159265358979323846
+#endif
+
+#ifdef _MSC_VER
+/* 'inline' is available only in C++ in MSVC */
+#   define inline __inline
+#   define force_inline __forceinline
+#   define noinline __declspec(noinline)
+#elif defined __GNUC__ || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
+#   define inline __inline__
+#   define force_inline __inline__ __attribute__ ((__always_inline__))
+#   define noinline __attribute__((noinline))
+#else
+#   ifndef force_inline
+#      define force_inline inline
+#   endif
+#   ifndef noinline
+#      define noinline
+#   endif
+#endif
+
+/* GCC visibility */
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(_WIN32)
+#   define PIXMAN_EXPORT __attribute__ ((visibility("default")))
+/* Sun Studio 8 visibility */
+#elif defined(__SUNPRO_C) && (__SUNPRO_C >= 0x550)
+#   define PIXMAN_EXPORT __global
+#else
+#   define PIXMAN_EXPORT
+#endif
+
+/* member offsets */
+#define CONTAINER_OF(type, member, data)				\
+    ((type *)(((uint8_t *)data) - offsetof (type, member)))
+
+/* TLS */
+#if defined(PIXMAN_NO_TLS)
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static type name
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    (&name)
+
+#elif defined(TLS)
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static TLS type name
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    (&name)
+
+#elif defined(__MINGW32__)
+
+#   define _NO_W32_PSEUDO_MODIFIERS
+#   include <windows.h>
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static volatile int tls_ ## name ## _initialized = 0;		\
+    static void *tls_ ## name ## _mutex = NULL;				\
+    static unsigned tls_ ## name ## _index;				\
+									\
+    static type *							\
+    tls_ ## name ## _alloc (void)					\
+    {									\
+        type *value = calloc (1, sizeof (type));			\
+        if (value)							\
+            TlsSetValue (tls_ ## name ## _index, value);		\
+        return value;							\
+    }									\
+									\
+    static force_inline type *						\
+    tls_ ## name ## _get (void)						\
+    {									\
+	type *value;							\
+	if (!tls_ ## name ## _initialized)				\
+	{								\
+	    if (!tls_ ## name ## _mutex)				\
+	    {								\
+		void *mutex = CreateMutexA (NULL, 0, NULL);		\
+		if (InterlockedCompareExchangePointer (			\
+			&tls_ ## name ## _mutex, mutex, NULL) != NULL)	\
+		{							\
+		    CloseHandle (mutex);				\
+		}							\
+	    }								\
+	    WaitForSingleObject (tls_ ## name ## _mutex, 0xFFFFFFFF);	\
+	    if (!tls_ ## name ## _initialized)				\
+	    {								\
+		tls_ ## name ## _index = TlsAlloc ();			\
+		tls_ ## name ## _initialized = 1;			\
+	    }								\
+	    ReleaseMutex (tls_ ## name ## _mutex);			\
+	}								\
+	if (tls_ ## name ## _index == 0xFFFFFFFF)			\
+	    return NULL;						\
+	value = TlsGetValue (tls_ ## name ## _index);			\
+	if (!value)							\
+	    value = tls_ ## name ## _alloc ();				\
+	return value;							\
+    }
+
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    tls_ ## name ## _get ()
+
+#elif defined(_MSC_VER)
+
+#   define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static __declspec(thread) type name
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    (&name)
+
+#elif defined(HAVE_PTHREADS)
+
+#include <pthread.h>
+
+#  define PIXMAN_DEFINE_THREAD_LOCAL(type, name)			\
+    static pthread_once_t tls_ ## name ## _once_control = PTHREAD_ONCE_INIT; \
+    static pthread_key_t tls_ ## name ## _key;				\
+									\
+    static void								\
+    tls_ ## name ## _destroy_value (void *value)			\
+    {									\
+	free (value);							\
+    }									\
+									\
+    static void								\
+    tls_ ## name ## _make_key (void)					\
+    {									\
+	pthread_key_create (&tls_ ## name ## _key,			\
+			    tls_ ## name ## _destroy_value);		\
+    }									\
+									\
+    static type *							\
+    tls_ ## name ## _alloc (void)					\
+    {									\
+	type *value = calloc (1, sizeof (type));			\
+	if (value)							\
+	    pthread_setspecific (tls_ ## name ## _key, value);		\
+	return value;							\
+    }									\
+									\
+    static force_inline type *						\
+    tls_ ## name ## _get (void)						\
+    {									\
+	type *value = NULL;						\
+	if (pthread_once (&tls_ ## name ## _once_control,		\
+			  tls_ ## name ## _make_key) == 0)		\
+	{								\
+	    value = pthread_getspecific (tls_ ## name ## _key);		\
+	    if (!value)							\
+		value = tls_ ## name ## _alloc ();			\
+	}								\
+	return value;							\
+    }
+
+#   define PIXMAN_GET_THREAD_LOCAL(name)				\
+    tls_ ## name ## _get ()
+
+#else
+
+#    error "Unknown thread local support for this system. Pixman will not work with multiple threads. Define PIXMAN_NO_TLS to acknowledge and accept this limitation and compile pixman without thread-safety support."
+
+#endif
diff --git a/src/pixman/pixman/pixman-conical-gradient.c b/src/pixman/pixman/pixman-conical-gradient.c
new file mode 100644
index 0000000..8bb46ae
--- /dev/null
+++ b/src/pixman/pixman/pixman-conical-gradient.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <math.h>
+#include "pixman-private.h"
+
+static force_inline double
+coordinates_to_parameter (double x, double y, double angle)
+{
+    double t;
+
+    t = atan2 (y, x) + angle;
+
+    while (t < 0)
+	t += 2 * M_PI;
+
+    while (t >= 2 * M_PI)
+	t -= 2 * M_PI;
+
+    return 1 - t * (1 / (2 * M_PI)); /* Scale t to [0, 1] and
+				      * make rotation CCW
+				      */
+}
+
+static uint32_t *
+conical_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    uint32_t *buffer = iter->buffer;
+
+    gradient_t *gradient = (gradient_t *)image;
+    conical_gradient_t *conical = (conical_gradient_t *)image;
+    uint32_t       *end = buffer + width;
+    pixman_gradient_walker_t walker;
+    pixman_bool_t affine = TRUE;
+    double cx = 1.;
+    double cy = 0.;
+    double cz = 0.;
+    double rx = x + 0.5;
+    double ry = y + 0.5;
+    double rz = 1.;
+
+    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
+
+    if (image->common.transform)
+    {
+	pixman_vector_t v;
+
+	/* reference point is the center of the pixel */
+	v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+	v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+	v.vector[2] = pixman_fixed_1;
+
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return iter->buffer;
+
+	cx = image->common.transform->matrix[0][0] / 65536.;
+	cy = image->common.transform->matrix[1][0] / 65536.;
+	cz = image->common.transform->matrix[2][0] / 65536.;
+
+	rx = v.vector[0] / 65536.;
+	ry = v.vector[1] / 65536.;
+	rz = v.vector[2] / 65536.;
+
+	affine =
+	    image->common.transform->matrix[2][0] == 0 &&
+	    v.vector[2] == pixman_fixed_1;
+    }
+
+    if (affine)
+    {
+	rx -= conical->center.x / 65536.;
+	ry -= conical->center.y / 65536.;
+
+	while (buffer < end)
+	{
+	    if (!mask || *mask++)
+	    {
+		double t = coordinates_to_parameter (rx, ry, conical->angle);
+
+		*buffer = _pixman_gradient_walker_pixel (
+		    &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
+	    }
+
+	    ++buffer;
+
+	    rx += cx;
+	    ry += cy;
+	}
+    }
+    else
+    {
+	while (buffer < end)
+	{
+	    double x, y;
+
+	    if (!mask || *mask++)
+	    {
+		double t;
+
+		if (rz != 0)
+		{
+		    x = rx / rz;
+		    y = ry / rz;
+		}
+		else
+		{
+		    x = y = 0.;
+		}
+
+		x -= conical->center.x / 65536.;
+		y -= conical->center.y / 65536.;
+
+		t = coordinates_to_parameter (x, y, conical->angle);
+
+		*buffer = _pixman_gradient_walker_pixel (
+		    &walker, (pixman_fixed_48_16_t)pixman_double_to_fixed (t));
+	    }
+
+	    ++buffer;
+
+	    rx += cx;
+	    ry += cy;
+	    rz += cz;
+	}
+    }
+
+    iter->y++;
+    return iter->buffer;
+}
+
+static uint32_t *
+conical_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *buffer = conical_get_scanline_narrow (iter, NULL);
+
+    pixman_expand_to_float (
+	(argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return buffer;
+}
+
+void
+_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->iter_flags & ITER_NARROW)
+	iter->get_scanline = conical_get_scanline_narrow;
+    else
+	iter->get_scanline = conical_get_scanline_wide;
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_conical_gradient (const pixman_point_fixed_t *  center,
+                                      pixman_fixed_t                angle,
+                                      const pixman_gradient_stop_t *stops,
+                                      int                           n_stops)
+{
+    pixman_image_t *image = _pixman_image_allocate ();
+    conical_gradient_t *conical;
+
+    if (!image)
+	return NULL;
+
+    conical = &image->conical;
+
+    if (!_pixman_init_gradient (&conical->common, stops, n_stops))
+    {
+	free (image);
+	return NULL;
+    }
+
+    angle = MOD (angle, pixman_int_to_fixed (360));
+
+    image->type = CONICAL;
+
+    conical->center = *center;
+    conical->angle = (pixman_fixed_to_double (angle) / 180.0) * M_PI;
+
+    return image;
+}
+
diff --git a/src/pixman/pixman/pixman-edge-accessors.c b/src/pixman/pixman/pixman-edge-accessors.c
new file mode 100644
index 0000000..ea3a31e
--- /dev/null
+++ b/src/pixman/pixman/pixman-edge-accessors.c
@@ -0,0 +1,4 @@
+
+#define PIXMAN_FB_ACCESSORS
+
+#include "pixman-edge.c"
diff --git a/src/pixman/pixman/pixman-edge-imp.h b/src/pixman/pixman/pixman-edge-imp.h
new file mode 100644
index 0000000..a4698ed
--- /dev/null
+++ b/src/pixman/pixman/pixman-edge-imp.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright © 2004 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef rasterize_span
+#endif
+
+static void
+RASTERIZE_EDGES (pixman_image_t  *image,
+		pixman_edge_t	*l,
+		pixman_edge_t	*r,
+		pixman_fixed_t		t,
+		pixman_fixed_t		b)
+{
+    pixman_fixed_t  y = t;
+    uint32_t  *line;
+    uint32_t *buf = (image)->bits.bits;
+    int stride = (image)->bits.rowstride;
+    int width = (image)->bits.width;
+
+    line = buf + pixman_fixed_to_int (y) * stride;
+
+    for (;;)
+    {
+	pixman_fixed_t	lx;
+	pixman_fixed_t      rx;
+	int	lxi;
+	int rxi;
+
+	lx = l->x;
+	rx = r->x;
+#if N_BITS == 1
+	/* For the non-antialiased case, round the coordinates up, in effect
+	 * sampling just slightly to the left of the pixel. This is so that
+	 * when the sample point lies exactly on the line, we round towards
+	 * north-west.
+	 *
+	 * (The AA case does a similar  adjustment in RENDER_SAMPLES_X)
+	 */
+	lx += X_FRAC_FIRST(1) - pixman_fixed_e;
+	rx += X_FRAC_FIRST(1) - pixman_fixed_e;
+#endif
+	/* clip X */
+	if (lx < 0)
+	    lx = 0;
+	if (pixman_fixed_to_int (rx) >= width)
+#if N_BITS == 1
+	    rx = pixman_int_to_fixed (width);
+#else
+	    /* Use the last pixel of the scanline, covered 100%.
+	     * We can't use the first pixel following the scanline,
+	     * because accessing it could result in a buffer overrun.
+	     */
+	    rx = pixman_int_to_fixed (width) - 1;
+#endif
+
+	/* Skip empty (or backwards) sections */
+	if (rx > lx)
+	{
+
+	    /* Find pixel bounds for span */
+	    lxi = pixman_fixed_to_int (lx);
+	    rxi = pixman_fixed_to_int (rx);
+
+#if N_BITS == 1
+	    {
+
+#define LEFT_MASK(x)							\
+		(((x) & 0x1f) ?						\
+		 SCREEN_SHIFT_RIGHT (0xffffffff, (x) & 0x1f) : 0)
+#define RIGHT_MASK(x)							\
+		(((32 - (x)) & 0x1f) ?					\
+		 SCREEN_SHIFT_LEFT (0xffffffff, (32 - (x)) & 0x1f) : 0)
+		
+#define MASK_BITS(x,w,l,n,r) {						\
+		    n = (w);						\
+		    r = RIGHT_MASK ((x) + n);				\
+		    l = LEFT_MASK (x);					\
+		    if (l) {						\
+			n -= 32 - ((x) & 0x1f);				\
+			if (n < 0) {					\
+			    n = 0;					\
+			    l &= r;					\
+			    r = 0;					\
+			}						\
+		    }							\
+		    n >>= 5;						\
+		}
+		
+		uint32_t  *a = line;
+		uint32_t  startmask;
+		uint32_t  endmask;
+		int	    nmiddle;
+		int	    width = rxi - lxi;
+		int	    x = lxi;
+		
+		a += x >> 5;
+		x &= 0x1f;
+		
+		MASK_BITS (x, width, startmask, nmiddle, endmask);
+
+		if (startmask) {
+		    WRITE(image, a, READ(image, a) | startmask);
+		    a++;
+		}
+		while (nmiddle--)
+		    WRITE(image, a++, 0xffffffff);
+		if (endmask)
+		    WRITE(image, a, READ(image, a) | endmask);
+	    }
+#else
+	    {
+		DEFINE_ALPHA(line,lxi);
+		int	    lxs;
+		int     rxs;
+
+		/* Sample coverage for edge pixels */
+		lxs = RENDER_SAMPLES_X (lx, N_BITS);
+		rxs = RENDER_SAMPLES_X (rx, N_BITS);
+
+		/* Add coverage across row */
+		if (lxi == rxi)
+		{
+		    ADD_ALPHA (rxs - lxs);
+		}
+		else
+		{
+		    int	xi;
+
+		    ADD_ALPHA (N_X_FRAC(N_BITS) - lxs);
+		    STEP_ALPHA;
+		    for (xi = lxi + 1; xi < rxi; xi++)
+		    {
+			ADD_ALPHA (N_X_FRAC(N_BITS));
+			STEP_ALPHA;
+		    }
+		    ADD_ALPHA (rxs);
+		}
+	    }
+#endif
+	}
+
+	if (y == b)
+	    break;
+
+#if N_BITS > 1
+	if (pixman_fixed_frac (y) != Y_FRAC_LAST(N_BITS))
+	{
+	    RENDER_EDGE_STEP_SMALL (l);
+	    RENDER_EDGE_STEP_SMALL (r);
+	    y += STEP_Y_SMALL(N_BITS);
+	}
+	else
+#endif
+	{
+	    RENDER_EDGE_STEP_BIG (l);
+	    RENDER_EDGE_STEP_BIG (r);
+	    y += STEP_Y_BIG(N_BITS);
+	    line += stride;
+	}
+    }
+}
+
+#undef rasterize_span
diff --git a/src/pixman/pixman/pixman-edge.c b/src/pixman/pixman/pixman-edge.c
new file mode 100644
index 0000000..ad6dfc4
--- /dev/null
+++ b/src/pixman/pixman/pixman-edge.c
@@ -0,0 +1,385 @@
+/*
+ * Copyright © 2004 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+
+#include "pixman-private.h"
+#include "pixman-accessor.h"
+
+/*
+ * Step across a small sample grid gap
+ */
+#define RENDER_EDGE_STEP_SMALL(edge)					\
+    {									\
+	edge->x += edge->stepx_small;					\
+	edge->e += edge->dx_small;					\
+	if (edge->e > 0)						\
+	{								\
+	    edge->e -= edge->dy;					\
+	    edge->x += edge->signdx;					\
+	}								\
+    }
+
+/*
+ * Step across a large sample grid gap
+ */
+#define RENDER_EDGE_STEP_BIG(edge)					\
+    {									\
+	edge->x += edge->stepx_big;					\
+	edge->e += edge->dx_big;					\
+	if (edge->e > 0)						\
+	{								\
+	    edge->e -= edge->dy;					\
+	    edge->x += edge->signdx;					\
+	}								\
+    }
+
+#ifdef PIXMAN_FB_ACCESSORS
+#define PIXMAN_RASTERIZE_EDGES pixman_rasterize_edges_accessors
+#else
+#define PIXMAN_RASTERIZE_EDGES pixman_rasterize_edges_no_accessors
+#endif
+
+/*
+ * 4 bit alpha
+ */
+
+#define N_BITS  4
+#define RASTERIZE_EDGES rasterize_edges_4
+
+#ifndef WORDS_BIGENDIAN
+#define SHIFT_4(o)      ((o) << 2)
+#else
+#define SHIFT_4(o)      ((1 - (o)) << 2)
+#endif
+
+#define GET_4(x, o)      (((x) >> SHIFT_4 (o)) & 0xf)
+#define PUT_4(x, o, v)							\
+    (((x) & ~(0xf << SHIFT_4 (o))) | (((v) & 0xf) << SHIFT_4 (o)))
+
+#define DEFINE_ALPHA(line, x)						\
+    uint8_t   *__ap = (uint8_t *) line + ((x) >> 1);			\
+    int __ao = (x) & 1
+
+#define STEP_ALPHA      ((__ap += __ao), (__ao ^= 1))
+
+#define ADD_ALPHA(a)							\
+    {									\
+        uint8_t __o = READ (image, __ap);				\
+        uint8_t __a = (a) + GET_4 (__o, __ao);				\
+        WRITE (image, __ap, PUT_4 (__o, __ao, __a | (0 - ((__a) >> 4)))); \
+    }
+
+#include "pixman-edge-imp.h"
+
+#undef ADD_ALPHA
+#undef STEP_ALPHA
+#undef DEFINE_ALPHA
+#undef RASTERIZE_EDGES
+#undef N_BITS
+
+
+/*
+ * 1 bit alpha
+ */
+
+#define N_BITS 1
+#define RASTERIZE_EDGES rasterize_edges_1
+
+#include "pixman-edge-imp.h"
+
+#undef RASTERIZE_EDGES
+#undef N_BITS
+
+/*
+ * 8 bit alpha
+ */
+
+static force_inline uint8_t
+clip255 (int x)
+{
+    if (x > 255)
+	return 255;
+
+    return x;
+}
+
+#define ADD_SATURATE_8(buf, val, length)				\
+    do									\
+    {									\
+        int i__ = (length);						\
+        uint8_t *buf__ = (buf);						\
+        int val__ = (val);						\
+									\
+        while (i__--)							\
+        {								\
+            WRITE (image, (buf__), clip255 (READ (image, (buf__)) + (val__))); \
+            (buf__)++;							\
+	}								\
+    } while (0)
+
+/*
+ * We want to detect the case where we add the same value to a long
+ * span of pixels.  The triangles on the end are filled in while we
+ * count how many sub-pixel scanlines contribute to the middle section.
+ *
+ *                 +--------------------------+
+ *  fill_height =|   \                      /
+ *                     +------------------+
+ *                      |================|
+ *                   fill_start       fill_end
+ */
+static void
+rasterize_edges_8 (pixman_image_t *image,
+                   pixman_edge_t * l,
+                   pixman_edge_t * r,
+                   pixman_fixed_t  t,
+                   pixman_fixed_t  b)
+{
+    pixman_fixed_t y = t;
+    uint32_t  *line;
+    int fill_start = -1, fill_end = -1;
+    int fill_size = 0;
+    uint32_t *buf = (image)->bits.bits;
+    int stride = (image)->bits.rowstride;
+    int width = (image)->bits.width;
+
+    line = buf + pixman_fixed_to_int (y) * stride;
+
+    for (;;)
+    {
+        uint8_t *ap = (uint8_t *) line;
+        pixman_fixed_t lx, rx;
+        int lxi, rxi;
+
+        /* clip X */
+        lx = l->x;
+        if (lx < 0)
+	    lx = 0;
+
+        rx = r->x;
+
+        if (pixman_fixed_to_int (rx) >= width)
+	{
+	    /* Use the last pixel of the scanline, covered 100%.
+	     * We can't use the first pixel following the scanline,
+	     * because accessing it could result in a buffer overrun.
+	     */
+	    rx = pixman_int_to_fixed (width) - 1;
+	}
+
+        /* Skip empty (or backwards) sections */
+        if (rx > lx)
+        {
+            int lxs, rxs;
+
+            /* Find pixel bounds for span. */
+            lxi = pixman_fixed_to_int (lx);
+            rxi = pixman_fixed_to_int (rx);
+
+            /* Sample coverage for edge pixels */
+            lxs = RENDER_SAMPLES_X (lx, 8);
+            rxs = RENDER_SAMPLES_X (rx, 8);
+
+            /* Add coverage across row */
+            if (lxi == rxi)
+            {
+                WRITE (image, ap + lxi,
+		       clip255 (READ (image, ap + lxi) + rxs - lxs));
+	    }
+            else
+            {
+                WRITE (image, ap + lxi,
+		       clip255 (READ (image, ap + lxi) + N_X_FRAC (8) - lxs));
+
+                /* Move forward so that lxi/rxi is the pixel span */
+                lxi++;
+
+                /* Don't bother trying to optimize the fill unless
+		 * the span is longer than 4 pixels. */
+                if (rxi - lxi > 4)
+                {
+                    if (fill_start < 0)
+                    {
+                        fill_start = lxi;
+                        fill_end = rxi;
+                        fill_size++;
+		    }
+                    else
+                    {
+                        if (lxi >= fill_end || rxi < fill_start)
+                        {
+                            /* We're beyond what we saved, just fill it */
+                            ADD_SATURATE_8 (ap + fill_start,
+                                            fill_size * N_X_FRAC (8),
+                                            fill_end - fill_start);
+                            fill_start = lxi;
+                            fill_end = rxi;
+                            fill_size = 1;
+			}
+                        else
+                        {
+                            /* Update fill_start */
+                            if (lxi > fill_start)
+                            {
+                                ADD_SATURATE_8 (ap + fill_start,
+                                                fill_size * N_X_FRAC (8),
+                                                lxi - fill_start);
+                                fill_start = lxi;
+			    }
+                            else if (lxi < fill_start)
+                            {
+                                ADD_SATURATE_8 (ap + lxi, N_X_FRAC (8),
+                                                fill_start - lxi);
+			    }
+
+                            /* Update fill_end */
+                            if (rxi < fill_end)
+                            {
+                                ADD_SATURATE_8 (ap + rxi,
+                                                fill_size * N_X_FRAC (8),
+                                                fill_end - rxi);
+                                fill_end = rxi;
+			    }
+                            else if (fill_end < rxi)
+                            {
+                                ADD_SATURATE_8 (ap + fill_end,
+                                                N_X_FRAC (8),
+                                                rxi - fill_end);
+			    }
+                            fill_size++;
+			}
+		    }
+		}
+                else
+                {
+                    ADD_SATURATE_8 (ap + lxi, N_X_FRAC (8), rxi - lxi);
+		}
+
+                WRITE (image, ap + rxi, clip255 (READ (image, ap + rxi) + rxs));
+	    }
+	}
+
+        if (y == b)
+        {
+            /* We're done, make sure we clean up any remaining fill. */
+            if (fill_start != fill_end)
+            {
+                if (fill_size == N_Y_FRAC (8))
+                {
+                    MEMSET_WRAPPED (image, ap + fill_start,
+				    0xff, fill_end - fill_start);
+		}
+                else
+                {
+                    ADD_SATURATE_8 (ap + fill_start, fill_size * N_X_FRAC (8),
+                                    fill_end - fill_start);
+		}
+	    }
+            break;
+	}
+
+        if (pixman_fixed_frac (y) != Y_FRAC_LAST (8))
+        {
+            RENDER_EDGE_STEP_SMALL (l);
+            RENDER_EDGE_STEP_SMALL (r);
+            y += STEP_Y_SMALL (8);
+	}
+        else
+        {
+            RENDER_EDGE_STEP_BIG (l);
+            RENDER_EDGE_STEP_BIG (r);
+            y += STEP_Y_BIG (8);
+            if (fill_start != fill_end)
+            {
+                if (fill_size == N_Y_FRAC (8))
+                {
+                    MEMSET_WRAPPED (image, ap + fill_start,
+				    0xff, fill_end - fill_start);
+		}
+                else
+                {
+                    ADD_SATURATE_8 (ap + fill_start, fill_size * N_X_FRAC (8),
+                                    fill_end - fill_start);
+		}
+		
+                fill_start = fill_end = -1;
+                fill_size = 0;
+	    }
+	    
+            line += stride;
+	}
+    }
+}
+
+#ifndef PIXMAN_FB_ACCESSORS
+static
+#endif
+void
+PIXMAN_RASTERIZE_EDGES (pixman_image_t *image,
+                        pixman_edge_t * l,
+                        pixman_edge_t * r,
+                        pixman_fixed_t  t,
+                        pixman_fixed_t  b)
+{
+    switch (PIXMAN_FORMAT_BPP (image->bits.format))
+    {
+    case 1:
+	rasterize_edges_1 (image, l, r, t, b);
+	break;
+
+    case 4:
+	rasterize_edges_4 (image, l, r, t, b);
+	break;
+
+    case 8:
+	rasterize_edges_8 (image, l, r, t, b);
+	break;
+
+    default:
+        break;
+    }
+}
+
+#ifndef PIXMAN_FB_ACCESSORS
+
+PIXMAN_EXPORT void
+pixman_rasterize_edges (pixman_image_t *image,
+                        pixman_edge_t * l,
+                        pixman_edge_t * r,
+                        pixman_fixed_t  t,
+                        pixman_fixed_t  b)
+{
+    return_if_fail (image->type == BITS);
+    return_if_fail (PIXMAN_FORMAT_TYPE (image->bits.format) == PIXMAN_TYPE_A);
+    
+    if (image->bits.read_func || image->bits.write_func)
+	pixman_rasterize_edges_accessors (image, l, r, t, b);
+    else
+	pixman_rasterize_edges_no_accessors (image, l, r, t, b);
+}
+
+#endif
diff --git a/src/pixman/pixman/pixman-fast-path.c b/src/pixman/pixman/pixman-fast-path.c
new file mode 100644
index 0000000..c6e43de
--- /dev/null
+++ b/src/pixman/pixman/pixman-fast-path.c
@@ -0,0 +1,3292 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <string.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+static force_inline uint32_t
+fetch_24 (uint8_t *a)
+{
+    if (((uintptr_t)a) & 1)
+    {
+#ifdef WORDS_BIGENDIAN
+	return (*a << 16) | (*(uint16_t *)(a + 1));
+#else
+	return *a | (*(uint16_t *)(a + 1) << 8);
+#endif
+    }
+    else
+    {
+#ifdef WORDS_BIGENDIAN
+	return (*(uint16_t *)a << 8) | *(a + 2);
+#else
+	return *(uint16_t *)a | (*(a + 2) << 16);
+#endif
+    }
+}
+
+static force_inline void
+store_24 (uint8_t *a,
+          uint32_t v)
+{
+    if (((uintptr_t)a) & 1)
+    {
+#ifdef WORDS_BIGENDIAN
+	*a = (uint8_t) (v >> 16);
+	*(uint16_t *)(a + 1) = (uint16_t) (v);
+#else
+	*a = (uint8_t) (v);
+	*(uint16_t *)(a + 1) = (uint16_t) (v >> 8);
+#endif
+    }
+    else
+    {
+#ifdef WORDS_BIGENDIAN
+	*(uint16_t *)a = (uint16_t)(v >> 8);
+	*(a + 2) = (uint8_t)v;
+#else
+	*(uint16_t *)a = (uint16_t)v;
+	*(a + 2) = (uint8_t)(v >> 16);
+#endif
+    }
+}
+
+static force_inline uint32_t
+over (uint32_t src,
+      uint32_t dest)
+{
+    uint32_t a = ~src >> 24;
+
+    UN8x4_MUL_UN8_ADD_UN8x4 (dest, a, src);
+
+    return dest;
+}
+
+static force_inline uint32_t
+in (uint32_t x,
+    uint8_t  y)
+{
+    uint16_t a = y;
+
+    UN8x4_MUL_UN8 (x, a);
+
+    return x;
+}
+
+/*
+ * Naming convention:
+ *
+ *  op_src_mask_dest
+ */
+static void
+fast_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line;
+    uint32_t    *dst, *dst_line;
+    uint8_t     *mask, *mask_line;
+    int src_stride, mask_stride, dst_stride;
+    uint8_t m;
+    uint32_t s, d;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	src = src_line;
+	src_line += src_stride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	w = width;
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m)
+	    {
+		s = *src | 0xff000000;
+
+		if (m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    d = in (s, m);
+		    *dst = over (d, *dst);
+		}
+	    }
+	    src++;
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_in_n_8_8 (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint16_t t;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    if (srca == 0xff)
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    while (w--)
+	    {
+		m = *mask++;
+
+		if (m == 0)
+		    *dst = 0;
+		else if (m != 0xff)
+		    *dst = MUL_UN8 (m, *dst, t);
+
+		dst++;
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    while (w--)
+	    {
+		m = *mask++;
+		m = MUL_UN8 (m, srca, t);
+
+		if (m == 0)
+		    *dst = 0;
+		else if (m != 0xff)
+		    *dst = MUL_UN8 (m, *dst, t);
+
+		dst++;
+	    }
+	}
+    }
+}
+
+static void
+fast_composite_in_8_8 (pixman_implementation_t *imp,
+                       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+
+	    if (s == 0)
+		*dst = 0;
+	    else if (s != 0xff)
+		*dst = MUL_UN8 (s, *dst, t);
+
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst, d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m == 0xff)
+	    {
+		if (srca == 0xff)
+		    *dst = src;
+		else
+		    *dst = over (src, *dst);
+	    }
+	    else if (m)
+	    {
+		d = in (src, m);
+		*dst = over (d, *dst);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+				   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, s;
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    ma = *mask++;
+
+	    if (ma)
+	    {
+		d = *dst;
+		s = src;
+
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (s, ma, d);
+
+		*dst = s;
+	    }
+
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca, s;
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    ma = *mask++;
+	    if (ma == 0xffffffff)
+	    {
+		if (srca == 0xff)
+		    *dst = src;
+		else
+		    *dst = over (src, *dst);
+	    }
+	    else if (ma)
+	    {
+		d = *dst;
+		s = src;
+
+		UN8x4_MUL_UN8x4 (s, ma);
+		UN8x4_MUL_UN8 (ma, srca);
+		ma = ~ma;
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+		*dst = d;
+	    }
+
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8_0888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint8_t     *dst_line, *dst;
+    uint32_t d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m == 0xff)
+	    {
+		if (srca == 0xff)
+		{
+		    d = src;
+		}
+		else
+		{
+		    d = fetch_24 (dst);
+		    d = over (src, d);
+		}
+		store_24 (dst, d);
+	    }
+	    else if (m)
+	    {
+		d = over (in (src, m), fetch_24 (dst));
+		store_24 (dst, d);
+	    }
+	    dst += 3;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint8_t     *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m == 0xff)
+	    {
+		if (srca == 0xff)
+		{
+		    d = src;
+		}
+		else
+		{
+		    d = *dst;
+		    d = over (src, convert_0565_to_0888 (d));
+		}
+		*dst = convert_8888_to_0565 (d);
+	    }
+	    else if (m)
+	    {
+		d = *dst;
+		d = over (in (src, m), convert_0565_to_0888 (d));
+		*dst = convert_8888_to_0565 (d);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t  src, srca, s;
+    uint16_t  src16;
+    uint16_t *dst_line, *dst;
+    uint32_t  d;
+    uint32_t *mask_line, *mask, ma;
+    int dst_stride, mask_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    src16 = convert_8888_to_0565 (src);
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    ma = *mask++;
+	    if (ma == 0xffffffff)
+	    {
+		if (srca == 0xff)
+		{
+		    *dst = src16;
+		}
+		else
+		{
+		    d = *dst;
+		    d = over (src, convert_0565_to_0888 (d));
+		    *dst = convert_8888_to_0565 (d);
+		}
+	    }
+	    else if (ma)
+	    {
+		d = *dst;
+		d = convert_0565_to_0888 (d);
+
+		s = src;
+
+		UN8x4_MUL_UN8x4 (s, ma);
+		UN8x4_MUL_UN8 (ma, srca);
+		ma = ~ma;
+		UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ma, s);
+
+		*dst = convert_8888_to_0565 (d);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    uint8_t a;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+	    if (a == 0xff)
+		*dst = s;
+	    else if (s)
+		*dst = over (s, *dst);
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_src_x888_8888 (pixman_implementation_t *imp,
+			      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	    *dst++ = (*src++) | 0xff000000;
+    }
+}
+
+#if 0
+static void
+fast_composite_over_8888_0888 (pixman_implementation_t *imp,
+			       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint8_t a;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 3);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+	    if (a)
+	    {
+		if (a == 0xff)
+		    d = s;
+		else
+		    d = over (s, fetch_24 (dst));
+
+		store_24 (dst, d);
+	    }
+	    dst += 3;
+	}
+    }
+}
+#endif
+
+static void
+fast_composite_over_8888_0565 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint8_t a;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+	    if (s)
+	    {
+		if (a == 0xff)
+		{
+		    d = s;
+		}
+		else
+		{
+		    d = *dst;
+		    d = over (s, convert_0565_to_0888 (d));
+		}
+		*dst = convert_8888_to_0565 (d);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_8_8 (pixman_implementation_t *imp,
+			pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s, d;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    if (s)
+	    {
+		if (s != 0xff)
+		{
+		    d = *dst;
+		    t = d + s;
+		    s = t | (0 - (t >> 8));
+		}
+		*dst = s;
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_0565_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t	d;
+    uint16_t    *src_line, *src;
+    uint32_t	s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    if (s)
+	    {
+		d = *dst;
+		s = convert_0565_to_8888 (s);
+		if (d)
+		{
+		    d = convert_0565_to_8888 (d);
+		    UN8x4_ADD_UN8x4 (s, d);
+		}
+		*dst = convert_8888_to_0565 (s);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t s, d;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    if (s)
+	    {
+		if (s != 0xffffffff)
+		{
+		    d = *dst;
+		    if (d)
+			UN8x4_ADD_UN8x4 (s, d);
+		}
+		*dst = s;
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+fast_composite_add_n_8_8 (pixman_implementation_t *imp,
+			  pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint8_t sa;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    sa = (src >> 24);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w--)
+	{
+	    uint16_t tmp;
+	    uint16_t a;
+	    uint32_t m, d;
+	    uint32_t r;
+
+	    a = *mask++;
+	    d = *dst;
+
+	    m = MUL_UN8 (sa, a, tmp);
+	    r = ADD_UN8 (m, d, tmp);
+
+	    *dst++ = r;
+	}
+    }
+}
+
+#ifdef WORDS_BIGENDIAN
+#define CREATE_BITMASK(n) (0x80000000 >> (n))
+#define UPDATE_BITMASK(n) ((n) >> 1)
+#else
+#define CREATE_BITMASK(n) (1 << (n))
+#define UPDATE_BITMASK(n) ((n) << 1)
+#endif
+
+#define TEST_BIT(p, n)					\
+    (*((p) + ((n) >> 5)) & CREATE_BITMASK ((n) & 31))
+#define SET_BIT(p, n)							\
+    do { *((p) + ((n) >> 5)) |= CREATE_BITMASK ((n) & 31); } while (0);
+
+static void
+fast_composite_add_1_1 (pixman_implementation_t *imp,
+			pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     *dst_line, *dst;
+    uint32_t     *src_line, *src;
+    int           dst_stride, src_stride;
+    int32_t       w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, src_y, uint32_t,
+                           src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, 0, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    /*
+	     * TODO: improve performance by processing uint32_t data instead
+	     *       of individual bits
+	     */
+	    if (TEST_BIT (src, src_x + w))
+		SET_BIT (dst, dest_x + w);
+	}
+    }
+}
+
+static void
+fast_composite_over_n_1_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     src, srca;
+    uint32_t    *dst, *dst_line;
+    uint32_t    *mask, *mask_line;
+    int          mask_stride, dst_stride;
+    uint32_t     bitcache, bitmask;
+    int32_t      w;
+
+    if (width <= 0)
+	return;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t,
+                           dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+                           mask_stride, mask_line, 1);
+    mask_line += mask_x >> 5;
+
+    if (srca == 0xff)
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		    *dst = src;
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		    *dst = over (src, *dst);
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+}
+
+static void
+fast_composite_over_n_1_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     src, srca;
+    uint16_t    *dst, *dst_line;
+    uint32_t    *mask, *mask_line;
+    int          mask_stride, dst_stride;
+    uint32_t     bitcache, bitmask;
+    int32_t      w;
+    uint32_t     d;
+    uint16_t     src565;
+
+    if (width <= 0)
+	return;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t,
+                           dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, 0, mask_y, uint32_t,
+                           mask_stride, mask_line, 1);
+    mask_line += mask_x >> 5;
+
+    if (srca == 0xff)
+    {
+	src565 = convert_8888_to_0565 (src);
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		    *dst = src565;
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    dst = dst_line;
+	    dst_line += dst_stride;
+	    mask = mask_line;
+	    mask_line += mask_stride;
+	    w = width;
+
+	    bitcache = *mask++;
+	    bitmask = CREATE_BITMASK (mask_x & 31);
+
+	    while (w--)
+	    {
+		if (bitmask == 0)
+		{
+		    bitcache = *mask++;
+		    bitmask = CREATE_BITMASK (0);
+		}
+		if (bitcache & bitmask)
+		{
+		    d = over (src, convert_0565_to_0888 (*dst));
+		    *dst = convert_8888_to_0565 (d);
+		}
+		bitmask = UPDATE_BITMASK (bitmask);
+		dst++;
+	    }
+	}
+    }
+}
+
+/*
+ * Simple bitblt
+ */
+
+static void
+fast_composite_solid_fill (pixman_implementation_t *imp,
+                           pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (dest_image->bits.format == PIXMAN_a1)
+    {
+	src = src >> 31;
+    }
+    else if (dest_image->bits.format == PIXMAN_a8)
+    {
+	src = src >> 24;
+    }
+    else if (dest_image->bits.format == PIXMAN_r5g6b5 ||
+             dest_image->bits.format == PIXMAN_b5g6r5)
+    {
+	src = convert_8888_to_0565 (src);
+    }
+
+    pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+                 PIXMAN_FORMAT_BPP (dest_image->bits.format),
+                 dest_x, dest_y,
+                 width, height,
+                 src);
+}
+
+static void
+fast_composite_src_memcpy (pixman_implementation_t *imp,
+			   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8;
+    uint32_t n_bytes = width * bpp;
+    int dst_stride, src_stride;
+    uint8_t    *dst;
+    uint8_t    *src;
+
+    src_stride = src_image->bits.rowstride * 4;
+    dst_stride = dest_image->bits.rowstride * 4;
+
+    src = (uint8_t *)src_image->bits.bits + src_y * src_stride + src_x * bpp;
+    dst = (uint8_t *)dest_image->bits.bits + dest_y * dst_stride + dest_x * bpp;
+
+    while (height--)
+    {
+	memcpy (dst, src, n_bytes);
+
+	dst += dst_stride;
+	src += src_stride;
+    }
+}
+
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, SRC, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, SRC, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, SRC, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, SRC, NORMAL)
+FAST_NEAREST (x888_8888_cover, x888, 8888, uint32_t, uint32_t, SRC, COVER)
+FAST_NEAREST (x888_8888_pad, x888, 8888, uint32_t, uint32_t, SRC, PAD)
+FAST_NEAREST (x888_8888_normal, x888, 8888, uint32_t, uint32_t, SRC, NORMAL)
+FAST_NEAREST (8888_8888_cover, 8888, 8888, uint32_t, uint32_t, OVER, COVER)
+FAST_NEAREST (8888_8888_none, 8888, 8888, uint32_t, uint32_t, OVER, NONE)
+FAST_NEAREST (8888_8888_pad, 8888, 8888, uint32_t, uint32_t, OVER, PAD)
+FAST_NEAREST (8888_8888_normal, 8888, 8888, uint32_t, uint32_t, OVER, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, SRC, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, SRC, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, SRC, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (565_565_normal, 0565, 0565, uint16_t, uint16_t, SRC, NORMAL)
+FAST_NEAREST (8888_565_cover, 8888, 0565, uint32_t, uint16_t, OVER, COVER)
+FAST_NEAREST (8888_565_none, 8888, 0565, uint32_t, uint16_t, OVER, NONE)
+FAST_NEAREST (8888_565_pad, 8888, 0565, uint32_t, uint16_t, OVER, PAD)
+FAST_NEAREST (8888_565_normal, 8888, 0565, uint32_t, uint16_t, OVER, NORMAL)
+
+#define REPEAT_MIN_WIDTH    32
+
+static void
+fast_composite_tiled_repeat (pixman_implementation_t *imp,
+			     pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    pixman_composite_func_t func;
+    pixman_format_code_t mask_format;
+    uint32_t src_flags, mask_flags;
+    int32_t sx, sy;
+    int32_t width_remain;
+    int32_t num_pixels;
+    int32_t src_width;
+    int32_t i, j;
+    pixman_image_t extended_src_image;
+    uint32_t extended_src[REPEAT_MIN_WIDTH * 2];
+    pixman_bool_t need_src_extension;
+    uint32_t *src_line;
+    int32_t src_stride;
+    int32_t src_bpp;
+    pixman_composite_info_t info2 = *info;
+
+    src_flags = (info->src_flags & ~FAST_PATH_NORMAL_REPEAT) |
+		    FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+
+    if (mask_image)
+    {
+	mask_format = mask_image->common.extended_format_code;
+	mask_flags = info->mask_flags;
+    }
+    else
+    {
+	mask_format = PIXMAN_null;
+	mask_flags = FAST_PATH_IS_OPAQUE;
+    }
+
+    _pixman_implementation_lookup_composite (
+	imp->toplevel, info->op,
+	src_image->common.extended_format_code, src_flags,
+	mask_format, mask_flags,
+	dest_image->common.extended_format_code, info->dest_flags,
+	&imp, &func);
+
+    src_bpp = PIXMAN_FORMAT_BPP (src_image->bits.format);
+
+    if (src_image->bits.width < REPEAT_MIN_WIDTH		&&
+	(src_bpp == 32 || src_bpp == 16 || src_bpp == 8)	&&
+	!src_image->bits.indexed)
+    {
+	sx = src_x;
+	sx = MOD (sx, src_image->bits.width);
+	sx += width;
+	src_width = 0;
+
+	while (src_width < REPEAT_MIN_WIDTH && src_width <= sx)
+	    src_width += src_image->bits.width;
+
+	src_stride = (src_width * (src_bpp >> 3) + 3) / (int) sizeof (uint32_t);
+
+	/* Initialize/validate stack-allocated temporary image */
+	_pixman_bits_image_init (&extended_src_image, src_image->bits.format,
+				 src_width, 1, &extended_src[0], src_stride,
+				 FALSE);
+	_pixman_image_validate (&extended_src_image);
+
+	info2.src_image = &extended_src_image;
+	need_src_extension = TRUE;
+    }
+    else
+    {
+	src_width = src_image->bits.width;
+	need_src_extension = FALSE;
+    }
+
+    sx = src_x;
+    sy = src_y;
+
+    while (--height >= 0)
+    {
+	sx = MOD (sx, src_width);
+	sy = MOD (sy, src_image->bits.height);
+
+	if (need_src_extension)
+	{
+	    if (src_bpp == 32)
+	    {
+		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint32_t, src_stride, src_line, 1);
+
+		for (i = 0; i < src_width; )
+		{
+		    for (j = 0; j < src_image->bits.width; j++, i++)
+			extended_src[i] = src_line[j];
+		}
+	    }
+	    else if (src_bpp == 16)
+	    {
+		uint16_t *src_line_16;
+
+		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint16_t, src_stride,
+				       src_line_16, 1);
+		src_line = (uint32_t*)src_line_16;
+
+		for (i = 0; i < src_width; )
+		{
+		    for (j = 0; j < src_image->bits.width; j++, i++)
+			((uint16_t*)extended_src)[i] = ((uint16_t*)src_line)[j];
+		}
+	    }
+	    else if (src_bpp == 8)
+	    {
+		uint8_t *src_line_8;
+
+		PIXMAN_IMAGE_GET_LINE (src_image, 0, sy, uint8_t, src_stride,
+				       src_line_8, 1);
+		src_line = (uint32_t*)src_line_8;
+
+		for (i = 0; i < src_width; )
+		{
+		    for (j = 0; j < src_image->bits.width; j++, i++)
+			((uint8_t*)extended_src)[i] = ((uint8_t*)src_line)[j];
+		}
+	    }
+
+	    info2.src_y = 0;
+	}
+	else
+	{
+	    info2.src_y = sy;
+	}
+
+	width_remain = width;
+
+	while (width_remain > 0)
+	{
+	    num_pixels = src_width - sx;
+
+	    if (num_pixels > width_remain)
+		num_pixels = width_remain;
+
+	    info2.src_x = sx;
+	    info2.width = num_pixels;
+	    info2.height = 1;
+
+	    func (imp, &info2);
+
+	    width_remain -= num_pixels;
+	    info2.mask_x += num_pixels;
+	    info2.dest_x += num_pixels;
+	    sx = 0;
+	}
+
+	sx = src_x;
+	sy++;
+	info2.mask_x = info->mask_x;
+	info2.mask_y++;
+	info2.dest_x = info->dest_x;
+	info2.dest_y++;
+    }
+
+    if (need_src_extension)
+	_pixman_image_fini (&extended_src_image);
+}
+
+/* Use more unrolling for src_0565_0565 because it is typically CPU bound */
+static force_inline void
+scaled_nearest_scanline_565_565_SRC (uint16_t *       dst,
+				     const uint16_t * src,
+				     int32_t          w,
+				     pixman_fixed_t   vx,
+				     pixman_fixed_t   unit_x,
+				     pixman_fixed_t   max_vx,
+				     pixman_bool_t    fully_transparent_src)
+{
+    uint16_t tmp1, tmp2, tmp3, tmp4;
+    while ((w -= 4) >= 0)
+    {
+	tmp1 = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	tmp2 = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	tmp3 = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	tmp4 = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	*dst++ = tmp1;
+	*dst++ = tmp2;
+	*dst++ = tmp3;
+	*dst++ = tmp4;
+    }
+    if (w & 2)
+    {
+	tmp1 = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	tmp2 = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	*dst++ = tmp1;
+	*dst++ = tmp2;
+    }
+    if (w & 1)
+	*dst = *(src + pixman_fixed_to_int (vx));
+}
+
+FAST_NEAREST_MAINLOOP (565_565_cover_SRC,
+		       scaled_nearest_scanline_565_565_SRC,
+		       uint16_t, uint16_t, COVER)
+FAST_NEAREST_MAINLOOP (565_565_none_SRC,
+		       scaled_nearest_scanline_565_565_SRC,
+		       uint16_t, uint16_t, NONE)
+FAST_NEAREST_MAINLOOP (565_565_pad_SRC,
+		       scaled_nearest_scanline_565_565_SRC,
+		       uint16_t, uint16_t, PAD)
+
+static force_inline uint32_t
+fetch_nearest (pixman_repeat_t src_repeat,
+	       pixman_format_code_t format,
+	       uint32_t *src, int x, int src_width)
+{
+    if (repeat (src_repeat, &x, src_width))
+    {
+	if (format == PIXMAN_x8r8g8b8 || format == PIXMAN_x8b8g8r8)
+	    return *(src + x) | 0xff000000;
+	else
+	    return *(src + x);
+    }
+    else
+    {
+	return 0;
+    }
+}
+
+static force_inline void
+combine_over (uint32_t s, uint32_t *dst)
+{
+    if (s)
+    {
+	uint8_t ia = 0xff - (s >> 24);
+
+	if (ia)
+	    UN8x4_MUL_UN8_ADD_UN8x4 (*dst, ia, s);
+	else
+	    *dst = s;
+    }
+}
+
+static force_inline void
+combine_src (uint32_t s, uint32_t *dst)
+{
+    *dst = s;
+}
+
+static void
+fast_composite_scaled_nearest (pixman_implementation_t *imp,
+			       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t       *dst_line;
+    uint32_t       *src_line;
+    int             dst_stride, src_stride;
+    int		    src_width, src_height;
+    pixman_repeat_t src_repeat;
+    pixman_fixed_t unit_x, unit_y;
+    pixman_format_code_t src_format;
+    pixman_vector_t v;
+    pixman_fixed_t vy;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be
+     * transformed from destination space to source space
+     */
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, uint32_t, src_stride, src_line, 1);
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))
+	return;
+
+    unit_x = src_image->common.transform->matrix[0][0];
+    unit_y = src_image->common.transform->matrix[1][1];
+
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */
+    v.vector[0] -= pixman_fixed_e;
+    v.vector[1] -= pixman_fixed_e;
+
+    src_height = src_image->bits.height;
+    src_width = src_image->bits.width;
+    src_repeat = src_image->common.repeat;
+    src_format = src_image->bits.format;
+
+    vy = v.vector[1];
+    while (height--)
+    {
+        pixman_fixed_t vx = v.vector[0];
+	int y = pixman_fixed_to_int (vy);
+	uint32_t *dst = dst_line;
+
+	dst_line += dst_stride;
+
+        /* adjust the y location by a unit vector in the y direction
+         * this is equivalent to transforming y+1 of the destination point to source space */
+        vy += unit_y;
+
+	if (!repeat (src_repeat, &y, src_height))
+	{
+	    if (op == PIXMAN_OP_SRC)
+		memset (dst, 0, sizeof (*dst) * width);
+	}
+	else
+	{
+	    int w = width;
+
+	    uint32_t *src = src_line + y * src_stride;
+
+	    while (w >= 2)
+	    {
+		uint32_t s1, s2;
+		int x1, x2;
+
+		x1 = pixman_fixed_to_int (vx);
+		vx += unit_x;
+
+		x2 = pixman_fixed_to_int (vx);
+		vx += unit_x;
+
+		w -= 2;
+
+		s1 = fetch_nearest (src_repeat, src_format, src, x1, src_width);
+		s2 = fetch_nearest (src_repeat, src_format, src, x2, src_width);
+
+		if (op == PIXMAN_OP_OVER)
+		{
+		    combine_over (s1, dst++);
+		    combine_over (s2, dst++);
+		}
+		else
+		{
+		    combine_src (s1, dst++);
+		    combine_src (s2, dst++);
+		}
+	    }
+
+	    while (w--)
+	    {
+		uint32_t s;
+		int x;
+
+		x = pixman_fixed_to_int (vx);
+		vx += unit_x;
+
+		s = fetch_nearest (src_repeat, src_format, src, x, src_width);
+
+		if (op == PIXMAN_OP_OVER)
+		    combine_over (s, dst++);
+		else
+		    combine_src (s, dst++);
+	    }
+	}
+    }
+}
+
+#define CACHE_LINE_SIZE 64
+
+#define FAST_SIMPLE_ROTATE(suffix, pix_type)                                  \
+                                                                              \
+static void                                                                   \
+blt_rotated_90_trivial_##suffix (pix_type       *dst,                         \
+				 int             dst_stride,                  \
+				 const pix_type *src,                         \
+				 int             src_stride,                  \
+				 int             w,                           \
+				 int             h)                           \
+{                                                                             \
+    int x, y;                                                                 \
+    for (y = 0; y < h; y++)                                                   \
+    {                                                                         \
+	const pix_type *s = src + (h - y - 1);                                \
+	pix_type *d = dst + dst_stride * y;                                   \
+	for (x = 0; x < w; x++)                                               \
+	{                                                                     \
+	    *d++ = *s;                                                        \
+	    s += src_stride;                                                  \
+	}                                                                     \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_270_trivial_##suffix (pix_type       *dst,                        \
+				  int             dst_stride,                 \
+				  const pix_type *src,                        \
+				  int             src_stride,                 \
+				  int             w,                          \
+				  int             h)                          \
+{                                                                             \
+    int x, y;                                                                 \
+    for (y = 0; y < h; y++)                                                   \
+    {                                                                         \
+	const pix_type *s = src + src_stride * (w - 1) + y;                   \
+	pix_type *d = dst + dst_stride * y;                                   \
+	for (x = 0; x < w; x++)                                               \
+	{                                                                     \
+	    *d++ = *s;                                                        \
+	    s -= src_stride;                                                  \
+	}                                                                     \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_90_##suffix (pix_type       *dst,                                 \
+			 int             dst_stride,                          \
+			 const pix_type *src,                                 \
+			 int             src_stride,                          \
+			 int             W,                                   \
+			 int             H)                                   \
+{                                                                             \
+    int x;                                                                    \
+    int leading_pixels = 0, trailing_pixels = 0;                              \
+    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
+                                                                              \
+    /*                                                                        \
+     * split processing into handling destination as TILE_SIZExH cache line   \
+     * aligned vertical stripes (optimistically assuming that destination     \
+     * stride is a multiple of cache line, if not - it will be just a bit     \
+     * slower)                                                                \
+     */                                                                       \
+                                                                              \
+    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
+    {                                                                         \
+	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (leading_pixels > W)                                               \
+	    leading_pixels = W;                                               \
+                                                                              \
+	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
+	blt_rotated_90_trivial_##suffix (                                     \
+	    dst,                                                              \
+	    dst_stride,                                                       \
+	    src,                                                              \
+	    src_stride,                                                       \
+	    leading_pixels,                                                   \
+	    H);                                                               \
+	                                                                      \
+	dst += leading_pixels;                                                \
+	src += leading_pixels * src_stride;                                   \
+	W -= leading_pixels;                                                  \
+    }                                                                         \
+                                                                              \
+    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
+    {                                                                         \
+	trailing_pixels = (((uintptr_t)(dst + W) &                            \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (trailing_pixels > W)                                              \
+	    trailing_pixels = W;                                              \
+	W -= trailing_pixels;                                                 \
+    }                                                                         \
+                                                                              \
+    for (x = 0; x < W; x += TILE_SIZE)                                        \
+    {                                                                         \
+	/* aligned middle part TILE_SIZExH */                                 \
+	blt_rotated_90_trivial_##suffix (                                     \
+	    dst + x,                                                          \
+	    dst_stride,                                                       \
+	    src + src_stride * x,                                             \
+	    src_stride,                                                       \
+	    TILE_SIZE,                                                        \
+	    H);                                                               \
+    }                                                                         \
+                                                                              \
+    if (trailing_pixels)                                                      \
+    {                                                                         \
+	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
+	blt_rotated_90_trivial_##suffix (                                     \
+	    dst + W,                                                          \
+	    dst_stride,                                                       \
+	    src + W * src_stride,                                             \
+	    src_stride,                                                       \
+	    trailing_pixels,                                                  \
+	    H);                                                               \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+blt_rotated_270_##suffix (pix_type       *dst,                                \
+			  int             dst_stride,                         \
+			  const pix_type *src,                                \
+			  int             src_stride,                         \
+			  int             W,                                  \
+			  int             H)                                  \
+{                                                                             \
+    int x;                                                                    \
+    int leading_pixels = 0, trailing_pixels = 0;                              \
+    const int TILE_SIZE = CACHE_LINE_SIZE / sizeof(pix_type);                 \
+                                                                              \
+    /*                                                                        \
+     * split processing into handling destination as TILE_SIZExH cache line   \
+     * aligned vertical stripes (optimistically assuming that destination     \
+     * stride is a multiple of cache line, if not - it will be just a bit     \
+     * slower)                                                                \
+     */                                                                       \
+                                                                              \
+    if ((uintptr_t)dst & (CACHE_LINE_SIZE - 1))                               \
+    {                                                                         \
+	leading_pixels = TILE_SIZE - (((uintptr_t)dst &                       \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (leading_pixels > W)                                               \
+	    leading_pixels = W;                                               \
+                                                                              \
+	/* unaligned leading part NxH (where N < TILE_SIZE) */                \
+	blt_rotated_270_trivial_##suffix (                                    \
+	    dst,                                                              \
+	    dst_stride,                                                       \
+	    src + src_stride * (W - leading_pixels),                          \
+	    src_stride,                                                       \
+	    leading_pixels,                                                   \
+	    H);                                                               \
+	                                                                      \
+	dst += leading_pixels;                                                \
+	W -= leading_pixels;                                                  \
+    }                                                                         \
+                                                                              \
+    if ((uintptr_t)(dst + W) & (CACHE_LINE_SIZE - 1))                         \
+    {                                                                         \
+	trailing_pixels = (((uintptr_t)(dst + W) &                            \
+			    (CACHE_LINE_SIZE - 1)) / sizeof(pix_type));       \
+	if (trailing_pixels > W)                                              \
+	    trailing_pixels = W;                                              \
+	W -= trailing_pixels;                                                 \
+	src += trailing_pixels * src_stride;                                  \
+    }                                                                         \
+                                                                              \
+    for (x = 0; x < W; x += TILE_SIZE)                                        \
+    {                                                                         \
+	/* aligned middle part TILE_SIZExH */                                 \
+	blt_rotated_270_trivial_##suffix (                                    \
+	    dst + x,                                                          \
+	    dst_stride,                                                       \
+	    src + src_stride * (W - x - TILE_SIZE),                           \
+	    src_stride,                                                       \
+	    TILE_SIZE,                                                        \
+	    H);                                                               \
+    }                                                                         \
+                                                                              \
+    if (trailing_pixels)                                                      \
+    {                                                                         \
+	/* unaligned trailing part NxH (where N < TILE_SIZE) */               \
+	blt_rotated_270_trivial_##suffix (                                    \
+	    dst + W,                                                          \
+	    dst_stride,                                                       \
+	    src - trailing_pixels * src_stride,                               \
+	    src_stride,                                                       \
+	    trailing_pixels,                                                  \
+	    H);                                                               \
+    }                                                                         \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+fast_composite_rotate_90_##suffix (pixman_implementation_t *imp,              \
+				   pixman_composite_info_t *info)	      \
+{									      \
+    PIXMAN_COMPOSITE_ARGS (info);					      \
+    pix_type       *dst_line;						      \
+    pix_type       *src_line;                                                 \
+    int             dst_stride, src_stride;                                   \
+    int             src_x_t, src_y_t;                                         \
+                                                                              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
+			   dst_stride, dst_line, 1);                          \
+    src_x_t = -src_y + pixman_fixed_to_int (                                  \
+				src_image->common.transform->matrix[0][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e) - height;\
+    src_y_t = src_x + pixman_fixed_to_int (                                   \
+				src_image->common.transform->matrix[1][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e);         \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
+			   src_stride, src_line, 1);                          \
+    blt_rotated_90_##suffix (dst_line, dst_stride, src_line, src_stride,      \
+			     width, height);                                  \
+}                                                                             \
+                                                                              \
+static void                                                                   \
+fast_composite_rotate_270_##suffix (pixman_implementation_t *imp,             \
+				    pixman_composite_info_t *info)            \
+{                                                                             \
+    PIXMAN_COMPOSITE_ARGS (info);					      \
+    pix_type       *dst_line;						      \
+    pix_type       *src_line;                                                 \
+    int             dst_stride, src_stride;                                   \
+    int             src_x_t, src_y_t;                                         \
+                                                                              \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, pix_type,              \
+			   dst_stride, dst_line, 1);                          \
+    src_x_t = src_y + pixman_fixed_to_int (                                   \
+				src_image->common.transform->matrix[0][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e);         \
+    src_y_t = -src_x + pixman_fixed_to_int (                                  \
+				src_image->common.transform->matrix[1][2] +   \
+				pixman_fixed_1 / 2 - pixman_fixed_e) - width; \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x_t, src_y_t, pix_type,             \
+			   src_stride, src_line, 1);                          \
+    blt_rotated_270_##suffix (dst_line, dst_stride, src_line, src_stride,     \
+			      width, height);                                 \
+}
+
+FAST_SIMPLE_ROTATE (8, uint8_t)
+FAST_SIMPLE_ROTATE (565, uint16_t)
+FAST_SIMPLE_ROTATE (8888, uint32_t)
+
+static const pixman_fast_path_t c_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, fast_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, fast_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r8g8b8, fast_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b8g8r8, fast_composite_over_n_8_0888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, fast_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8r8g8b8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8r8g8b8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, a8b8g8r8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, x8b8g8r8, fast_composite_over_n_1_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, r5g6b5,   fast_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a1, b5g6r5,   fast_composite_over_n_1_0565),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, fast_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, fast_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, fast_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, fast_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, fast_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, fast_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, fast_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, fast_composite_add_0565_0565),
+    PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, fast_composite_add_0565_0565),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, fast_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, fast_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, fast_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, a1, null, a1, fast_composite_add_1_1),
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, fast_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, fast_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8r8g8b8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8r8g8b8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8b8g8r8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, x8b8g8r8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a1, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, a8, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, solid, null, r5g6b5, fast_composite_solid_fill),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, fast_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, fast_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8x8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8a8, null, b8g8r8a8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8x8, null, b8g8r8x8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, r8g8b8, null, r8g8b8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8, null, b8g8r8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, x8r8g8b8, r5g6b5, 8888_565),
+    SIMPLE_NEAREST_FAST_PATH (SRC, a8r8g8b8, r5g6b5, 8888_565),
+
+    SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, r5g6b5, 565_565),
+
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, x888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, x888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, 8888_8888),
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, 8888_8888),
+
+    SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, 8888_565),
+
+#define NEAREST_FAST_PATH(op,s,d)		\
+    {   PIXMAN_OP_ ## op,			\
+	PIXMAN_ ## s, SCALED_NEAREST_FLAGS,	\
+	PIXMAN_null, 0,				\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,	\
+	fast_composite_scaled_nearest,		\
+    }
+
+    NEAREST_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8),
+    NEAREST_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8),
+
+    NEAREST_FAST_PATH (SRC, x8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (SRC, x8b8g8r8, a8b8g8r8),
+    NEAREST_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8),
+
+    NEAREST_FAST_PATH (OVER, x8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8),
+    NEAREST_FAST_PATH (OVER, x8b8g8r8, x8b8g8r8),
+    NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8),
+
+    NEAREST_FAST_PATH (OVER, x8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8),
+    NEAREST_FAST_PATH (OVER, x8b8g8r8, a8b8g8r8),
+    NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8),
+
+#define SIMPLE_ROTATE_FLAGS(angle)					  \
+    (FAST_PATH_ROTATE_ ## angle ## _TRANSFORM	|			  \
+     FAST_PATH_NEAREST_FILTER			|			  \
+     FAST_PATH_SAMPLES_COVER_CLIP_NEAREST	|			  \
+     FAST_PATH_STANDARD_FLAGS)
+
+#define SIMPLE_ROTATE_FAST_PATH(op,s,d,suffix)				  \
+    {   PIXMAN_OP_ ## op,						  \
+	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (90),				  \
+	PIXMAN_null, 0,							  \
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
+	fast_composite_rotate_90_##suffix,				  \
+    },									  \
+    {   PIXMAN_OP_ ## op,						  \
+	PIXMAN_ ## s, SIMPLE_ROTATE_FLAGS (270),			  \
+	PIXMAN_null, 0,							  \
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				  \
+	fast_composite_rotate_270_##suffix,				  \
+    }
+
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, 8888),
+    SIMPLE_ROTATE_FAST_PATH (SRC, r5g6b5, r5g6b5, 565),
+    SIMPLE_ROTATE_FAST_PATH (SRC, a8, a8, 8),
+
+    /* Simple repeat fast path entry. */
+    {	PIXMAN_OP_any,
+	PIXMAN_any,
+	(FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE |
+	 FAST_PATH_NORMAL_REPEAT),
+	PIXMAN_any, 0,
+	PIXMAN_any, FAST_PATH_STD_DEST_FLAGS,
+	fast_composite_tiled_repeat
+    },
+
+    {   PIXMAN_OP_NONE	},
+};
+
+#ifdef WORDS_BIGENDIAN
+#define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (32 - (offs) - (n)))
+#else
+#define A1_FILL_MASK(n, offs) (((1U << (n)) - 1) << (offs))
+#endif
+
+static force_inline void
+pixman_fill1_line (uint32_t *dst, int offs, int width, int v)
+{
+    if (offs)
+    {
+	int leading_pixels = 32 - offs;
+	if (leading_pixels >= width)
+	{
+	    if (v)
+		*dst |= A1_FILL_MASK (width, offs);
+	    else
+		*dst &= ~A1_FILL_MASK (width, offs);
+	    return;
+	}
+	else
+	{
+	    if (v)
+		*dst++ |= A1_FILL_MASK (leading_pixels, offs);
+	    else
+		*dst++ &= ~A1_FILL_MASK (leading_pixels, offs);
+	    width -= leading_pixels;
+	}
+    }
+    while (width >= 32)
+    {
+	if (v)
+	    *dst++ = 0xFFFFFFFF;
+	else
+	    *dst++ = 0;
+	width -= 32;
+    }
+    if (width > 0)
+    {
+	if (v)
+	    *dst |= A1_FILL_MASK (width, 0);
+	else
+	    *dst &= ~A1_FILL_MASK (width, 0);
+    }
+}
+
+static void
+pixman_fill1 (uint32_t *bits,
+              int       stride,
+              int       x,
+              int       y,
+              int       width,
+              int       height,
+              uint32_t  filler)
+{
+    uint32_t *dst = bits + y * stride + (x >> 5);
+    int offs = x & 31;
+
+    if (filler & 1)
+    {
+	while (height--)
+	{
+	    pixman_fill1_line (dst, offs, width, 1);
+	    dst += stride;
+	}
+    }
+    else
+    {
+	while (height--)
+	{
+	    pixman_fill1_line (dst, offs, width, 0);
+	    dst += stride;
+	}
+    }
+}
+
+static void
+pixman_fill8 (uint32_t *bits,
+              int       stride,
+              int       x,
+              int       y,
+              int       width,
+              int       height,
+              uint32_t  filler)
+{
+    int byte_stride = stride * (int) sizeof (uint32_t);
+    uint8_t *dst = (uint8_t *) bits;
+    uint8_t v = filler & 0xff;
+    int i;
+
+    dst = dst + y * byte_stride + x;
+
+    while (height--)
+    {
+	for (i = 0; i < width; ++i)
+	    dst[i] = v;
+
+	dst += byte_stride;
+    }
+}
+
+static void
+pixman_fill16 (uint32_t *bits,
+               int       stride,
+               int       x,
+               int       y,
+               int       width,
+               int       height,
+               uint32_t  filler)
+{
+    int short_stride =
+	(stride * (int)sizeof (uint32_t)) / (int)sizeof (uint16_t);
+    uint16_t *dst = (uint16_t *)bits;
+    uint16_t v = filler & 0xffff;
+    int i;
+
+    dst = dst + y * short_stride + x;
+
+    while (height--)
+    {
+	for (i = 0; i < width; ++i)
+	    dst[i] = v;
+
+	dst += short_stride;
+    }
+}
+
+static void
+pixman_fill32 (uint32_t *bits,
+               int       stride,
+               int       x,
+               int       y,
+               int       width,
+               int       height,
+               uint32_t  filler)
+{
+    int i;
+
+    bits = bits + y * stride + x;
+
+    while (height--)
+    {
+	for (i = 0; i < width; ++i)
+	    bits[i] = filler;
+
+	bits += stride;
+    }
+}
+
+static pixman_bool_t
+fast_path_fill (pixman_implementation_t *imp,
+                uint32_t *               bits,
+                int                      stride,
+                int                      bpp,
+                int                      x,
+                int                      y,
+                int                      width,
+                int                      height,
+                uint32_t		 filler)
+{
+    switch (bpp)
+    {
+    case 1:
+	pixman_fill1 (bits, stride, x, y, width, height, filler);
+	break;
+
+    case 8:
+	pixman_fill8 (bits, stride, x, y, width, height, filler);
+	break;
+
+    case 16:
+	pixman_fill16 (bits, stride, x, y, width, height, filler);
+	break;
+
+    case 32:
+	pixman_fill32 (bits, stride, x, y, width, height, filler);
+	break;
+
+    default:
+	return FALSE;
+    }
+
+    return TRUE;
+}
+
+/*****************************************************************************/
+
+static uint32_t *
+fast_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int32_t w = iter->width;
+    uint32_t *dst = iter->buffer;
+    const uint16_t *src = (const uint16_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    /* Align the source buffer at 4 bytes boundary */
+    if (w > 0 && ((uintptr_t)src & 3))
+    {
+	*dst++ = convert_0565_to_8888 (*src++);
+	w--;
+    }
+    /* Process two pixels per iteration */
+    while ((w -= 2) >= 0)
+    {
+	uint32_t sr, sb, sg, t0, t1;
+	uint32_t s = *(const uint32_t *)src;
+	src += 2;
+	sr = (s >> 8) & 0x00F800F8;
+	sb = (s << 3) & 0x00F800F8;
+	sg = (s >> 3) & 0x00FC00FC;
+	sr |= sr >> 5;
+	sb |= sb >> 5;
+	sg |= sg >> 6;
+	t0 = ((sr << 16) & 0x00FF0000) | ((sg << 8) & 0x0000FF00) |
+	     (sb & 0xFF) | 0xFF000000;
+	t1 = (sr & 0x00FF0000) | ((sg >> 8) & 0x0000FF00) |
+	     (sb >> 16) | 0xFF000000;
+#ifdef WORDS_BIGENDIAN
+	*dst++ = t1;
+	*dst++ = t0;
+#else
+	*dst++ = t0;
+	*dst++ = t1;
+#endif
+    }
+    if (w & 1)
+    {
+	*dst = convert_0565_to_8888 (*src);
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask)
+{
+    iter->bits += iter->stride;
+    return iter->buffer;
+}
+
+/* Helper function for a workaround, which tries to ensure that 0x1F001F
+ * constant is always allocated in a register on RISC architectures.
+ */
+static force_inline uint32_t
+convert_8888_to_0565_workaround (uint32_t s, uint32_t x1F001F)
+{
+    uint32_t a, b;
+    a = (s >> 3) & x1F001F;
+    b = s & 0xFC00;
+    a |= a >> 5;
+    a |= b >> 5;
+    return a;
+}
+
+static void
+fast_write_back_r5g6b5 (pixman_iter_t *iter)
+{
+    int32_t w = iter->width;
+    uint16_t *dst = (uint16_t *)(iter->bits - iter->stride);
+    const uint32_t *src = iter->buffer;
+    /* Workaround to ensure that x1F001F variable is allocated in a register */
+    static volatile uint32_t volatile_x1F001F = 0x1F001F;
+    uint32_t x1F001F = volatile_x1F001F;
+
+    while ((w -= 4) >= 0)
+    {
+	uint32_t s1 = *src++;
+	uint32_t s2 = *src++;
+	uint32_t s3 = *src++;
+	uint32_t s4 = *src++;
+	*dst++ = convert_8888_to_0565_workaround (s1, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (s2, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (s3, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (s4, x1F001F);
+    }
+    if (w & 2)
+    {
+	*dst++ = convert_8888_to_0565_workaround (*src++, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (*src++, x1F001F);
+    }
+    if (w & 1)
+    {
+	*dst = convert_8888_to_0565_workaround (*src, x1F001F);
+    }
+}
+
+typedef struct
+{
+    int		y;
+    uint64_t *	buffer;
+} line_t;
+
+typedef struct
+{
+    line_t		lines[2];
+    pixman_fixed_t	y;
+    pixman_fixed_t	x;
+    uint64_t		data[1];
+} bilinear_info_t;
+
+static void
+fetch_horizontal (bits_image_t *image, line_t *line,
+		  int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
+{
+    uint32_t *bits = image->bits + y * image->rowstride;
+    int i;
+
+    for (i = 0; i < n; ++i)
+    {
+	int x0 = pixman_fixed_to_int (x);
+	int x1 = x0 + 1;
+	int32_t dist_x;
+
+	uint32_t left = *(bits + x0);
+	uint32_t right = *(bits + x1);
+
+	dist_x = pixman_fixed_to_bilinear_weight (x);
+	dist_x <<= (8 - BILINEAR_INTERPOLATION_BITS);
+
+#if SIZEOF_LONG <= 4
+	{
+	    uint32_t lag, rag, ag;
+	    uint32_t lrb, rrb, rb;
+
+	    lag = (left & 0xff00ff00) >> 8;
+	    rag = (right & 0xff00ff00) >> 8;
+	    ag = (lag << 8) + dist_x * (rag - lag);
+
+	    lrb = (left & 0x00ff00ff);
+	    rrb = (right & 0x00ff00ff);
+	    rb = (lrb << 8) + dist_x * (rrb - lrb);
+
+	    *((uint32_t *)(line->buffer + i)) = ag;
+	    *((uint32_t *)(line->buffer + i) + 1) = rb;
+	}
+#else
+	{
+	    uint64_t lagrb, ragrb;
+	    uint32_t lag, rag;
+	    uint32_t lrb, rrb;
+
+	    lag = (left & 0xff00ff00);
+	    lrb = (left & 0x00ff00ff);
+	    rag = (right & 0xff00ff00);
+	    rrb = (right & 0x00ff00ff);
+	    lagrb = (((uint64_t)lag) << 24) | lrb;
+	    ragrb = (((uint64_t)rag) << 24) | rrb;
+
+	    line->buffer[i] = (lagrb << 8) + dist_x * (ragrb - lagrb);
+	}
+#endif
+
+	x += ux;
+    }
+
+    line->y = y;
+}
+
+static uint32_t *
+fast_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_fixed_t fx, ux;
+    bilinear_info_t *info = iter->data;
+    line_t *line0, *line1;
+    int y0, y1;
+    int32_t dist_y;
+    int i;
+
+    fx = info->x;
+    ux = iter->image->common.transform->matrix[0][0];
+
+    y0 = pixman_fixed_to_int (info->y);
+    y1 = y0 + 1;
+    dist_y = pixman_fixed_to_bilinear_weight (info->y);
+    dist_y <<= (8 - BILINEAR_INTERPOLATION_BITS);
+
+    line0 = &info->lines[y0 & 0x01];
+    line1 = &info->lines[y1 & 0x01];
+
+    if (line0->y != y0)
+    {
+	fetch_horizontal (
+	    &iter->image->bits, line0, y0, fx, ux, iter->width);
+    }
+
+    if (line1->y != y1)
+    {
+	fetch_horizontal (
+	    &iter->image->bits, line1, y1, fx, ux, iter->width);
+    }
+
+    for (i = 0; i < iter->width; ++i)
+    {
+#if SIZEOF_LONG <= 4
+	uint32_t ta, tr, tg, tb;
+	uint32_t ba, br, bg, bb;
+	uint32_t tag, trb;
+	uint32_t bag, brb;
+	uint32_t a, r, g, b;
+
+	tag = *((uint32_t *)(line0->buffer + i));
+	trb = *((uint32_t *)(line0->buffer + i) + 1);
+	bag = *((uint32_t *)(line1->buffer + i));
+	brb = *((uint32_t *)(line1->buffer + i) + 1);
+
+	ta = tag >> 16;
+	ba = bag >> 16;
+	a = (ta << 8) + dist_y * (ba - ta);
+
+	tr = trb >> 16;
+	br = brb >> 16;
+	r = (tr << 8) + dist_y * (br - tr);
+
+	tg = tag & 0xffff;
+	bg = bag & 0xffff;
+	g = (tg << 8) + dist_y * (bg - tg);
+	
+	tb = trb & 0xffff;
+	bb = brb & 0xffff;
+	b = (tb << 8) + dist_y * (bb - tb);
+
+	a = (a <<  8) & 0xff000000;
+	r = (r <<  0) & 0x00ff0000;
+	g = (g >>  8) & 0x0000ff00;
+	b = (b >> 16) & 0x000000ff;
+#else
+	uint64_t top = line0->buffer[i];
+	uint64_t bot = line1->buffer[i];
+	uint64_t tar = (top & 0xffff0000ffff0000ULL) >> 16;
+	uint64_t bar = (bot & 0xffff0000ffff0000ULL) >> 16;
+	uint64_t tgb = (top & 0x0000ffff0000ffffULL);
+	uint64_t bgb = (bot & 0x0000ffff0000ffffULL);
+	uint64_t ar, gb;
+	uint32_t a, r, g, b;
+
+	ar = (tar << 8) + dist_y * (bar - tar);
+	gb = (tgb << 8) + dist_y * (bgb - tgb);
+
+	a = ((ar >> 24) & 0xff000000);
+	r = ((ar >>  0) & 0x00ff0000);
+	g = ((gb >> 40) & 0x0000ff00);
+	b = ((gb >> 16) & 0x000000ff);
+#endif
+
+	iter->buffer[i] = a | r | g | b;
+    }
+
+    info->y += iter->image->common.transform->matrix[1][1];
+
+    return iter->buffer;
+}
+
+static void
+bilinear_cover_iter_fini (pixman_iter_t *iter)
+{
+    free (iter->data);
+}
+
+static void
+fast_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info)
+{
+    int width = iter->width;
+    bilinear_info_t *info;
+    pixman_vector_t v;
+
+    /* Reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (iter->image->common.transform, &v))
+	goto fail;
+
+    info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t));
+    if (!info)
+	goto fail;
+
+    info->x = v.vector[0] - pixman_fixed_1 / 2;
+    info->y = v.vector[1] - pixman_fixed_1 / 2;
+
+    /* It is safe to set the y coordinates to -1 initially
+     * because COVER_CLIP_BILINEAR ensures that we will only
+     * be asked to fetch lines in the [0, height) interval
+     */
+    info->lines[0].y = -1;
+    info->lines[0].buffer = &(info->data[0]);
+    info->lines[1].y = -1;
+    info->lines[1].buffer = &(info->data[width]);
+
+    iter->get_scanline = fast_fetch_bilinear_cover;
+    iter->fini = bilinear_cover_iter_fini;
+
+    iter->data = info;
+    return;
+
+fail:
+    /* Something went wrong, either a bad matrix or OOM; in such cases,
+     * we don't guarantee any particular rendering.
+     */
+    _pixman_log_error (
+	FUNC, "Allocation failure or bad matrix, skipping rendering\n");
+    
+    iter->get_scanline = _pixman_iter_get_scanline_noop;
+    iter->fini = NULL;
+}
+
+static uint32_t *
+bits_image_fetch_bilinear_no_repeat_8888 (pixman_iter_t *iter,
+					  const uint32_t *mask)
+{
+
+    pixman_image_t * ima = iter->image;
+    int              offset = iter->x;
+    int              line = iter->y++;
+    int              width = iter->width;
+    uint32_t *       buffer = iter->buffer;
+
+    bits_image_t *bits = &ima->bits;
+    pixman_fixed_t x_top, x_bottom, x;
+    pixman_fixed_t ux_top, ux_bottom, ux;
+    pixman_vector_t v;
+    uint32_t top_mask, bottom_mask;
+    uint32_t *top_row;
+    uint32_t *bottom_row;
+    uint32_t *end;
+    uint32_t zero[2] = { 0, 0 };
+    uint32_t one = 1;
+    int y, y1, y2;
+    int disty;
+    int mask_inc;
+    int w;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (bits->common.transform, &v))
+	return iter->buffer;
+
+    ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
+    x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
+
+    y = v.vector[1] - pixman_fixed_1/2;
+    disty = pixman_fixed_to_bilinear_weight (y);
+
+    /* Load the pointers to the first and second lines from the source
+     * image that bilinear code must read.
+     *
+     * The main trick in this code is about the check if any line are
+     * outside of the image;
+     *
+     * When I realize that a line (any one) is outside, I change
+     * the pointer to a dummy area with zeros. Once I change this, I
+     * must be sure the pointer will not change, so I set the
+     * variables to each pointer increments inside the loop.
+     */
+    y1 = pixman_fixed_to_int (y);
+    y2 = y1 + 1;
+
+    if (y1 < 0 || y1 >= bits->height)
+    {
+	top_row = zero;
+	x_top = 0;
+	ux_top = 0;
+    }
+    else
+    {
+	top_row = bits->bits + y1 * bits->rowstride;
+	x_top = x;
+	ux_top = ux;
+    }
+
+    if (y2 < 0 || y2 >= bits->height)
+    {
+	bottom_row = zero;
+	x_bottom = 0;
+	ux_bottom = 0;
+    }
+    else
+    {
+	bottom_row = bits->bits + y2 * bits->rowstride;
+	x_bottom = x;
+	ux_bottom = ux;
+    }
+
+    /* Instead of checking whether the operation uses the mast in
+     * each loop iteration, verify this only once and prepare the
+     * variables to make the code smaller inside the loop.
+     */
+    if (!mask)
+    {
+        mask_inc = 0;
+        mask = &one;
+    }
+    else
+    {
+        /* If have a mask, prepare the variables to check it */
+        mask_inc = 1;
+    }
+
+    /* If both are zero, then the whole thing is zero */
+    if (top_row == zero && bottom_row == zero)
+    {
+	memset (buffer, 0, width * sizeof (uint32_t));
+	return iter->buffer;
+    }
+    else if (bits->format == PIXMAN_x8r8g8b8)
+    {
+	if (top_row == zero)
+	{
+	    top_mask = 0;
+	    bottom_mask = 0xff000000;
+	}
+	else if (bottom_row == zero)
+	{
+	    top_mask = 0xff000000;
+	    bottom_mask = 0;
+	}
+	else
+	{
+	    top_mask = 0xff000000;
+	    bottom_mask = 0xff000000;
+	}
+    }
+    else
+    {
+	top_mask = 0;
+	bottom_mask = 0;
+    }
+
+    end = buffer + width;
+
+    /* Zero fill to the left of the image */
+    while (buffer < end && x < pixman_fixed_minus_1)
+    {
+	*buffer++ = 0;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Left edge
+     */
+    while (buffer < end && x < 0)
+    {
+	uint32_t tr, br;
+	int32_t distx;
+
+	tr = top_row[pixman_fixed_to_int (x_top) + 1] | top_mask;
+	br = bottom_row[pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+	distx = pixman_fixed_to_bilinear_weight (x);
+
+	*buffer++ = bilinear_interpolation (0, tr, 0, br, distx, disty);
+
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Main part */
+    w = pixman_int_to_fixed (bits->width - 1);
+
+    while (buffer < end  &&  x < w)
+    {
+	if (*mask)
+	{
+	    uint32_t tl, tr, bl, br;
+	    int32_t distx;
+
+	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+	    tr = top_row [pixman_fixed_to_int (x_top) + 1] | top_mask;
+	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+	    br = bottom_row [pixman_fixed_to_int (x_bottom) + 1] | bottom_mask;
+
+	    distx = pixman_fixed_to_bilinear_weight (x);
+
+	    *buffer = bilinear_interpolation (tl, tr, bl, br, distx, disty);
+	}
+
+	buffer++;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Right Edge */
+    w = pixman_int_to_fixed (bits->width);
+    while (buffer < end  &&  x < w)
+    {
+	if (*mask)
+	{
+	    uint32_t tl, bl;
+	    int32_t distx;
+
+	    tl = top_row [pixman_fixed_to_int (x_top)] | top_mask;
+	    bl = bottom_row [pixman_fixed_to_int (x_bottom)] | bottom_mask;
+
+	    distx = pixman_fixed_to_bilinear_weight (x);
+
+	    *buffer = bilinear_interpolation (tl, 0, bl, 0, distx, disty);
+	}
+
+	buffer++;
+	x += ux;
+	x_top += ux_top;
+	x_bottom += ux_bottom;
+	mask += mask_inc;
+    }
+
+    /* Zero fill to the left of the image */
+    while (buffer < end)
+	*buffer++ = 0;
+
+    return iter->buffer;
+}
+
+typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
+
+static force_inline void
+bits_image_fetch_separable_convolution_affine (pixman_image_t * image,
+					       int              offset,
+					       int              line,
+					       int              width,
+					       uint32_t *       buffer,
+					       const uint32_t * mask,
+
+					       convert_pixel_t	convert_pixel,
+					       pixman_format_code_t	format,
+					       pixman_repeat_t	repeat_mode)
+{
+    bits_image_t *bits = &image->bits;
+    pixman_fixed_t *params = image->common.filter_params;
+    int cwidth = pixman_fixed_to_int (params[0]);
+    int cheight = pixman_fixed_to_int (params[1]);
+    int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1;
+    int y_off = ((cheight << 16) - pixman_fixed_1) >> 1;
+    int x_phase_bits = pixman_fixed_to_int (params[2]);
+    int y_phase_bits = pixman_fixed_to_int (params[3]);
+    int x_phase_shift = 16 - x_phase_bits;
+    int y_phase_shift = 16 - y_phase_bits;
+    pixman_fixed_t vx, vy;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    int k;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    vx = v.vector[0];
+    vy = v.vector[1];
+
+    for (k = 0; k < width; ++k)
+    {
+	pixman_fixed_t *y_params;
+	int satot, srtot, sgtot, sbtot;
+	pixman_fixed_t x, y;
+	int32_t x1, x2, y1, y2;
+	int32_t px, py;
+	int i, j;
+
+	if (mask && !mask[k])
+	    goto next;
+
+	/* Round x and y to the middle of the closest phase before continuing. This
+	 * ensures that the convolution matrix is aligned right, since it was
+	 * positioned relative to a particular phase (and not relative to whatever
+	 * exact fraction we happen to get here).
+	 */
+	x = ((vx >> x_phase_shift) << x_phase_shift) + ((1 << x_phase_shift) >> 1);
+	y = ((vy >> y_phase_shift) << y_phase_shift) + ((1 << y_phase_shift) >> 1);
+
+	px = (x & 0xffff) >> x_phase_shift;
+	py = (y & 0xffff) >> y_phase_shift;
+
+	x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
+	y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
+	x2 = x1 + cwidth;
+	y2 = y1 + cheight;
+
+	satot = srtot = sgtot = sbtot = 0;
+
+	y_params = params + 4 + (1 << x_phase_bits) * cwidth + py * cheight;
+
+	for (i = y1; i < y2; ++i)
+	{
+	    pixman_fixed_t fy = *y_params++;
+
+	    if (fy)
+	    {
+		pixman_fixed_t *x_params = params + 4 + px * cwidth;
+
+		for (j = x1; j < x2; ++j)
+		{
+		    pixman_fixed_t fx = *x_params++;
+		    int rx = j;
+		    int ry = i;
+		    
+		    if (fx)
+		    {
+			pixman_fixed_t f;
+			uint32_t pixel, mask;
+			uint8_t *row;
+
+			mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+			if (repeat_mode != PIXMAN_REPEAT_NONE)
+			{
+			    repeat (repeat_mode, &rx, bits->width);
+			    repeat (repeat_mode, &ry, bits->height);
+
+			    row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
+			    pixel = convert_pixel (row, rx) | mask;
+			}
+			else
+			{
+			    if (rx < 0 || ry < 0 || rx >= bits->width || ry >= bits->height)
+			    {
+				pixel = 0;
+			    }
+			    else
+			    {
+				row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
+				pixel = convert_pixel (row, rx) | mask;
+			    }
+			}
+
+			f = ((pixman_fixed_32_32_t)fx * fy + 0x8000) >> 16;
+			srtot += (int)RED_8 (pixel) * f;
+			sgtot += (int)GREEN_8 (pixel) * f;
+			sbtot += (int)BLUE_8 (pixel) * f;
+			satot += (int)ALPHA_8 (pixel) * f;
+		    }
+		}
+	    }
+	}
+
+	satot = (satot + 0x8000) >> 16;
+	srtot = (srtot + 0x8000) >> 16;
+	sgtot = (sgtot + 0x8000) >> 16;
+	sbtot = (sbtot + 0x8000) >> 16;
+
+	satot = CLIP (satot, 0, 0xff);
+	srtot = CLIP (srtot, 0, 0xff);
+	sgtot = CLIP (sgtot, 0, 0xff);
+	sbtot = CLIP (sbtot, 0, 0xff);
+
+	buffer[k] = (satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot << 0);
+
+    next:
+	vx += ux;
+	vy += uy;
+    }
+}
+
+static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+static force_inline void
+bits_image_fetch_bilinear_affine (pixman_image_t * image,
+				  int              offset,
+				  int              line,
+				  int              width,
+				  uint32_t *       buffer,
+				  const uint32_t * mask,
+
+				  convert_pixel_t	convert_pixel,
+				  pixman_format_code_t	format,
+				  pixman_repeat_t	repeat_mode)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    bits_image_t *bits = &image->bits;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+	int x1, y1, x2, y2;
+	uint32_t tl, tr, bl, br;
+	int32_t distx, disty;
+	int width = image->bits.width;
+	int height = image->bits.height;
+	const uint8_t *row1;
+	const uint8_t *row2;
+
+	if (mask && !mask[i])
+	    goto next;
+
+	x1 = x - pixman_fixed_1 / 2;
+	y1 = y - pixman_fixed_1 / 2;
+
+	distx = pixman_fixed_to_bilinear_weight (x1);
+	disty = pixman_fixed_to_bilinear_weight (y1);
+
+	y1 = pixman_fixed_to_int (y1);
+	y2 = y1 + 1;
+	x1 = pixman_fixed_to_int (x1);
+	x2 = x1 + 1;
+
+	if (repeat_mode != PIXMAN_REPEAT_NONE)
+	{
+	    uint32_t mask;
+
+	    mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+	    repeat (repeat_mode, &x1, width);
+	    repeat (repeat_mode, &y1, height);
+	    repeat (repeat_mode, &x2, width);
+	    repeat (repeat_mode, &y2, height);
+
+	    row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+	    row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+
+	    tl = convert_pixel (row1, x1) | mask;
+	    tr = convert_pixel (row1, x2) | mask;
+	    bl = convert_pixel (row2, x1) | mask;
+	    br = convert_pixel (row2, x2) | mask;
+	}
+	else
+	{
+	    uint32_t mask1, mask2;
+	    int bpp;
+
+	    /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
+	     * which means if you use it in expressions, those
+	     * expressions become unsigned themselves. Since
+	     * the variables below can be negative in some cases,
+	     * that will lead to crashes on 64 bit architectures.
+	     *
+	     * So this line makes sure bpp is signed
+	     */
+	    bpp = PIXMAN_FORMAT_BPP (format);
+
+	    if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
+	    {
+		buffer[i] = 0;
+		goto next;
+	    }
+
+	    if (y2 == 0)
+	    {
+		row1 = zero;
+		mask1 = 0;
+	    }
+	    else
+	    {
+		row1 = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+		row1 += bpp / 8 * x1;
+
+		mask1 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+	    }
+
+	    if (y1 == height - 1)
+	    {
+		row2 = zero;
+		mask2 = 0;
+	    }
+	    else
+	    {
+		row2 = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+		row2 += bpp / 8 * x1;
+
+		mask2 = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+	    }
+
+	    if (x2 == 0)
+	    {
+		tl = 0;
+		bl = 0;
+	    }
+	    else
+	    {
+		tl = convert_pixel (row1, 0) | mask1;
+		bl = convert_pixel (row2, 0) | mask2;
+	    }
+
+	    if (x1 == width - 1)
+	    {
+		tr = 0;
+		br = 0;
+	    }
+	    else
+	    {
+		tr = convert_pixel (row1, 1) | mask1;
+		br = convert_pixel (row2, 1) | mask2;
+	    }
+	}
+
+	buffer[i] = bilinear_interpolation (
+	    tl, tr, bl, br, distx, disty);
+
+    next:
+	x += ux;
+	y += uy;
+    }
+}
+
+static force_inline void
+bits_image_fetch_nearest_affine (pixman_image_t * image,
+				 int              offset,
+				 int              line,
+				 int              width,
+				 uint32_t *       buffer,
+				 const uint32_t * mask,
+				 
+				 convert_pixel_t	convert_pixel,
+				 pixman_format_code_t	format,
+				 pixman_repeat_t	repeat_mode)
+{
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    bits_image_t *bits = &image->bits;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+	int width, height, x0, y0;
+	const uint8_t *row;
+
+	if (mask && !mask[i])
+	    goto next;
+	
+	width = image->bits.width;
+	height = image->bits.height;
+	x0 = pixman_fixed_to_int (x - pixman_fixed_e);
+	y0 = pixman_fixed_to_int (y - pixman_fixed_e);
+
+	if (repeat_mode == PIXMAN_REPEAT_NONE &&
+	    (y0 < 0 || y0 >= height || x0 < 0 || x0 >= width))
+	{
+	    buffer[i] = 0;
+	}
+	else
+	{
+	    uint32_t mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+	    if (repeat_mode != PIXMAN_REPEAT_NONE)
+	    {
+		repeat (repeat_mode, &x0, width);
+		repeat (repeat_mode, &y0, height);
+	    }
+
+	    row = (uint8_t *)bits->bits + bits->rowstride * 4 * y0;
+
+	    buffer[i] = convert_pixel (row, x0) | mask;
+	}
+
+    next:
+	x += ux;
+	y += uy;
+    }
+}
+
+static force_inline uint32_t
+convert_a8r8g8b8 (const uint8_t *row, int x)
+{
+    return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_x8r8g8b8 (const uint8_t *row, int x)
+{
+    return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_a8 (const uint8_t *row, int x)
+{
+    return *(row + x) << 24;
+}
+
+static force_inline uint32_t
+convert_r5g6b5 (const uint8_t *row, int x)
+{
+    return convert_0565_to_0888 (*((uint16_t *)row + x));
+}
+
+#define MAKE_SEPARABLE_CONVOLUTION_FETCHER(name, format, repeat_mode)  \
+    static uint32_t *							\
+    bits_image_fetch_separable_convolution_affine_ ## name (pixman_iter_t   *iter, \
+							    const uint32_t * mask) \
+    {									\
+	bits_image_fetch_separable_convolution_affine (                 \
+	    iter->image,                                                \
+	    iter->x, iter->y++,                                         \
+	    iter->width,                                                \
+	    iter->buffer, mask,                                         \
+	    convert_ ## format,                                         \
+	    PIXMAN_ ## format,                                          \
+	    repeat_mode);                                               \
+									\
+	return iter->buffer;                                            \
+    }
+
+#define MAKE_BILINEAR_FETCHER(name, format, repeat_mode)		\
+    static uint32_t *							\
+    bits_image_fetch_bilinear_affine_ ## name (pixman_iter_t   *iter,	\
+					       const uint32_t * mask)	\
+    {									\
+	bits_image_fetch_bilinear_affine (iter->image,			\
+					  iter->x, iter->y++,		\
+					  iter->width,			\
+					  iter->buffer, mask,		\
+					  convert_ ## format,		\
+					  PIXMAN_ ## format,		\
+					  repeat_mode);			\
+	return iter->buffer;						\
+    }
+
+#define MAKE_NEAREST_FETCHER(name, format, repeat_mode)			\
+    static uint32_t *							\
+    bits_image_fetch_nearest_affine_ ## name (pixman_iter_t   *iter,	\
+					      const uint32_t * mask)	\
+    {									\
+	bits_image_fetch_nearest_affine (iter->image,			\
+					 iter->x, iter->y++,		\
+					 iter->width,			\
+					 iter->buffer, mask,		\
+					 convert_ ## format,		\
+					 PIXMAN_ ## format,		\
+					 repeat_mode);			\
+	return iter->buffer;						\
+    }
+
+#define MAKE_FETCHERS(name, format, repeat_mode)			\
+    MAKE_NEAREST_FETCHER (name, format, repeat_mode)			\
+    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)			\
+    MAKE_SEPARABLE_CONVOLUTION_FETCHER (name, format, repeat_mode)
+
+MAKE_FETCHERS (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8r8g8b8, a8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8r8g8b8,  a8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_x8r8g8b8,     x8r8g8b8, PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_x8r8g8b8,    x8r8g8b8, PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_x8r8g8b8, x8r8g8b8, PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_x8r8g8b8,  x8r8g8b8, PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_a8,           a8,       PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_a8,          a8,       PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_a8,	 a8,       PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_a8,	 a8,       PIXMAN_REPEAT_NORMAL)
+MAKE_FETCHERS (pad_r5g6b5,       r5g6b5,   PIXMAN_REPEAT_PAD)
+MAKE_FETCHERS (none_r5g6b5,      r5g6b5,   PIXMAN_REPEAT_NONE)
+MAKE_FETCHERS (reflect_r5g6b5,   r5g6b5,   PIXMAN_REPEAT_REFLECT)
+MAKE_FETCHERS (normal_r5g6b5,    r5g6b5,   PIXMAN_REPEAT_NORMAL)
+
+#define IMAGE_FLAGS							\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
+     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+static const pixman_iter_info_t fast_iters[] = 
+{
+    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW | ITER_SRC,
+      _pixman_iter_init_bits_stride, fast_fetch_r5g6b5, NULL },
+
+    { PIXMAN_r5g6b5, FAST_PATH_STD_DEST_FLAGS,
+      ITER_NARROW | ITER_DEST,
+      _pixman_iter_init_bits_stride,
+      fast_fetch_r5g6b5, fast_write_back_r5g6b5 },
+    
+    { PIXMAN_r5g6b5, FAST_PATH_STD_DEST_FLAGS,
+      ITER_NARROW | ITER_DEST | ITER_IGNORE_RGB | ITER_IGNORE_ALPHA,
+      _pixman_iter_init_bits_stride,
+      fast_dest_fetch_noop, fast_write_back_r5g6b5 },
+
+    { PIXMAN_a8r8g8b8,
+      (FAST_PATH_STANDARD_FLAGS			|
+       FAST_PATH_SCALE_TRANSFORM		|
+       FAST_PATH_BILINEAR_FILTER		|
+       FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR),
+      ITER_NARROW | ITER_SRC,
+      fast_bilinear_cover_iter_init,
+      NULL, NULL
+    },
+
+#define FAST_BILINEAR_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_X_UNIT_POSITIVE		|				\
+     FAST_PATH_Y_UNIT_ZERO		|				\
+     FAST_PATH_NONE_REPEAT		|				\
+     FAST_PATH_BILINEAR_FILTER)
+
+    { PIXMAN_a8r8g8b8,
+      FAST_BILINEAR_FLAGS,
+      ITER_NARROW | ITER_SRC,
+      NULL, bits_image_fetch_bilinear_no_repeat_8888, NULL
+    },
+
+    { PIXMAN_x8r8g8b8,
+      FAST_BILINEAR_FLAGS,
+      ITER_NARROW | ITER_SRC,
+      NULL, bits_image_fetch_bilinear_no_repeat_8888, NULL
+    },
+
+#define GENERAL_BILINEAR_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_BILINEAR_FILTER)
+
+#define GENERAL_NEAREST_FLAGS						\
+    (FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_HAS_TRANSFORM		|				\
+     FAST_PATH_AFFINE_TRANSFORM		|				\
+     FAST_PATH_NEAREST_FILTER)
+
+#define GENERAL_SEPARABLE_CONVOLUTION_FLAGS				\
+    (FAST_PATH_NO_ALPHA_MAP            |				\
+     FAST_PATH_NO_ACCESSORS            |				\
+     FAST_PATH_HAS_TRANSFORM           |				\
+     FAST_PATH_AFFINE_TRANSFORM        |				\
+     FAST_PATH_SEPARABLE_CONVOLUTION_FILTER)
+    
+#define SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)   \
+    { PIXMAN_ ## format,						\
+      GENERAL_SEPARABLE_CONVOLUTION_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
+      ITER_NARROW | ITER_SRC,						\
+      NULL, bits_image_fetch_separable_convolution_affine_ ## name, NULL \
+    },
+
+#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
+    { PIXMAN_ ## format,						\
+      GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
+      ITER_NARROW | ITER_SRC,						\
+      NULL, bits_image_fetch_bilinear_affine_ ## name, NULL,		\
+    },
+
+#define NEAREST_AFFINE_FAST_PATH(name, format, repeat)			\
+    { PIXMAN_ ## format,						\
+      GENERAL_NEAREST_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
+      ITER_NARROW | ITER_SRC,						\
+      NULL, bits_image_fetch_nearest_affine_ ## name, NULL		\
+    },
+
+#define AFFINE_FAST_PATHS(name, format, repeat)				\
+    SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)	\
+    BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
+    NEAREST_AFFINE_FAST_PATH(name, format, repeat)
+    
+    AFFINE_FAST_PATHS (pad_a8r8g8b8, a8r8g8b8, PAD)
+    AFFINE_FAST_PATHS (none_a8r8g8b8, a8r8g8b8, NONE)
+    AFFINE_FAST_PATHS (reflect_a8r8g8b8, a8r8g8b8, REFLECT)
+    AFFINE_FAST_PATHS (normal_a8r8g8b8, a8r8g8b8, NORMAL)
+    AFFINE_FAST_PATHS (pad_x8r8g8b8, x8r8g8b8, PAD)
+    AFFINE_FAST_PATHS (none_x8r8g8b8, x8r8g8b8, NONE)
+    AFFINE_FAST_PATHS (reflect_x8r8g8b8, x8r8g8b8, REFLECT)
+    AFFINE_FAST_PATHS (normal_x8r8g8b8, x8r8g8b8, NORMAL)
+    AFFINE_FAST_PATHS (pad_a8, a8, PAD)
+    AFFINE_FAST_PATHS (none_a8, a8, NONE)
+    AFFINE_FAST_PATHS (reflect_a8, a8, REFLECT)
+    AFFINE_FAST_PATHS (normal_a8, a8, NORMAL)
+    AFFINE_FAST_PATHS (pad_r5g6b5, r5g6b5, PAD)
+    AFFINE_FAST_PATHS (none_r5g6b5, r5g6b5, NONE)
+    AFFINE_FAST_PATHS (reflect_r5g6b5, r5g6b5, REFLECT)
+    AFFINE_FAST_PATHS (normal_r5g6b5, r5g6b5, NORMAL)
+
+    { PIXMAN_null },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
+
+    imp->fill = fast_path_fill;
+    imp->iter_info = fast_iters;
+
+    return imp;
+}
diff --git a/src/pixman/pixman/pixman-filter.c b/src/pixman/pixman/pixman-filter.c
new file mode 100644
index 0000000..b2bf53f
--- /dev/null
+++ b/src/pixman/pixman/pixman-filter.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright 2012, Red Hat, Inc.
+ * Copyright 2012, Soren Sandmann
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Soren Sandmann <soren.sandmann@gmail.com>
+ */
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+typedef double (* kernel_func_t) (double x);
+
+typedef struct
+{
+    pixman_kernel_t	kernel;
+    kernel_func_t	func;
+    double		width;
+} filter_info_t;
+
+static double
+impulse_kernel (double x)
+{
+    return (x == 0.0)? 1.0 : 0.0;
+}
+
+static double
+box_kernel (double x)
+{
+    return 1;
+}
+
+static double
+linear_kernel (double x)
+{
+    return 1 - fabs (x);
+}
+
+static double
+gaussian_kernel (double x)
+{
+#define SQRT2 (1.4142135623730950488016887242096980785696718753769480)
+#define SIGMA (SQRT2 / 2.0)
+    
+    return exp (- x * x / (2 * SIGMA * SIGMA)) / (SIGMA * sqrt (2.0 * M_PI));
+}
+
+static double
+sinc (double x)
+{
+    if (x == 0.0)
+	return 1.0;
+    else
+	return sin (M_PI * x) / (M_PI * x);
+}
+
+static double
+lanczos (double x, int n)
+{
+    return sinc (x) * sinc (x * (1.0 / n));
+}
+
+static double
+lanczos2_kernel (double x)
+{
+    return lanczos (x, 2);
+}
+
+static double
+lanczos3_kernel (double x)
+{
+    return lanczos (x, 3);
+}
+
+static double
+nice_kernel (double x)
+{
+    return lanczos3_kernel (x * 0.75);
+}
+
+static double
+general_cubic (double x, double B, double C)
+{
+    double ax = fabs(x);
+
+    if (ax < 1)
+    {
+	return ((12 - 9 * B - 6 * C) * ax * ax * ax +
+		(-18 + 12 * B + 6 * C) * ax * ax + (6 - 2 * B)) / 6;
+    }
+    else if (ax >= 1 && ax < 2)
+    {
+	return ((-B - 6 * C) * ax * ax * ax +
+		(6 * B + 30 * C) * ax * ax + (-12 * B - 48 * C) *
+		ax + (8 * B + 24 * C)) / 6;
+    }
+    else
+    {
+	return 0;
+    }
+}
+
+static double
+cubic_kernel (double x)
+{
+    /* This is the Mitchell-Netravali filter.
+     *
+     * (0.0, 0.5) would give us the Catmull-Rom spline,
+     * but that one seems to be indistinguishable from Lanczos2.
+     */
+    return general_cubic (x, 1/3.0, 1/3.0);
+}
+
+static const filter_info_t filters[] =
+{
+    { PIXMAN_KERNEL_IMPULSE,	        impulse_kernel,   0.0 },
+    { PIXMAN_KERNEL_BOX,	        box_kernel,       1.0 },
+    { PIXMAN_KERNEL_LINEAR,	        linear_kernel,    2.0 },
+    { PIXMAN_KERNEL_CUBIC,		cubic_kernel,     4.0 },
+    { PIXMAN_KERNEL_GAUSSIAN,	        gaussian_kernel,  6 * SIGMA },
+    { PIXMAN_KERNEL_LANCZOS2,	        lanczos2_kernel,  4.0 },
+    { PIXMAN_KERNEL_LANCZOS3,	        lanczos3_kernel,  6.0 },
+    { PIXMAN_KERNEL_LANCZOS3_STRETCHED, nice_kernel,      8.0 },
+};
+
+/* This function scales @kernel2 by @scale, then
+ * aligns @x1 in @kernel1 with @x2 in @kernel2 and
+ * and integrates the product of the kernels across @width.
+ *
+ * This function assumes that the intervals are within
+ * the kernels in question. E.g., the caller must not
+ * try to integrate a linear kernel ouside of [-1:1]
+ */
+static double
+integral (pixman_kernel_t kernel1, double x1,
+	  pixman_kernel_t kernel2, double scale, double x2,
+	  double width)
+{
+    /* If the integration interval crosses zero, break it into
+     * two separate integrals. This ensures that filters such
+     * as LINEAR that are not differentiable at 0 will still
+     * integrate properly.
+     */
+    if (x1 < 0 && x1 + width > 0)
+    {
+	return
+	    integral (kernel1, x1, kernel2, scale, x2, - x1) +
+	    integral (kernel1, 0, kernel2, scale, x2 - x1, width + x1);
+    }
+    else if (x2 < 0 && x2 + width > 0)
+    {
+	return
+	    integral (kernel1, x1, kernel2, scale, x2, - x2) +
+	    integral (kernel1, x1 - x2, kernel2, scale, 0, width + x2);
+    }
+    else if (kernel1 == PIXMAN_KERNEL_IMPULSE)
+    {
+	assert (width == 0.0);
+	return filters[kernel2].func (x2 * scale);
+    }
+    else if (kernel2 == PIXMAN_KERNEL_IMPULSE)
+    {
+	assert (width == 0.0);
+	return filters[kernel1].func (x1);
+    }
+    else
+    {
+	/* Integration via Simpson's rule */
+#define N_SEGMENTS 128
+#define SAMPLE(a1, a2)							\
+	(filters[kernel1].func ((a1)) * filters[kernel2].func ((a2) * scale))
+	
+	double s = 0.0;
+	double h = width / (double)N_SEGMENTS;
+	int i;
+
+	s = SAMPLE (x1, x2);
+
+	for (i = 1; i < N_SEGMENTS; i += 2)
+	{
+	    double a1 = x1 + h * i;
+	    double a2 = x2 + h * i;
+
+	    s += 2 * SAMPLE (a1, a2);
+
+	    if (i >= 2 && i < N_SEGMENTS - 1)
+		s += 4 * SAMPLE (a1, a2);
+	}
+
+	s += SAMPLE (x1 + width, x2 + width);
+	
+	return h * s * (1.0 / 3.0);
+    }
+}
+
+static pixman_fixed_t *
+create_1d_filter (int             *width,
+		  pixman_kernel_t  reconstruct,
+		  pixman_kernel_t  sample,
+		  double           scale,
+		  int              n_phases)
+{
+    pixman_fixed_t *params, *p;
+    double step;
+    double size;
+    int i;
+
+    size = scale * filters[sample].width + filters[reconstruct].width;
+    *width = ceil (size);
+
+    p = params = malloc (*width * n_phases * sizeof (pixman_fixed_t));
+    if (!params)
+        return NULL;
+
+    step = 1.0 / n_phases;
+
+    for (i = 0; i < n_phases; ++i)
+    {
+        double frac = step / 2.0 + i * step;
+	pixman_fixed_t new_total;
+        int x, x1, x2;
+	double total;
+
+	/* Sample convolution of reconstruction and sampling
+	 * filter. See rounding.txt regarding the rounding
+	 * and sample positions.
+	 */
+
+	x1 = ceil (frac - *width / 2.0 - 0.5);
+        x2 = x1 + *width;
+
+	total = 0;
+        for (x = x1; x < x2; ++x)
+        {
+	    double pos = x + 0.5 - frac;
+	    double rlow = - filters[reconstruct].width / 2.0;
+	    double rhigh = rlow + filters[reconstruct].width;
+	    double slow = pos - scale * filters[sample].width / 2.0;
+	    double shigh = slow + scale * filters[sample].width;
+	    double c = 0.0;
+	    double ilow, ihigh;
+
+	    if (rhigh >= slow && rlow <= shigh)
+	    {
+		ilow = MAX (slow, rlow);
+		ihigh = MIN (shigh, rhigh);
+
+		c = integral (reconstruct, ilow,
+			      sample, 1.0 / scale, ilow - pos,
+			      ihigh - ilow);
+	    }
+
+	    total += c;
+            *p++ = (pixman_fixed_t)(c * 65536.0 + 0.5);
+        }
+
+	/* Normalize */
+	p -= *width;
+        total = 1 / total;
+        new_total = 0;
+	for (x = x1; x < x2; ++x)
+	{
+	    pixman_fixed_t t = (*p) * total + 0.5;
+
+	    new_total += t;
+	    *p++ = t;
+	}
+
+	if (new_total != pixman_fixed_1)
+	    *(p - *width / 2) += (pixman_fixed_1 - new_total);
+    }
+
+    return params;
+}
+
+/* Create the parameter list for a SEPARABLE_CONVOLUTION filter
+ * with the given kernels and scale parameters
+ */
+PIXMAN_EXPORT pixman_fixed_t *
+pixman_filter_create_separable_convolution (int             *n_values,
+					    pixman_fixed_t   scale_x,
+					    pixman_fixed_t   scale_y,
+					    pixman_kernel_t  reconstruct_x,
+					    pixman_kernel_t  reconstruct_y,
+					    pixman_kernel_t  sample_x,
+					    pixman_kernel_t  sample_y,
+					    int              subsample_bits_x,
+					    int	             subsample_bits_y)
+{
+    double sx = fabs (pixman_fixed_to_double (scale_x));
+    double sy = fabs (pixman_fixed_to_double (scale_y));
+    pixman_fixed_t *horz = NULL, *vert = NULL, *params = NULL;
+    int subsample_x, subsample_y;
+    int width, height;
+
+    subsample_x = (1 << subsample_bits_x);
+    subsample_y = (1 << subsample_bits_y);
+
+    horz = create_1d_filter (&width, reconstruct_x, sample_x, sx, subsample_x);
+    vert = create_1d_filter (&height, reconstruct_y, sample_y, sy, subsample_y);
+
+    if (!horz || !vert)
+        goto out;
+    
+    *n_values = 4 + width * subsample_x + height * subsample_y;
+    
+    params = malloc (*n_values * sizeof (pixman_fixed_t));
+    if (!params)
+        goto out;
+
+    params[0] = pixman_int_to_fixed (width);
+    params[1] = pixman_int_to_fixed (height);
+    params[2] = pixman_int_to_fixed (subsample_bits_x);
+    params[3] = pixman_int_to_fixed (subsample_bits_y);
+
+    memcpy (params + 4, horz,
+	    width * subsample_x * sizeof (pixman_fixed_t));
+    memcpy (params + 4 + width * subsample_x, vert,
+	    height * subsample_y * sizeof (pixman_fixed_t));
+
+out:
+    free (horz);
+    free (vert);
+
+    return params;
+}
diff --git a/src/pixman/pixman/pixman-general.c b/src/pixman/pixman/pixman-general.c
new file mode 100644
index 0000000..a653fa7
--- /dev/null
+++ b/src/pixman/pixman/pixman-general.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright © 2009 Red Hat, Inc.
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *             2008 Aaron Plattner, NVIDIA Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pixman-private.h"
+
+static void
+general_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *info)
+{
+    pixman_image_t *image = iter->image;
+
+    switch (image->type)
+    {
+    case BITS:
+        if ((iter->iter_flags & ITER_SRC) == ITER_SRC)
+            _pixman_bits_image_src_iter_init (image, iter);
+        else
+            _pixman_bits_image_dest_iter_init (image, iter);
+        break;
+
+    case LINEAR:
+        _pixman_linear_gradient_iter_init (image, iter);
+        break;
+
+    case RADIAL:
+	_pixman_radial_gradient_iter_init (image, iter);
+        break;
+
+    case CONICAL:
+	_pixman_conical_gradient_iter_init (image, iter);
+        break;
+
+    case SOLID:
+        _pixman_log_error (FUNC, "Solid image not handled by noop");
+        break;
+
+    default:
+	_pixman_log_error (FUNC, "Pixman bug: unknown image type\n");
+        break;
+    }
+}
+
+static const pixman_iter_info_t general_iters[] =
+{
+    { PIXMAN_any, 0, 0, general_iter_init, NULL, NULL },
+    { PIXMAN_null },
+};
+
+typedef struct op_info_t op_info_t;
+struct op_info_t
+{
+    uint8_t src, dst;
+};
+
+#define ITER_IGNORE_BOTH						\
+    (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_LOCALIZED_ALPHA)
+
+static const op_info_t op_flags[PIXMAN_N_OPERATORS] =
+{
+    /* Src                   Dst                   */
+    { ITER_IGNORE_BOTH,      ITER_IGNORE_BOTH      }, /* CLEAR */
+    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_BOTH      }, /* SRC */
+    { ITER_IGNORE_BOTH,      ITER_LOCALIZED_ALPHA  }, /* DST */
+    { 0,                     ITER_LOCALIZED_ALPHA  }, /* OVER */
+    { ITER_LOCALIZED_ALPHA,  0                     }, /* OVER_REVERSE */
+    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_RGB       }, /* IN */
+    { ITER_IGNORE_RGB,       ITER_LOCALIZED_ALPHA  }, /* IN_REVERSE */
+    { ITER_LOCALIZED_ALPHA,  ITER_IGNORE_RGB       }, /* OUT */
+    { ITER_IGNORE_RGB,       ITER_LOCALIZED_ALPHA  }, /* OUT_REVERSE */
+    { 0,                     0                     }, /* ATOP */
+    { 0,                     0                     }, /* ATOP_REVERSE */
+    { 0,                     0                     }, /* XOR */
+    { ITER_LOCALIZED_ALPHA,  ITER_LOCALIZED_ALPHA  }, /* ADD */
+    { 0,                     0                     }, /* SATURATE */
+};
+
+#define SCANLINE_BUFFER_LENGTH 8192
+
+static void
+general_composite_rect  (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t stack_scanline_buffer[3 * SCANLINE_BUFFER_LENGTH];
+    uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer;
+    uint8_t *src_buffer, *mask_buffer, *dest_buffer;
+    pixman_iter_t src_iter, mask_iter, dest_iter;
+    pixman_combine_32_func_t compose;
+    pixman_bool_t component_alpha;
+    iter_flags_t width_flag, src_iter_flags;
+    int Bpp;
+    int i;
+
+    if ((src_image->common.flags & FAST_PATH_NARROW_FORMAT)		    &&
+	(!mask_image || mask_image->common.flags & FAST_PATH_NARROW_FORMAT) &&
+	(dest_image->common.flags & FAST_PATH_NARROW_FORMAT))
+    {
+	width_flag = ITER_NARROW;
+	Bpp = 4;
+    }
+    else
+    {
+	width_flag = ITER_WIDE;
+	Bpp = 16;
+    }
+
+#define ALIGN(addr)							\
+    ((uint8_t *)((((uintptr_t)(addr)) + 15) & (~15)))
+
+    src_buffer = ALIGN (scanline_buffer);
+    mask_buffer = ALIGN (src_buffer + width * Bpp);
+    dest_buffer = ALIGN (mask_buffer + width * Bpp);
+
+    if (ALIGN (dest_buffer + width * Bpp) >
+	    scanline_buffer + sizeof (stack_scanline_buffer))
+    {
+	scanline_buffer = pixman_malloc_ab_plus_c (width, Bpp * 3, 32 * 3);
+
+	if (!scanline_buffer)
+	    return;
+
+	src_buffer = ALIGN (scanline_buffer);
+	mask_buffer = ALIGN (src_buffer + width * Bpp);
+	dest_buffer = ALIGN (mask_buffer + width * Bpp);
+    }
+
+    if (width_flag == ITER_WIDE)
+    {
+	/* To make sure there aren't any NANs in the buffers */
+	memset (src_buffer, 0, width * Bpp);
+	memset (mask_buffer, 0, width * Bpp);
+	memset (dest_buffer, 0, width * Bpp);
+    }
+    
+    /* src iter */
+    src_iter_flags = width_flag | op_flags[op].src | ITER_SRC;
+
+    _pixman_implementation_iter_init (imp->toplevel, &src_iter, src_image,
+                                      src_x, src_y, width, height,
+                                      src_buffer, src_iter_flags,
+                                      info->src_flags);
+
+    /* mask iter */
+    if ((src_iter_flags & (ITER_IGNORE_ALPHA | ITER_IGNORE_RGB)) ==
+	(ITER_IGNORE_ALPHA | ITER_IGNORE_RGB))
+    {
+	/* If it doesn't matter what the source is, then it doesn't matter
+	 * what the mask is
+	 */
+	mask_image = NULL;
+    }
+
+    component_alpha =
+        mask_image			      &&
+        mask_image->common.type == BITS       &&
+        mask_image->common.component_alpha    &&
+        PIXMAN_FORMAT_RGB (mask_image->bits.format);
+
+    _pixman_implementation_iter_init (
+	imp->toplevel, &mask_iter,
+	mask_image, mask_x, mask_y, width, height, mask_buffer,
+	ITER_SRC | width_flag | (component_alpha? 0 : ITER_IGNORE_RGB),
+	info->mask_flags);
+
+    /* dest iter */
+    _pixman_implementation_iter_init (
+	imp->toplevel, &dest_iter, dest_image, dest_x, dest_y, width, height,
+	dest_buffer, ITER_DEST | width_flag | op_flags[op].dst, info->dest_flags);
+
+    compose = _pixman_implementation_lookup_combiner (
+	imp->toplevel, op, component_alpha, width_flag != ITER_WIDE);
+
+    for (i = 0; i < height; ++i)
+    {
+	uint32_t *s, *m, *d;
+
+	m = mask_iter.get_scanline (&mask_iter, NULL);
+	s = src_iter.get_scanline (&src_iter, m);
+	d = dest_iter.get_scanline (&dest_iter, NULL);
+
+	compose (imp->toplevel, op, d, s, m, width);
+
+	dest_iter.write_back (&dest_iter);
+    }
+
+    if (src_iter.fini)
+	src_iter.fini (&src_iter);
+    if (mask_iter.fini)
+	mask_iter.fini (&mask_iter);
+    if (dest_iter.fini)
+	dest_iter.fini (&dest_iter);
+    
+    if (scanline_buffer != (uint8_t *) stack_scanline_buffer)
+	free (scanline_buffer);
+}
+
+static const pixman_fast_path_t general_fast_path[] =
+{
+    { PIXMAN_OP_any, PIXMAN_any, 0, PIXMAN_any,	0, PIXMAN_any, 0, general_composite_rect },
+    { PIXMAN_OP_NONE }
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_general (void)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (NULL, general_fast_path);
+
+    _pixman_setup_combiner_functions_32 (imp);
+    _pixman_setup_combiner_functions_float (imp);
+
+    imp->iter_info = general_iters;
+
+    return imp;
+}
+
diff --git a/src/pixman/pixman/pixman-glyph.c b/src/pixman/pixman/pixman-glyph.c
new file mode 100644
index 0000000..96a349a
--- /dev/null
+++ b/src/pixman/pixman/pixman-glyph.c
@@ -0,0 +1,676 @@
+/*
+ * Copyright 2010, 2012, Soren Sandmann <sandmann@cs.au.dk>
+ * Copyright 2010, 2011, 2012, Red Hat, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Soren Sandmann <sandmann@cs.au.dk>
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+#include <stdlib.h>
+
+typedef struct glyph_metrics_t glyph_metrics_t;
+typedef struct glyph_t glyph_t;
+
+#define TOMBSTONE ((glyph_t *)0x1)
+
+/* XXX: These numbers are arbitrary---we've never done any measurements.
+ */
+#define N_GLYPHS_HIGH_WATER  (16384)
+#define N_GLYPHS_LOW_WATER   (8192)
+#define HASH_SIZE (2 * N_GLYPHS_HIGH_WATER)
+#define HASH_MASK (HASH_SIZE - 1)
+
+struct glyph_t
+{
+    void *		font_key;
+    void *		glyph_key;
+    int			origin_x;
+    int			origin_y;
+    pixman_image_t *	image;
+    pixman_link_t	mru_link;
+};
+
+struct pixman_glyph_cache_t
+{
+    int			n_glyphs;
+    int			n_tombstones;
+    int			freeze_count;
+    pixman_list_t	mru;
+    glyph_t *		glyphs[HASH_SIZE];
+};
+
+static void
+free_glyph (glyph_t *glyph)
+{
+    pixman_list_unlink (&glyph->mru_link);
+    pixman_image_unref (glyph->image);
+    free (glyph);
+}
+
+static unsigned int
+hash (const void *font_key, const void *glyph_key)
+{
+    size_t key = (size_t)font_key + (size_t)glyph_key;
+
+    /* This hash function is based on one found on Thomas Wang's
+     * web page at
+     *
+     *    http://www.concentric.net/~Ttwang/tech/inthash.htm
+     *
+     */
+    key = (key << 15) - key - 1;
+    key = key ^ (key >> 12);
+    key = key + (key << 2);
+    key = key ^ (key >> 4);
+    key = key + (key << 3) + (key << 11);
+    key = key ^ (key >> 16);
+
+    return key;
+}
+
+static glyph_t *
+lookup_glyph (pixman_glyph_cache_t *cache,
+	      void                 *font_key,
+	      void                 *glyph_key)
+{
+    unsigned idx;
+    glyph_t *g;
+
+    idx = hash (font_key, glyph_key);
+    while ((g = cache->glyphs[idx++ & HASH_MASK]))
+    {
+	if (g != TOMBSTONE			&&
+	    g->font_key == font_key		&&
+	    g->glyph_key == glyph_key)
+	{
+	    return g;
+	}
+    }
+
+    return NULL;
+}
+
+static void
+insert_glyph (pixman_glyph_cache_t *cache,
+	      glyph_t              *glyph)
+{
+    unsigned idx;
+    glyph_t **loc;
+
+    idx = hash (glyph->font_key, glyph->glyph_key);
+
+    /* Note: we assume that there is room in the table. If there isn't,
+     * this will be an infinite loop.
+     */
+    do
+    {
+	loc = &cache->glyphs[idx++ & HASH_MASK];
+    } while (*loc && *loc != TOMBSTONE);
+
+    if (*loc == TOMBSTONE)
+	cache->n_tombstones--;
+    cache->n_glyphs++;
+
+    *loc = glyph;
+}
+
+static void
+remove_glyph (pixman_glyph_cache_t *cache,
+	      glyph_t              *glyph)
+{
+    unsigned idx;
+
+    idx = hash (glyph->font_key, glyph->glyph_key);
+    while (cache->glyphs[idx & HASH_MASK] != glyph)
+	idx++;
+
+    cache->glyphs[idx & HASH_MASK] = TOMBSTONE;
+    cache->n_tombstones++;
+    cache->n_glyphs--;
+
+    /* Eliminate tombstones if possible */
+    if (cache->glyphs[(idx + 1) & HASH_MASK] == NULL)
+    {
+	while (cache->glyphs[idx & HASH_MASK] == TOMBSTONE)
+	{
+	    cache->glyphs[idx & HASH_MASK] = NULL;
+	    cache->n_tombstones--;
+	    idx--;
+	}
+    }
+}
+
+static void
+clear_table (pixman_glyph_cache_t *cache)
+{
+    int i;
+
+    for (i = 0; i < HASH_SIZE; ++i)
+    {
+	glyph_t *glyph = cache->glyphs[i];
+
+	if (glyph && glyph != TOMBSTONE)
+	    free_glyph (glyph);
+
+	cache->glyphs[i] = NULL;
+    }
+
+    cache->n_glyphs = 0;
+    cache->n_tombstones = 0;
+}
+
+PIXMAN_EXPORT pixman_glyph_cache_t *
+pixman_glyph_cache_create (void)
+{
+    pixman_glyph_cache_t *cache;
+
+    if (!(cache = malloc (sizeof *cache)))
+	return NULL;
+
+    memset (cache->glyphs, 0, sizeof (cache->glyphs));
+    cache->n_glyphs = 0;
+    cache->n_tombstones = 0;
+    cache->freeze_count = 0;
+
+    pixman_list_init (&cache->mru);
+
+    return cache;
+}
+
+PIXMAN_EXPORT void
+pixman_glyph_cache_destroy (pixman_glyph_cache_t *cache)
+{
+    return_if_fail (cache->freeze_count == 0);
+
+    clear_table (cache);
+
+    free (cache);
+}
+
+PIXMAN_EXPORT void
+pixman_glyph_cache_freeze (pixman_glyph_cache_t  *cache)
+{
+    cache->freeze_count++;
+}
+
+PIXMAN_EXPORT void
+pixman_glyph_cache_thaw (pixman_glyph_cache_t  *cache)
+{
+    if (--cache->freeze_count == 0					&&
+	cache->n_glyphs + cache->n_tombstones > N_GLYPHS_HIGH_WATER)
+    {
+	if (cache->n_tombstones > N_GLYPHS_HIGH_WATER)
+	{
+	    /* More than half the entries are
+	     * tombstones. Just dump the whole table.
+	     */
+	    clear_table (cache);
+	}
+
+	while (cache->n_glyphs > N_GLYPHS_LOW_WATER)
+	{
+	    glyph_t *glyph = CONTAINER_OF (glyph_t, mru_link, cache->mru.tail);
+
+	    remove_glyph (cache, glyph);
+	    free_glyph (glyph);
+	}
+    }
+}
+
+PIXMAN_EXPORT const void *
+pixman_glyph_cache_lookup (pixman_glyph_cache_t  *cache,
+			   void                  *font_key,
+			   void                  *glyph_key)
+{
+    return lookup_glyph (cache, font_key, glyph_key);
+}
+
+PIXMAN_EXPORT const void *
+pixman_glyph_cache_insert (pixman_glyph_cache_t  *cache,
+			   void                  *font_key,
+			   void                  *glyph_key,
+			   int			  origin_x,
+			   int                    origin_y,
+			   pixman_image_t        *image)
+{
+    glyph_t *glyph;
+    int32_t width, height;
+
+    return_val_if_fail (cache->freeze_count > 0, NULL);
+    return_val_if_fail (image->type == BITS, NULL);
+
+    width = image->bits.width;
+    height = image->bits.height;
+
+    if (cache->n_glyphs >= HASH_SIZE)
+	return NULL;
+
+    if (!(glyph = malloc (sizeof *glyph)))
+	return NULL;
+
+    glyph->font_key = font_key;
+    glyph->glyph_key = glyph_key;
+    glyph->origin_x = origin_x;
+    glyph->origin_y = origin_y;
+
+    if (!(glyph->image = pixman_image_create_bits (
+	      image->bits.format, width, height, NULL, -1)))
+    {
+	free (glyph);
+	return NULL;
+    }
+
+    pixman_image_composite32 (PIXMAN_OP_SRC,
+			      image, NULL, glyph->image, 0, 0, 0, 0, 0, 0,
+			      width, height);
+
+    if (PIXMAN_FORMAT_A   (glyph->image->bits.format) != 0	&&
+	PIXMAN_FORMAT_RGB (glyph->image->bits.format) != 0)
+    {
+	pixman_image_set_component_alpha (glyph->image, TRUE);
+    }
+
+    pixman_list_prepend (&cache->mru, &glyph->mru_link);
+
+    _pixman_image_validate (glyph->image);
+    insert_glyph (cache, glyph);
+
+    return glyph;
+}
+
+PIXMAN_EXPORT void
+pixman_glyph_cache_remove (pixman_glyph_cache_t  *cache,
+			   void                  *font_key,
+			   void                  *glyph_key)
+{
+    glyph_t *glyph;
+
+    if ((glyph = lookup_glyph (cache, font_key, glyph_key)))
+    {
+	remove_glyph (cache, glyph);
+
+	free_glyph (glyph);
+    }
+}
+
+PIXMAN_EXPORT void
+pixman_glyph_get_extents (pixman_glyph_cache_t *cache,
+			  int                   n_glyphs,
+			  pixman_glyph_t       *glyphs,
+			  pixman_box32_t       *extents)
+{
+    int i;
+
+    extents->x1 = extents->y1 = INT32_MAX;
+    extents->x2 = extents->y2 = INT32_MIN;
+
+    for (i = 0; i < n_glyphs; ++i)
+    {
+	glyph_t *glyph = (glyph_t *)glyphs[i].glyph;
+	int x1, y1, x2, y2;
+
+	x1 = glyphs[i].x - glyph->origin_x;
+	y1 = glyphs[i].y - glyph->origin_y;
+	x2 = glyphs[i].x - glyph->origin_x + glyph->image->bits.width;
+	y2 = glyphs[i].y - glyph->origin_y + glyph->image->bits.height;
+
+	if (x1 < extents->x1)
+	    extents->x1 = x1;
+	if (y1 < extents->y1)
+	    extents->y1 = y1;
+	if (x2 > extents->x2)
+	    extents->x2 = x2;
+	if (y2 > extents->y2)
+	    extents->y2 = y2;
+    }
+}
+
+/* This function returns a format that is suitable for use as a mask for the
+ * set of glyphs in question.
+ */
+PIXMAN_EXPORT pixman_format_code_t
+pixman_glyph_get_mask_format (pixman_glyph_cache_t *cache,
+			      int		    n_glyphs,
+			      const pixman_glyph_t *glyphs)
+{
+    pixman_format_code_t format = PIXMAN_a1;
+    int i;
+
+    for (i = 0; i < n_glyphs; ++i)
+    {
+	const glyph_t *glyph = glyphs[i].glyph;
+	pixman_format_code_t glyph_format = glyph->image->bits.format;
+
+	if (PIXMAN_FORMAT_TYPE (glyph_format) == PIXMAN_TYPE_A)
+	{
+	    if (PIXMAN_FORMAT_A (glyph_format) > PIXMAN_FORMAT_A (format))
+		format = glyph_format;
+	}
+	else
+	{
+	    return PIXMAN_a8r8g8b8;
+	}
+    }
+
+    return format;
+}
+
+static pixman_bool_t
+box32_intersect (pixman_box32_t *dest,
+		 const pixman_box32_t *box1,
+		 const pixman_box32_t *box2)
+{
+    dest->x1 = MAX (box1->x1, box2->x1);
+    dest->y1 = MAX (box1->y1, box2->y1);
+    dest->x2 = MIN (box1->x2, box2->x2);
+    dest->y2 = MIN (box1->y2, box2->y2);
+
+    return dest->x2 > dest->x1 && dest->y2 > dest->y1;
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+PIXMAN_EXPORT void
+pixman_composite_glyphs_no_mask (pixman_op_t            op,
+				 pixman_image_t        *src,
+				 pixman_image_t        *dest,
+				 int32_t                src_x,
+				 int32_t                src_y,
+				 int32_t                dest_x,
+				 int32_t                dest_y,
+				 pixman_glyph_cache_t  *cache,
+				 int                    n_glyphs,
+				 const pixman_glyph_t  *glyphs)
+{
+    pixman_region32_t region;
+    pixman_format_code_t glyph_format = PIXMAN_null;
+    uint32_t glyph_flags = 0;
+    pixman_format_code_t dest_format;
+    uint32_t dest_flags;
+    pixman_composite_func_t func = NULL;
+    pixman_implementation_t *implementation = NULL;
+    pixman_composite_info_t info;
+    int i;
+
+    _pixman_image_validate (src);
+    _pixman_image_validate (dest);
+    
+    dest_format = dest->common.extended_format_code;
+    dest_flags = dest->common.flags;
+    
+    pixman_region32_init (&region);
+    if (!_pixman_compute_composite_region32 (
+	    &region,
+	    src, NULL, dest,
+	    src_x - dest_x, src_y - dest_y, 0, 0, 0, 0,
+	    dest->bits.width, dest->bits.height))
+    {
+	goto out;
+    }
+
+    info.op = op;
+    info.src_image = src;
+    info.dest_image = dest;
+    info.src_flags = src->common.flags;
+    info.dest_flags = dest->common.flags;
+
+    for (i = 0; i < n_glyphs; ++i)
+    {
+	glyph_t *glyph = (glyph_t *)glyphs[i].glyph;
+	pixman_image_t *glyph_img = glyph->image;
+	pixman_box32_t glyph_box;
+	pixman_box32_t *pbox;
+	uint32_t extra = FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+	pixman_box32_t composite_box;
+	int n;
+
+	glyph_box.x1 = dest_x + glyphs[i].x - glyph->origin_x;
+	glyph_box.y1 = dest_y + glyphs[i].y - glyph->origin_y;
+	glyph_box.x2 = glyph_box.x1 + glyph->image->bits.width;
+	glyph_box.y2 = glyph_box.y1 + glyph->image->bits.height;
+	
+	pbox = pixman_region32_rectangles (&region, &n);
+	
+	info.mask_image = glyph_img;
+
+	while (n--)
+	{
+	    if (box32_intersect (&composite_box, pbox, &glyph_box))
+	    {
+		if (glyph_img->common.extended_format_code != glyph_format	||
+		    glyph_img->common.flags != glyph_flags)
+		{
+		    glyph_format = glyph_img->common.extended_format_code;
+		    glyph_flags = glyph_img->common.flags;
+
+		    _pixman_implementation_lookup_composite (
+			get_implementation(), op,
+			src->common.extended_format_code, src->common.flags,
+			glyph_format, glyph_flags | extra,
+			dest_format, dest_flags,
+			&implementation, &func);
+		}
+
+		info.src_x = src_x + composite_box.x1 - dest_x;
+		info.src_y = src_y + composite_box.y1 - dest_y;
+		info.mask_x = composite_box.x1 - (dest_x + glyphs[i].x - glyph->origin_x);
+		info.mask_y = composite_box.y1 - (dest_y + glyphs[i].y - glyph->origin_y);
+		info.dest_x = composite_box.x1;
+		info.dest_y = composite_box.y1;
+		info.width = composite_box.x2 - composite_box.x1;
+		info.height = composite_box.y2 - composite_box.y1;
+
+		info.mask_flags = glyph_flags;
+
+		func (implementation, &info);
+	    }
+
+	    pbox++;
+	}
+	pixman_list_move_to_front (&cache->mru, &glyph->mru_link);
+    }
+
+out:
+    pixman_region32_fini (&region);
+}
+
+static void
+add_glyphs (pixman_glyph_cache_t *cache,
+	    pixman_image_t *dest,
+	    int off_x, int off_y,
+	    int n_glyphs, const pixman_glyph_t *glyphs)
+{
+    pixman_format_code_t glyph_format = PIXMAN_null;
+    uint32_t glyph_flags = 0;
+    pixman_composite_func_t func = NULL;
+    pixman_implementation_t *implementation = NULL;
+    pixman_format_code_t dest_format;
+    uint32_t dest_flags;
+    pixman_box32_t dest_box;
+    pixman_composite_info_t info;
+    pixman_image_t *white_img = NULL;
+    pixman_bool_t white_src = FALSE;
+    int i;
+
+    _pixman_image_validate (dest);
+
+    dest_format = dest->common.extended_format_code;
+    dest_flags = dest->common.flags;
+
+    info.op = PIXMAN_OP_ADD;
+    info.dest_image = dest;
+    info.src_x = 0;
+    info.src_y = 0;
+    info.dest_flags = dest_flags;
+
+    dest_box.x1 = 0;
+    dest_box.y1 = 0;
+    dest_box.x2 = dest->bits.width;
+    dest_box.y2 = dest->bits.height;
+
+    for (i = 0; i < n_glyphs; ++i)
+    {
+	glyph_t *glyph = (glyph_t *)glyphs[i].glyph;
+	pixman_image_t *glyph_img = glyph->image;
+	pixman_box32_t glyph_box;
+	pixman_box32_t composite_box;
+
+	if (glyph_img->common.extended_format_code != glyph_format	||
+	    glyph_img->common.flags != glyph_flags)
+	{
+	    pixman_format_code_t src_format, mask_format;
+
+	    glyph_format = glyph_img->common.extended_format_code;
+	    glyph_flags = glyph_img->common.flags;
+
+	    if (glyph_format == dest->bits.format)
+	    {
+		src_format = glyph_format;
+		mask_format = PIXMAN_null;
+		info.src_flags = glyph_flags | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+		info.mask_flags = FAST_PATH_IS_OPAQUE;
+		info.mask_image = NULL;
+		white_src = FALSE;
+	    }
+	    else
+	    {
+		if (!white_img)
+		{
+		    static const pixman_color_t white = { 0xffff, 0xffff, 0xffff, 0xffff };
+
+		    if (!(white_img = pixman_image_create_solid_fill (&white)))
+			goto out;
+
+		    _pixman_image_validate (white_img);
+		}
+
+		src_format = PIXMAN_solid;
+		mask_format = glyph_format;
+		info.src_flags = white_img->common.flags;
+		info.mask_flags = glyph_flags | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+		info.src_image = white_img;
+		white_src = TRUE;
+	    }
+
+	    _pixman_implementation_lookup_composite (
+		get_implementation(), PIXMAN_OP_ADD,
+		src_format, info.src_flags,
+		mask_format, info.mask_flags,
+		dest_format, dest_flags,
+		&implementation, &func);
+	}
+
+	glyph_box.x1 = glyphs[i].x - glyph->origin_x + off_x;
+	glyph_box.y1 = glyphs[i].y - glyph->origin_y + off_y;
+	glyph_box.x2 = glyph_box.x1 + glyph->image->bits.width;
+	glyph_box.y2 = glyph_box.y1 + glyph->image->bits.height;
+	
+	if (box32_intersect (&composite_box, &glyph_box, &dest_box))
+	{
+	    int src_x = composite_box.x1 - glyph_box.x1;
+	    int src_y = composite_box.y1 - glyph_box.y1;
+
+	    if (white_src)
+		info.mask_image = glyph_img;
+	    else
+		info.src_image = glyph_img;
+
+	    info.mask_x = info.src_x = src_x;
+	    info.mask_y = info.src_y = src_y;
+	    info.dest_x = composite_box.x1;
+	    info.dest_y = composite_box.y1;
+	    info.width = composite_box.x2 - composite_box.x1;
+	    info.height = composite_box.y2 - composite_box.y1;
+
+	    func (implementation, &info);
+
+	    pixman_list_move_to_front (&cache->mru, &glyph->mru_link);
+	}
+    }
+
+out:
+    if (white_img)
+	pixman_image_unref (white_img);
+}
+
+/* Conceptually, for each glyph, (white IN glyph) is PIXMAN_OP_ADDed to an
+ * infinitely big mask image at the position such that the glyph origin point
+ * is positioned at the (glyphs[i].x, glyphs[i].y) point.
+ *
+ * Then (mask_x, mask_y) in the infinite mask and (src_x, src_y) in the source
+ * image are both aligned with (dest_x, dest_y) in the destination image. Then
+ * these three images are composited within the 
+ *
+ *       (dest_x, dest_y, dst_x + width, dst_y + height)
+ *
+ * rectangle.
+ *
+ * TODO:
+ *   - Trim the mask to the destination clip/image?
+ *   - Trim composite region based on sources, when the op ignores 0s.
+ */
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+PIXMAN_EXPORT void
+pixman_composite_glyphs (pixman_op_t            op,
+			 pixman_image_t        *src,
+			 pixman_image_t        *dest,
+			 pixman_format_code_t   mask_format,
+			 int32_t                src_x,
+			 int32_t                src_y,
+			 int32_t		mask_x,
+			 int32_t		mask_y,
+			 int32_t                dest_x,
+			 int32_t                dest_y,
+			 int32_t                width,
+			 int32_t                height,
+			 pixman_glyph_cache_t  *cache,
+			 int			n_glyphs,
+			 const pixman_glyph_t  *glyphs)
+{
+    pixman_image_t *mask;
+
+    if (!(mask = pixman_image_create_bits (mask_format, width, height, NULL, -1)))
+	return;
+
+    if (PIXMAN_FORMAT_A   (mask_format) != 0 &&
+	PIXMAN_FORMAT_RGB (mask_format) != 0)
+    {
+	pixman_image_set_component_alpha (mask, TRUE);
+    }
+
+    add_glyphs (cache, mask, - mask_x, - mask_y, n_glyphs, glyphs);
+
+    pixman_image_composite32 (op, src, mask, dest,
+			      src_x, src_y,
+			      0, 0,
+			      dest_x, dest_y,
+			      width, height);
+
+    pixman_image_unref (mask);
+}
diff --git a/src/pixman/pixman/pixman-gradient-walker.c b/src/pixman/pixman/pixman-gradient-walker.c
new file mode 100644
index 0000000..5944a55
--- /dev/null
+++ b/src/pixman/pixman/pixman-gradient-walker.c
@@ -0,0 +1,202 @@
+/*
+ *
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+void
+_pixman_gradient_walker_init (pixman_gradient_walker_t *walker,
+                              gradient_t *              gradient,
+                              pixman_repeat_t		repeat)
+{
+    walker->num_stops = gradient->n_stops;
+    walker->stops     = gradient->stops;
+    walker->left_x    = 0;
+    walker->right_x   = 0x10000;
+    walker->a_s       = 0.0f;
+    walker->a_b       = 0.0f;
+    walker->r_s       = 0.0f;
+    walker->r_b       = 0.0f;
+    walker->g_s       = 0.0f;
+    walker->g_b       = 0.0f;
+    walker->b_s       = 0.0f;
+    walker->b_b       = 0.0f;
+    walker->repeat    = repeat;
+
+    walker->need_reset = TRUE;
+}
+
+static void
+gradient_walker_reset (pixman_gradient_walker_t *walker,
+		       pixman_fixed_48_16_t      pos)
+{
+    int32_t x, left_x, right_x;
+    pixman_color_t *left_c, *right_c;
+    int n, count = walker->num_stops;
+    pixman_gradient_stop_t *stops = walker->stops;
+    float la, lr, lg, lb;
+    float ra, rr, rg, rb;
+    float lx, rx;
+
+    if (walker->repeat == PIXMAN_REPEAT_NORMAL)
+    {
+	x = (int32_t)pos & 0xffff;
+    }
+    else if (walker->repeat == PIXMAN_REPEAT_REFLECT)
+    {
+	x = (int32_t)pos & 0xffff;
+	if ((int32_t)pos & 0x10000)
+	    x = 0x10000 - x;
+    }
+    else
+    {
+	x = pos;
+    }
+    
+    for (n = 0; n < count; n++)
+    {
+	if (x < stops[n].x)
+	    break;
+    }
+    
+    left_x =  stops[n - 1].x;
+    left_c = &stops[n - 1].color;
+    
+    right_x =  stops[n].x;
+    right_c = &stops[n].color;
+
+    if (walker->repeat == PIXMAN_REPEAT_NORMAL)
+    {
+	left_x  += (pos - x);
+	right_x += (pos - x);
+    }
+    else if (walker->repeat == PIXMAN_REPEAT_REFLECT)
+    {
+	if ((int32_t)pos & 0x10000)
+	{
+	    pixman_color_t  *tmp_c;
+	    int32_t tmp_x;
+
+	    tmp_x   = 0x10000 - right_x;
+	    right_x = 0x10000 - left_x;
+	    left_x  = tmp_x;
+
+	    tmp_c   = right_c;
+	    right_c = left_c;
+	    left_c  = tmp_c;
+
+	    x = 0x10000 - x;
+	}
+	left_x  += (pos - x);
+	right_x += (pos - x);
+    }
+    else if (walker->repeat == PIXMAN_REPEAT_NONE)
+    {
+	if (n == 0)
+	    right_c = left_c;
+	else if (n == count)
+	    left_c = right_c;
+    }
+
+    /* The alpha channel is scaled to be in the [0, 255] interval,
+     * and the red/green/blue channels are scaled to be in [0, 1].
+     * This ensures that after premultiplication all channels will
+     * be in the [0, 255] interval.
+     */
+    la = (left_c->alpha * (1.0f/257.0f));
+    lr = (left_c->red * (1.0f/257.0f));
+    lg = (left_c->green * (1.0f/257.0f));
+    lb = (left_c->blue * (1.0f/257.0f));
+
+    ra = (right_c->alpha * (1.0f/257.0f));
+    rr = (right_c->red * (1.0f/257.0f));
+    rg = (right_c->green * (1.0f/257.0f));
+    rb = (right_c->blue * (1.0f/257.0f));
+    
+    lx = left_x * (1.0f/65536.0f);
+    rx = right_x * (1.0f/65536.0f);
+    
+    if (FLOAT_IS_ZERO (rx - lx) || left_x == INT32_MIN || right_x == INT32_MAX)
+    {
+	walker->a_s = walker->r_s = walker->g_s = walker->b_s = 0.0f;
+	walker->a_b = (la + ra) / 2.0f;
+	walker->r_b = (lr + rr) / 510.0f;
+	walker->g_b = (lg + rg) / 510.0f;
+	walker->b_b = (lb + rb) / 510.0f;
+    }
+    else
+    {
+	float w_rec = 1.0f / (rx - lx);
+
+	walker->a_b = (la * rx - ra * lx) * w_rec;
+	walker->r_b = (lr * rx - rr * lx) * w_rec * (1.0f/255.0f);
+	walker->g_b = (lg * rx - rg * lx) * w_rec * (1.0f/255.0f);
+	walker->b_b = (lb * rx - rb * lx) * w_rec * (1.0f/255.0f);
+
+	walker->a_s = (ra - la) * w_rec;
+	walker->r_s = (rr - lr) * w_rec * (1.0f/255.0f);
+	walker->g_s = (rg - lg) * w_rec * (1.0f/255.0f);
+	walker->b_s = (rb - lb) * w_rec * (1.0f/255.0f);
+    }
+   
+    walker->left_x = left_x;
+    walker->right_x = right_x;
+
+    walker->need_reset = FALSE;
+}
+
+uint32_t
+_pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
+                               pixman_fixed_48_16_t      x)
+{
+    float a, r, g, b;
+    uint8_t a8, r8, g8, b8;
+    uint32_t v;
+    float y;
+
+    if (walker->need_reset || x < walker->left_x || x >= walker->right_x)
+        gradient_walker_reset (walker, x);
+
+    y = x * (1.0f / 65536.0f);
+
+    a = walker->a_s * y + walker->a_b;
+    r = a * (walker->r_s * y + walker->r_b);
+    g = a * (walker->g_s * y + walker->g_b);
+    b = a * (walker->b_s * y + walker->b_b);
+
+    a8 = a + 0.5f;
+    r8 = r + 0.5f;
+    g8 = g + 0.5f;
+    b8 = b + 0.5f;
+
+    v = ((a8 << 24) & 0xff000000) |
+        ((r8 << 16) & 0x00ff0000) |
+        ((g8 <<  8) & 0x0000ff00) |
+        ((b8 >>  0) & 0x000000ff);
+
+    return v;
+}
diff --git a/src/pixman/pixman/pixman-image.c b/src/pixman/pixman/pixman-image.c
new file mode 100644
index 0000000..1ff1a49
--- /dev/null
+++ b/src/pixman/pixman/pixman-image.c
@@ -0,0 +1,945 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "pixman-private.h"
+
+static const pixman_color_t transparent_black = { 0, 0, 0, 0 };
+
+static void
+gradient_property_changed (pixman_image_t *image)
+{
+    gradient_t *gradient = &image->gradient;
+    int n = gradient->n_stops;
+    pixman_gradient_stop_t *stops = gradient->stops;
+    pixman_gradient_stop_t *begin = &(gradient->stops[-1]);
+    pixman_gradient_stop_t *end = &(gradient->stops[n]);
+
+    switch (gradient->common.repeat)
+    {
+    default:
+    case PIXMAN_REPEAT_NONE:
+	begin->x = INT32_MIN;
+	begin->color = transparent_black;
+	end->x = INT32_MAX;
+	end->color = transparent_black;
+	break;
+
+    case PIXMAN_REPEAT_NORMAL:
+	begin->x = stops[n - 1].x - pixman_fixed_1;
+	begin->color = stops[n - 1].color;
+	end->x = stops[0].x + pixman_fixed_1;
+	end->color = stops[0].color;
+	break;
+
+    case PIXMAN_REPEAT_REFLECT:
+	begin->x = - stops[0].x;
+	begin->color = stops[0].color;
+	end->x = pixman_int_to_fixed (2) - stops[n - 1].x;
+	end->color = stops[n - 1].color;
+	break;
+
+    case PIXMAN_REPEAT_PAD:
+	begin->x = INT32_MIN;
+	begin->color = stops[0].color;
+	end->x = INT32_MAX;
+	end->color = stops[n - 1].color;
+	break;
+    }
+}
+
+pixman_bool_t
+_pixman_init_gradient (gradient_t *                  gradient,
+                       const pixman_gradient_stop_t *stops,
+                       int                           n_stops)
+{
+    return_val_if_fail (n_stops > 0, FALSE);
+
+    /* We allocate two extra stops, one before the beginning of the stop list,
+     * and one after the end. These stops are initialized to whatever color
+     * would be used for positions outside the range of the stop list.
+     *
+     * This saves a bit of computation in the gradient walker.
+     *
+     * The pointer we store in the gradient_t struct still points to the
+     * first user-supplied struct, so when freeing, we will have to
+     * subtract one.
+     */
+    gradient->stops =
+	pixman_malloc_ab (n_stops + 2, sizeof (pixman_gradient_stop_t));
+    if (!gradient->stops)
+	return FALSE;
+
+    gradient->stops += 1;
+    memcpy (gradient->stops, stops, n_stops * sizeof (pixman_gradient_stop_t));
+    gradient->n_stops = n_stops;
+
+    gradient->common.property_changed = gradient_property_changed;
+
+    return TRUE;
+}
+
+void
+_pixman_image_init (pixman_image_t *image)
+{
+    image_common_t *common = &image->common;
+
+    pixman_region32_init (&common->clip_region);
+
+    common->alpha_count = 0;
+    common->have_clip_region = FALSE;
+    common->clip_sources = FALSE;
+    common->transform = NULL;
+    common->repeat = PIXMAN_REPEAT_NONE;
+    common->filter = PIXMAN_FILTER_NEAREST;
+    common->filter_params = NULL;
+    common->n_filter_params = 0;
+    common->alpha_map = NULL;
+    common->component_alpha = FALSE;
+    common->ref_count = 1;
+    common->property_changed = NULL;
+    common->client_clip = FALSE;
+    common->destroy_func = NULL;
+    common->destroy_data = NULL;
+    common->dirty = TRUE;
+}
+
+pixman_bool_t
+_pixman_image_fini (pixman_image_t *image)
+{
+    image_common_t *common = (image_common_t *)image;
+
+    common->ref_count--;
+
+    if (common->ref_count == 0)
+    {
+	if (image->common.destroy_func)
+	    image->common.destroy_func (image, image->common.destroy_data);
+
+	pixman_region32_fini (&common->clip_region);
+
+	free (common->transform);
+	free (common->filter_params);
+
+	if (common->alpha_map)
+	    pixman_image_unref ((pixman_image_t *)common->alpha_map);
+
+	if (image->type == LINEAR ||
+	    image->type == RADIAL ||
+	    image->type == CONICAL)
+	{
+	    if (image->gradient.stops)
+	    {
+		/* See _pixman_init_gradient() for an explanation of the - 1 */
+		free (image->gradient.stops - 1);
+	    }
+
+	    /* This will trigger if someone adds a property_changed
+	     * method to the linear/radial/conical gradient overwriting
+	     * the general one.
+	     */
+	    assert (
+		image->common.property_changed == gradient_property_changed);
+	}
+
+	if (image->type == BITS && image->bits.free_me)
+	    free (image->bits.free_me);
+
+	return TRUE;
+    }
+
+    return FALSE;
+}
+
+pixman_image_t *
+_pixman_image_allocate (void)
+{
+    pixman_image_t *image = malloc (sizeof (pixman_image_t));
+
+    if (image)
+	_pixman_image_init (image);
+
+    return image;
+}
+
+static void
+image_property_changed (pixman_image_t *image)
+{
+    image->common.dirty = TRUE;
+}
+
+/* Ref Counting */
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_ref (pixman_image_t *image)
+{
+    image->common.ref_count++;
+
+    return image;
+}
+
+/* returns TRUE when the image is freed */
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_unref (pixman_image_t *image)
+{
+    if (_pixman_image_fini (image))
+    {
+	free (image);
+	return TRUE;
+    }
+
+    return FALSE;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_destroy_function (pixman_image_t *            image,
+                                   pixman_image_destroy_func_t func,
+                                   void *                      data)
+{
+    image->common.destroy_func = func;
+    image->common.destroy_data = data;
+}
+
+PIXMAN_EXPORT void *
+pixman_image_get_destroy_data (pixman_image_t *image)
+{
+  return image->common.destroy_data;
+}
+
+void
+_pixman_image_reset_clip_region (pixman_image_t *image)
+{
+    image->common.have_clip_region = FALSE;
+}
+
+/* Executive Summary: This function is a no-op that only exists
+ * for historical reasons.
+ *
+ * There used to be a bug in the X server where it would rely on
+ * out-of-bounds accesses when it was asked to composite with a
+ * window as the source. It would create a pixman image pointing
+ * to some bogus position in memory, but then set a clip region
+ * to the position where the actual bits were.
+ *
+ * Due to a bug in old versions of pixman, where it would not clip
+ * against the image bounds when a clip region was set, this would
+ * actually work. So when the pixman bug was fixed, a workaround was
+ * added to allow certain out-of-bound accesses. This function disabled
+ * those workarounds.
+ *
+ * Since 0.21.2, pixman doesn't do these workarounds anymore, so now
+ * this function is a no-op.
+ */
+PIXMAN_EXPORT void
+pixman_disable_out_of_bounds_workaround (void)
+{
+}
+
+static void
+compute_image_info (pixman_image_t *image)
+{
+    pixman_format_code_t code;
+    uint32_t flags = 0;
+
+    /* Transform */
+    if (!image->common.transform)
+    {
+	flags |= (FAST_PATH_ID_TRANSFORM	|
+		  FAST_PATH_X_UNIT_POSITIVE	|
+		  FAST_PATH_Y_UNIT_ZERO		|
+		  FAST_PATH_AFFINE_TRANSFORM);
+    }
+    else
+    {
+	flags |= FAST_PATH_HAS_TRANSFORM;
+
+	if (image->common.transform->matrix[2][0] == 0			&&
+	    image->common.transform->matrix[2][1] == 0			&&
+	    image->common.transform->matrix[2][2] == pixman_fixed_1)
+	{
+	    flags |= FAST_PATH_AFFINE_TRANSFORM;
+
+	    if (image->common.transform->matrix[0][1] == 0 &&
+		image->common.transform->matrix[1][0] == 0)
+	    {
+		if (image->common.transform->matrix[0][0] == -pixman_fixed_1 &&
+		    image->common.transform->matrix[1][1] == -pixman_fixed_1)
+		{
+		    flags |= FAST_PATH_ROTATE_180_TRANSFORM;
+		}
+		flags |= FAST_PATH_SCALE_TRANSFORM;
+	    }
+	    else if (image->common.transform->matrix[0][0] == 0 &&
+	             image->common.transform->matrix[1][1] == 0)
+	    {
+		pixman_fixed_t m01 = image->common.transform->matrix[0][1];
+		pixman_fixed_t m10 = image->common.transform->matrix[1][0];
+
+		if (m01 == -pixman_fixed_1 && m10 == pixman_fixed_1)
+		    flags |= FAST_PATH_ROTATE_90_TRANSFORM;
+		else if (m01 == pixman_fixed_1 && m10 == -pixman_fixed_1)
+		    flags |= FAST_PATH_ROTATE_270_TRANSFORM;
+	    }
+	}
+
+	if (image->common.transform->matrix[0][0] > 0)
+	    flags |= FAST_PATH_X_UNIT_POSITIVE;
+
+	if (image->common.transform->matrix[1][0] == 0)
+	    flags |= FAST_PATH_Y_UNIT_ZERO;
+    }
+
+    /* Filter */
+    switch (image->common.filter)
+    {
+    case PIXMAN_FILTER_NEAREST:
+    case PIXMAN_FILTER_FAST:
+	flags |= (FAST_PATH_NEAREST_FILTER | FAST_PATH_NO_CONVOLUTION_FILTER);
+	break;
+
+    case PIXMAN_FILTER_BILINEAR:
+    case PIXMAN_FILTER_GOOD:
+    case PIXMAN_FILTER_BEST:
+	flags |= (FAST_PATH_BILINEAR_FILTER | FAST_PATH_NO_CONVOLUTION_FILTER);
+
+	/* Here we have a chance to optimize BILINEAR filter to NEAREST if
+	 * they are equivalent for the currently used transformation matrix.
+	 */
+	if (flags & FAST_PATH_ID_TRANSFORM)
+	{
+	    flags |= FAST_PATH_NEAREST_FILTER;
+	}
+	else if (
+	    /* affine and integer translation components in matrix ... */
+	    ((flags & FAST_PATH_AFFINE_TRANSFORM) &&
+	     !pixman_fixed_frac (image->common.transform->matrix[0][2] |
+				 image->common.transform->matrix[1][2])) &&
+	    (
+		/* ... combined with a simple rotation */
+		(flags & (FAST_PATH_ROTATE_90_TRANSFORM |
+			  FAST_PATH_ROTATE_180_TRANSFORM |
+			  FAST_PATH_ROTATE_270_TRANSFORM)) ||
+		/* ... or combined with a simple non-rotated translation */
+		(image->common.transform->matrix[0][0] == pixman_fixed_1 &&
+		 image->common.transform->matrix[1][1] == pixman_fixed_1 &&
+		 image->common.transform->matrix[0][1] == 0 &&
+		 image->common.transform->matrix[1][0] == 0)
+		)
+	    )
+	{
+	    /* FIXME: there are some affine-test failures, showing that
+	     * handling of BILINEAR and NEAREST filter is not quite
+	     * equivalent when getting close to 32K for the translation
+	     * components of the matrix. That's likely some bug, but for
+	     * now just skip BILINEAR->NEAREST optimization in this case.
+	     */
+	    pixman_fixed_t magic_limit = pixman_int_to_fixed (30000);
+	    if (image->common.transform->matrix[0][2] <= magic_limit  &&
+	        image->common.transform->matrix[1][2] <= magic_limit  &&
+	        image->common.transform->matrix[0][2] >= -magic_limit &&
+	        image->common.transform->matrix[1][2] >= -magic_limit)
+	    {
+		flags |= FAST_PATH_NEAREST_FILTER;
+	    }
+	}
+	break;
+
+    case PIXMAN_FILTER_CONVOLUTION:
+	break;
+
+    case PIXMAN_FILTER_SEPARABLE_CONVOLUTION:
+	flags |= FAST_PATH_SEPARABLE_CONVOLUTION_FILTER;
+	break;
+
+    default:
+	flags |= FAST_PATH_NO_CONVOLUTION_FILTER;
+	break;
+    }
+
+    /* Repeat mode */
+    switch (image->common.repeat)
+    {
+    case PIXMAN_REPEAT_NONE:
+	flags |=
+	    FAST_PATH_NO_REFLECT_REPEAT		|
+	    FAST_PATH_NO_PAD_REPEAT		|
+	    FAST_PATH_NO_NORMAL_REPEAT;
+	break;
+
+    case PIXMAN_REPEAT_REFLECT:
+	flags |=
+	    FAST_PATH_NO_PAD_REPEAT		|
+	    FAST_PATH_NO_NONE_REPEAT		|
+	    FAST_PATH_NO_NORMAL_REPEAT;
+	break;
+
+    case PIXMAN_REPEAT_PAD:
+	flags |=
+	    FAST_PATH_NO_REFLECT_REPEAT		|
+	    FAST_PATH_NO_NONE_REPEAT		|
+	    FAST_PATH_NO_NORMAL_REPEAT;
+	break;
+
+    default:
+	flags |=
+	    FAST_PATH_NO_REFLECT_REPEAT		|
+	    FAST_PATH_NO_PAD_REPEAT		|
+	    FAST_PATH_NO_NONE_REPEAT;
+	break;
+    }
+
+    /* Component alpha */
+    if (image->common.component_alpha)
+	flags |= FAST_PATH_COMPONENT_ALPHA;
+    else
+	flags |= FAST_PATH_UNIFIED_ALPHA;
+
+    flags |= (FAST_PATH_NO_ACCESSORS | FAST_PATH_NARROW_FORMAT);
+
+    /* Type specific checks */
+    switch (image->type)
+    {
+    case SOLID:
+	code = PIXMAN_solid;
+
+	if (image->solid.color.alpha == 0xffff)
+	    flags |= FAST_PATH_IS_OPAQUE;
+	break;
+
+    case BITS:
+	if (image->bits.width == 1	&&
+	    image->bits.height == 1	&&
+	    image->common.repeat != PIXMAN_REPEAT_NONE)
+	{
+	    code = PIXMAN_solid;
+	}
+	else
+	{
+	    code = image->bits.format;
+	    flags |= FAST_PATH_BITS_IMAGE;
+	}
+
+	if (!PIXMAN_FORMAT_A (image->bits.format)				&&
+	    PIXMAN_FORMAT_TYPE (image->bits.format) != PIXMAN_TYPE_GRAY		&&
+	    PIXMAN_FORMAT_TYPE (image->bits.format) != PIXMAN_TYPE_COLOR)
+	{
+	    flags |= FAST_PATH_SAMPLES_OPAQUE;
+
+	    if (image->common.repeat != PIXMAN_REPEAT_NONE)
+		flags |= FAST_PATH_IS_OPAQUE;
+	}
+
+	if (image->bits.read_func || image->bits.write_func)
+	    flags &= ~FAST_PATH_NO_ACCESSORS;
+
+	if (PIXMAN_FORMAT_IS_WIDE (image->bits.format))
+	    flags &= ~FAST_PATH_NARROW_FORMAT;
+	break;
+
+    case RADIAL:
+	code = PIXMAN_unknown;
+
+	/*
+	 * As explained in pixman-radial-gradient.c, every point of
+	 * the plane has a valid associated radius (and thus will be
+	 * colored) if and only if a is negative (i.e. one of the two
+	 * circles contains the other one).
+	 */
+
+        if (image->radial.a >= 0)
+	    break;
+
+	/* Fall through */
+
+    case CONICAL:
+    case LINEAR:
+	code = PIXMAN_unknown;
+
+	if (image->common.repeat != PIXMAN_REPEAT_NONE)
+	{
+	    int i;
+
+	    flags |= FAST_PATH_IS_OPAQUE;
+	    for (i = 0; i < image->gradient.n_stops; ++i)
+	    {
+		if (image->gradient.stops[i].color.alpha != 0xffff)
+		{
+		    flags &= ~FAST_PATH_IS_OPAQUE;
+		    break;
+		}
+	    }
+	}
+	break;
+
+    default:
+	code = PIXMAN_unknown;
+	break;
+    }
+
+    /* Alpha maps are only supported for BITS images, so it's always
+     * safe to ignore their presense for non-BITS images
+     */
+    if (!image->common.alpha_map || image->type != BITS)
+    {
+	flags |= FAST_PATH_NO_ALPHA_MAP;
+    }
+    else
+    {
+	if (PIXMAN_FORMAT_IS_WIDE (image->common.alpha_map->format))
+	    flags &= ~FAST_PATH_NARROW_FORMAT;
+    }
+
+    /* Both alpha maps and convolution filters can introduce
+     * non-opaqueness in otherwise opaque images. Also
+     * an image with component alpha turned on is only opaque
+     * if all channels are opaque, so we simply turn it off
+     * unconditionally for those images.
+     */
+    if (image->common.alpha_map						||
+	image->common.filter == PIXMAN_FILTER_CONVOLUTION		||
+        image->common.filter == PIXMAN_FILTER_SEPARABLE_CONVOLUTION     ||
+	image->common.component_alpha)
+    {
+	flags &= ~(FAST_PATH_IS_OPAQUE | FAST_PATH_SAMPLES_OPAQUE);
+    }
+
+    image->common.flags = flags;
+    image->common.extended_format_code = code;
+}
+
+void
+_pixman_image_validate (pixman_image_t *image)
+{
+    if (image->common.dirty)
+    {
+	compute_image_info (image);
+
+	/* It is important that property_changed is
+	 * called *after* compute_image_info() because
+	 * property_changed() can make use of the flags
+	 * to set up accessors etc.
+	 */
+	if (image->common.property_changed)
+	    image->common.property_changed (image);
+
+	image->common.dirty = FALSE;
+    }
+
+    if (image->common.alpha_map)
+	_pixman_image_validate ((pixman_image_t *)image->common.alpha_map);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_clip_region32 (pixman_image_t *   image,
+                                pixman_region32_t *region)
+{
+    image_common_t *common = (image_common_t *)image;
+    pixman_bool_t result;
+
+    if (region)
+    {
+	if ((result = pixman_region32_copy (&common->clip_region, region)))
+	    image->common.have_clip_region = TRUE;
+    }
+    else
+    {
+	_pixman_image_reset_clip_region (image);
+
+	result = TRUE;
+    }
+
+    image_property_changed (image);
+
+    return result;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_clip_region (pixman_image_t *   image,
+                              pixman_region16_t *region)
+{
+    image_common_t *common = (image_common_t *)image;
+    pixman_bool_t result;
+
+    if (region)
+    {
+	if ((result = pixman_region32_copy_from_region16 (&common->clip_region, region)))
+	    image->common.have_clip_region = TRUE;
+    }
+    else
+    {
+	_pixman_image_reset_clip_region (image);
+
+	result = TRUE;
+    }
+
+    image_property_changed (image);
+
+    return result;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_has_client_clip (pixman_image_t *image,
+                                  pixman_bool_t   client_clip)
+{
+    image->common.client_clip = client_clip;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_transform (pixman_image_t *          image,
+                            const pixman_transform_t *transform)
+{
+    static const pixman_transform_t id =
+    {
+	{ { pixman_fixed_1, 0, 0 },
+	  { 0, pixman_fixed_1, 0 },
+	  { 0, 0, pixman_fixed_1 } }
+    };
+
+    image_common_t *common = (image_common_t *)image;
+    pixman_bool_t result;
+
+    if (common->transform == transform)
+	return TRUE;
+
+    if (!transform || memcmp (&id, transform, sizeof (pixman_transform_t)) == 0)
+    {
+	free (common->transform);
+	common->transform = NULL;
+	result = TRUE;
+
+	goto out;
+    }
+
+    if (common->transform &&
+	memcmp (common->transform, transform, sizeof (pixman_transform_t)) == 0)
+    {
+	return TRUE;
+    }
+
+    if (common->transform == NULL)
+	common->transform = malloc (sizeof (pixman_transform_t));
+
+    if (common->transform == NULL)
+    {
+	result = FALSE;
+
+	goto out;
+    }
+
+    memcpy (common->transform, transform, sizeof(pixman_transform_t));
+
+    result = TRUE;
+
+out:
+    image_property_changed (image);
+
+    return result;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_repeat (pixman_image_t *image,
+                         pixman_repeat_t repeat)
+{
+    if (image->common.repeat == repeat)
+	return;
+
+    image->common.repeat = repeat;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_set_filter (pixman_image_t *      image,
+                         pixman_filter_t       filter,
+                         const pixman_fixed_t *params,
+                         int                   n_params)
+{
+    image_common_t *common = (image_common_t *)image;
+    pixman_fixed_t *new_params;
+
+    if (params == common->filter_params && filter == common->filter)
+	return TRUE;
+
+    if (filter == PIXMAN_FILTER_SEPARABLE_CONVOLUTION)
+    {
+	int width = pixman_fixed_to_int (params[0]);
+	int height = pixman_fixed_to_int (params[1]);
+	int x_phase_bits = pixman_fixed_to_int (params[2]);
+	int y_phase_bits = pixman_fixed_to_int (params[3]);
+	int n_x_phases = (1 << x_phase_bits);
+	int n_y_phases = (1 << y_phase_bits);
+
+	return_val_if_fail (
+	    n_params == 4 + n_x_phases * width + n_y_phases * height, FALSE);
+    }
+    
+    new_params = NULL;
+    if (params)
+    {
+	new_params = pixman_malloc_ab (n_params, sizeof (pixman_fixed_t));
+	if (!new_params)
+	    return FALSE;
+
+	memcpy (new_params,
+	        params, n_params * sizeof (pixman_fixed_t));
+    }
+
+    common->filter = filter;
+
+    if (common->filter_params)
+	free (common->filter_params);
+
+    common->filter_params = new_params;
+    common->n_filter_params = n_params;
+
+    image_property_changed (image);
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_source_clipping (pixman_image_t *image,
+                                  pixman_bool_t   clip_sources)
+{
+    if (image->common.clip_sources == clip_sources)
+	return;
+
+    image->common.clip_sources = clip_sources;
+
+    image_property_changed (image);
+}
+
+/* Unlike all the other property setters, this function does not
+ * copy the content of indexed. Doing this copying is simply
+ * way, way too expensive.
+ */
+PIXMAN_EXPORT void
+pixman_image_set_indexed (pixman_image_t *        image,
+                          const pixman_indexed_t *indexed)
+{
+    bits_image_t *bits = (bits_image_t *)image;
+
+    if (bits->indexed == indexed)
+	return;
+
+    bits->indexed = indexed;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_alpha_map (pixman_image_t *image,
+                            pixman_image_t *alpha_map,
+                            int16_t         x,
+                            int16_t         y)
+{
+    image_common_t *common = (image_common_t *)image;
+
+    return_if_fail (!alpha_map || alpha_map->type == BITS);
+
+    if (alpha_map && common->alpha_count > 0)
+    {
+	/* If this image is being used as an alpha map itself,
+	 * then you can't give it an alpha map of its own.
+	 */
+	return;
+    }
+
+    if (alpha_map && alpha_map->common.alpha_map)
+    {
+	/* If the image has an alpha map of its own,
+	 * then it can't be used as an alpha map itself
+	 */
+	return;
+    }
+
+    if (common->alpha_map != (bits_image_t *)alpha_map)
+    {
+	if (common->alpha_map)
+	{
+	    common->alpha_map->common.alpha_count--;
+
+	    pixman_image_unref ((pixman_image_t *)common->alpha_map);
+	}
+
+	if (alpha_map)
+	{
+	    common->alpha_map = (bits_image_t *)pixman_image_ref (alpha_map);
+
+	    common->alpha_map->common.alpha_count++;
+	}
+	else
+	{
+	    common->alpha_map = NULL;
+	}
+    }
+
+    common->alpha_origin_x = x;
+    common->alpha_origin_y = y;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_component_alpha   (pixman_image_t *image,
+                                    pixman_bool_t   component_alpha)
+{
+    if (image->common.component_alpha == component_alpha)
+	return;
+
+    image->common.component_alpha = component_alpha;
+
+    image_property_changed (image);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_get_component_alpha   (pixman_image_t       *image)
+{
+    return image->common.component_alpha;
+}
+
+PIXMAN_EXPORT void
+pixman_image_set_accessors (pixman_image_t *           image,
+                            pixman_read_memory_func_t  read_func,
+                            pixman_write_memory_func_t write_func)
+{
+    return_if_fail (image != NULL);
+
+    if (image->type == BITS)
+    {
+	image->bits.read_func = read_func;
+	image->bits.write_func = write_func;
+
+	image_property_changed (image);
+    }
+}
+
+PIXMAN_EXPORT uint32_t *
+pixman_image_get_data (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return image->bits.bits;
+
+    return NULL;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_width (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return image->bits.width;
+
+    return 0;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_height (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return image->bits.height;
+
+    return 0;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_stride (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return image->bits.rowstride * (int) sizeof (uint32_t);
+
+    return 0;
+}
+
+PIXMAN_EXPORT int
+pixman_image_get_depth (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return PIXMAN_FORMAT_DEPTH (image->bits.format);
+
+    return 0;
+}
+
+PIXMAN_EXPORT pixman_format_code_t
+pixman_image_get_format (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return image->bits.format;
+
+    return PIXMAN_null;
+}
+
+uint32_t
+_pixman_image_get_solid (pixman_implementation_t *imp,
+			 pixman_image_t *         image,
+                         pixman_format_code_t     format)
+{
+    uint32_t result;
+
+    if (image->type == SOLID)
+    {
+	result = image->solid.color_32;
+    }
+    else if (image->type == BITS)
+    {
+	if (image->bits.format == PIXMAN_a8r8g8b8)
+	    result = image->bits.bits[0];
+	else if (image->bits.format == PIXMAN_x8r8g8b8)
+	    result = image->bits.bits[0] | 0xff000000;
+	else if (image->bits.format == PIXMAN_a8)
+	    result = (*(uint8_t *)image->bits.bits) << 24;
+	else
+	    goto otherwise;
+    }
+    else
+    {
+	pixman_iter_t iter;
+
+    otherwise:
+	_pixman_implementation_iter_init (
+	    imp, &iter, image, 0, 0, 1, 1,
+	    (uint8_t *)&result,
+	    ITER_NARROW | ITER_SRC, image->common.flags);
+	
+	result = *iter.get_scanline (&iter, NULL);
+
+	if (iter.fini)
+	    iter.fini (&iter);
+    }
+
+    /* If necessary, convert RGB <--> BGR. */
+    if (PIXMAN_FORMAT_TYPE (format) != PIXMAN_TYPE_ARGB
+	&& PIXMAN_FORMAT_TYPE (format) != PIXMAN_TYPE_ARGB_SRGB)
+    {
+	result = (((result & 0xff000000) >>  0) |
+	          ((result & 0x00ff0000) >> 16) |
+	          ((result & 0x0000ff00) >>  0) |
+	          ((result & 0x000000ff) << 16));
+    }
+
+    return result;
+}
diff --git a/src/pixman/pixman/pixman-implementation.c b/src/pixman/pixman/pixman-implementation.c
new file mode 100644
index 0000000..5884054
--- /dev/null
+++ b/src/pixman/pixman/pixman-implementation.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright © 2009 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include "pixman-private.h"
+
+pixman_implementation_t *
+_pixman_implementation_create (pixman_implementation_t *fallback,
+			       const pixman_fast_path_t *fast_paths)
+{
+    pixman_implementation_t *imp;
+
+    assert (fast_paths);
+
+    if ((imp = malloc (sizeof (pixman_implementation_t))))
+    {
+	pixman_implementation_t *d;
+
+	memset (imp, 0, sizeof *imp);
+
+	imp->fallback = fallback;
+	imp->fast_paths = fast_paths;
+	
+	/* Make sure the whole fallback chain has the right toplevel */
+	for (d = imp; d != NULL; d = d->fallback)
+	    d->toplevel = imp;
+    }
+
+    return imp;
+}
+
+#define N_CACHED_FAST_PATHS 8
+
+typedef struct
+{
+    struct
+    {
+	pixman_implementation_t *	imp;
+	pixman_fast_path_t		fast_path;
+    } cache [N_CACHED_FAST_PATHS];
+} cache_t;
+
+PIXMAN_DEFINE_THREAD_LOCAL (cache_t, fast_path_cache);
+
+static void
+dummy_composite_rect (pixman_implementation_t *imp,
+		      pixman_composite_info_t *info)
+{
+}
+
+void
+_pixman_implementation_lookup_composite (pixman_implementation_t  *toplevel,
+					 pixman_op_t               op,
+					 pixman_format_code_t      src_format,
+					 uint32_t                  src_flags,
+					 pixman_format_code_t      mask_format,
+					 uint32_t                  mask_flags,
+					 pixman_format_code_t      dest_format,
+					 uint32_t                  dest_flags,
+					 pixman_implementation_t **out_imp,
+					 pixman_composite_func_t  *out_func)
+{
+    pixman_implementation_t *imp;
+    cache_t *cache;
+    int i;
+
+    /* Check cache for fast paths */
+    cache = PIXMAN_GET_THREAD_LOCAL (fast_path_cache);
+
+    for (i = 0; i < N_CACHED_FAST_PATHS; ++i)
+    {
+	const pixman_fast_path_t *info = &(cache->cache[i].fast_path);
+
+	/* Note that we check for equality here, not whether
+	 * the cached fast path matches. This is to prevent
+	 * us from selecting an overly general fast path
+	 * when a more specific one would work.
+	 */
+	if (info->op == op			&&
+	    info->src_format == src_format	&&
+	    info->mask_format == mask_format	&&
+	    info->dest_format == dest_format	&&
+	    info->src_flags == src_flags	&&
+	    info->mask_flags == mask_flags	&&
+	    info->dest_flags == dest_flags	&&
+	    info->func)
+	{
+	    *out_imp = cache->cache[i].imp;
+	    *out_func = cache->cache[i].fast_path.func;
+
+	    goto update_cache;
+	}
+    }
+
+    for (imp = toplevel; imp != NULL; imp = imp->fallback)
+    {
+	const pixman_fast_path_t *info = imp->fast_paths;
+
+	while (info->op != PIXMAN_OP_NONE)
+	{
+	    if ((info->op == op || info->op == PIXMAN_OP_any)		&&
+		/* Formats */
+		((info->src_format == src_format) ||
+		 (info->src_format == PIXMAN_any))			&&
+		((info->mask_format == mask_format) ||
+		 (info->mask_format == PIXMAN_any))			&&
+		((info->dest_format == dest_format) ||
+		 (info->dest_format == PIXMAN_any))			&&
+		/* Flags */
+		(info->src_flags & src_flags) == info->src_flags	&&
+		(info->mask_flags & mask_flags) == info->mask_flags	&&
+		(info->dest_flags & dest_flags) == info->dest_flags)
+	    {
+		*out_imp = imp;
+		*out_func = info->func;
+
+		/* Set i to the last spot in the cache so that the
+		 * move-to-front code below will work
+		 */
+		i = N_CACHED_FAST_PATHS - 1;
+
+		goto update_cache;
+	    }
+
+	    ++info;
+	}
+    }
+
+    /* We should never reach this point */
+    _pixman_log_error (
+        FUNC,
+        "No composite function found\n"
+        "\n"
+        "The most likely cause of this is that this system has issues with\n"
+        "thread local storage\n");
+
+    *out_imp = NULL;
+    *out_func = dummy_composite_rect;
+    return;
+
+update_cache:
+    if (i)
+    {
+	while (i--)
+	    cache->cache[i + 1] = cache->cache[i];
+
+	cache->cache[0].imp = *out_imp;
+	cache->cache[0].fast_path.op = op;
+	cache->cache[0].fast_path.src_format = src_format;
+	cache->cache[0].fast_path.src_flags = src_flags;
+	cache->cache[0].fast_path.mask_format = mask_format;
+	cache->cache[0].fast_path.mask_flags = mask_flags;
+	cache->cache[0].fast_path.dest_format = dest_format;
+	cache->cache[0].fast_path.dest_flags = dest_flags;
+	cache->cache[0].fast_path.func = *out_func;
+    }
+}
+
+static void
+dummy_combine (pixman_implementation_t *imp,
+	       pixman_op_t              op,
+	       uint32_t *               pd,
+	       const uint32_t *         ps,
+	       const uint32_t *         pm,
+	       int                      w)
+{
+}
+
+pixman_combine_32_func_t
+_pixman_implementation_lookup_combiner (pixman_implementation_t *imp,
+					pixman_op_t		 op,
+					pixman_bool_t		 component_alpha,
+					pixman_bool_t		 narrow)
+{
+    while (imp)
+    {
+	pixman_combine_32_func_t f = NULL;
+
+	switch ((narrow << 1) | component_alpha)
+	{
+	case 0: /* not narrow, not component alpha */
+	    f = (pixman_combine_32_func_t)imp->combine_float[op];
+	    break;
+	    
+	case 1: /* not narrow, component_alpha */
+	    f = (pixman_combine_32_func_t)imp->combine_float_ca[op];
+	    break;
+
+	case 2: /* narrow, not component alpha */
+	    f = imp->combine_32[op];
+	    break;
+
+	case 3: /* narrow, component_alpha */
+	    f = imp->combine_32_ca[op];
+	    break;
+	}
+
+	if (f)
+	    return f;
+
+	imp = imp->fallback;
+    }
+
+    /* We should never reach this point */
+    _pixman_log_error (FUNC, "No known combine function\n");
+    return dummy_combine;
+}
+
+pixman_bool_t
+_pixman_implementation_blt (pixman_implementation_t * imp,
+                            uint32_t *                src_bits,
+                            uint32_t *                dst_bits,
+                            int                       src_stride,
+                            int                       dst_stride,
+                            int                       src_bpp,
+                            int                       dst_bpp,
+                            int                       src_x,
+                            int                       src_y,
+                            int                       dest_x,
+                            int                       dest_y,
+                            int                       width,
+                            int                       height)
+{
+    while (imp)
+    {
+	if (imp->blt &&
+	    (*imp->blt) (imp, src_bits, dst_bits, src_stride, dst_stride,
+			 src_bpp, dst_bpp, src_x, src_y, dest_x, dest_y,
+			 width, height))
+	{
+	    return TRUE;
+	}
+
+	imp = imp->fallback;
+    }
+
+    return FALSE;
+}
+
+pixman_bool_t
+_pixman_implementation_fill (pixman_implementation_t *imp,
+                             uint32_t *               bits,
+                             int                      stride,
+                             int                      bpp,
+                             int                      x,
+                             int                      y,
+                             int                      width,
+                             int                      height,
+                             uint32_t                 filler)
+{
+    while (imp)
+    {
+	if (imp->fill &&
+	    ((*imp->fill) (imp, bits, stride, bpp, x, y, width, height, filler)))
+	{
+	    return TRUE;
+	}
+
+	imp = imp->fallback;
+    }
+
+    return FALSE;
+}
+
+static uint32_t *
+get_scanline_null (pixman_iter_t *iter, const uint32_t *mask)
+{
+    return NULL;
+}
+
+void
+_pixman_implementation_iter_init (pixman_implementation_t *imp,
+                                  pixman_iter_t           *iter,
+                                  pixman_image_t          *image,
+                                  int                      x,
+                                  int                      y,
+                                  int                      width,
+                                  int                      height,
+                                  uint8_t                 *buffer,
+                                  iter_flags_t             iter_flags,
+                                  uint32_t                 image_flags)
+{
+    pixman_format_code_t format;
+
+    iter->image = image;
+    iter->buffer = (uint32_t *)buffer;
+    iter->x = x;
+    iter->y = y;
+    iter->width = width;
+    iter->height = height;
+    iter->iter_flags = iter_flags;
+    iter->image_flags = image_flags;
+    iter->fini = NULL;
+
+    if (!iter->image)
+    {
+	iter->get_scanline = get_scanline_null;
+	return;
+    }
+
+    format = iter->image->common.extended_format_code;
+
+    while (imp)
+    {
+        if (imp->iter_info)
+        {
+            const pixman_iter_info_t *info;
+
+            for (info = imp->iter_info; info->format != PIXMAN_null; ++info)
+            {
+                if ((info->format == PIXMAN_any || info->format == format) &&
+                    (info->image_flags & image_flags) == info->image_flags &&
+                    (info->iter_flags & iter_flags) == info->iter_flags)
+                {
+                    iter->get_scanline = info->get_scanline;
+                    iter->write_back = info->write_back;
+
+                    if (info->initializer)
+                        info->initializer (iter, info);
+                    return;
+                }
+            }
+        }
+
+        imp = imp->fallback;
+    }
+}
+
+pixman_bool_t
+_pixman_disabled (const char *name)
+{
+    const char *env;
+
+    if ((env = getenv ("PIXMAN_DISABLE")))
+    {
+	do
+	{
+	    const char *end;
+	    int len;
+
+	    if ((end = strchr (env, ' ')))
+		len = end - env;
+	    else
+		len = strlen (env);
+
+	    if (strlen (name) == len && strncmp (name, env, len) == 0)
+	    {
+		printf ("pixman: Disabled %s implementation\n", name);
+		return TRUE;
+	    }
+
+	    env += len;
+	}
+	while (*env++);
+    }
+
+    return FALSE;
+}
+
+pixman_implementation_t *
+_pixman_choose_implementation (void)
+{
+    pixman_implementation_t *imp;
+
+    imp = _pixman_implementation_create_general();
+
+    if (!_pixman_disabled ("fast"))
+	imp = _pixman_implementation_create_fast_path (imp);
+
+    imp = _pixman_x86_get_implementations (imp);
+    imp = _pixman_arm_get_implementations (imp);
+    imp = _pixman_ppc_get_implementations (imp);
+    imp = _pixman_mips_get_implementations (imp);
+
+    imp = _pixman_implementation_create_noop (imp);
+
+    return imp;
+}
diff --git a/src/pixman/pixman/pixman-inlines.h b/src/pixman/pixman/pixman-inlines.h
new file mode 100644
index 0000000..dd1c2f1
--- /dev/null
+++ b/src/pixman/pixman/pixman-inlines.h
@@ -0,0 +1,1339 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifndef PIXMAN_FAST_PATH_H__
+#define PIXMAN_FAST_PATH_H__
+
+#include "pixman-private.h"
+
+#define PIXMAN_REPEAT_COVER -1
+
+/* Flags describing input parameters to fast path macro template.
+ * Turning on some flag values may indicate that
+ * "some property X is available so template can use this" or
+ * "some property X should be handled by template".
+ *
+ * FLAG_HAVE_SOLID_MASK
+ *  Input mask is solid so template should handle this.
+ *
+ * FLAG_HAVE_NON_SOLID_MASK
+ *  Input mask is bits mask so template should handle this.
+ *
+ * FLAG_HAVE_SOLID_MASK and FLAG_HAVE_NON_SOLID_MASK are mutually
+ * exclusive. (It's not allowed to turn both flags on)
+ */
+#define FLAG_NONE				(0)
+#define FLAG_HAVE_SOLID_MASK			(1 <<   1)
+#define FLAG_HAVE_NON_SOLID_MASK		(1 <<   2)
+
+/* To avoid too short repeated scanline function calls, extend source
+ * scanlines having width less than below constant value.
+ */
+#define REPEAT_NORMAL_MIN_WIDTH			64
+
+static force_inline pixman_bool_t
+repeat (pixman_repeat_t repeat, int *c, int size)
+{
+    if (repeat == PIXMAN_REPEAT_NONE)
+    {
+	if (*c < 0 || *c >= size)
+	    return FALSE;
+    }
+    else if (repeat == PIXMAN_REPEAT_NORMAL)
+    {
+	while (*c >= size)
+	    *c -= size;
+	while (*c < 0)
+	    *c += size;
+    }
+    else if (repeat == PIXMAN_REPEAT_PAD)
+    {
+	*c = CLIP (*c, 0, size - 1);
+    }
+    else /* REFLECT */
+    {
+	*c = MOD (*c, size * 2);
+	if (*c >= size)
+	    *c = size * 2 - *c - 1;
+    }
+    return TRUE;
+}
+
+static force_inline int
+pixman_fixed_to_bilinear_weight (pixman_fixed_t x)
+{
+    return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
+	   ((1 << BILINEAR_INTERPOLATION_BITS) - 1);
+}
+
+#if BILINEAR_INTERPOLATION_BITS <= 4
+/* Inspired by Filter_32_opaque from Skia */
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    int distxy, distxiy, distixy, distixiy;
+    uint32_t lo, hi;
+
+    distx <<= (4 - BILINEAR_INTERPOLATION_BITS);
+    disty <<= (4 - BILINEAR_INTERPOLATION_BITS);
+
+    distxy = distx * disty;
+    distxiy = (distx << 4) - distxy;	/* distx * (16 - disty) */
+    distixy = (disty << 4) - distxy;	/* disty * (16 - distx) */
+    distixiy =
+	16 * 16 - (disty << 4) -
+	(distx << 4) + distxy; /* (16 - distx) * (16 - disty) */
+
+    lo = (tl & 0xff00ff) * distixiy;
+    hi = ((tl >> 8) & 0xff00ff) * distixiy;
+
+    lo += (tr & 0xff00ff) * distxiy;
+    hi += ((tr >> 8) & 0xff00ff) * distxiy;
+
+    lo += (bl & 0xff00ff) * distixy;
+    hi += ((bl >> 8) & 0xff00ff) * distixy;
+
+    lo += (br & 0xff00ff) * distxy;
+    hi += ((br >> 8) & 0xff00ff) * distxy;
+
+    return ((lo >> 8) & 0xff00ff) | (hi & ~0xff00ff);
+}
+
+#else
+#if SIZEOF_LONG > 4
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    uint64_t distxy, distxiy, distixy, distixiy;
+    uint64_t tl64, tr64, bl64, br64;
+    uint64_t f, r;
+
+    distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
+    disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
+
+    distxy = distx * disty;
+    distxiy = distx * (256 - disty);
+    distixy = (256 - distx) * disty;
+    distixiy = (256 - distx) * (256 - disty);
+
+    /* Alpha and Blue */
+    tl64 = tl & 0xff0000ff;
+    tr64 = tr & 0xff0000ff;
+    bl64 = bl & 0xff0000ff;
+    br64 = br & 0xff0000ff;
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r = f & 0x0000ff0000ff0000ull;
+
+    /* Red and Green */
+    tl64 = tl;
+    tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
+
+    tr64 = tr;
+    tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
+
+    bl64 = bl;
+    bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
+
+    br64 = br;
+    br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
+
+    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+    r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
+
+    return (uint32_t)(r >> 16);
+}
+
+#else
+
+static force_inline uint32_t
+bilinear_interpolation (uint32_t tl, uint32_t tr,
+			uint32_t bl, uint32_t br,
+			int distx, int disty)
+{
+    int distxy, distxiy, distixy, distixiy;
+    uint32_t f, r;
+
+    distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
+    disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
+
+    distxy = distx * disty;
+    distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
+    distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
+    distixiy =
+	256 * 256 - (disty << 8) -
+	(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
+
+    /* Blue */
+    r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+
+    /* Green */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    tl >>= 16;
+    tr >>= 16;
+    bl >>= 16;
+    br >>= 16;
+    r >>= 16;
+
+    /* Red */
+    f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
+      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
+    r |= f & 0x00ff0000;
+
+    /* Alpha */
+    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
+      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
+    r |= f & 0xff000000;
+
+    return r;
+}
+
+#endif
+#endif // BILINEAR_INTERPOLATION_BITS <= 4
+
+/*
+ * For each scanline fetched from source image with PAD repeat:
+ * - calculate how many pixels need to be padded on the left side
+ * - calculate how many pixels need to be padded on the right side
+ * - update width to only count pixels which are fetched from the image
+ * All this information is returned via 'width', 'left_pad', 'right_pad'
+ * arguments. The code is assuming that 'unit_x' is positive.
+ *
+ * Note: 64-bit math is used in order to avoid potential overflows, which
+ *       is probably excessive in many cases. This particular function
+ *       may need its own correctness test and performance tuning.
+ */
+static force_inline void
+pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+				pixman_fixed_t  vx,
+				pixman_fixed_t  unit_x,
+				int32_t *       width,
+				int32_t *       left_pad,
+				int32_t *       right_pad)
+{
+    int64_t max_vx = (int64_t) source_image_width << 16;
+    int64_t tmp;
+    if (vx < 0)
+    {
+	tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
+	if (tmp > *width)
+	{
+	    *left_pad = *width;
+	    *width = 0;
+	}
+	else
+	{
+	    *left_pad = (int32_t) tmp;
+	    *width -= (int32_t) tmp;
+	}
+    }
+    else
+    {
+	*left_pad = 0;
+    }
+    tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
+    if (tmp < 0)
+    {
+	*right_pad = *width;
+	*width = 0;
+    }
+    else if (tmp >= *width)
+    {
+	*right_pad = 0;
+    }
+    else
+    {
+	*right_pad = *width - (int32_t) tmp;
+	*width = (int32_t) tmp;
+    }
+}
+
+/* A macroified version of specialized nearest scalers for some
+ * common 8888 and 565 formats. It supports SRC and OVER ops.
+ *
+ * There are two repeat versions, one that handles repeat normal,
+ * and one without repeat handling that only works if the src region
+ * used is completely covered by the pre-repeated source samples.
+ *
+ * The loops are unrolled to process two pixels per iteration for better
+ * performance on most CPU architectures (superscalar processors
+ * can issue several operations simultaneously, other processors can hide
+ * instructions latencies by pipelining operations). Unrolling more
+ * does not make much sense because the compiler will start running out
+ * of spare registers soon.
+ */
+
+#define GET_8888_ALPHA(s) ((s) >> 24)
+ /* This is not actually used since we don't have an OVER with
+    565 source, but it is needed to build. */
+#define GET_0565_ALPHA(s) 0xff
+#define GET_x888_ALPHA(s) 0xff
+
+#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\
+			      src_type_t, dst_type_t, OP, repeat_mode)				\
+static force_inline void									\
+scanline_func_name (dst_type_t       *dst,							\
+		    const src_type_t *src,							\
+		    int32_t           w,							\
+		    pixman_fixed_t    vx,							\
+		    pixman_fixed_t    unit_x,							\
+		    pixman_fixed_t    src_width_fixed,						\
+		    pixman_bool_t     fully_transparent_src)					\
+{												\
+	uint32_t   d;										\
+	src_type_t s1, s2;									\
+	uint8_t    a1, a2;									\
+	int        x1, x2;									\
+												\
+	if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src)			\
+	    return;										\
+												\
+	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
+	    abort();										\
+												\
+	while ((w -= 2) >= 0)									\
+	{											\
+	    x1 = pixman_fixed_to_int (vx);							\
+	    vx += unit_x;									\
+	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= 0)									\
+		    vx -= src_width_fixed;							\
+	    }											\
+	    s1 = *(src + x1);									\
+												\
+	    x2 = pixman_fixed_to_int (vx);							\
+	    vx += unit_x;									\
+	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    {											\
+		/* This works because we know that unit_x is positive */			\
+		while (vx >= 0)									\
+		    vx -= src_width_fixed;							\
+	    }											\
+	    s2 = *(src + x2);									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = convert_ ## DST_FORMAT ## _to_8888 (*dst);				\
+		    s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+												\
+		if (a2 == 0xff)									\
+		{										\
+		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2);			\
+		}										\
+		else if (s2)									\
+		{										\
+		    d = convert_## DST_FORMAT ## _to_8888 (*dst);				\
+		    s2 = convert_## SRC_FORMAT ## _to_8888 (s2);				\
+		    a2 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
+		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
+		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2);			\
+	    }											\
+	}											\
+												\
+	if (w & 1)										\
+	{											\
+	    x1 = pixman_fixed_to_int (vx);							\
+	    s1 = *(src + x1);									\
+												\
+	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
+	    {											\
+		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
+												\
+		if (a1 == 0xff)									\
+		{										\
+		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
+		}										\
+		else if (s1)									\
+		{										\
+		    d = convert_## DST_FORMAT ## _to_8888 (*dst);				\
+		    s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1);				\
+		    a1 ^= 0xff;									\
+		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
+		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
+		}										\
+		dst++;										\
+	    }											\
+	    else /* PIXMAN_OP_SRC */								\
+	    {											\
+		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
+	    }											\
+	}											\
+}
+
+#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
+static void											\
+fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,		\
+						   pixman_composite_info_t *info)               \
+{												\
+    PIXMAN_COMPOSITE_ARGS (info);					                        \
+    dst_type_t *dst_line;						                        \
+    mask_type_t *mask_line;									\
+    src_type_t *src_first_line;									\
+    int       y;										\
+    pixman_fixed_t src_width_fixed = pixman_int_to_fixed (src_image->bits.width);		\
+    pixman_fixed_t max_vy;									\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+    int32_t left_pad, right_pad;								\
+												\
+    src_type_t *src;										\
+    dst_type_t *dst;										\
+    mask_type_t solid_mask;									\
+    const mask_type_t *mask = &solid_mask;							\
+    int src_stride, mask_stride, dst_stride;							\
+												\
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
+    if (have_mask)										\
+    {												\
+	if (mask_is_solid)									\
+	    solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
+	else											\
+	    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,			\
+				   mask_stride, mask_line, 1);					\
+    }												\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
+    v.vector[0] -= pixman_fixed_e;								\
+    v.vector[1] -= pixman_fixed_e;								\
+												\
+    vx = v.vector[0];										\
+    vy = v.vector[1];										\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
+    {												\
+	max_vy = pixman_int_to_fixed (src_image->bits.height);					\
+												\
+	/* Clamp repeating positions inside the actual samples */				\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);					\
+	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+    }												\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
+	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
+    {												\
+	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
+					&width, &left_pad, &right_pad);				\
+	vx += left_pad * unit_x;								\
+    }												\
+												\
+    while (--height >= 0)									\
+    {												\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+	if (have_mask && !mask_is_solid)							\
+	{											\
+	    mask = mask_line;									\
+	    mask_line += mask_stride;								\
+	}											\
+												\
+	y = pixman_fixed_to_int (vy);								\
+	vy += unit_y;										\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
+	    src = src_first_line + src_stride * y;						\
+	    if (left_pad > 0)									\
+	    {											\
+		scanline_func (mask, dst,							\
+			       src + src_image->bits.width - src_image->bits.width + 1,		\
+			       left_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
+			       dst + left_pad, src + src_image->bits.width, width,		\
+			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
+			       dst + left_pad + width, src + src_image->bits.width,		\
+			       right_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
+	{											\
+	    static const src_type_t zero[1] = { 0 };						\
+	    if (y < 0 || y >= src_image->bits.height)						\
+	    {											\
+		scanline_func (mask, dst, zero + 1, left_pad + width + right_pad,		\
+			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
+		continue;									\
+	    }											\
+	    src = src_first_line + src_stride * y;						\
+	    if (left_pad > 0)									\
+	    {											\
+		scanline_func (mask, dst, zero + 1, left_pad,					\
+			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
+			       dst + left_pad, src + src_image->bits.width, width,		\
+			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
+			       dst + left_pad + width, zero + 1, right_pad,			\
+			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
+	    }											\
+	}											\
+	else											\
+	{											\
+	    src = src_first_line + src_stride * y;						\
+	    scanline_func (mask, dst, src + src_image->bits.width, width, vx - src_width_fixed,	\
+			   unit_x, src_width_fixed, FALSE);					\
+	}											\
+    }												\
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
+	FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, have_mask, mask_is_solid)
+
+#define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t,	\
+			      repeat_mode)							\
+    static force_inline void									\
+    scanline_func##scale_func_name##_wrapper (							\
+		    const uint8_t    *mask,							\
+		    dst_type_t       *dst,							\
+		    const src_type_t *src,							\
+		    int32_t          w,								\
+		    pixman_fixed_t   vx,							\
+		    pixman_fixed_t   unit_x,							\
+		    pixman_fixed_t   max_vx,							\
+		    pixman_bool_t    fully_transparent_src)					\
+    {												\
+	scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src);			\
+    }												\
+    FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper,	\
+			       src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE)
+
+#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,		\
+			      repeat_mode)							\
+	FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t,		\
+			      dst_type_t, repeat_mode)
+
+#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,				\
+		     src_type_t, dst_type_t, OP, repeat_mode)				\
+    FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
+			  SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,		\
+			  OP, repeat_mode)						\
+    FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP,			\
+			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
+			  src_type_t, dst_type_t, repeat_mode)
+
+
+#define SCALED_NEAREST_FLAGS						\
+    (FAST_PATH_SCALE_TRANSFORM	|					\
+     FAST_PATH_NO_ALPHA_MAP	|					\
+     FAST_PATH_NEAREST_FILTER	|					\
+     FAST_PATH_NO_ACCESSORS	|					\
+     FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,    \
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,	\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_NEAREST_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,	\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
+    }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),			\
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)			\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func)		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
+
+/*****************************************************************************/
+
+/*
+ * Identify 5 zones in each scanline for bilinear scaling. Depending on
+ * whether 2 pixels to be interpolated are fetched from the image itself,
+ * from the padding area around it or from both image and padding area.
+ */
+static force_inline void
+bilinear_pad_repeat_get_scanline_bounds (int32_t         source_image_width,
+					 pixman_fixed_t  vx,
+					 pixman_fixed_t  unit_x,
+					 int32_t *       left_pad,
+					 int32_t *       left_tz,
+					 int32_t *       width,
+					 int32_t *       right_tz,
+					 int32_t *       right_pad)
+{
+	int width1 = *width, left_pad1, right_pad1;
+	int width2 = *width, left_pad2, right_pad2;
+
+	pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x,
+					&width1, &left_pad1, &right_pad1);
+	pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1,
+					unit_x, &width2, &left_pad2, &right_pad2);
+
+	*left_pad = left_pad2;
+	*left_tz = left_pad1 - left_pad2;
+	*right_tz = right_pad2 - right_pad1;
+	*right_pad = right_pad1;
+	*width -= *left_pad + *left_tz + *right_tz + *right_pad;
+}
+
+/*
+ * Main loop template for single pass bilinear scaling. It needs to be
+ * provided with 'scanline_func' which should do the compositing operation.
+ * The needed function has the following prototype:
+ *
+ *	scanline_func (dst_type_t *       dst,
+ *		       const mask_type_ * mask,
+ *		       const src_type_t * src_top,
+ *		       const src_type_t * src_bottom,
+ *		       int32_t            width,
+ *		       int                weight_top,
+ *		       int                weight_bottom,
+ *		       pixman_fixed_t     vx,
+ *		       pixman_fixed_t     unit_x,
+ *		       pixman_fixed_t     max_vx,
+ *		       pixman_bool_t      zero_src)
+ *
+ * Where:
+ *  dst                 - destination scanline buffer for storing results
+ *  mask                - mask buffer (or single value for solid mask)
+ *  src_top, src_bottom - two source scanlines
+ *  width               - number of pixels to process
+ *  weight_top          - weight of the top row for interpolation
+ *  weight_bottom       - weight of the bottom row for interpolation
+ *  vx                  - initial position for fetching the first pair of
+ *                        pixels from the source buffer
+ *  unit_x              - position increment needed to move to the next pair
+ *                        of pixels
+ *  max_vx              - image size as a fixed point value, can be used for
+ *                        implementing NORMAL repeat (when it is supported)
+ *  zero_src            - boolean hint variable, which is set to TRUE when
+ *                        all source pixels are fetched from zero padding
+ *                        zone for NONE repeat
+ *
+ * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to
+ *       BILINEAR_INTERPOLATION_RANGE, but sometimes it may be less than that
+ *       for NONE repeat when handling fuzzy antialiased top or bottom image
+ *       edges. Also both top and bottom weight variables are guaranteed to
+ *       have value, which is less than BILINEAR_INTERPOLATION_RANGE.
+ *       For example, the weights can fit into unsigned byte or be used
+ *       with 8-bit SIMD multiplication instructions for 8-bit interpolation
+ *       precision.
+ */
+#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, flags)				\
+static void											\
+fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,		\
+						   pixman_composite_info_t *info)		\
+{												\
+    PIXMAN_COMPOSITE_ARGS (info);								\
+    dst_type_t *dst_line;									\
+    mask_type_t *mask_line;									\
+    src_type_t *src_first_line;									\
+    int       y1, y2;										\
+    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
+    pixman_vector_t v;										\
+    pixman_fixed_t vx, vy;									\
+    pixman_fixed_t unit_x, unit_y;								\
+    int32_t left_pad, left_tz, right_tz, right_pad;						\
+												\
+    dst_type_t *dst;										\
+    mask_type_t solid_mask;									\
+    const mask_type_t *mask = &solid_mask;							\
+    int src_stride, mask_stride, dst_stride;							\
+												\
+    int src_width;										\
+    pixman_fixed_t src_width_fixed;								\
+    int max_x;											\
+    pixman_bool_t need_src_extension;								\
+												\
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
+    if (flags & FLAG_HAVE_SOLID_MASK)								\
+    {												\
+	solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
+	mask_stride = 0;									\
+    }												\
+    else if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
+    {												\
+	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,				\
+			       mask_stride, mask_line, 1);					\
+    }												\
+												\
+    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
+     * transformed from destination space to source space */					\
+    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
+												\
+    /* reference point is the center of the pixel */						\
+    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
+    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
+    v.vector[2] = pixman_fixed_1;								\
+												\
+    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
+	return;											\
+												\
+    unit_x = src_image->common.transform->matrix[0][0];						\
+    unit_y = src_image->common.transform->matrix[1][1];						\
+												\
+    v.vector[0] -= pixman_fixed_1 / 2;								\
+    v.vector[1] -= pixman_fixed_1 / 2;								\
+												\
+    vy = v.vector[1];										\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
+	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
+    {												\
+	bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x,	\
+					&left_pad, &left_tz, &width, &right_tz, &right_pad);	\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    /* PAD repeat does not need special handling for 'transition zones' and */		\
+	    /* they can be combined with 'padding zones' safely */				\
+	    left_pad += left_tz;								\
+	    right_pad += right_tz;								\
+	    left_tz = right_tz = 0;								\
+	}											\
+	v.vector[0] += left_pad * unit_x;							\
+    }												\
+												\
+    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
+    {												\
+	vx = v.vector[0];									\
+	repeat (PIXMAN_REPEAT_NORMAL, &vx, pixman_int_to_fixed(src_image->bits.width));		\
+	max_x = pixman_fixed_to_int (vx + (width - 1) * (int64_t)unit_x) + 1;			\
+												\
+	if (src_image->bits.width < REPEAT_NORMAL_MIN_WIDTH)					\
+	{											\
+	    src_width = 0;									\
+												\
+	    while (src_width < REPEAT_NORMAL_MIN_WIDTH && src_width <= max_x)			\
+		src_width += src_image->bits.width;						\
+												\
+	    need_src_extension = TRUE;								\
+	}											\
+	else											\
+	{											\
+	    src_width = src_image->bits.width;							\
+	    need_src_extension = FALSE;								\
+	}											\
+												\
+	src_width_fixed = pixman_int_to_fixed (src_width);					\
+    }												\
+												\
+    while (--height >= 0)									\
+    {												\
+	int weight1, weight2;									\
+	dst = dst_line;										\
+	dst_line += dst_stride;									\
+	vx = v.vector[0];									\
+	if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
+	{											\
+	    mask = mask_line;									\
+	    mask_line += mask_stride;								\
+	}											\
+												\
+	y1 = pixman_fixed_to_int (vy);								\
+	weight2 = pixman_fixed_to_bilinear_weight (vy);						\
+	if (weight2)										\
+	{											\
+	    /* both weight1 and weight2 are smaller than BILINEAR_INTERPOLATION_RANGE */	\
+	    y2 = y1 + 1;									\
+	    weight1 = BILINEAR_INTERPOLATION_RANGE - weight2;					\
+	}											\
+	else											\
+	{											\
+	    /* set both top and bottom row to the same scanline and tweak weights */		\
+	    y2 = y1;										\
+	    weight1 = weight2 = BILINEAR_INTERPOLATION_RANGE / 2;				\
+	}											\
+	vy += unit_y;										\
+	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
+	{											\
+	    src_type_t *src1, *src2;								\
+	    src_type_t buf1[2];									\
+	    src_type_t buf2[2];									\
+	    repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height);				\
+	    repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height);				\
+	    src1 = src_first_line + src_stride * y1;						\
+	    src2 = src_first_line + src_stride * y2;						\
+												\
+	    if (left_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = src1[0];							\
+		buf2[0] = buf2[1] = src2[0];							\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE);		\
+		dst += left_pad;								\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += left_pad;								\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (dst, mask,							\
+			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
+		dst += width;									\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += width;								\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = src1[src_image->bits.width - 1];				\
+		buf2[0] = buf2[1] = src2[src_image->bits.width - 1];				\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE);	\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
+	{											\
+	    src_type_t *src1, *src2;								\
+	    src_type_t buf1[2];									\
+	    src_type_t buf2[2];									\
+	    /* handle top/bottom zero padding by just setting weights to 0 if needed */		\
+	    if (y1 < 0)										\
+	    {											\
+		weight1 = 0;									\
+		y1 = 0;										\
+	    }											\
+	    if (y1 >= src_image->bits.height)							\
+	    {											\
+		weight1 = 0;									\
+		y1 = src_image->bits.height - 1;						\
+	    }											\
+	    if (y2 < 0)										\
+	    {											\
+		weight2 = 0;									\
+		y2 = 0;										\
+	    }											\
+	    if (y2 >= src_image->bits.height)							\
+	    {											\
+		weight2 = 0;									\
+		y2 = src_image->bits.height - 1;						\
+	    }											\
+	    src1 = src_first_line + src_stride * y1;						\
+	    src2 = src_first_line + src_stride * y2;						\
+												\
+	    if (left_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = 0;								\
+		buf2[0] = buf2[1] = 0;								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE);		\
+		dst += left_pad;								\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += left_pad;								\
+	    }											\
+	    if (left_tz > 0)									\
+	    {											\
+		buf1[0] = 0;									\
+		buf1[1] = src1[0];								\
+		buf2[0] = 0;									\
+		buf2[1] = src2[0];								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, left_tz, weight1, weight2,				\
+			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
+		dst += left_tz;									\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += left_tz;								\
+		vx += left_tz * unit_x;								\
+	    }											\
+	    if (width > 0)									\
+	    {											\
+		scanline_func (dst, mask,							\
+			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
+		dst += width;									\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += width;								\
+		vx += width * unit_x;								\
+	    }											\
+	    if (right_tz > 0)									\
+	    {											\
+		buf1[0] = src1[src_image->bits.width - 1];					\
+		buf1[1] = 0;									\
+		buf2[0] = src2[src_image->bits.width - 1];					\
+		buf2[1] = 0;									\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_tz, weight1, weight2,				\
+			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
+		dst += right_tz;								\
+		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
+		    mask += right_tz;								\
+	    }											\
+	    if (right_pad > 0)									\
+	    {											\
+		buf1[0] = buf1[1] = 0;								\
+		buf2[0] = buf2[1] = 0;								\
+		scanline_func (dst, mask,							\
+			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE);		\
+	    }											\
+	}											\
+	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
+	{											\
+	    int32_t	    num_pixels;								\
+	    int32_t	    width_remain;							\
+	    src_type_t *    src_line_top;							\
+	    src_type_t *    src_line_bottom;							\
+	    src_type_t	    buf1[2];								\
+	    src_type_t	    buf2[2];								\
+	    src_type_t	    extended_src_line0[REPEAT_NORMAL_MIN_WIDTH*2];			\
+	    src_type_t	    extended_src_line1[REPEAT_NORMAL_MIN_WIDTH*2];			\
+	    int		    i, j;								\
+												\
+	    repeat (PIXMAN_REPEAT_NORMAL, &y1, src_image->bits.height);				\
+	    repeat (PIXMAN_REPEAT_NORMAL, &y2, src_image->bits.height);				\
+	    src_line_top = src_first_line + src_stride * y1;					\
+	    src_line_bottom = src_first_line + src_stride * y2;					\
+												\
+	    if (need_src_extension)								\
+	    {											\
+		for (i=0; i<src_width;)								\
+		{										\
+		    for (j=0; j<src_image->bits.width; j++, i++)				\
+		    {										\
+			extended_src_line0[i] = src_line_top[j];				\
+			extended_src_line1[i] = src_line_bottom[j];				\
+		    }										\
+		}										\
+												\
+		src_line_top = &extended_src_line0[0];						\
+		src_line_bottom = &extended_src_line1[0];					\
+	    }											\
+												\
+	    /* Top & Bottom wrap around buffer */						\
+	    buf1[0] = src_line_top[src_width - 1];						\
+	    buf1[1] = src_line_top[0];								\
+	    buf2[0] = src_line_bottom[src_width - 1];						\
+	    buf2[1] = src_line_bottom[0];							\
+												\
+	    width_remain = width;								\
+												\
+	    while (width_remain > 0)								\
+	    {											\
+		/* We use src_width_fixed because it can make vx in original source range */	\
+		repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);				\
+												\
+		/* Wrap around part */								\
+		if (pixman_fixed_to_int (vx) == src_width - 1)					\
+		{										\
+		    /* for positive unit_x							\
+		     * num_pixels = max(n) + 1, where vx + n*unit_x < src_width_fixed		\
+		     *										\
+		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
+		     * So we are safe from overflow.						\
+		     */										\
+		    num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1;	\
+												\
+		    if (num_pixels > width_remain)						\
+			num_pixels = width_remain;						\
+												\
+		    scanline_func (dst, mask, buf1, buf2, num_pixels,				\
+				   weight1, weight2, pixman_fixed_frac(vx),			\
+				   unit_x, src_width_fixed, FALSE);				\
+												\
+		    width_remain -= num_pixels;							\
+		    vx += num_pixels * unit_x;							\
+		    dst += num_pixels;								\
+												\
+		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
+			mask += num_pixels;							\
+												\
+		    repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);			\
+		}										\
+												\
+		/* Normal scanline composite */							\
+		if (pixman_fixed_to_int (vx) != src_width - 1 && width_remain > 0)		\
+		{										\
+		    /* for positive unit_x							\
+		     * num_pixels = max(n) + 1, where vx + n*unit_x < (src_width_fixed - 1)	\
+		     *										\
+		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
+		     * So we are safe from overflow here.					\
+		     */										\
+		    num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e)	\
+				  / unit_x) + 1;						\
+												\
+		    if (num_pixels > width_remain)						\
+			num_pixels = width_remain;						\
+												\
+		    scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels,	\
+				   weight1, weight2, vx, unit_x, src_width_fixed, FALSE);	\
+												\
+		    width_remain -= num_pixels;							\
+		    vx += num_pixels * unit_x;							\
+		    dst += num_pixels;								\
+												\
+		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
+		        mask += num_pixels;							\
+		}										\
+	    }											\
+	}											\
+	else											\
+	{											\
+	    scanline_func (dst, mask, src_first_line + src_stride * y1,				\
+			   src_first_line + src_stride * y2, width,				\
+			   weight1, weight2, vx, unit_x, max_vx, FALSE);			\
+	}											\
+    }												\
+}
+
+/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
+#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
+				  dst_type_t, repeat_mode, flags)				\
+	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
+				  dst_type_t, repeat_mode, flags)
+
+#define SCALED_BILINEAR_FLAGS						\
+    (FAST_PATH_SCALE_TRANSFORM	|					\
+     FAST_PATH_NO_ALPHA_MAP	|					\
+     FAST_PATH_BILINEAR_FILTER	|					\
+     FAST_PATH_NO_ACCESSORS	|					\
+     FAST_PATH_NARROW_FORMAT)
+
+#define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_FAST_PATH_NORMAL(op,s,d,func)			\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_null, 0,							\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_PAD_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NONE_REPEAT		|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
+    }
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)	\
+    {   PIXMAN_OP_ ## op,						\
+	PIXMAN_ ## s,							\
+	(SCALED_BILINEAR_FLAGS		|				\
+	 FAST_PATH_NORMAL_REPEAT	|				\
+	 FAST_PATH_X_UNIT_POSITIVE),					\
+	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
+	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
+	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
+    }
+
+/* Prefer the use of 'cover' variant, because it is faster */
+#define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func)				\
+    SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func),			\
+    SIMPLE_BILINEAR_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func)			\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func),		\
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
+
+#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func)		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func),		\
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func)
+
+#endif
diff --git a/src/pixman/pixman/pixman-linear-gradient.c b/src/pixman/pixman/pixman-linear-gradient.c
new file mode 100644
index 0000000..40c8c9f
--- /dev/null
+++ b/src/pixman/pixman/pixman-linear-gradient.c
@@ -0,0 +1,287 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include "pixman-private.h"
+
+static pixman_bool_t
+linear_gradient_is_horizontal (pixman_image_t *image,
+			       int             x,
+			       int             y,
+			       int             width,
+			       int             height)
+{
+    linear_gradient_t *linear = (linear_gradient_t *)image;
+    pixman_vector_t v;
+    pixman_fixed_32_32_t l;
+    pixman_fixed_48_16_t dx, dy;
+    double inc;
+
+    if (image->common.transform)
+    {
+	/* projective transformation */
+	if (image->common.transform->matrix[2][0] != 0 ||
+	    image->common.transform->matrix[2][1] != 0 ||
+	    image->common.transform->matrix[2][2] == 0)
+	{
+	    return FALSE;
+	}
+
+	v.vector[0] = image->common.transform->matrix[0][1];
+	v.vector[1] = image->common.transform->matrix[1][1];
+	v.vector[2] = image->common.transform->matrix[2][2];
+    }
+    else
+    {
+	v.vector[0] = 0;
+	v.vector[1] = pixman_fixed_1;
+	v.vector[2] = pixman_fixed_1;
+    }
+
+    dx = linear->p2.x - linear->p1.x;
+    dy = linear->p2.y - linear->p1.y;
+
+    l = dx * dx + dy * dy;
+
+    if (l == 0)
+	return FALSE;
+
+    /*
+     * compute how much the input of the gradient walked changes
+     * when moving vertically through the whole image
+     */
+    inc = height * (double) pixman_fixed_1 * pixman_fixed_1 *
+	(dx * v.vector[0] + dy * v.vector[1]) /
+	(v.vector[2] * (double) l);
+
+    /* check that casting to integer would result in 0 */
+    if (-1 < inc && inc < 1)
+	return TRUE;
+
+    return FALSE;
+}
+
+static uint32_t *
+linear_get_scanline_narrow (pixman_iter_t  *iter,
+			    const uint32_t *mask)
+{
+    pixman_image_t *image  = iter->image;
+    int             x      = iter->x;
+    int             y      = iter->y;
+    int             width  = iter->width;
+    uint32_t *      buffer = iter->buffer;
+
+    pixman_vector_t v, unit;
+    pixman_fixed_32_32_t l;
+    pixman_fixed_48_16_t dx, dy;
+    gradient_t *gradient = (gradient_t *)image;
+    linear_gradient_t *linear = (linear_gradient_t *)image;
+    uint32_t *end = buffer + width;
+    pixman_gradient_walker_t walker;
+
+    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (image->common.transform)
+    {
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return iter->buffer;
+
+	unit.vector[0] = image->common.transform->matrix[0][0];
+	unit.vector[1] = image->common.transform->matrix[1][0];
+	unit.vector[2] = image->common.transform->matrix[2][0];
+    }
+    else
+    {
+	unit.vector[0] = pixman_fixed_1;
+	unit.vector[1] = 0;
+	unit.vector[2] = 0;
+    }
+
+    dx = linear->p2.x - linear->p1.x;
+    dy = linear->p2.y - linear->p1.y;
+
+    l = dx * dx + dy * dy;
+
+    if (l == 0 || unit.vector[2] == 0)
+    {
+	/* affine transformation only */
+        pixman_fixed_32_32_t t, next_inc;
+	double inc;
+
+	if (l == 0 || v.vector[2] == 0)
+	{
+	    t = 0;
+	    inc = 0;
+	}
+	else
+	{
+	    double invden, v2;
+
+	    invden = pixman_fixed_1 * (double) pixman_fixed_1 /
+		(l * (double) v.vector[2]);
+	    v2 = v.vector[2] * (1. / pixman_fixed_1);
+	    t = ((dx * v.vector[0] + dy * v.vector[1]) - 
+		 (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
+	    inc = (dx * unit.vector[0] + dy * unit.vector[1]) * invden;
+	}
+	next_inc = 0;
+
+	if (((pixman_fixed_32_32_t )(inc * width)) == 0)
+	{
+	    register uint32_t color;
+
+	    color = _pixman_gradient_walker_pixel (&walker, t);
+	    while (buffer < end)
+		*buffer++ = color;
+	}
+	else
+	{
+	    int i;
+
+	    i = 0;
+	    while (buffer < end)
+	    {
+		if (!mask || *mask++)
+		{
+		    *buffer = _pixman_gradient_walker_pixel (&walker,
+							     t + next_inc);
+		}
+		i++;
+		next_inc = inc * i;
+		buffer++;
+	    }
+	}
+    }
+    else
+    {
+	/* projective transformation */
+        double t;
+
+	t = 0;
+
+	while (buffer < end)
+	{
+	    if (!mask || *mask++)
+	    {
+	        if (v.vector[2] != 0)
+		{
+		    double invden, v2;
+
+		    invden = pixman_fixed_1 * (double) pixman_fixed_1 /
+			(l * (double) v.vector[2]);
+		    v2 = v.vector[2] * (1. / pixman_fixed_1);
+		    t = ((dx * v.vector[0] + dy * v.vector[1]) - 
+			 (dx * linear->p1.x + dy * linear->p1.y) * v2) * invden;
+		}
+
+		*buffer = _pixman_gradient_walker_pixel (&walker, t);
+	    }
+
+	    ++buffer;
+
+	    v.vector[0] += unit.vector[0];
+	    v.vector[1] += unit.vector[1];
+	    v.vector[2] += unit.vector[2];
+	}
+    }
+
+    iter->y++;
+
+    return iter->buffer;
+}
+
+static uint32_t *
+linear_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *buffer = linear_get_scanline_narrow (iter, NULL);
+
+    pixman_expand_to_float (
+	(argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return buffer;
+}
+
+void
+_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t  *iter)
+{
+    if (linear_gradient_is_horizontal (
+	    iter->image, iter->x, iter->y, iter->width, iter->height))
+    {
+	if (iter->iter_flags & ITER_NARROW)
+	    linear_get_scanline_narrow (iter, NULL);
+	else
+	    linear_get_scanline_wide (iter, NULL);
+
+	iter->get_scanline = _pixman_iter_get_scanline_noop;
+    }
+    else
+    {
+	if (iter->iter_flags & ITER_NARROW)
+	    iter->get_scanline = linear_get_scanline_narrow;
+	else
+	    iter->get_scanline = linear_get_scanline_wide;
+    }
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_linear_gradient (const pixman_point_fixed_t *  p1,
+                                     const pixman_point_fixed_t *  p2,
+                                     const pixman_gradient_stop_t *stops,
+                                     int                           n_stops)
+{
+    pixman_image_t *image;
+    linear_gradient_t *linear;
+
+    image = _pixman_image_allocate ();
+
+    if (!image)
+	return NULL;
+
+    linear = &image->linear;
+
+    if (!_pixman_init_gradient (&linear->common, stops, n_stops))
+    {
+	free (image);
+	return NULL;
+    }
+
+    linear->p1 = *p1;
+    linear->p2 = *p2;
+
+    image->type = LINEAR;
+
+    return image;
+}
+
diff --git a/src/pixman/pixman/pixman-matrix.c b/src/pixman/pixman/pixman-matrix.c
new file mode 100644
index 0000000..4032c13
--- /dev/null
+++ b/src/pixman/pixman/pixman-matrix.c
@@ -0,0 +1,1073 @@
+/*
+ * Copyright © 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+/*
+ * Matrix interfaces
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <math.h>
+#include <string.h>
+#include "pixman-private.h"
+
+#define F(x)    pixman_int_to_fixed (x)
+
+static force_inline int
+count_leading_zeros (uint32_t x)
+{
+#ifdef HAVE_BUILTIN_CLZ
+    return __builtin_clz (x);
+#else
+    int n = 0;
+    while (x)
+    {
+        n++;
+        x >>= 1;
+    }
+    return 32 - n;
+#endif
+}
+
+/*
+ * Large signed/unsigned integer division with rounding for the platforms with
+ * only 64-bit integer data type supported (no 128-bit data type).
+ *
+ * Arguments:
+ *     hi, lo - high and low 64-bit parts of the dividend
+ *     div    - 48-bit divisor
+ *
+ * Returns: lowest 64 bits of the result as a return value and highest 64
+ *          bits of the result to "result_hi" pointer
+ */
+
+/* grade-school unsigned division (128-bit by 48-bit) with rounding to nearest */
+static force_inline uint64_t
+rounded_udiv_128_by_48 (uint64_t  hi,
+                        uint64_t  lo,
+                        uint64_t  div,
+                        uint64_t *result_hi)
+{
+    uint64_t tmp, remainder, result_lo;
+    assert(div < ((uint64_t)1 << 48));
+
+    remainder = hi % div;
+    *result_hi = hi / div;
+
+    tmp = (remainder << 16) + (lo >> 48);
+    result_lo = tmp / div;
+    remainder = tmp % div;
+
+    tmp = (remainder << 16) + ((lo >> 32) & 0xFFFF);
+    result_lo = (result_lo << 16) + (tmp / div);
+    remainder = tmp % div;
+
+    tmp = (remainder << 16) + ((lo >> 16) & 0xFFFF);
+    result_lo = (result_lo << 16) + (tmp / div);
+    remainder = tmp % div;
+
+    tmp = (remainder << 16) + (lo & 0xFFFF);
+    result_lo = (result_lo << 16) + (tmp / div);
+    remainder = tmp % div;
+
+    /* round to nearest */
+    if (remainder * 2 >= div && ++result_lo == 0)
+        *result_hi += 1;
+
+    return result_lo;
+}
+
+/* signed division (128-bit by 49-bit) with rounding to nearest */
+static inline int64_t
+rounded_sdiv_128_by_49 (int64_t   hi,
+                        uint64_t  lo,
+                        int64_t   div,
+                        int64_t  *signed_result_hi)
+{
+    uint64_t result_lo, result_hi;
+    int sign = 0;
+    if (div < 0)
+    {
+        div = -div;
+        sign ^= 1;
+    }
+    if (hi < 0)
+    {
+        if (lo != 0)
+            hi++;
+        hi = -hi;
+        lo = -lo;
+        sign ^= 1;
+    }
+    result_lo = rounded_udiv_128_by_48 (hi, lo, div, &result_hi);
+    if (sign)
+    {
+        if (result_lo != 0)
+            result_hi++;
+        result_hi = -result_hi;
+        result_lo = -result_lo;
+    }
+    if (signed_result_hi)
+    {
+        *signed_result_hi = result_hi;
+    }
+    return result_lo;
+}
+
+/*
+ * Multiply 64.16 fixed point value by (2^scalebits) and convert
+ * to 128-bit integer.
+ */
+static force_inline void
+fixed_64_16_to_int128 (int64_t  hi,
+                       int64_t  lo,
+                       int64_t *rhi,
+                       int64_t *rlo,
+                       int      scalebits)
+{
+    /* separate integer and fractional parts */
+    hi += lo >> 16;
+    lo &= 0xFFFF;
+
+    if (scalebits <= 0)
+    {
+        *rlo = hi >> (-scalebits);
+        *rhi = *rlo >> 63;
+    }
+    else
+    {
+        *rhi = hi >> (64 - scalebits);
+        *rlo = (uint64_t)hi << scalebits;
+        if (scalebits < 16)
+            *rlo += lo >> (16 - scalebits);
+        else
+            *rlo += lo << (scalebits - 16);
+    }
+}
+
+/*
+ * Convert 112.16 fixed point value to 48.16 with clamping for the out
+ * of range values.
+ */
+static force_inline pixman_fixed_48_16_t
+fixed_112_16_to_fixed_48_16 (int64_t hi, int64_t lo, pixman_bool_t *clampflag)
+{
+    if ((lo >> 63) != hi)
+    {
+        *clampflag = TRUE;
+        return hi >= 0 ? INT64_MAX : INT64_MIN;
+    }
+    else
+    {
+        return lo;
+    }
+}
+
+/*
+ * Transform a point with 31.16 fixed point coordinates from the destination
+ * space to a point with 48.16 fixed point coordinates in the source space.
+ * No overflows are possible for affine transformations and the results are
+ * accurate including the least significant bit. Projective transformations
+ * may overflow, in this case the results are just clamped to return maximum
+ * or minimum 48.16 values (so that the caller can at least handle the NONE
+ * and PAD repeats correctly) and the return value is FALSE to indicate that
+ * such clamping has happened.
+ */
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_point_31_16 (const pixman_transform_t    *t,
+                              const pixman_vector_48_16_t *v,
+                              pixman_vector_48_16_t       *result)
+{
+    pixman_bool_t clampflag = FALSE;
+    int i;
+    int64_t tmp[3][2], divint;
+    uint16_t divfrac;
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    for (i = 0; i < 3; i++)
+    {
+        tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16);
+        tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF);
+    }
+
+    /*
+     * separate 64-bit integer and 16-bit fractional parts for the divisor,
+     * which is also scaled by 65536 after fixed point multiplication.
+     */
+    divint  = tmp[2][0] + (tmp[2][1] >> 16);
+    divfrac = tmp[2][1] & 0xFFFF;
+
+    if (divint == pixman_fixed_1 && divfrac == 0)
+    {
+        /*
+         * this is a simple affine transformation
+         */
+        result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+        result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+        result->v[2] = pixman_fixed_1;
+    }
+    else if (divint == 0 && divfrac == 0)
+    {
+        /*
+         * handle zero divisor (if the values are non-zero, set the
+         * results to maximum positive or minimum negative)
+         */
+        clampflag = TRUE;
+
+        result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+        result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+
+        if (result->v[0] > 0)
+            result->v[0] = INT64_MAX;
+        else if (result->v[0] < 0)
+            result->v[0] = INT64_MIN;
+
+        if (result->v[1] > 0)
+            result->v[1] = INT64_MAX;
+        else if (result->v[1] < 0)
+            result->v[1] = INT64_MIN;
+    }
+    else
+    {
+        /*
+         * projective transformation, analyze the top 32 bits of the divisor
+         */
+        int32_t hi32divbits = divint >> 32;
+        if (hi32divbits < 0)
+            hi32divbits = ~hi32divbits;
+
+        if (hi32divbits == 0)
+        {
+            /* the divisor is small, we can actually keep all the bits */
+            int64_t hi, rhi, lo, rlo;
+            int64_t div = (divint << 16) + divfrac;
+
+            fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[0] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+
+            fixed_64_16_to_int128 (tmp[1][0], tmp[1][1], &hi, &lo, 32);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[1] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+        }
+        else
+        {
+            /* the divisor needs to be reduced to 48 bits */
+            int64_t hi, rhi, lo, rlo, div;
+            int shift = 32 - count_leading_zeros (hi32divbits);
+            fixed_64_16_to_int128 (divint, divfrac, &hi, &div, 16 - shift);
+
+            fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32 - shift);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[0] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+
+            fixed_64_16_to_int128 (tmp[1][0], tmp[1][1], &hi, &lo, 32 - shift);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[1] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+        }
+    }
+    result->v[2] = pixman_fixed_1;
+    return !clampflag;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_point_31_16_affine (const pixman_transform_t    *t,
+                                     const pixman_vector_48_16_t *v,
+                                     pixman_vector_48_16_t       *result)
+{
+    int64_t hi0, lo0, hi1, lo1;
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    hi0  = (int64_t)t->matrix[0][0] * (v->v[0] >> 16);
+    lo0  = (int64_t)t->matrix[0][0] * (v->v[0] & 0xFFFF);
+    hi0 += (int64_t)t->matrix[0][1] * (v->v[1] >> 16);
+    lo0 += (int64_t)t->matrix[0][1] * (v->v[1] & 0xFFFF);
+    hi0 += (int64_t)t->matrix[0][2];
+
+    hi1  = (int64_t)t->matrix[1][0] * (v->v[0] >> 16);
+    lo1  = (int64_t)t->matrix[1][0] * (v->v[0] & 0xFFFF);
+    hi1 += (int64_t)t->matrix[1][1] * (v->v[1] >> 16);
+    lo1 += (int64_t)t->matrix[1][1] * (v->v[1] & 0xFFFF);
+    hi1 += (int64_t)t->matrix[1][2];
+
+    result->v[0] = hi0 + ((lo0 + 0x8000) >> 16);
+    result->v[1] = hi1 + ((lo1 + 0x8000) >> 16);
+    result->v[2] = pixman_fixed_1;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_point_31_16_3d (const pixman_transform_t    *t,
+                                 const pixman_vector_48_16_t *v,
+                                 pixman_vector_48_16_t       *result)
+{
+    int i;
+    int64_t tmp[3][2];
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    for (i = 0; i < 3; i++)
+    {
+        tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16);
+        tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF);
+    }
+
+    result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+    result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+    result->v[2] = tmp[2][0] + ((tmp[2][1] + 0x8000) >> 16);
+}
+
+PIXMAN_EXPORT void
+pixman_transform_init_identity (struct pixman_transform *matrix)
+{
+    int i;
+
+    memset (matrix, '\0', sizeof (struct pixman_transform));
+    for (i = 0; i < 3; i++)
+	matrix->matrix[i][i] = F (1);
+}
+
+typedef pixman_fixed_32_32_t pixman_fixed_34_30_t;
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_point_3d (const struct pixman_transform *transform,
+                           struct pixman_vector *         vector)
+{
+    pixman_vector_48_16_t tmp;
+    tmp.v[0] = vector->vector[0];
+    tmp.v[1] = vector->vector[1];
+    tmp.v[2] = vector->vector[2];
+
+    pixman_transform_point_31_16_3d (transform, &tmp, &tmp);
+
+    vector->vector[0] = tmp.v[0];
+    vector->vector[1] = tmp.v[1];
+    vector->vector[2] = tmp.v[2];
+
+    return vector->vector[0] == tmp.v[0] &&
+           vector->vector[1] == tmp.v[1] &&
+           vector->vector[2] == tmp.v[2];
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_point (const struct pixman_transform *transform,
+                        struct pixman_vector *         vector)
+{
+    pixman_vector_48_16_t tmp;
+    tmp.v[0] = vector->vector[0];
+    tmp.v[1] = vector->vector[1];
+    tmp.v[2] = vector->vector[2];
+
+    if (!pixman_transform_point_31_16 (transform, &tmp, &tmp))
+        return FALSE;
+
+    vector->vector[0] = tmp.v[0];
+    vector->vector[1] = tmp.v[1];
+    vector->vector[2] = tmp.v[2];
+
+    return vector->vector[0] == tmp.v[0] &&
+           vector->vector[1] == tmp.v[1] &&
+           vector->vector[2] == tmp.v[2];
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_multiply (struct pixman_transform *      dst,
+                           const struct pixman_transform *l,
+                           const struct pixman_transform *r)
+{
+    struct pixman_transform d;
+    int dx, dy;
+    int o;
+
+    for (dy = 0; dy < 3; dy++)
+    {
+	for (dx = 0; dx < 3; dx++)
+	{
+	    pixman_fixed_48_16_t v;
+	    pixman_fixed_32_32_t partial;
+	    
+	    v = 0;
+	    for (o = 0; o < 3; o++)
+	    {
+		partial =
+		    (pixman_fixed_32_32_t) l->matrix[dy][o] *
+		    (pixman_fixed_32_32_t) r->matrix[o][dx];
+
+		v += (partial + 0x8000) >> 16;
+	    }
+
+	    if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
+		return FALSE;
+	    
+	    d.matrix[dy][dx] = (pixman_fixed_t) v;
+	}
+    }
+
+    *dst = d;
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_init_scale (struct pixman_transform *t,
+                             pixman_fixed_t           sx,
+                             pixman_fixed_t           sy)
+{
+    memset (t, '\0', sizeof (struct pixman_transform));
+
+    t->matrix[0][0] = sx;
+    t->matrix[1][1] = sy;
+    t->matrix[2][2] = F (1);
+}
+
+static pixman_fixed_t
+fixed_inverse (pixman_fixed_t x)
+{
+    return (pixman_fixed_t) ((((pixman_fixed_48_16_t) F (1)) * F (1)) / x);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_scale (struct pixman_transform *forward,
+                        struct pixman_transform *reverse,
+                        pixman_fixed_t           sx,
+                        pixman_fixed_t           sy)
+{
+    struct pixman_transform t;
+
+    if (sx == 0 || sy == 0)
+	return FALSE;
+
+    if (forward)
+    {
+	pixman_transform_init_scale (&t, sx, sy);
+	if (!pixman_transform_multiply (forward, &t, forward))
+	    return FALSE;
+    }
+    
+    if (reverse)
+    {
+	pixman_transform_init_scale (&t, fixed_inverse (sx),
+	                             fixed_inverse (sy));
+	if (!pixman_transform_multiply (reverse, reverse, &t))
+	    return FALSE;
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_init_rotate (struct pixman_transform *t,
+                              pixman_fixed_t           c,
+                              pixman_fixed_t           s)
+{
+    memset (t, '\0', sizeof (struct pixman_transform));
+
+    t->matrix[0][0] = c;
+    t->matrix[0][1] = -s;
+    t->matrix[1][0] = s;
+    t->matrix[1][1] = c;
+    t->matrix[2][2] = F (1);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_rotate (struct pixman_transform *forward,
+                         struct pixman_transform *reverse,
+                         pixman_fixed_t           c,
+                         pixman_fixed_t           s)
+{
+    struct pixman_transform t;
+
+    if (forward)
+    {
+	pixman_transform_init_rotate (&t, c, s);
+	if (!pixman_transform_multiply (forward, &t, forward))
+	    return FALSE;
+    }
+
+    if (reverse)
+    {
+	pixman_transform_init_rotate (&t, c, -s);
+	if (!pixman_transform_multiply (reverse, reverse, &t))
+	    return FALSE;
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_init_translate (struct pixman_transform *t,
+                                 pixman_fixed_t           tx,
+                                 pixman_fixed_t           ty)
+{
+    memset (t, '\0', sizeof (struct pixman_transform));
+
+    t->matrix[0][0] = F (1);
+    t->matrix[0][2] = tx;
+    t->matrix[1][1] = F (1);
+    t->matrix[1][2] = ty;
+    t->matrix[2][2] = F (1);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_translate (struct pixman_transform *forward,
+                            struct pixman_transform *reverse,
+                            pixman_fixed_t           tx,
+                            pixman_fixed_t           ty)
+{
+    struct pixman_transform t;
+
+    if (forward)
+    {
+	pixman_transform_init_translate (&t, tx, ty);
+
+	if (!pixman_transform_multiply (forward, &t, forward))
+	    return FALSE;
+    }
+
+    if (reverse)
+    {
+	pixman_transform_init_translate (&t, -tx, -ty);
+
+	if (!pixman_transform_multiply (reverse, reverse, &t))
+	    return FALSE;
+    }
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_bounds (const struct pixman_transform *matrix,
+                         struct pixman_box16 *          b)
+
+{
+    struct pixman_vector v[4];
+    int i;
+    int x1, y1, x2, y2;
+
+    v[0].vector[0] = F (b->x1);
+    v[0].vector[1] = F (b->y1);
+    v[0].vector[2] = F (1);
+
+    v[1].vector[0] = F (b->x2);
+    v[1].vector[1] = F (b->y1);
+    v[1].vector[2] = F (1);
+
+    v[2].vector[0] = F (b->x2);
+    v[2].vector[1] = F (b->y2);
+    v[2].vector[2] = F (1);
+
+    v[3].vector[0] = F (b->x1);
+    v[3].vector[1] = F (b->y2);
+    v[3].vector[2] = F (1);
+
+    for (i = 0; i < 4; i++)
+    {
+	if (!pixman_transform_point (matrix, &v[i]))
+	    return FALSE;
+
+	x1 = pixman_fixed_to_int (v[i].vector[0]);
+	y1 = pixman_fixed_to_int (v[i].vector[1]);
+	x2 = pixman_fixed_to_int (pixman_fixed_ceil (v[i].vector[0]));
+	y2 = pixman_fixed_to_int (pixman_fixed_ceil (v[i].vector[1]));
+
+	if (i == 0)
+	{
+	    b->x1 = x1;
+	    b->y1 = y1;
+	    b->x2 = x2;
+	    b->y2 = y2;
+	}
+	else
+	{
+	    if (x1 < b->x1) b->x1 = x1;
+	    if (y1 < b->y1) b->y1 = y1;
+	    if (x2 > b->x2) b->x2 = x2;
+	    if (y2 > b->y2) b->y2 = y2;
+	}
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_invert (struct pixman_transform *      dst,
+                         const struct pixman_transform *src)
+{
+    struct pixman_f_transform m;
+
+    pixman_f_transform_from_pixman_transform (&m, src);
+
+    if (!pixman_f_transform_invert (&m, &m))
+	return FALSE;
+
+    if (!pixman_transform_from_pixman_f_transform (dst, &m))
+	return FALSE;
+
+    return TRUE;
+}
+
+static pixman_bool_t
+within_epsilon (pixman_fixed_t a,
+                pixman_fixed_t b,
+                pixman_fixed_t epsilon)
+{
+    pixman_fixed_t t = a - b;
+
+    if (t < 0)
+	t = -t;
+
+    return t <= epsilon;
+}
+
+#define EPSILON (pixman_fixed_t) (2)
+
+#define IS_SAME(a, b) (within_epsilon (a, b, EPSILON))
+#define IS_ZERO(a)    (within_epsilon (a, 0, EPSILON))
+#define IS_ONE(a)     (within_epsilon (a, F (1), EPSILON))
+#define IS_UNIT(a)			    \
+    (within_epsilon (a, F (1), EPSILON) ||  \
+     within_epsilon (a, F (-1), EPSILON) || \
+     IS_ZERO (a))
+#define IS_INT(a)    (IS_ZERO (pixman_fixed_frac (a)))
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_identity (const struct pixman_transform *t)
+{
+    return (IS_SAME (t->matrix[0][0], t->matrix[1][1]) &&
+	    IS_SAME (t->matrix[0][0], t->matrix[2][2]) &&
+	    !IS_ZERO (t->matrix[0][0]) &&
+	    IS_ZERO (t->matrix[0][1]) &&
+	    IS_ZERO (t->matrix[0][2]) &&
+	    IS_ZERO (t->matrix[1][0]) &&
+	    IS_ZERO (t->matrix[1][2]) &&
+	    IS_ZERO (t->matrix[2][0]) &&
+	    IS_ZERO (t->matrix[2][1]));
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_scale (const struct pixman_transform *t)
+{
+    return (!IS_ZERO (t->matrix[0][0]) &&
+            IS_ZERO (t->matrix[0][1]) &&
+            IS_ZERO (t->matrix[0][2]) &&
+
+            IS_ZERO (t->matrix[1][0]) &&
+            !IS_ZERO (t->matrix[1][1]) &&
+            IS_ZERO (t->matrix[1][2]) &&
+
+            IS_ZERO (t->matrix[2][0]) &&
+            IS_ZERO (t->matrix[2][1]) &&
+            !IS_ZERO (t->matrix[2][2]));
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_int_translate (const struct pixman_transform *t)
+{
+    return (IS_ONE (t->matrix[0][0]) &&
+            IS_ZERO (t->matrix[0][1]) &&
+            IS_INT (t->matrix[0][2]) &&
+
+            IS_ZERO (t->matrix[1][0]) &&
+            IS_ONE (t->matrix[1][1]) &&
+            IS_INT (t->matrix[1][2]) &&
+
+            IS_ZERO (t->matrix[2][0]) &&
+            IS_ZERO (t->matrix[2][1]) &&
+            IS_ONE (t->matrix[2][2]));
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_is_inverse (const struct pixman_transform *a,
+                             const struct pixman_transform *b)
+{
+    struct pixman_transform t;
+
+    if (!pixman_transform_multiply (&t, a, b))
+	return FALSE;
+
+    return pixman_transform_is_identity (&t);
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_from_pixman_transform (struct pixman_f_transform *    ft,
+                                          const struct pixman_transform *t)
+{
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	    ft->m[j][i] = pixman_fixed_to_double (t->matrix[j][i]);
+    }
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_from_pixman_f_transform (struct pixman_transform *        t,
+                                          const struct pixman_f_transform *ft)
+{
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	{
+	    double d = ft->m[j][i];
+	    if (d < -32767.0 || d > 32767.0)
+		return FALSE;
+	    d = d * 65536.0 + 0.5;
+	    t->matrix[j][i] = (pixman_fixed_t) floor (d);
+	}
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_invert (struct pixman_f_transform *      dst,
+                           const struct pixman_f_transform *src)
+{
+    static const int a[3] = { 2, 2, 1 };
+    static const int b[3] = { 1, 0, 0 };
+    pixman_f_transform_t d;
+    double det;
+    int i, j;
+
+    det = 0;
+    for (i = 0; i < 3; i++)
+    {
+	double p;
+	int ai = a[i];
+	int bi = b[i];
+	p = src->m[i][0] * (src->m[ai][2] * src->m[bi][1] -
+	                    src->m[ai][1] * src->m[bi][2]);
+	if (i == 1)
+	    p = -p;
+	det += p;
+    }
+    
+    if (det == 0)
+	return FALSE;
+    
+    det = 1 / det;
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	{
+	    double p;
+	    int ai = a[i];
+	    int aj = a[j];
+	    int bi = b[i];
+	    int bj = b[j];
+
+	    p = (src->m[ai][aj] * src->m[bi][bj] -
+	         src->m[ai][bj] * src->m[bi][aj]);
+	    
+	    if (((i + j) & 1) != 0)
+		p = -p;
+	    
+	    d.m[j][i] = det * p;
+	}
+    }
+
+    *dst = d;
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_point (const struct pixman_f_transform *t,
+                          struct pixman_f_vector *         v)
+{
+    struct pixman_f_vector result;
+    int i, j;
+    double a;
+
+    for (j = 0; j < 3; j++)
+    {
+	a = 0;
+	for (i = 0; i < 3; i++)
+	    a += t->m[j][i] * v->v[i];
+	result.v[j] = a;
+    }
+    
+    if (!result.v[2])
+	return FALSE;
+
+    for (j = 0; j < 2; j++)
+	v->v[j] = result.v[j] / result.v[2];
+
+    v->v[2] = 1;
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_point_3d (const struct pixman_f_transform *t,
+                             struct pixman_f_vector *         v)
+{
+    struct pixman_f_vector result;
+    int i, j;
+    double a;
+
+    for (j = 0; j < 3; j++)
+    {
+	a = 0;
+	for (i = 0; i < 3; i++)
+	    a += t->m[j][i] * v->v[i];
+	result.v[j] = a;
+    }
+    
+    *v = result;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_multiply (struct pixman_f_transform *      dst,
+                             const struct pixman_f_transform *l,
+                             const struct pixman_f_transform *r)
+{
+    struct pixman_f_transform d;
+    int dx, dy;
+    int o;
+
+    for (dy = 0; dy < 3; dy++)
+    {
+	for (dx = 0; dx < 3; dx++)
+	{
+	    double v = 0;
+	    for (o = 0; o < 3; o++)
+		v += l->m[dy][o] * r->m[o][dx];
+	    d.m[dy][dx] = v;
+	}
+    }
+    
+    *dst = d;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_scale (struct pixman_f_transform *t,
+                               double                     sx,
+                               double                     sy)
+{
+    t->m[0][0] = sx;
+    t->m[0][1] = 0;
+    t->m[0][2] = 0;
+    t->m[1][0] = 0;
+    t->m[1][1] = sy;
+    t->m[1][2] = 0;
+    t->m[2][0] = 0;
+    t->m[2][1] = 0;
+    t->m[2][2] = 1;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_scale (struct pixman_f_transform *forward,
+                          struct pixman_f_transform *reverse,
+                          double                     sx,
+                          double                     sy)
+{
+    struct pixman_f_transform t;
+
+    if (sx == 0 || sy == 0)
+	return FALSE;
+
+    if (forward)
+    {
+	pixman_f_transform_init_scale (&t, sx, sy);
+	pixman_f_transform_multiply (forward, &t, forward);
+    }
+    
+    if (reverse)
+    {
+	pixman_f_transform_init_scale (&t, 1 / sx, 1 / sy);
+	pixman_f_transform_multiply (reverse, reverse, &t);
+    }
+    
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_rotate (struct pixman_f_transform *t,
+                                double                     c,
+                                double                     s)
+{
+    t->m[0][0] = c;
+    t->m[0][1] = -s;
+    t->m[0][2] = 0;
+    t->m[1][0] = s;
+    t->m[1][1] = c;
+    t->m[1][2] = 0;
+    t->m[2][0] = 0;
+    t->m[2][1] = 0;
+    t->m[2][2] = 1;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_rotate (struct pixman_f_transform *forward,
+                           struct pixman_f_transform *reverse,
+                           double                     c,
+                           double                     s)
+{
+    struct pixman_f_transform t;
+
+    if (forward)
+    {
+	pixman_f_transform_init_rotate (&t, c, s);
+	pixman_f_transform_multiply (forward, &t, forward);
+    }
+    
+    if (reverse)
+    {
+	pixman_f_transform_init_rotate (&t, c, -s);
+	pixman_f_transform_multiply (reverse, reverse, &t);
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_translate (struct pixman_f_transform *t,
+                                   double                     tx,
+                                   double                     ty)
+{
+    t->m[0][0] = 1;
+    t->m[0][1] = 0;
+    t->m[0][2] = tx;
+    t->m[1][0] = 0;
+    t->m[1][1] = 1;
+    t->m[1][2] = ty;
+    t->m[2][0] = 0;
+    t->m[2][1] = 0;
+    t->m[2][2] = 1;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_translate (struct pixman_f_transform *forward,
+                              struct pixman_f_transform *reverse,
+                              double                     tx,
+                              double                     ty)
+{
+    struct pixman_f_transform t;
+
+    if (forward)
+    {
+	pixman_f_transform_init_translate (&t, tx, ty);
+	pixman_f_transform_multiply (forward, &t, forward);
+    }
+
+    if (reverse)
+    {
+	pixman_f_transform_init_translate (&t, -tx, -ty);
+	pixman_f_transform_multiply (reverse, reverse, &t);
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_f_transform_bounds (const struct pixman_f_transform *t,
+                           struct pixman_box16 *            b)
+{
+    struct pixman_f_vector v[4];
+    int i;
+    int x1, y1, x2, y2;
+
+    v[0].v[0] = b->x1;
+    v[0].v[1] = b->y1;
+    v[0].v[2] = 1;
+    v[1].v[0] = b->x2;
+    v[1].v[1] = b->y1;
+    v[1].v[2] = 1;
+    v[2].v[0] = b->x2;
+    v[2].v[1] = b->y2;
+    v[2].v[2] = 1;
+    v[3].v[0] = b->x1;
+    v[3].v[1] = b->y2;
+    v[3].v[2] = 1;
+
+    for (i = 0; i < 4; i++)
+    {
+	if (!pixman_f_transform_point (t, &v[i]))
+	    return FALSE;
+
+	x1 = floor (v[i].v[0]);
+	y1 = floor (v[i].v[1]);
+	x2 = ceil (v[i].v[0]);
+	y2 = ceil (v[i].v[1]);
+
+	if (i == 0)
+	{
+	    b->x1 = x1;
+	    b->y1 = y1;
+	    b->x2 = x2;
+	    b->y2 = y2;
+	}
+	else
+	{
+	    if (x1 < b->x1) b->x1 = x1;
+	    if (y1 < b->y1) b->y1 = y1;
+	    if (x2 > b->x2) b->x2 = x2;
+	    if (y2 > b->y2) b->y2 = y2;
+	}
+    }
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT void
+pixman_f_transform_init_identity (struct pixman_f_transform *t)
+{
+    int i, j;
+
+    for (j = 0; j < 3; j++)
+    {
+	for (i = 0; i < 3; i++)
+	    t->m[j][i] = i == j ? 1 : 0;
+    }
+}
diff --git a/src/pixman/pixman/pixman-mips-dspr2-asm.S b/src/pixman/pixman/pixman-mips-dspr2-asm.S
new file mode 100644
index 0000000..866e93e
--- /dev/null
+++ b/src/pixman/pixman/pixman-mips-dspr2-asm.S
@@ -0,0 +1,4283 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nemanja Lukic (nlukic@mips.com)
+ */
+
+#include "pixman-private.h"
+#include "pixman-mips-dspr2-asm.h"
+
+LEAF_MIPS_DSPR2(pixman_fill_buff16_mips)
+/*
+ * a0 - *dest
+ * a1 - count (bytes)
+ * a2 - value to fill buffer with
+ */
+
+    beqz     a1, 3f
+     andi    t1, a0, 0x0002
+    beqz     t1, 0f          /* check if address is 4-byte aligned */
+     nop
+    sh       a2, 0(a0)
+    addiu    a0, a0, 2
+    addiu    a1, a1, -2
+0:
+    srl      t1, a1, 5       /* t1 how many multiples of 32 bytes */
+    replv.ph a2, a2          /* replicate fill value (16bit) in a2 */
+    beqz     t1, 2f
+     nop
+1:
+    addiu    t1, t1, -1
+    beqz     t1, 11f
+     addiu   a1, a1, -32
+    pref     30, 32(a0)
+    sw       a2, 0(a0)
+    sw       a2, 4(a0)
+    sw       a2, 8(a0)
+    sw       a2, 12(a0)
+    sw       a2, 16(a0)
+    sw       a2, 20(a0)
+    sw       a2, 24(a0)
+    sw       a2, 28(a0)
+    b        1b
+     addiu   a0, a0, 32
+11:
+    sw       a2, 0(a0)
+    sw       a2, 4(a0)
+    sw       a2, 8(a0)
+    sw       a2, 12(a0)
+    sw       a2, 16(a0)
+    sw       a2, 20(a0)
+    sw       a2, 24(a0)
+    sw       a2, 28(a0)
+    addiu    a0, a0, 32
+2:
+    blez     a1, 3f
+     addiu   a1, a1, -2
+    sh       a2, 0(a0)
+    b        2b
+     addiu   a0, a0, 2
+3:
+    jr       ra
+     nop
+
+END(pixman_fill_buff16_mips)
+
+LEAF_MIPS32R2(pixman_fill_buff32_mips)
+/*
+ * a0 - *dest
+ * a1 - count (bytes)
+ * a2 - value to fill buffer with
+ */
+
+    beqz     a1, 3f
+     nop
+    srl      t1, a1, 5 /* t1 how many multiples of 32 bytes */
+    beqz     t1, 2f
+     nop
+1:
+    addiu    t1, t1, -1
+    beqz     t1, 11f
+     addiu   a1, a1, -32
+    pref     30, 32(a0)
+    sw       a2, 0(a0)
+    sw       a2, 4(a0)
+    sw       a2, 8(a0)
+    sw       a2, 12(a0)
+    sw       a2, 16(a0)
+    sw       a2, 20(a0)
+    sw       a2, 24(a0)
+    sw       a2, 28(a0)
+    b        1b
+     addiu   a0, a0, 32
+11:
+    sw       a2, 0(a0)
+    sw       a2, 4(a0)
+    sw       a2, 8(a0)
+    sw       a2, 12(a0)
+    sw       a2, 16(a0)
+    sw       a2, 20(a0)
+    sw       a2, 24(a0)
+    sw       a2, 28(a0)
+    addiu    a0, a0, 32
+2:
+    blez     a1, 3f
+     addiu   a1, a1, -4
+    sw       a2, 0(a0)
+    b        2b
+     addiu   a0, a0, 4
+3:
+    jr       ra
+     nop
+
+END(pixman_fill_buff32_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_src_8888_0565_asm_mips)
+/*
+ * a0 - dst (r5g6b5)
+ * a1 - src (a8r8g8b8)
+ * a2 - w
+ */
+
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+    li       t4, 0xf800f800
+    li       t5, 0x07e007e0
+    li       t6, 0x001f001f
+1:
+    lw       t0, 0(a1)
+    lw       t1, 4(a1)
+    addiu    a1, a1, 8
+    addiu    a2, a2, -2
+
+    CONVERT_2x8888_TO_2x0565 t0, t1, t2, t3, t4, t5, t6, t7, t8
+
+    sh       t2, 0(a0)
+    sh       t3, 2(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a2, 3f
+     nop
+    lw       t0, 0(a1)
+
+    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
+
+    sh       t1, 0(a0)
+3:
+    j        ra
+     nop
+
+END(pixman_composite_src_8888_0565_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_src_0565_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (r5g6b5)
+ * a2 - w
+ */
+
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+    li       t4, 0x07e007e0
+    li       t5, 0x001F001F
+1:
+    lhu      t0, 0(a1)
+    lhu      t1, 2(a1)
+    addiu    a1, a1, 4
+    addiu    a2, a2, -2
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
+
+    sw       t2, 0(a0)
+    sw       t3, 4(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    lhu      t0, 0(a1)
+
+    CONVERT_1x0565_TO_1x8888 t0, t1, t2, t3
+
+    sw       t1, 0(a0)
+3:
+    j        ra
+     nop
+
+END(pixman_composite_src_0565_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_src_x888_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (x8r8g8b8)
+ * a2 - w
+ */
+
+    beqz     a2, 4f
+     nop
+    li       t9, 0xff000000
+    srl      t8, a2, 3    /* t1 = how many multiples of 8 src pixels */
+    beqz     t8, 3f       /* branch if less than 8 src pixels */
+     nop
+1:
+    addiu    t8, t8, -1
+    beqz     t8, 2f
+     addiu   a2, a2, -8
+    pref     0, 32(a1)
+    lw       t0, 0(a1)
+    lw       t1, 4(a1)
+    lw       t2, 8(a1)
+    lw       t3, 12(a1)
+    lw       t4, 16(a1)
+    lw       t5, 20(a1)
+    lw       t6, 24(a1)
+    lw       t7, 28(a1)
+    addiu    a1, a1, 32
+    or       t0, t0, t9
+    or       t1, t1, t9
+    or       t2, t2, t9
+    or       t3, t3, t9
+    or       t4, t4, t9
+    or       t5, t5, t9
+    or       t6, t6, t9
+    or       t7, t7, t9
+    pref     30, 32(a0)
+    sw       t0, 0(a0)
+    sw       t1, 4(a0)
+    sw       t2, 8(a0)
+    sw       t3, 12(a0)
+    sw       t4, 16(a0)
+    sw       t5, 20(a0)
+    sw       t6, 24(a0)
+    sw       t7, 28(a0)
+    b        1b
+     addiu   a0, a0, 32
+2:
+    lw       t0, 0(a1)
+    lw       t1, 4(a1)
+    lw       t2, 8(a1)
+    lw       t3, 12(a1)
+    lw       t4, 16(a1)
+    lw       t5, 20(a1)
+    lw       t6, 24(a1)
+    lw       t7, 28(a1)
+    addiu    a1, a1, 32
+    or       t0, t0, t9
+    or       t1, t1, t9
+    or       t2, t2, t9
+    or       t3, t3, t9
+    or       t4, t4, t9
+    or       t5, t5, t9
+    or       t6, t6, t9
+    or       t7, t7, t9
+    sw       t0, 0(a0)
+    sw       t1, 4(a0)
+    sw       t2, 8(a0)
+    sw       t3, 12(a0)
+    sw       t4, 16(a0)
+    sw       t5, 20(a0)
+    sw       t6, 24(a0)
+    sw       t7, 28(a0)
+    beqz     a2, 4f
+     addiu   a0, a0, 32
+3:
+    lw       t0, 0(a1)
+    addiu    a1, a1, 4
+    addiu    a2, a2, -1
+    or       t1, t0, t9
+    sw       t1, 0(a0)
+    bnez     a2, 3b
+     addiu   a0, a0, 4
+4:
+    jr       ra
+     nop
+
+END(pixman_composite_src_x888_8888_asm_mips)
+
+#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+LEAF_MIPS_DSPR2(pixman_composite_src_0888_8888_rev_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (b8g8r8)
+ * a2 - w
+ */
+
+    beqz              a2, 6f
+     nop
+
+    lui               t8, 0xff00;
+    srl               t9, a2, 2   /* t9 = how many multiples of 4 src pixels */
+    beqz              t9, 4f      /* branch if less than 4 src pixels */
+     nop
+
+    li                t0, 0x1
+    li                t1, 0x2
+    li                t2, 0x3
+    andi              t3, a1, 0x3
+    beq               t3, t0, 1f
+     nop
+    beq               t3, t1, 2f
+     nop
+    beq               t3, t2, 3f
+     nop
+
+0:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 0(a1)            /* t0 = R2 | B1 | G1 | R1 */
+    lw                t1, 4(a1)            /* t1 = G3 | R3 | B2 | G2 */
+    lw                t2, 8(a1)            /* t2 = B4 | G4 | R4 | B3 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = B1 | R2 | R1 | G1 */
+    wsbh              t1, t1               /* t1 = R3 | G3 | G2 | B2 */
+    wsbh              t2, t2               /* t2 = G4 | B4 | B3 | R4 */
+
+    packrl.ph         t3, t1, t0           /* t3 = G2 | B2 | B1 | R2 */
+    packrl.ph         t4, t0, t0           /* t4 = R1 | G1 | B1 | R2 */
+    rotr              t3, t3, 16           /* t3 = B1 | R2 | G2 | B2 */
+    or                t3, t3, t8           /* t3 = FF | R2 | G2 | B2 */
+    srl               t4, t4, 8            /* t4 =  0 | R1 | G1 | B1 */
+    or                t4, t4, t8           /* t4 = FF | R1 | G1 | B1 */
+    packrl.ph         t5, t2, t1           /* t5 = B3 | R4 | R3 | G3 */
+    rotr              t5, t5, 24           /* t5 = R4 | R3 | G3 | B3 */
+    or                t5, t5, t8           /* t5 = FF | R3 | G3 | B3 */
+    rotr              t2, t2, 16           /* t2 = B3 | R4 | G4 | B4 */
+    or                t2, t2, t8           /* t5 = FF | R3 | G3 | B3 */
+
+    sw                t4, 0(a0)
+    sw                t3, 4(a0)
+    sw                t5, 8(a0)
+    sw                t2, 12(a0)
+    b                 0b
+     addiu            a0, a0, 16
+
+1:
+    lbu               t6, 0(a1)            /* t6 =  0 |  0 |  0 | R1 */
+    lhu               t7, 1(a1)            /* t7 =  0 |  0 | B1 | G1 */
+    sll               t6, t6, 16           /* t6 =  0 | R1 |  0 | 0  */
+    wsbh              t7, t7               /* t7 =  0 |  0 | G1 | B1 */
+    or                t7, t6, t7           /* t7 =  0 | R1 | G1 | B1 */
+11:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 3(a1)            /* t0 = R3 | B2 | G2 | R2 */
+    lw                t1, 7(a1)            /* t1 = G4 | R4 | B3 | G3 */
+    lw                t2, 11(a1)           /* t2 = B5 | G5 | R5 | B4 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = B2 | R3 | R2 | G2 */
+    wsbh              t1, t1               /* t1 = R4 | G4 | G3 | B3 */
+    wsbh              t2, t2               /* t2 = G5 | B5 | B4 | R5 */
+
+    packrl.ph         t3, t1, t0           /* t3 = G3 | B3 | B2 | R3 */
+    packrl.ph         t4, t2, t1           /* t4 = B4 | R5 | R4 | G4 */
+    rotr              t0, t0, 24           /* t0 = R3 | R2 | G2 | B2 */
+    rotr              t3, t3, 16           /* t3 = B2 | R3 | G3 | B3 */
+    rotr              t4, t4, 24           /* t4 = R5 | R4 | G4 | B4 */
+    or                t7, t7, t8           /* t7 = FF | R1 | G1 | B1 */
+    or                t0, t0, t8           /* t0 = FF | R2 | G2 | B2 */
+    or                t3, t3, t8           /* t1 = FF | R3 | G3 | B3 */
+    or                t4, t4, t8           /* t3 = FF | R4 | G4 | B4 */
+
+    sw                t7, 0(a0)
+    sw                t0, 4(a0)
+    sw                t3, 8(a0)
+    sw                t4, 12(a0)
+    rotr              t7, t2, 16           /* t7 = xx | R5 | G5 | B5 */
+    b                 11b
+     addiu            a0, a0, 16
+
+2:
+    lhu               t7, 0(a1)            /* t7 =  0 |  0 | G1 | R1 */
+    wsbh              t7, t7               /* t7 =  0 |  0 | R1 | G1 */
+21:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 2(a1)            /* t0 = B2 | G2 | R2 | B1 */
+    lw                t1, 6(a1)            /* t1 = R4 | B3 | G3 | R3 */
+    lw                t2, 10(a1)           /* t2 = G5 | R5 | B4 | G4 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = G2 | B2 | B1 | R2 */
+    wsbh              t1, t1               /* t1 = B3 | R4 | R3 | G3 */
+    wsbh              t2, t2               /* t2 = R5 | G5 | G4 | B4 */
+
+    precr_sra.ph.w    t7, t0, 0            /* t7 = R1 | G1 | B1 | R2 */
+    rotr              t0, t0, 16           /* t0 = B1 | R2 | G2 | B2 */
+    packrl.ph         t3, t2, t1           /* t3 = G4 | B4 | B3 | R4 */
+    rotr              t1, t1, 24           /* t1 = R4 | R3 | G3 | B3 */
+    srl               t7, t7, 8            /* t7 =  0 | R1 | G1 | B1 */
+    rotr              t3, t3, 16           /* t3 = B3 | R4 | G4 | B4 */
+    or                t7, t7, t8           /* t7 = FF | R1 | G1 | B1 */
+    or                t0, t0, t8           /* t0 = FF | R2 | G2 | B2 */
+    or                t1, t1, t8           /* t1 = FF | R3 | G3 | B3 */
+    or                t3, t3, t8           /* t3 = FF | R4 | G4 | B4 */
+
+    sw                t7, 0(a0)
+    sw                t0, 4(a0)
+    sw                t1, 8(a0)
+    sw                t3, 12(a0)
+    srl               t7, t2, 16           /* t7 =  0 |  0 | R5 | G5 */
+    b                 21b
+     addiu            a0, a0, 16
+
+3:
+    lbu               t7, 0(a1)            /* t7 =  0 |  0 |  0 | R1 */
+31:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 1(a1)            /* t0 = G2 | R2 | B1 | G1 */
+    lw                t1, 5(a1)            /* t1 = B3 | G3 | R3 | B2 */
+    lw                t2, 9(a1)            /* t2 = R5 | B4 | G4 | R4 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = R2 | G2 | G1 | B1 */
+    wsbh              t1, t1               /* t1 = G3 | B3 | B2 | R3 */
+    wsbh              t2, t2               /* t2 = B4 | R5 | R4 | G4 */
+
+    precr_sra.ph.w    t7, t0, 0            /* t7 = xx | R1 | G1 | B1 */
+    packrl.ph         t3, t1, t0           /* t3 = B2 | R3 | R2 | G2 */
+    rotr              t1, t1, 16           /* t1 = B2 | R3 | G3 | B3 */
+    rotr              t4, t2, 24           /* t4 = R5 | R4 | G4 | B4 */
+    rotr              t3, t3, 24           /* t3 = R3 | R2 | G2 | B2 */
+    or                t7, t7, t8           /* t7 = FF | R1 | G1 | B1 */
+    or                t3, t3, t8           /* t3 = FF | R2 | G2 | B2 */
+    or                t1, t1, t8           /* t1 = FF | R3 | G3 | B3 */
+    or                t4, t4, t8           /* t4 = FF | R4 | G4 | B4 */
+
+    sw                t7, 0(a0)
+    sw                t3, 4(a0)
+    sw                t1, 8(a0)
+    sw                t4, 12(a0)
+    srl               t7, t2, 16           /* t7 =  0 |  0 | xx | R5 */
+    b                 31b
+     addiu            a0, a0, 16
+
+4:
+    beqz              a2, 6f
+     nop
+5:
+    lbu               t0, 0(a1)            /* t0 =  0 | 0 | 0 | R */
+    lbu               t1, 1(a1)            /* t1 =  0 | 0 | 0 | G */
+    lbu               t2, 2(a1)            /* t2 =  0 | 0 | 0 | B */
+    addiu             a1, a1, 3
+
+    sll               t0, t0, 16           /* t2 =  0 | R | 0 | 0 */
+    sll               t1, t1, 8            /* t1 =  0 | 0 | G | 0 */
+
+    or                t2, t2, t1           /* t2 =  0 | 0 | G | B */
+    or                t2, t2, t0           /* t2 =  0 | R | G | B */
+    or                t2, t2, t8           /* t2 = FF | R | G | B */
+
+    sw                t2, 0(a0)
+    addiu             a2, a2, -1
+    bnez              a2, 5b
+     addiu            a0, a0, 4
+6:
+    j                 ra
+     nop
+
+END(pixman_composite_src_0888_8888_rev_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_src_0888_0565_rev_asm_mips)
+/*
+ * a0 - dst (r5g6b5)
+ * a1 - src (b8g8r8)
+ * a2 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0, v1
+    beqz              a2, 6f
+     nop
+
+    li                t6, 0xf800f800
+    li                t7, 0x07e007e0
+    li                t8, 0x001F001F
+    srl               t9, a2, 2   /* t9 = how many multiples of 4 src pixels */
+    beqz              t9, 4f      /* branch if less than 4 src pixels */
+     nop
+
+    li                t0, 0x1
+    li                t1, 0x2
+    li                t2, 0x3
+    andi              t3, a1, 0x3
+    beq               t3, t0, 1f
+     nop
+    beq               t3, t1, 2f
+     nop
+    beq               t3, t2, 3f
+     nop
+
+0:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 0(a1)            /* t0 = R2 | B1 | G1 | R1 */
+    lw                t1, 4(a1)            /* t1 = G3 | R3 | B2 | G2 */
+    lw                t2, 8(a1)            /* t2 = B4 | G4 | R4 | B3 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = B1 | R2 | R1 | G1 */
+    wsbh              t1, t1               /* t1 = R3 | G3 | G2 | B2 */
+    wsbh              t2, t2               /* t2 = G4 | B4 | B3 | R4 */
+
+    packrl.ph         t3, t1, t0           /* t3 = G2 | B2 | B1 | R2 */
+    packrl.ph         t4, t0, t0           /* t4 = R1 | G1 | B1 | R2 */
+    rotr              t3, t3, 16           /* t3 = B1 | R2 | G2 | B2 */
+    srl               t4, t4, 8            /* t4 =  0 | R1 | G1 | B1 */
+    packrl.ph         t5, t2, t1           /* t5 = B3 | R4 | R3 | G3 */
+    rotr              t5, t5, 24           /* t5 = R4 | R3 | G3 | B3 */
+    rotr              t2, t2, 16           /* t2 = B3 | R4 | G4 | B4 */
+
+    CONVERT_2x8888_TO_2x0565 t4, t3, t4, t3, t6, t7, t8, v0, v1
+    CONVERT_2x8888_TO_2x0565 t5, t2, t5, t2, t6, t7, t8, v0, v1
+
+    sh                t4, 0(a0)
+    sh                t3, 2(a0)
+    sh                t5, 4(a0)
+    sh                t2, 6(a0)
+    b                 0b
+     addiu            a0, a0, 8
+
+1:
+    lbu               t4, 0(a1)            /* t4 =  0 |  0 |  0 | R1 */
+    lhu               t5, 1(a1)            /* t5 =  0 |  0 | B1 | G1 */
+    sll               t4, t4, 16           /* t4 =  0 | R1 |  0 | 0  */
+    wsbh              t5, t5               /* t5 =  0 |  0 | G1 | B1 */
+    or                t5, t4, t5           /* t5 =  0 | R1 | G1 | B1 */
+11:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 3(a1)            /* t0 = R3 | B2 | G2 | R2 */
+    lw                t1, 7(a1)            /* t1 = G4 | R4 | B3 | G3 */
+    lw                t2, 11(a1)           /* t2 = B5 | G5 | R5 | B4 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = B2 | R3 | R2 | G2 */
+    wsbh              t1, t1               /* t1 = R4 | G4 | G3 | B3 */
+    wsbh              t2, t2               /* t2 = G5 | B5 | B4 | R5 */
+
+    packrl.ph         t3, t1, t0           /* t3 = G3 | B3 | B2 | R3 */
+    packrl.ph         t4, t2, t1           /* t4 = B4 | R5 | R4 | G4 */
+    rotr              t0, t0, 24           /* t0 = R3 | R2 | G2 | B2 */
+    rotr              t3, t3, 16           /* t3 = B2 | R3 | G3 | B3 */
+    rotr              t4, t4, 24           /* t4 = R5 | R4 | G4 | B4 */
+
+    CONVERT_2x8888_TO_2x0565 t5, t0, t5, t0, t6, t7, t8, v0, v1
+    CONVERT_2x8888_TO_2x0565 t3, t4, t3, t4, t6, t7, t8, v0, v1
+
+    sh                t5, 0(a0)
+    sh                t0, 2(a0)
+    sh                t3, 4(a0)
+    sh                t4, 6(a0)
+    rotr              t5, t2, 16           /* t5 = xx | R5 | G5 | B5 */
+    b                 11b
+     addiu            a0, a0, 8
+
+2:
+    lhu               t5, 0(a1)            /* t5 =  0 |  0 | G1 | R1 */
+    wsbh              t5, t5               /* t5 =  0 |  0 | R1 | G1 */
+21:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 2(a1)            /* t0 = B2 | G2 | R2 | B1 */
+    lw                t1, 6(a1)            /* t1 = R4 | B3 | G3 | R3 */
+    lw                t2, 10(a1)           /* t2 = G5 | R5 | B4 | G4 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = G2 | B2 | B1 | R2 */
+    wsbh              t1, t1               /* t1 = B3 | R4 | R3 | G3 */
+    wsbh              t2, t2               /* t2 = R5 | G5 | G4 | B4 */
+
+    precr_sra.ph.w    t5, t0, 0            /* t5 = R1 | G1 | B1 | R2 */
+    rotr              t0, t0, 16           /* t0 = B1 | R2 | G2 | B2 */
+    packrl.ph         t3, t2, t1           /* t3 = G4 | B4 | B3 | R4 */
+    rotr              t1, t1, 24           /* t1 = R4 | R3 | G3 | B3 */
+    srl               t5, t5, 8            /* t5 =  0 | R1 | G1 | B1 */
+    rotr              t3, t3, 16           /* t3 = B3 | R4 | G4 | B4 */
+
+    CONVERT_2x8888_TO_2x0565 t5, t0, t5, t0, t6, t7, t8, v0, v1
+    CONVERT_2x8888_TO_2x0565 t1, t3, t1, t3, t6, t7, t8, v0, v1
+
+    sh                t5, 0(a0)
+    sh                t0, 2(a0)
+    sh                t1, 4(a0)
+    sh                t3, 6(a0)
+    srl               t5, t2, 16           /* t5 =  0 |  0 | R5 | G5 */
+    b                 21b
+     addiu            a0, a0, 8
+
+3:
+    lbu               t5, 0(a1)            /* t5 =  0 |  0 |  0 | R1 */
+31:
+    beqz              t9, 4f
+     addiu            t9, t9, -1
+    lw                t0, 1(a1)            /* t0 = G2 | R2 | B1 | G1 */
+    lw                t1, 5(a1)            /* t1 = B3 | G3 | R3 | B2 */
+    lw                t2, 9(a1)            /* t2 = R5 | B4 | G4 | R4 */
+
+    addiu             a1, a1, 12
+    addiu             a2, a2, -4
+
+    wsbh              t0, t0               /* t0 = R2 | G2 | G1 | B1 */
+    wsbh              t1, t1               /* t1 = G3 | B3 | B2 | R3 */
+    wsbh              t2, t2               /* t2 = B4 | R5 | R4 | G4 */
+
+    precr_sra.ph.w    t5, t0, 0            /* t5 = xx | R1 | G1 | B1 */
+    packrl.ph         t3, t1, t0           /* t3 = B2 | R3 | R2 | G2 */
+    rotr              t1, t1, 16           /* t1 = B2 | R3 | G3 | B3 */
+    rotr              t4, t2, 24           /* t4 = R5 | R4 | G4 | B4 */
+    rotr              t3, t3, 24           /* t3 = R3 | R2 | G2 | B2 */
+
+    CONVERT_2x8888_TO_2x0565 t5, t3, t5, t3, t6, t7, t8, v0, v1
+    CONVERT_2x8888_TO_2x0565 t1, t4, t1, t4, t6, t7, t8, v0, v1
+
+    sh                t5, 0(a0)
+    sh                t3, 2(a0)
+    sh                t1, 4(a0)
+    sh                t4, 6(a0)
+    srl               t5, t2, 16           /* t5 =  0 |  0 | xx | R5 */
+    b                 31b
+     addiu            a0, a0, 8
+
+4:
+    beqz              a2, 6f
+     nop
+5:
+    lbu               t0, 0(a1)            /* t0 =  0 | 0 | 0 | R */
+    lbu               t1, 1(a1)            /* t1 =  0 | 0 | 0 | G */
+    lbu               t2, 2(a1)            /* t2 =  0 | 0 | 0 | B */
+    addiu             a1, a1, 3
+
+    sll               t0, t0, 16           /* t2 =  0 | R | 0 | 0 */
+    sll               t1, t1, 8            /* t1 =  0 | 0 | G | 0 */
+
+    or                t2, t2, t1           /* t2 =  0 | 0 | G | B */
+    or                t2, t2, t0           /* t2 =  0 | R | G | B */
+
+    CONVERT_1x8888_TO_1x0565 t2, t3, t4, t5
+
+    sh                t3, 0(a0)
+    addiu             a2, a2, -1
+    bnez              a2, 5b
+     addiu            a0, a0, 2
+6:
+    RESTORE_REGS_FROM_STACK 0, v0, v1
+    j                 ra
+     nop
+
+END(pixman_composite_src_0888_0565_rev_asm_mips)
+#endif
+
+LEAF_MIPS_DSPR2(pixman_composite_src_pixbuf_8888_asm_mips)
+/*
+ * a0 - dst  (a8b8g8r8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    li       v0, 0x00ff00ff
+
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1)
+    lw       t1, 4(a1)
+    addiu    a1, a1, 8
+    addiu    a2, a2, -2
+    srl      t2, t0, 24
+    srl      t3, t1, 24
+
+    MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t2, t3, t0, t1, v0, t4, t5, t6, t7, t8, t9
+
+    sll      t0, t0, 8
+    sll      t1, t1, 8
+    andi     t2, t2, 0xff
+    andi     t3, t3, 0xff
+    or       t0, t0, t2
+    or       t1, t1, t3
+    wsbh     t0, t0
+    wsbh     t1, t1
+    rotr     t0, t0, 16
+    rotr     t1, t1, 16
+    sw       t0, 0(a0)
+    sw       t1, 4(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    lw       t0, 0(a1)
+    srl      t1, t0, 24
+
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t3, t4, t5
+
+    sll      t0, t0, 8
+    andi     t1, t1, 0xff
+    or       t0, t0, t1
+    wsbh     t0, t0
+    rotr     t0, t0, 16
+    sw       t0, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j        ra
+     nop
+
+END(pixman_composite_src_pixbuf_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_src_rpixbuf_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    li       v0, 0x00ff00ff
+
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1)
+    lw       t1, 4(a1)
+    addiu    a1, a1, 8
+    addiu    a2, a2, -2
+    srl      t2, t0, 24
+    srl      t3, t1, 24
+
+    MIPS_2xUN8x4_MUL_2xUN8 t0, t1, t2, t3, t0, t1, v0, t4, t5, t6, t7, t8, t9
+
+    sll      t0, t0, 8
+    sll      t1, t1, 8
+    andi     t2, t2, 0xff
+    andi     t3, t3, 0xff
+    or       t0, t0, t2
+    or       t1, t1, t3
+    rotr     t0, t0, 8
+    rotr     t1, t1, 8
+    sw       t0, 0(a0)
+    sw       t1, 4(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    lw       t0, 0(a1)
+    srl      t1, t0, 24
+
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t3, t4, t5
+
+    sll      t0, t0, 8
+    andi     t1, t1, 0xff
+    or       t0, t0, t1
+    rotr     t0, t0, 8
+    sw       t0, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j        ra
+     nop
+
+END(pixman_composite_src_rpixbuf_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+
+    SAVE_REGS_ON_STACK 0, v0
+    li       v0, 0x00ff00ff
+
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+
+1:
+                       /* a1 = source      (32bit constant) */
+    lbu      t0, 0(a2) /* t2 = mask        (a8) */
+    lbu      t1, 1(a2) /* t3 = mask        (a8) */
+    addiu    a2, a2, 2
+
+    MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, t2, t3, v0, t4, t5, t6, t7, t8, t9
+
+    sw       t2, 0(a0)
+    sw       t3, 4(a0)
+    addiu    a3, a3, -2
+    addiu    t2, a3, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 8
+
+    beqz     a3, 3f
+     nop
+
+2:
+    lbu      t0, 0(a2)
+    addiu    a2, a2, 1
+
+    MIPS_UN8x4_MUL_UN8 a1, t0, t1, v0, t3, t4, t5
+
+    sw       t1, 0(a0)
+    addiu    a3, a3, -1
+    addiu    a0, a0, 4
+
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j        ra
+     nop
+
+END(pixman_composite_src_n_8_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8_asm_mips)
+/*
+ * a0 - dst  (a8)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    li                t9, 0x00ff00ff
+    beqz              a3, 3f
+     nop
+    srl               t7, a3, 2   /* t7 = how many multiples of 4 dst pixels */
+    beqz              t7, 1f      /* branch if less than 4 src pixels */
+     nop
+
+    srl               t8, a1, 24
+    replv.ph          t8, t8
+
+0:
+    beqz              t7, 1f
+     addiu            t7, t7, -1
+    lbu               t0, 0(a2)
+    lbu               t1, 1(a2)
+    lbu               t2, 2(a2)
+    lbu               t3, 3(a2)
+
+    addiu             a2, a2, 4
+
+    precr_sra.ph.w    t1, t0, 0
+    precr_sra.ph.w    t3, t2, 0
+    precr.qb.ph       t0, t3, t1
+
+    muleu_s.ph.qbl    t2, t0, t8
+    muleu_s.ph.qbr    t3, t0, t8
+    shra_r.ph         t4, t2, 8
+    shra_r.ph         t5, t3, 8
+    and               t4, t4, t9
+    and               t5, t5, t9
+    addq.ph           t2, t2, t4
+    addq.ph           t3, t3, t5
+    shra_r.ph         t2, t2, 8
+    shra_r.ph         t3, t3, 8
+    precr.qb.ph       t2, t2, t3
+
+    sb                t2, 0(a0)
+    srl               t2, t2, 8
+    sb                t2, 1(a0)
+    srl               t2, t2, 8
+    sb                t2, 2(a0)
+    srl               t2, t2, 8
+    sb                t2, 3(a0)
+    addiu             a3, a3, -4
+    b                 0b
+     addiu            a0, a0, 4
+
+1:
+    beqz              a3, 3f
+     nop
+    srl               t8, a1, 24
+2:
+    lbu               t0, 0(a2)
+    addiu             a2, a2, 1
+
+    mul               t2, t0, t8
+    shra_r.ph         t3, t2, 8
+    andi              t3, t3, 0x00ff
+    addq.ph           t2, t2, t3
+    shra_r.ph         t2, t2, 8
+
+    sb                t2, 0(a0)
+    addiu             a3, a3, -1
+    bnez              a3, 2b
+     addiu            a0, a0, 1
+
+3:
+    j                 ra
+     nop
+
+END(pixman_composite_src_n_8_8_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_8888_ca_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8r8g8b8)
+ * a3 - w
+ */
+
+    beqz         a3, 8f
+     nop
+    SAVE_REGS_ON_STACK 8, s0, s1, s2, s3, s4, s5
+
+    li           t6, 0xff
+    addiu        t7, zero, -1 /* t7 = 0xffffffff */
+    srl          t8, a1, 24   /* t8 = srca */
+    li           t9, 0x00ff00ff
+
+    addiu        t1, a3, -1
+    beqz         t1, 4f       /* last pixel */
+     nop
+
+0:
+    lw           t0, 0(a2)    /* t0 = mask */
+    lw           t1, 4(a2)    /* t1 = mask */
+    addiu        a3, a3, -2   /* w = w - 2 */
+    or           t2, t0, t1
+    beqz         t2, 3f      /* if (t0 == 0) && (t1 == 0) */
+     addiu       a2, a2, 8
+    and          t2, t0, t1
+    beq          t2, t7, 1f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
+     nop
+
+//if(ma)
+    lw           t2, 0(a0)    /* t2 = dst */
+    lw           t3, 4(a0)    /* t3 = dst */
+    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
+    MIPS_2xUN8x4_MUL_2xUN8   t0, t1, t8, t8, t0, t1, t9, s0, s1, s2, s3, s4, s5
+    not          t0, t0
+    not          t1, t1
+    MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
+    addu_s.qb    t2, t4, t2
+    addu_s.qb    t3, t5, t3
+    sw           t2, 0(a0)
+    sw           t3, 4(a0)
+    addiu        t1, a3, -1
+    bgtz         t1, 0b
+     addiu       a0, a0, 8
+    b            4f
+     nop
+1:
+//if (t0 == 0xffffffff) && (t1 == 0xffffffff):
+    beq          t8, t6, 2f   /* if (srca == 0xff) */
+     nop
+    lw           t2, 0(a0)    /* t2 = dst */
+    lw           t3, 4(a0)    /* t3 = dst */
+    not          t0, a1
+    not          t1, a1
+    srl          t0, t0, 24
+    srl          t1, t1, 24
+    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
+    addu_s.qb    t2, a1, t2
+    addu_s.qb    t3, a1, t3
+    sw           t2, 0(a0)
+    sw           t3, 4(a0)
+    addiu        t1, a3, -1
+    bgtz         t1, 0b
+     addiu       a0, a0, 8
+    b            4f
+     nop
+2:
+    sw           a1, 0(a0)
+    sw           a1, 4(a0)
+3:
+    addiu        t1, a3, -1
+    bgtz         t1, 0b
+     addiu       a0, a0, 8
+
+4:
+    beqz         a3, 7f
+     nop
+                              /* a1 = src */
+    lw           t0, 0(a2)    /* t0 = mask */
+    beqz         t0, 7f       /* if (t0 == 0) */
+     nop
+    beq          t0, t7, 5f  /* if (t0 == 0xffffffff) */
+     nop
+//if(ma)
+    lw           t1, 0(a0)    /* t1 = dst */
+    MIPS_UN8x4_MUL_UN8x4  a1, t0, t2, t9, t3, t4, t5, s0
+    MIPS_UN8x4_MUL_UN8    t0, t8, t0, t9, t3, t4, t5
+    not          t0, t0
+    MIPS_UN8x4_MUL_UN8x4  t1, t0, t1, t9, t3, t4, t5, s0
+    addu_s.qb    t1, t2, t1
+    sw           t1, 0(a0)
+    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
+    j            ra
+     nop
+5:
+//if (t0 == 0xffffffff)
+    beq          t8, t6, 6f   /* if (srca == 0xff) */
+     nop
+    lw           t1, 0(a0)    /* t1 = dst */
+    not          t0, a1
+    srl          t0, t0, 24
+    MIPS_UN8x4_MUL_UN8 t1, t0, t1, t9, t2, t3, t4
+    addu_s.qb    t1, a1, t1
+    sw           t1, 0(a0)
+    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
+    j            ra
+     nop
+6:
+    sw           a1, 0(a0)
+7:
+    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
+8:
+    j            ra
+     nop
+
+END(pixman_composite_over_n_8888_8888_ca_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips)
+/*
+ * a0 - dst  (r5g6b5)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8r8g8b8)
+ * a3 - w
+ */
+
+    beqz         a3, 8f
+     nop
+    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    li           t6, 0xff
+    addiu        t7, zero, -1 /* t7 = 0xffffffff */
+    srl          t8, a1, 24   /* t8 = srca */
+    li           t9, 0x00ff00ff
+    li           s6, 0xf800f800
+    li           s7, 0x07e007e0
+    li           s8, 0x001F001F
+
+    addiu        t1, a3, -1
+    beqz         t1, 4f       /* last pixel */
+     nop
+
+0:
+    lw           t0, 0(a2)    /* t0 = mask */
+    lw           t1, 4(a2)    /* t1 = mask */
+    addiu        a3, a3, -2   /* w = w - 2 */
+    or           t2, t0, t1
+    beqz         t2, 3f      /* if (t0 == 0) && (t1 == 0) */
+     addiu       a2, a2, 8
+    and          t2, t0, t1
+    beq          t2, t7, 1f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
+     nop
+
+//if(ma)
+    lhu          t2, 0(a0)    /* t2 = dst */
+    lhu          t3, 2(a0)    /* t3 = dst */
+    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
+    MIPS_2xUN8x4_MUL_2xUN8   t0, t1, t8, t8, t0, t1, t9, s0, s1, s2, s3, s4, s5
+    not          t0, t0
+    not          t1, t1
+    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, s7, s8, s0, s1, s2, s3
+    MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
+    addu_s.qb    t2, t4, t2
+    addu_s.qb    t3, t5, t3
+    CONVERT_2x8888_TO_2x0565 t2, t3, t2, t3, s6, s7, s8, s0, s1
+    sh           t2, 0(a0)
+    sh           t3, 2(a0)
+    addiu        t1, a3, -1
+    bgtz         t1, 0b
+     addiu       a0, a0, 4
+    b            4f
+     nop
+1:
+//if (t0 == 0xffffffff) && (t1 == 0xffffffff):
+    beq          t8, t6, 2f   /* if (srca == 0xff) */
+     nop
+    lhu          t2, 0(a0)    /* t2 = dst */
+    lhu          t3, 2(a0)    /* t3 = dst */
+    not          t0, a1
+    not          t1, a1
+    srl          t0, t0, 24
+    srl          t1, t1, 24
+    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, s7, s8, s0, s1, s2, s3
+    MIPS_2xUN8x4_MUL_2xUN8   t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
+    addu_s.qb    t2, a1, t2
+    addu_s.qb    t3, a1, t3
+    CONVERT_2x8888_TO_2x0565 t2, t3, t2, t3, s6, s7, s8, s0, s1
+    sh           t2, 0(a0)
+    sh           t3, 2(a0)
+    addiu        t1, a3, -1
+    bgtz         t1, 0b
+     addiu       a0, a0, 4
+    b            4f
+     nop
+2:
+    CONVERT_1x8888_TO_1x0565 a1, t2, s0, s1
+    sh           t2, 0(a0)
+    sh           t2, 2(a0)
+3:
+    addiu        t1, a3, -1
+    bgtz         t1, 0b
+     addiu       a0, a0, 4
+
+4:
+    beqz         a3, 7f
+     nop
+                              /* a1 = src */
+    lw           t0, 0(a2)    /* t0 = mask */
+    beqz         t0, 7f       /* if (t0 == 0) */
+     nop
+    beq          t0, t7, 5f  /* if (t0 == 0xffffffff) */
+     nop
+//if(ma)
+    lhu          t1, 0(a0)    /* t1 = dst */
+    MIPS_UN8x4_MUL_UN8x4     a1, t0, t2, t9, t3, t4, t5, s0
+    MIPS_UN8x4_MUL_UN8       t0, t8, t0, t9, t3, t4, t5
+    not          t0, t0
+    CONVERT_1x0565_TO_1x8888 t1, s1, s2, s3
+    MIPS_UN8x4_MUL_UN8x4     s1, t0, s1, t9, t3, t4, t5, s0
+    addu_s.qb    s1, t2, s1
+    CONVERT_1x8888_TO_1x0565 s1, t1, s0, s2
+    sh           t1, 0(a0)
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    j            ra
+     nop
+5:
+//if (t0 == 0xffffffff)
+    beq          t8, t6, 6f   /* if (srca == 0xff) */
+     nop
+    lhu          t1, 0(a0)    /* t1 = dst */
+    not          t0, a1
+    srl          t0, t0, 24
+    CONVERT_1x0565_TO_1x8888 t1, s1, s2, s3
+    MIPS_UN8x4_MUL_UN8       s1, t0, s1, t9, t2, t3, t4
+    addu_s.qb    s1, a1, s1
+    CONVERT_1x8888_TO_1x0565 s1, t1, s0, s2
+    sh           t1, 0(a0)
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    j            ra
+     nop
+6:
+    CONVERT_1x8888_TO_1x0565 a1, t1, s0, s2
+    sh           t1, 0(a0)
+7:
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
+8:
+    j            ra
+     nop
+
+END(pixman_composite_over_n_8888_0565_ca_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8_8_asm_mips)
+/*
+ * a0 - dst  (a8)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    li                t9, 0x00ff00ff
+    beqz              a3, 3f
+     nop
+    srl               v0, a3, 2   /* v0 = how many multiples of 4 dst pixels */
+    beqz              v0, 1f      /* branch if less than 4 src pixels */
+     nop
+
+    srl               t8, a1, 24
+    replv.ph          t8, t8
+
+0:
+    beqz              v0, 1f
+     addiu            v0, v0, -1
+    lbu               t0, 0(a2)
+    lbu               t1, 1(a2)
+    lbu               t2, 2(a2)
+    lbu               t3, 3(a2)
+    lbu               t4, 0(a0)
+    lbu               t5, 1(a0)
+    lbu               t6, 2(a0)
+    lbu               t7, 3(a0)
+
+    addiu             a2, a2, 4
+
+    precr_sra.ph.w    t1, t0, 0
+    precr_sra.ph.w    t3, t2, 0
+    precr_sra.ph.w    t5, t4, 0
+    precr_sra.ph.w    t7, t6, 0
+
+    precr.qb.ph       t0, t3, t1
+    precr.qb.ph       t1, t7, t5
+
+    muleu_s.ph.qbl    t2, t0, t8
+    muleu_s.ph.qbr    t3, t0, t8
+    shra_r.ph         t4, t2, 8
+    shra_r.ph         t5, t3, 8
+    and               t4, t4, t9
+    and               t5, t5, t9
+    addq.ph           t2, t2, t4
+    addq.ph           t3, t3, t5
+    shra_r.ph         t2, t2, 8
+    shra_r.ph         t3, t3, 8
+    precr.qb.ph       t0, t2, t3
+    not               t6, t0
+
+    preceu.ph.qbl     t7, t6
+    preceu.ph.qbr     t6, t6
+
+    muleu_s.ph.qbl    t2, t1, t7
+    muleu_s.ph.qbr    t3, t1, t6
+    shra_r.ph         t4, t2, 8
+    shra_r.ph         t5, t3, 8
+    and               t4, t4, t9
+    and               t5, t5, t9
+    addq.ph           t2, t2, t4
+    addq.ph           t3, t3, t5
+    shra_r.ph         t2, t2, 8
+    shra_r.ph         t3, t3, 8
+    precr.qb.ph       t1, t2, t3
+
+    addu_s.qb         t2, t0, t1
+
+    sb                t2, 0(a0)
+    srl               t2, t2, 8
+    sb                t2, 1(a0)
+    srl               t2, t2, 8
+    sb                t2, 2(a0)
+    srl               t2, t2, 8
+    sb                t2, 3(a0)
+    addiu             a3, a3, -4
+    b                 0b
+     addiu            a0, a0, 4
+
+1:
+    beqz              a3, 3f
+     nop
+    srl               t8, a1, 24
+2:
+    lbu               t0, 0(a2)
+    lbu               t1, 0(a0)
+    addiu             a2, a2, 1
+
+    mul               t2, t0, t8
+    shra_r.ph         t3, t2, 8
+    andi              t3, t3, 0x00ff
+    addq.ph           t2, t2, t3
+    shra_r.ph         t2, t2, 8
+    not               t3, t2
+    andi              t3, t3, 0x00ff
+
+
+    mul               t4, t1, t3
+    shra_r.ph         t5, t4, 8
+    andi              t5, t5, 0x00ff
+    addq.ph           t4, t4, t5
+    shra_r.ph         t4, t4, 8
+    andi              t4, t4, 0x00ff
+
+    addu_s.qb         t2, t2, t4
+    sb                t2, 0(a0)
+    addiu             a3, a3, -1
+    bnez              a3, 2b
+     addiu            a0, a0, 1
+
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j                 ra
+     nop
+
+END(pixman_composite_over_n_8_8_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 4, s0, s1, s2, s3, s4
+    beqz      a3, 4f
+     nop
+    li        t4, 0x00ff00ff
+    li        t5, 0xff
+    addiu     t0, a3, -1
+    beqz      t0, 3f         /* last pixel */
+     srl      t6, a1, 24     /* t6 = srca */
+    not       s4, a1
+    beq       t5, t6, 2f     /* if (srca == 0xff) */
+     srl      s4, s4, 24
+1:
+                             /* a1 = src */
+    lbu       t0, 0(a2)      /* t0 = mask */
+    lbu       t1, 1(a2)      /* t1 = mask */
+    or        t2, t0, t1
+    beqz      t2, 111f       /* if (t0 == 0) && (t1 == 0) */
+     addiu    a2, a2, 2
+    and       t3, t0, t1
+
+    lw        t2, 0(a0)      /* t2 = dst */
+    beq       t3, t5, 11f    /* if (t0 == 0xff) && (t1 == 0xff) */
+     lw       t3, 4(a0)      /* t3 = dst */
+
+    MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, s0, s1, t4, t6, t7, t8, t9, s2, s3
+    not       s2, s0
+    not       s3, s1
+    srl       s2, s2, 24
+    srl       s3, s3, 24
+    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, s2, s3, t2, t3, t4, t0, t1, t6, t7, t8, t9
+    addu_s.qb s2, t2, s0
+    addu_s.qb s3, t3, s1
+    sw        s2, 0(a0)
+    b         111f
+     sw       s3, 4(a0)
+11:
+    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, s4, s4, t2, t3, t4, t0, t1, t6, t7, t8, t9
+    addu_s.qb s2, t2, a1
+    addu_s.qb s3, t3, a1
+    sw        s2, 0(a0)
+    sw        s3, 4(a0)
+
+111:
+    addiu     a3, a3, -2
+    addiu     t0, a3, -1
+    bgtz      t0, 1b
+     addiu    a0, a0, 8
+    b         3f
+     nop
+2:
+                             /* a1 = src */
+    lbu       t0, 0(a2)      /* t0 = mask */
+    lbu       t1, 1(a2)      /* t1 = mask */
+    or        t2, t0, t1
+    beqz      t2, 222f       /* if (t0 == 0) && (t1 == 0) */
+     addiu    a2, a2, 2
+    and       t3, t0, t1
+    beq       t3, t5, 22f    /* if (t0 == 0xff) && (t1 == 0xff) */
+     nop
+    lw        t2, 0(a0)      /* t2 = dst */
+    lw        t3, 4(a0)      /* t3 = dst */
+
+    OVER_2x8888_2x8_2x8888 a1, a1, t0, t1, t2, t3, \
+                           t6, t7, t4, t8, t9, s0, s1, s2, s3
+    sw        t6, 0(a0)
+    b         222f
+     sw        t7, 4(a0)
+22:
+    sw        a1, 0(a0)
+    sw        a1, 4(a0)
+222:
+    addiu     a3, a3, -2
+    addiu     t0, a3, -1
+    bgtz      t0, 2b
+     addiu    a0, a0, 8
+3:
+    blez      a3, 4f
+     nop
+                             /* a1 = src */
+    lbu       t0, 0(a2)      /* t0 = mask */
+    beqz      t0, 4f         /* if (t0 == 0) */
+     addiu    a2, a2, 1
+    move      t3, a1
+    beq       t0, t5, 31f    /* if (t0 == 0xff) */
+     lw       t1, 0(a0)      /* t1 = dst */
+
+    MIPS_UN8x4_MUL_UN8 a1, t0, t3, t4, t6, t7, t8
+31:
+    not       t2, t3
+    srl       t2, t2, 24
+    MIPS_UN8x4_MUL_UN8 t1, t2, t1, t4, t6, t7, t8
+    addu_s.qb t2, t1, t3
+    sw        t2, 0(a0)
+4:
+    RESTORE_REGS_FROM_STACK 4, s0, s1, s2, s3, s4
+    j         ra
+     nop
+
+END(pixman_composite_over_n_8_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8_0565_asm_mips)
+/*
+ * a0 - dst  (r5g6b5)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+    SAVE_REGS_ON_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    beqz     a3, 4f
+     nop
+    li       t4, 0x00ff00ff
+    li       t5, 0xff
+    li       t6, 0xf800f800
+    li       t7, 0x07e007e0
+    li       t8, 0x001F001F
+    addiu    t1, a3, -1
+    beqz     t1, 3f         /* last pixel */
+     srl     t0, a1, 24     /* t0 = srca */
+    not      v0, a1
+    beq      t0, t5, 2f     /* if (srca == 0xff) */
+     srl     v0, v0, 24
+1:
+                            /* a1 = src */
+    lbu      t0, 0(a2)      /* t0 = mask */
+    lbu      t1, 1(a2)      /* t1 = mask */
+    or       t2, t0, t1
+    beqz     t2, 111f       /* if (t0 == 0) && (t1 == 0) */
+     addiu   a2, a2, 2
+    lhu      t2, 0(a0)      /* t2 = dst */
+    lhu      t3, 2(a0)      /* t3 = dst */
+    CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, t7, t8, t9, s2, s3, s4
+    and      t9, t0, t1
+    beq      t9, t5, 11f    /* if (t0 == 0xff) && (t1 == 0xff) */
+     nop
+
+    MIPS_2xUN8x4_MUL_2xUN8   a1, a1, t0, t1, s2, s3, t4, t9, s4, s5, s6, s7, s8
+    not      s4, s2
+    not      s5, s3
+    srl      s4, s4, 24
+    srl      s5, s5, 24
+    MIPS_2xUN8x4_MUL_2xUN8   s0, s1, s4, s5, s0, s1, t4, t9, t0, t1, s6, s7, s8
+    addu_s.qb                s4, s2, s0
+    addu_s.qb                s5, s3, s1
+    CONVERT_2x8888_TO_2x0565 s4, s5, t2, t3, t6, t7, t8, s0, s1
+    sh       t2, 0(a0)
+    b        111f
+     sh      t3, 2(a0)
+11:
+    MIPS_2xUN8x4_MUL_2xUN8   s0, s1, v0, v0, s0, s1, t4, t9, t0, t1, s6, s7, s8
+    addu_s.qb                s4, a1, s0
+    addu_s.qb                s5, a1, s1
+    CONVERT_2x8888_TO_2x0565 s4, s5, t2, t3, t6, t7, t8, s0, s1
+    sh       t2, 0(a0)
+    sh       t3, 2(a0)
+111:
+    addiu    a3, a3, -2
+    addiu    t0, a3, -1
+    bgtz     t0, 1b
+     addiu   a0, a0, 4
+    b        3f
+     nop
+2:
+    CONVERT_1x8888_TO_1x0565 a1, s0, s1, s2
+21:
+                            /* a1 = src */
+    lbu      t0, 0(a2)      /* t0 = mask */
+    lbu      t1, 1(a2)      /* t1 = mask */
+    or       t2, t0, t1
+    beqz     t2, 222f       /* if (t0 == 0) && (t1 == 0) */
+     addiu   a2, a2, 2
+    and      t9, t0, t1
+    move     s2, s0
+    beq      t9, t5, 22f    /* if (t0 == 0xff) && (t2 == 0xff) */
+     move    s3, s0
+    lhu      t2, 0(a0)      /* t2 = dst */
+    lhu      t3, 2(a0)      /* t3 = dst */
+
+    CONVERT_2x0565_TO_2x8888 t2, t3, s2, s3, t7, t8, s4, s5, s6, s7
+    OVER_2x8888_2x8_2x8888   a1, a1, t0, t1, s2, s3, \
+                             t2, t3, t4, t9, s4, s5, s6, s7, s8
+    CONVERT_2x8888_TO_2x0565 t2, t3, s2, s3, t6, t7, t8, s4, s5
+22:
+    sh       s2, 0(a0)
+    sh       s3, 2(a0)
+222:
+    addiu    a3, a3, -2
+    addiu    t0, a3, -1
+    bgtz     t0, 21b
+     addiu   a0, a0, 4
+3:
+    blez      a3, 4f
+     nop
+                            /* a1 = src */
+    lbu      t0, 0(a2)      /* t0 = mask */
+    beqz     t0, 4f         /* if (t0 == 0) */
+     nop
+    lhu      t1, 0(a0)      /* t1 = dst */
+    CONVERT_1x0565_TO_1x8888 t1, t2, t3, t7
+    beq      t0, t5, 31f    /* if (t0 == 0xff) */
+     move    t3, a1
+
+    MIPS_UN8x4_MUL_UN8       a1, t0, t3, t4, t7, t8, t9
+31:
+    not      t6, t3
+    srl      t6, t6, 24
+    MIPS_UN8x4_MUL_UN8       t2, t6, t2, t4, t7, t8, t9
+    addu_s.qb                t1, t2, t3
+    CONVERT_1x8888_TO_1x0565 t1, t2, t3, t7
+    sh       t2, 0(a0)
+4:
+    RESTORE_REGS_FROM_STACK  24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    j        ra
+     nop
+
+END(pixman_composite_over_n_8_0565_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_8888_n_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - mask (32bit constant)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, s0
+    li       t4, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    srl      a2, a2, 24
+    beqz     t1, 2f
+     nop
+
+1:
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+                       /* a2 = mask        (32bit constant) */
+    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+    lw       t3, 4(a0) /* t3 = destination (a8r8g8b8) */
+    addiu    a1, a1, 8
+
+    OVER_2x8888_2x8_2x8888 t0, t1, a2, a2, t2, t3, \
+                           t5, t6, t4, t7, t8, t9, t0, t1, s0
+
+    sw       t5, 0(a0)
+    sw       t6, 4(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a3, 3f
+     nop
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+                       /* a2 = mask        (32bit constant) */
+    lw       t1, 0(a0) /* t1 = destination (a8r8g8b8) */
+
+    OVER_8888_8_8888 t0, a2, t1, t3, t4, t5, t6, t7, t8
+
+    sw       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0
+    j        ra
+     nop
+
+END(pixman_composite_over_8888_n_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_8888_n_0565_asm_mips)
+/*
+ * a0 - dst  (r5g6b5)
+ * a1 - src  (a8r8g8b8)
+ * a2 - mask (32bit constant)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2, s3
+    li       t6, 0x00ff00ff
+    li       t7, 0xf800f800
+    li       t8, 0x07e007e0
+    li       t9, 0x001F001F
+    beqz     a3, 3f
+     nop
+    srl      a2, a2, 24
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+                       /* a2 = mask        (32bit constant) */
+    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */
+    lhu      t3, 2(a0) /* t2 = destination (r5g6b5) */
+    addiu    a1, a1, 8
+
+    CONVERT_2x0565_TO_2x8888 t2, t3, t4, t5, t8, t9, s0, s1, t2, t3
+    OVER_2x8888_2x8_2x8888   t0, t1, a2, a2, t4, t5, \
+                             t2, t3, t6, t0, t1, s0, s1, s2, s3
+    CONVERT_2x8888_TO_2x0565 t2, t3, t4, t5, t7, t8, t9, s0, s1
+
+    sh       t4, 0(a0)
+    sh       t5, 2(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a3, 3f
+     nop
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+                       /* a2 = mask        (32bit constant) */
+    lhu      t1, 0(a0) /* t1 = destination (r5g6b5) */
+
+    CONVERT_1x0565_TO_1x8888 t1, t2, t4, t5
+    OVER_8888_8_8888         t0, a2, t2, t1, t6, t3, t4, t5, t7
+    CONVERT_1x8888_TO_1x0565 t1, t3, t4, t5
+
+    sh       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3
+    j                 ra
+     nop
+
+END(pixman_composite_over_8888_n_0565_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_0565_n_0565_asm_mips)
+/*
+ * a0 - dst  (r5g6b5)
+ * a1 - src  (r5g6b5)
+ * a2 - mask (32bit constant)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5
+    li       t6, 0x00ff00ff
+    li       t7, 0xf800f800
+    li       t8, 0x07e007e0
+    li       t9, 0x001F001F
+    beqz     a3, 3f
+     nop
+    srl      a2, a2, 24
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
+    lhu      t1, 2(a1) /* t1 = source      (r5g6b5) */
+                       /* a2 = mask        (32bit constant) */
+    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */
+    lhu      t3, 2(a0) /* t3 = destination (r5g6b5) */
+    addiu    a1, a1, 4
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, t4, t5, t8, t9, s0, s1, s2, s3
+    CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, t8, t9, s2, s3, s4, s5
+    OVER_2x8888_2x8_2x8888   t4, t5, a2, a2, s0, s1, \
+                             t0, t1, t6, s2, s3, s4, s5, t4, t5
+    CONVERT_2x8888_TO_2x0565 t0, t1, s0, s1, t7, t8, t9, s2, s3
+
+    sh       s0, 0(a0)
+    sh       s1, 2(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a3, 3f
+     nop
+    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
+                       /* a2 = mask        (32bit constant) */
+    lhu      t1, 0(a0) /* t1 = destination (r5g6b5) */
+
+    CONVERT_1x0565_TO_1x8888 t0, t2, t4, t5
+    CONVERT_1x0565_TO_1x8888 t1, t3, t4, t5
+    OVER_8888_8_8888         t2, a2, t3, t0, t6, t1, t4, t5, t7
+    CONVERT_1x8888_TO_1x0565 t0, t3, t4, t5
+
+    sh       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5
+    j        ra
+     nop
+
+END(pixman_composite_over_0565_n_0565_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_8888_8_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1
+    li       t4, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+    lbu      t2, 0(a2) /* t2 = mask        (a8) */
+    lbu      t3, 1(a2) /* t3 = mask        (a8) */
+    lw       t5, 0(a0) /* t5 = destination (a8r8g8b8) */
+    lw       t6, 4(a0) /* t6 = destination (a8r8g8b8) */
+    addiu    a1, a1, 8
+    addiu    a2, a2, 2
+
+    OVER_2x8888_2x8_2x8888 t0, t1, t2, t3, t5, t6, \
+                           t7, t8, t4, t9, s0, s1, t0, t1, t2
+
+    sw       t7, 0(a0)
+    sw       t8, 4(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a3, 3f
+     nop
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lbu      t1, 0(a2) /* t1 = mask        (a8) */
+    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+
+    OVER_8888_8_8888 t0, t1, t2, t3, t4, t5, t6, t7, t8
+
+    sw       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1
+    j        ra
+     nop
+
+END(pixman_composite_over_8888_8_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_8888_8_0565_asm_mips)
+/*
+ * a0 - dst  (r5g6b5)
+ * a1 - src  (a8r8g8b8)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5
+    li       t6, 0x00ff00ff
+    li       t7, 0xf800f800
+    li       t8, 0x07e007e0
+    li       t9, 0x001F001F
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+    lbu      t2, 0(a2) /* t2 = mask        (a8) */
+    lbu      t3, 1(a2) /* t3 = mask        (a8) */
+    lhu      t4, 0(a0) /* t4 = destination (r5g6b5) */
+    lhu      t5, 2(a0) /* t5 = destination (r5g6b5) */
+    addiu    a1, a1, 8
+    addiu    a2, a2, 2
+
+    CONVERT_2x0565_TO_2x8888 t4, t5, s0, s1, t8, t9, s2, s3, s4, s5
+    OVER_2x8888_2x8_2x8888   t0, t1, t2, t3, s0, s1, \
+                             t4, t5, t6, s2, s3, s4, s5, t0, t1
+    CONVERT_2x8888_TO_2x0565 t4, t5, s0, s1, t7, t8, t9, s2, s3
+
+    sh       s0, 0(a0)
+    sh       s1, 2(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a3, 3f
+     nop
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lbu      t1, 0(a2) /* t1 = mask        (a8) */
+    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */
+
+    CONVERT_1x0565_TO_1x8888 t2, t3, t4, t5
+    OVER_8888_8_8888         t0, t1, t3, t2, t6, t4, t5, t7, t8
+    CONVERT_1x8888_TO_1x0565 t2, t3, t4, t5
+
+    sh       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5
+    j        ra
+     nop
+
+END(pixman_composite_over_8888_8_0565_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_0565_8_0565_asm_mips)
+/*
+ * a0 - dst  (r5g6b5)
+ * a1 - src  (r5g6b5)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5
+    li       t4, 0xf800f800
+    li       t5, 0x07e007e0
+    li       t6, 0x001F001F
+    li       t7, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
+    lhu      t1, 2(a1) /* t1 = source      (r5g6b5) */
+    lbu      t2, 0(a2) /* t2 = mask        (a8) */
+    lbu      t3, 1(a2) /* t3 = mask        (a8) */
+    lhu      t8, 0(a0) /* t8 = destination (r5g6b5) */
+    lhu      t9, 2(a0) /* t9 = destination (r5g6b5) */
+    addiu    a1, a1, 4
+    addiu    a2, a2, 2
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, s0, s1, t5, t6, s2, s3, s4, s5
+    CONVERT_2x0565_TO_2x8888 t8, t9, s2, s3, t5, t6, s4, s5, t0, t1
+    OVER_2x8888_2x8_2x8888   s0, s1, t2, t3, s2, s3, \
+                             t0, t1, t7, s4, s5, t8, t9, s0, s1
+    CONVERT_2x8888_TO_2x0565 t0, t1, s0, s1, t4, t5, t6, s2, s3
+
+    sh       s0, 0(a0)
+    sh       s1, 2(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a3, 3f
+     nop
+    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
+    lbu      t1, 0(a2) /* t1 = mask        (a8) */
+    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */
+
+    CONVERT_1x0565_TO_1x8888 t0, t3, t4, t5
+    CONVERT_1x0565_TO_1x8888 t2, t4, t5, t6
+    OVER_8888_8_8888         t3, t1, t4, t0, t7, t2, t5, t6, t8
+    CONVERT_1x8888_TO_1x0565 t0, t3, t4, t5
+
+    sh       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5
+    j        ra
+     nop
+
+END(pixman_composite_over_0565_8_0565_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - mask (a8r8g8b8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2
+    li       t4, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+    lw       t2, 0(a2) /* t2 = mask        (a8r8g8b8) */
+    lw       t3, 4(a2) /* t3 = mask        (a8r8g8b8) */
+    lw       t5, 0(a0) /* t5 = destination (a8r8g8b8) */
+    lw       t6, 4(a0) /* t6 = destination (a8r8g8b8) */
+    addiu    a1, a1, 8
+    addiu    a2, a2, 8
+    srl      t2, t2, 24
+    srl      t3, t3, 24
+
+    OVER_2x8888_2x8_2x8888 t0, t1, t2, t3, t5, t6, t7, t8, t4, t9, s0, s1, s2, t0, t1
+
+    sw       t7, 0(a0)
+    sw       t8, 4(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a3, 3f
+     nop
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 0(a2) /* t1 = mask        (a8r8g8b8) */
+    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+    srl      t1, t1, 24
+
+    OVER_8888_8_8888 t0, t1, t2, t3, t4, t5, t6, t7, t8
+
+    sw       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+    j        ra
+     nop
+
+END(pixman_composite_over_8888_8888_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2
+    li           t4, 0x00ff00ff
+    beqz         a2, 3f
+     nop
+    addiu        t1, a2, -1
+    beqz         t1, 2f
+     nop
+1:
+    lw           t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw           t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+    lw           t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+    lw           t3, 4(a0) /* t3 = destination (a8r8g8b8) */
+    addiu        a1, a1, 8
+
+    not          t5, t0
+    srl          t5, t5, 24
+    not          t6, t1
+    srl          t6, t6, 24
+
+    or           t7, t5, t6
+    beqz         t7, 11f
+     or          t8, t0, t1
+    beqz         t8, 12f
+
+    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t5, t6, t7, t8, t4, t9, s0, s1, s2, t2, t3
+
+    addu_s.qb    t0, t7, t0
+    addu_s.qb    t1, t8, t1
+11:
+    sw           t0, 0(a0)
+    sw           t1, 4(a0)
+12:
+    addiu        a2, a2, -2
+    addiu        t1, a2, -1
+    bgtz         t1, 1b
+     addiu       a0, a0, 8
+2:
+    beqz         a2, 3f
+     nop
+
+    lw           t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw           t1, 0(a0) /* t1 = destination (a8r8g8b8) */
+    addiu        a1, a1, 4
+
+    not          t2, t0
+    srl          t2, t2, 24
+
+    beqz         t2, 21f
+     nop
+    beqz         t0, 3f
+
+    MIPS_UN8x4_MUL_UN8 t1, t2, t3, t4, t5, t6, t7
+
+    addu_s.qb    t0, t3, t0
+21:
+    sw           t0, 0(a0)
+
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+    j            ra
+     nop
+
+END(pixman_composite_over_8888_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_8888_0565_asm_mips)
+/*
+ * a0 - dst  (r5g6b5)
+ * a1 - src  (a8r8g8b8)
+ * a2 - w
+ */
+
+    SAVE_REGS_ON_STACK 8, s0, s1, s2, s3, s4, s5
+    li           t4, 0x00ff00ff
+    li           s3, 0xf800f800
+    li           s4, 0x07e007e0
+    li           s5, 0x001F001F
+    beqz         a2, 3f
+     nop
+    addiu        t1, a2, -1
+    beqz         t1, 2f
+     nop
+1:
+    lw           t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw           t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+    lhu          t2, 0(a0) /* t2 = destination (r5g6b5) */
+    lhu          t3, 2(a0) /* t3 = destination (r5g6b5) */
+    addiu        a1, a1, 8
+
+    not          t5, t0
+    srl          t5, t5, 24
+    not          t6, t1
+    srl          t6, t6, 24
+
+    or           t7, t5, t6
+    beqz         t7, 11f
+     or          t8, t0, t1
+    beqz         t8, 12f
+
+    CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, s4, s5, t7, t8, t9, s2
+    MIPS_2xUN8x4_MUL_2xUN8   s0, s1, t5, t6, t7, t8, t4, t9, t2, t3, s2, s0, s1
+
+    addu_s.qb    t0, t7, t0
+    addu_s.qb    t1, t8, t1
+11:
+    CONVERT_2x8888_TO_2x0565 t0, t1, t7, t8, s3, s4, s5, t2, t3
+    sh           t7, 0(a0)
+    sh           t8, 2(a0)
+12:
+    addiu        a2, a2, -2
+    addiu        t1, a2, -1
+    bgtz         t1, 1b
+     addiu       a0, a0, 4
+2:
+    beqz         a2, 3f
+     nop
+
+    lw           t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lhu          t1, 0(a0) /* t1 = destination (r5g6b5) */
+    addiu        a1, a1, 4
+
+    not          t2, t0
+    srl          t2, t2, 24
+
+    beqz         t2, 21f
+     nop
+    beqz         t0, 3f
+
+    CONVERT_1x0565_TO_1x8888 t1, s0, t8, t9
+    MIPS_UN8x4_MUL_UN8       s0, t2, t3, t4, t5, t6, t7
+
+    addu_s.qb    t0, t3, t0
+21:
+    CONVERT_1x8888_TO_1x0565 t0, s0, t8, t9
+    sh           s0, 0(a0)
+
+3:
+    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
+    j            ra
+     nop
+
+END(pixman_composite_over_8888_0565_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_n_0565_asm_mips)
+/*
+ * a0 - dst  (r5g6b5)
+ * a1 - src  (32bit constant)
+ * a2 - w
+ */
+
+    beqz         a2, 5f
+     nop
+
+    not          t0, a1
+    srl          t0, t0, 24
+    bgtz         t0, 1f
+     nop
+    CONVERT_1x8888_TO_1x0565 a1, t1, t2, t3
+0:
+    sh           t1, 0(a0)
+    addiu        a2, a2, -1
+    bgtz         a2, 0b
+     addiu       a0, a0, 2
+    j            ra
+     nop
+
+1:
+    SAVE_REGS_ON_STACK 0, s0, s1, s2
+    li           t4, 0x00ff00ff
+    li           t5, 0xf800f800
+    li           t6, 0x07e007e0
+    li           t7, 0x001F001F
+    addiu        t1, a2, -1
+    beqz         t1, 3f
+     nop
+2:
+    lhu          t1, 0(a0) /* t1 = destination (r5g6b5) */
+    lhu          t2, 2(a0) /* t2 = destination (r5g6b5) */
+
+    CONVERT_2x0565_TO_2x8888 t1, t2, t3, t8, t6, t7, t9, s0, s1, s2
+    MIPS_2xUN8x4_MUL_2xUN8   t3, t8, t0, t0, t1, t2, t4, t9, s0, s1, s2, t3, t8
+    addu_s.qb                t1, t1, a1
+    addu_s.qb                t2, t2, a1
+    CONVERT_2x8888_TO_2x0565 t1, t2, t3, t8, t5, t6, t7, s0, s1
+
+    sh           t3, 0(a0)
+    sh           t8, 2(a0)
+
+    addiu        a2, a2, -2
+    addiu        t1, a2, -1
+    bgtz         t1, 2b
+     addiu       a0, a0, 4
+3:
+    beqz         a2, 4f
+     nop
+
+    lhu          t1, 0(a0) /* t1 = destination (r5g6b5) */
+
+    CONVERT_1x0565_TO_1x8888 t1, t2, s0, s1
+    MIPS_UN8x4_MUL_UN8       t2, t0, t1, t4, s0, s1, s2
+    addu_s.qb                t1, t1, a1
+    CONVERT_1x8888_TO_1x0565 t1, t2, s0, s1
+
+    sh           t2, 0(a0)
+
+4:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+5:
+    j            ra
+     nop
+
+END(pixman_composite_over_n_0565_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (32bit constant)
+ * a2 - w
+ */
+
+    beqz         a2, 5f
+     nop
+
+    not          t0, a1
+    srl          t0, t0, 24
+    bgtz         t0, 1f
+     nop
+0:
+    sw           a1, 0(a0)
+    addiu        a2, a2, -1
+    bgtz         a2, 0b
+     addiu       a0, a0, 4
+    j            ra
+     nop
+
+1:
+    SAVE_REGS_ON_STACK 0, s0, s1, s2
+    li           t4, 0x00ff00ff
+    addiu        t1, a2, -1
+    beqz         t1, 3f
+     nop
+2:
+    lw           t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+    lw           t3, 4(a0) /* t3 = destination (a8r8g8b8) */
+
+    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t0, t0, t7, t8, t4, t9, s0, s1, s2, t2, t3
+
+    addu_s.qb    t7, t7, a1
+    addu_s.qb    t8, t8, a1
+
+    sw           t7, 0(a0)
+    sw           t8, 4(a0)
+
+    addiu        a2, a2, -2
+    addiu        t1, a2, -1
+    bgtz         t1, 2b
+     addiu       a0, a0, 8
+3:
+    beqz         a2, 4f
+     nop
+
+    lw           t1, 0(a0) /* t1 = destination (a8r8g8b8) */
+
+    MIPS_UN8x4_MUL_UN8 t1, t0, t3, t4, t5, t6, t7
+
+    addu_s.qb    t3, t3, a1
+
+    sw           t3, 0(a0)
+
+4:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+5:
+    j            ra
+     nop
+
+END(pixman_composite_over_n_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_8_8_8_asm_mips)
+/*
+ * a0 - dst  (a8)
+ * a1 - src  (a8)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0, v1
+    li                t9, 0x00ff00ff
+    beqz              a3, 3f
+     nop
+
+    srl               v0, a3, 2   /* v0 = how many multiples of 4 dst pixels */
+    beqz              v0, 1f      /* branch if less than 4 src pixels */
+     nop
+
+0:
+    beqz              v0, 1f
+     addiu            v0, v0, -1
+    lbu               t0, 0(a2)
+    lbu               t1, 1(a2)
+    lbu               t2, 2(a2)
+    lbu               t3, 3(a2)
+    lbu               t4, 0(a0)
+    lbu               t5, 1(a0)
+    lbu               t6, 2(a0)
+    lbu               t7, 3(a0)
+
+    addiu             a2, a2, 4
+
+    precr_sra.ph.w    t1, t0, 0
+    precr_sra.ph.w    t3, t2, 0
+    precr_sra.ph.w    t5, t4, 0
+    precr_sra.ph.w    t7, t6, 0
+
+    precr.qb.ph       t0, t3, t1
+    precr.qb.ph       t1, t7, t5
+
+    lbu               t4, 0(a1)
+    lbu               v1, 1(a1)
+    lbu               t7, 2(a1)
+    lbu               t8, 3(a1)
+
+    addiu             a1, a1, 4
+
+    precr_sra.ph.w    v1, t4, 0
+    precr_sra.ph.w    t8, t7, 0
+
+    muleu_s.ph.qbl    t2, t0, t8
+    muleu_s.ph.qbr    t3, t0, v1
+    shra_r.ph         t4, t2, 8
+    shra_r.ph         t5, t3, 8
+    and               t4, t4, t9
+    and               t5, t5, t9
+    addq.ph           t2, t2, t4
+    addq.ph           t3, t3, t5
+    shra_r.ph         t2, t2, 8
+    shra_r.ph         t3, t3, 8
+    precr.qb.ph       t0, t2, t3
+
+    addu_s.qb         t2, t0, t1
+
+    sb                t2, 0(a0)
+    srl               t2, t2, 8
+    sb                t2, 1(a0)
+    srl               t2, t2, 8
+    sb                t2, 2(a0)
+    srl               t2, t2, 8
+    sb                t2, 3(a0)
+    addiu             a3, a3, -4
+    b                 0b
+     addiu            a0, a0, 4
+
+1:
+    beqz              a3, 3f
+     nop
+2:
+    lbu               t8, 0(a1)
+    lbu               t0, 0(a2)
+    lbu               t1, 0(a0)
+    addiu             a1, a1, 1
+    addiu             a2, a2, 1
+
+    mul               t2, t0, t8
+    shra_r.ph         t3, t2, 8
+    andi              t3, t3, 0xff
+    addq.ph           t2, t2, t3
+    shra_r.ph         t2, t2, 8
+    andi              t2, t2, 0xff
+
+    addu_s.qb         t2, t2, t1
+    sb                t2, 0(a0)
+    addiu             a3, a3, -1
+    bnez              a3, 2b
+     addiu            a0, a0, 1
+
+3:
+    RESTORE_REGS_FROM_STACK 0, v0, v1
+    j                 ra
+     nop
+
+END(pixman_composite_add_8_8_8_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_n_8_8_asm_mips)
+/*
+ * a0 - dst  (a8)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    li                t9, 0x00ff00ff
+    beqz              a3, 3f
+     nop
+
+    srl               v0, a3, 2   /* v0 = how many multiples of 4 dst pixels */
+    beqz              v0, 1f      /* branch if less than 4 src pixels */
+     nop
+
+    srl               t8, a1, 24
+    replv.ph          t8, t8
+
+0:
+    beqz              v0, 1f
+     addiu            v0, v0, -1
+    lbu               t0, 0(a2)
+    lbu               t1, 1(a2)
+    lbu               t2, 2(a2)
+    lbu               t3, 3(a2)
+    lbu               t4, 0(a0)
+    lbu               t5, 1(a0)
+    lbu               t6, 2(a0)
+    lbu               t7, 3(a0)
+
+    addiu             a2, a2, 4
+
+    precr_sra.ph.w    t1, t0, 0
+    precr_sra.ph.w    t3, t2, 0
+    precr_sra.ph.w    t5, t4, 0
+    precr_sra.ph.w    t7, t6, 0
+
+    precr.qb.ph       t0, t3, t1
+    precr.qb.ph       t1, t7, t5
+
+    muleu_s.ph.qbl    t2, t0, t8
+    muleu_s.ph.qbr    t3, t0, t8
+    shra_r.ph         t4, t2, 8
+    shra_r.ph         t5, t3, 8
+    and               t4, t4, t9
+    and               t5, t5, t9
+    addq.ph           t2, t2, t4
+    addq.ph           t3, t3, t5
+    shra_r.ph         t2, t2, 8
+    shra_r.ph         t3, t3, 8
+    precr.qb.ph       t0, t2, t3
+
+    addu_s.qb         t2, t0, t1
+
+    sb                t2, 0(a0)
+    srl               t2, t2, 8
+    sb                t2, 1(a0)
+    srl               t2, t2, 8
+    sb                t2, 2(a0)
+    srl               t2, t2, 8
+    sb                t2, 3(a0)
+    addiu             a3, a3, -4
+    b                 0b
+     addiu            a0, a0, 4
+
+1:
+    beqz              a3, 3f
+     nop
+    srl               t8, a1, 24
+2:
+    lbu               t0, 0(a2)
+    lbu               t1, 0(a0)
+    addiu             a2, a2, 1
+
+    mul               t2, t0, t8
+    shra_r.ph         t3, t2, 8
+    andi              t3, t3, 0xff
+    addq.ph           t2, t2, t3
+    shra_r.ph         t2, t2, 8
+    andi              t2, t2, 0xff
+
+    addu_s.qb         t2, t2, t1
+    sb                t2, 0(a0)
+    addiu             a3, a3, -1
+    bnez              a3, 2b
+     addiu            a0, a0, 1
+
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j                 ra
+     nop
+
+END(pixman_composite_add_n_8_8_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_n_8_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2
+    li       t4, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+                       /* a1 = source      (32bit constant) */
+    lbu      t0, 0(a2) /* t0 = mask        (a8) */
+    lbu      t1, 1(a2) /* t1 = mask        (a8) */
+    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+    lw       t3, 4(a0) /* t3 = destination (a8r8g8b8) */
+    addiu    a2, a2, 2
+
+    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 a1, a1, \
+                                       t0, t1, \
+                                       t2, t3, \
+                                       t5, t6, \
+                                       t4, t7, t8, t9, s0, s1, s2
+
+    sw       t5, 0(a0)
+    sw       t6, 4(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a3, 3f
+     nop
+                       /* a1 = source      (32bit constant) */
+    lbu      t0, 0(a2) /* t0 = mask        (a8) */
+    lw       t1, 0(a0) /* t1 = destination (a8r8g8b8) */
+
+    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 a1, t0, t1, t2, t4, t3, t5, t6
+
+    sw       t2, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+    j        ra
+     nop
+
+END(pixman_composite_add_n_8_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_0565_8_0565_asm_mips)
+/*
+ * a0 - dst  (r5g6b5)
+ * a1 - src  (r5g6b5)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
+    li       t4, 0xf800f800
+    li       t5, 0x07e007e0
+    li       t6, 0x001F001F
+    li       t7, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
+    lhu      t1, 2(a1) /* t1 = source      (r5g6b5) */
+    lbu      t2, 0(a2) /* t2 = mask        (a8) */
+    lbu      t3, 1(a2) /* t3 = mask        (a8) */
+    lhu      t8, 0(a0) /* t8 = destination (r5g6b5) */
+    lhu      t9, 2(a0) /* t9 = destination (r5g6b5) */
+    addiu    a1, a1, 4
+    addiu    a2, a2, 2
+
+    CONVERT_2x0565_TO_2x8888  t0, t1, s0, s1, t5, t6, s2, s3, s4, s5
+    CONVERT_2x0565_TO_2x8888  t8, t9, s2, s3, t5, t6, s4, s5, s6, s7
+    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4  s0, s1, \
+                                        t2, t3, \
+                                        s2, s3, \
+                                        t0, t1, \
+                                        t7, s4, s5, s6, s7, t8, t9
+    CONVERT_2x8888_TO_2x0565  t0, t1, s0, s1, t4, t5, t6, s2, s3
+
+    sh       s0, 0(a0)
+    sh       s1, 2(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a3, 3f
+     nop
+    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
+    lbu      t1, 0(a2) /* t1 = mask        (a8) */
+    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */
+
+    CONVERT_1x0565_TO_1x8888  t0, t3, t4, t5
+    CONVERT_1x0565_TO_1x8888  t2, t4, t5, t6
+    MIPS_UN8x4_MUL_UN8_ADD_UN8x4  t3, t1, t4, t0, t7, t2, t5, t6
+    CONVERT_1x8888_TO_1x0565  t0, t3, t4, t5
+
+    sh       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
+    j        ra
+     nop
+
+END(pixman_composite_add_0565_8_0565_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_8888_8_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2
+    li       t4, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+    lbu      t2, 0(a2) /* t2 = mask        (a8) */
+    lbu      t3, 1(a2) /* t3 = mask        (a8) */
+    lw       t5, 0(a0) /* t5 = destination (a8r8g8b8) */
+    lw       t6, 4(a0) /* t6 = destination (a8r8g8b8) */
+    addiu    a1, a1, 8
+    addiu    a2, a2, 2
+
+    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 t0, t1, \
+                                       t2, t3, \
+                                       t5, t6, \
+                                       t7, t8, \
+                                       t4, t9, s0, s1, s2, t0, t1
+
+    sw       t7, 0(a0)
+    sw       t8, 4(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a3, 3f
+     nop
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lbu      t1, 0(a2) /* t1 = mask        (a8) */
+    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+
+    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, t1, t2, t3, t4, t5, t6, t7
+
+    sw       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+    j        ra
+     nop
+
+END(pixman_composite_add_8888_8_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_8888_n_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - mask (32bit constant)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2
+    li       t4, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    srl      a2, a2, 24
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+                       /* a2 = mask        (32bit constant) */
+    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+    lw       t3, 4(a0) /* t3 = destination (a8r8g8b8) */
+    addiu    a1, a1, 8
+
+    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 t0, t1, \
+                                       a2, a2, \
+                                       t2, t3, \
+                                       t5, t6, \
+                                       t4, t7, t8, t9, s0, s1, s2
+
+    sw       t5, 0(a0)
+    sw       t6, 4(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a3, 3f
+     nop
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+                       /* a2 = mask        (32bit constant) */
+    lw       t1, 0(a0) /* t1 = destination (a8r8g8b8) */
+
+    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, a2, t1, t3, t4, t5, t6, t7
+
+    sw       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+    j        ra
+     nop
+
+END(pixman_composite_add_8888_n_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_8888_8888_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8r8g8b8)
+ * a2 - mask (a8r8g8b8)
+ * a3 - w
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2
+    li       t4, 0x00ff00ff
+    beqz     a3, 3f
+     nop
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
+    lw       t2, 0(a2) /* t2 = mask        (a8r8g8b8) */
+    lw       t3, 4(a2) /* t3 = mask        (a8r8g8b8) */
+    lw       t5, 0(a0) /* t5 = destination (a8r8g8b8) */
+    lw       t6, 4(a0) /* t6 = destination (a8r8g8b8) */
+    addiu    a1, a1, 8
+    addiu    a2, a2, 8
+    srl      t2, t2, 24
+    srl      t3, t3, 24
+
+    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 t0, t1, \
+                                       t2, t3, \
+                                       t5, t6, \
+                                       t7, t8, \
+                                       t4, t9, s0, s1, s2, t0, t1
+
+    sw       t7, 0(a0)
+    sw       t8, 4(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a3, 3f
+     nop
+    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
+    lw       t1, 0(a2) /* t1 = mask        (a8r8g8b8) */
+    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+    srl      t1, t1, 24
+
+    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, t1, t2, t3, t4, t5, t6, t7
+
+    sw       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
+    j        ra
+     nop
+
+END(pixman_composite_add_8888_8888_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_8_8_asm_mips)
+/*
+ * a0 - dst  (a8)
+ * a1 - src  (a8)
+ * a2 - w
+ */
+
+    beqz              a2, 3f
+     nop
+    srl               t9, a2, 2   /* t9 = how many multiples of 4 dst pixels */
+    beqz              t9, 1f      /* branch if less than 4 src pixels */
+     nop
+
+0:
+    beqz              t9, 1f
+     addiu            t9, t9, -1
+    lbu               t0, 0(a1)
+    lbu               t1, 1(a1)
+    lbu               t2, 2(a1)
+    lbu               t3, 3(a1)
+    lbu               t4, 0(a0)
+    lbu               t5, 1(a0)
+    lbu               t6, 2(a0)
+    lbu               t7, 3(a0)
+
+    addiu             a1, a1, 4
+
+    precr_sra.ph.w    t1, t0, 0
+    precr_sra.ph.w    t3, t2, 0
+    precr_sra.ph.w    t5, t4, 0
+    precr_sra.ph.w    t7, t6, 0
+
+    precr.qb.ph       t0, t3, t1
+    precr.qb.ph       t1, t7, t5
+
+    addu_s.qb         t2, t0, t1
+
+    sb                t2, 0(a0)
+    srl               t2, t2, 8
+    sb                t2, 1(a0)
+    srl               t2, t2, 8
+    sb                t2, 2(a0)
+    srl               t2, t2, 8
+    sb                t2, 3(a0)
+    addiu             a2, a2, -4
+    b                 0b
+     addiu            a0, a0, 4
+
+1:
+    beqz              a2, 3f
+     nop
+2:
+    lbu               t0, 0(a1)
+    lbu               t1, 0(a0)
+    addiu             a1, a1, 1
+
+    addu_s.qb         t2, t0, t1
+    sb                t2, 0(a0)
+    addiu             a2, a2, -1
+    bnez              a2, 2b
+     addiu            a0, a0, 1
+
+3:
+    j                 ra
+     nop
+
+END(pixman_composite_add_8_8_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_add_8888_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (a8r8g8b8)
+ * a2 - w
+ */
+
+    beqz         a2, 4f
+     nop
+
+    srl          t9, a2, 2      /* t1 = how many multiples of 4 src pixels */
+    beqz         t9, 3f         /* branch if less than 4 src pixels */
+     nop
+1:
+    addiu        t9, t9, -1
+    beqz         t9, 2f
+     addiu       a2, a2, -4
+
+    lw           t0, 0(a1)
+    lw           t1, 4(a1)
+    lw           t2, 8(a1)
+    lw           t3, 12(a1)
+    lw           t4, 0(a0)
+    lw           t5, 4(a0)
+    lw           t6, 8(a0)
+    lw           t7, 12(a0)
+    addiu        a1, a1, 16
+
+    addu_s.qb    t4, t4, t0
+    addu_s.qb    t5, t5, t1
+    addu_s.qb    t6, t6, t2
+    addu_s.qb    t7, t7, t3
+
+    sw           t4, 0(a0)
+    sw           t5, 4(a0)
+    sw           t6, 8(a0)
+    sw           t7, 12(a0)
+    b            1b
+     addiu       a0, a0, 16
+2:
+    lw           t0, 0(a1)
+    lw           t1, 4(a1)
+    lw           t2, 8(a1)
+    lw           t3, 12(a1)
+    lw           t4, 0(a0)
+    lw           t5, 4(a0)
+    lw           t6, 8(a0)
+    lw           t7, 12(a0)
+    addiu        a1, a1, 16
+
+    addu_s.qb    t4, t4, t0
+    addu_s.qb    t5, t5, t1
+    addu_s.qb    t6, t6, t2
+    addu_s.qb    t7, t7, t3
+
+    sw           t4, 0(a0)
+    sw           t5, 4(a0)
+    sw           t6, 8(a0)
+    sw           t7, 12(a0)
+
+    beqz         a2, 4f
+     addiu       a0, a0, 16
+3:
+    lw           t0, 0(a1)
+    lw           t1, 0(a0)
+    addiu        a1, a1, 4
+    addiu        a2, a2, -1
+    addu_s.qb    t1, t1, t0
+    sw           t1, 0(a0)
+    bnez         a2, 3b
+     addiu       a0, a0, 4
+4:
+    jr           ra
+     nop
+
+END(pixman_composite_add_8888_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_out_reverse_8_0565_asm_mips)
+/*
+ * a0 - dst  (r5g6b5)
+ * a1 - src  (a8)
+ * a2 - w
+ */
+
+    beqz     a2, 4f
+     nop
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2, s3
+    li       t2, 0xf800f800
+    li       t3, 0x07e007e0
+    li       t4, 0x001F001F
+    li       t5, 0x00ff00ff
+
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    lbu      t0, 0(a1) /* t0 = source      (a8) */
+    lbu      t1, 1(a1) /* t1 = source      (a8) */
+    lhu      t6, 0(a0) /* t6 = destination (r5g6b5) */
+    lhu      t7, 2(a0) /* t7 = destination (r5g6b5) */
+    addiu    a1, a1, 2
+
+    not      t0, t0
+    not      t1, t1
+    andi     t0, 0xff  /* t0 = neg source1 */
+    andi     t1, 0xff  /* t1 = neg source2 */
+    CONVERT_2x0565_TO_2x8888 t6, t7, t8, t9, t3, t4, s0, s1, s2, s3
+    MIPS_2xUN8x4_MUL_2xUN8   t8, t9, t0, t1, t6, t7, t5, s0, s1, s2, s3, t8, t9
+    CONVERT_2x8888_TO_2x0565 t6, t7, t8, t9, t2, t3, t4, s0, s1
+
+    sh       t8, 0(a0)
+    sh       t9, 2(a0)
+    addiu    a2, a2, -2
+    addiu    t1, a2, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a2, 3f
+     nop
+    lbu      t0, 0(a1) /* t0 = source      (a8) */
+    lhu      t1, 0(a0) /* t1 = destination (r5g6b5) */
+
+    not      t0, t0
+    andi     t0, 0xff  /* t0 = neg source */
+    CONVERT_1x0565_TO_1x8888 t1, t2, t3, t4
+    MIPS_UN8x4_MUL_UN8        t2, t0, t1, t5, t3, t4, t6
+    CONVERT_1x8888_TO_1x0565 t1, t2, t3, t4
+
+    sh       t2, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3
+4:
+    j        ra
+     nop
+
+END(pixman_composite_out_reverse_8_0565_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_out_reverse_8_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (a8)
+ * a2 - w
+ */
+
+    beqz     a2, 3f
+     nop
+    li       t4, 0x00ff00ff
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    lbu      t0, 0(a1) /* t0 = source      (a8) */
+    lbu      t1, 1(a1) /* t1 = source      (a8) */
+    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
+    lw       t3, 4(a0) /* t3 = destination (a8r8g8b8) */
+    addiu    a1, a1, 2
+    not      t0, t0
+    not      t1, t1
+    andi     t0, 0xff  /* t0 = neg source */
+    andi     t1, 0xff  /* t1 = neg source */
+
+    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t0, t1, t5, t6, t4, t7, t8, t9, t2, t3, t0
+
+    sw       t5, 0(a0)
+    sw       t6, 4(a0)
+    addiu    a2, a2, -2
+    addiu    t1, a2, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    lbu      t0, 0(a1) /* t0 = source      (a8) */
+    lw       t1, 0(a0) /* t1 = destination (a8r8g8b8) */
+    not      t0, t0
+    andi     t0, 0xff  /* t0 = neg source */
+
+    MIPS_UN8x4_MUL_UN8 t1, t0, t2, t4, t3, t5, t6
+
+    sw       t2, 0(a0)
+3:
+    j        ra
+     nop
+
+END(pixman_composite_out_reverse_8_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_reverse_n_8888_asm_mips)
+/*
+ * a0 - dst  (a8r8g8b8)
+ * a1 - src  (32bit constant)
+ * a2 - w
+ */
+
+    beqz              a2, 5f
+     nop
+
+    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
+    li                t0, 0x00ff00ff
+    srl               t9, a2, 2   /* t9 = how many multiples of 4 src pixels */
+    beqz              t9, 2f      /* branch if less than 4 src pixels */
+     nop
+1:
+    beqz              t9, 2f
+     addiu            t9, t9, -1
+
+    lw                t1, 0(a0)
+    lw                t2, 4(a0)
+    lw                t3, 8(a0)
+    lw                t4, 12(a0)
+
+    addiu             a2, a2, -4
+
+    not               t5, t1
+    not               t6, t2
+    not               t7, t3
+    not               t8, t4
+    srl               t5, t5, 24
+    srl               t6, t6, 24
+    srl               t7, t7, 24
+    srl               t8, t8, 24
+    replv.ph          t5, t5
+    replv.ph          t6, t6
+    replv.ph          t7, t7
+    replv.ph          t8, t8
+    muleu_s.ph.qbl    s0, a1, t5
+    muleu_s.ph.qbr    s1, a1, t5
+    muleu_s.ph.qbl    s2, a1, t6
+    muleu_s.ph.qbr    s3, a1, t6
+    muleu_s.ph.qbl    s4, a1, t7
+    muleu_s.ph.qbr    s5, a1, t7
+    muleu_s.ph.qbl    s6, a1, t8
+    muleu_s.ph.qbr    s7, a1, t8
+
+    shra_r.ph         t5, s0, 8
+    shra_r.ph         t6, s1, 8
+    shra_r.ph         t7, s2, 8
+    shra_r.ph         t8, s3, 8
+    and               t5, t5, t0
+    and               t6, t6, t0
+    and               t7, t7, t0
+    and               t8, t8, t0
+    addq.ph           s0, s0, t5
+    addq.ph           s1, s1, t6
+    addq.ph           s2, s2, t7
+    addq.ph           s3, s3, t8
+    shra_r.ph         s0, s0, 8
+    shra_r.ph         s1, s1, 8
+    shra_r.ph         s2, s2, 8
+    shra_r.ph         s3, s3, 8
+    shra_r.ph         t5, s4, 8
+    shra_r.ph         t6, s5, 8
+    shra_r.ph         t7, s6, 8
+    shra_r.ph         t8, s7, 8
+    and               t5, t5, t0
+    and               t6, t6, t0
+    and               t7, t7, t0
+    and               t8, t8, t0
+    addq.ph           s4, s4, t5
+    addq.ph           s5, s5, t6
+    addq.ph           s6, s6, t7
+    addq.ph           s7, s7, t8
+    shra_r.ph         s4, s4, 8
+    shra_r.ph         s5, s5, 8
+    shra_r.ph         s6, s6, 8
+    shra_r.ph         s7, s7, 8
+
+    precr.qb.ph       t5, s0, s1
+    precr.qb.ph       t6, s2, s3
+    precr.qb.ph       t7, s4, s5
+    precr.qb.ph       t8, s6, s7
+    addu_s.qb         t5, t1, t5
+    addu_s.qb         t6, t2, t6
+    addu_s.qb         t7, t3, t7
+    addu_s.qb         t8, t4, t8
+
+    sw                t5, 0(a0)
+    sw                t6, 4(a0)
+    sw                t7, 8(a0)
+    sw                t8, 12(a0)
+    b                 1b
+     addiu            a0, a0, 16
+
+2:
+    beqz              a2, 4f
+     nop
+3:
+    lw                t1, 0(a0)
+
+    not               t2, t1
+    srl               t2, t2, 24
+    replv.ph          t2, t2
+
+    muleu_s.ph.qbl    t4, a1, t2
+    muleu_s.ph.qbr    t5, a1, t2
+    shra_r.ph         t6, t4, 8
+    shra_r.ph         t7, t5, 8
+
+    and               t6,t6,t0
+    and               t7,t7,t0
+
+    addq.ph           t8, t4, t6
+    addq.ph           t9, t5, t7
+
+    shra_r.ph         t8, t8, 8
+    shra_r.ph         t9, t9, 8
+
+    precr.qb.ph       t9, t8, t9
+
+    addu_s.qb         t9, t1, t9
+    sw                t9, 0(a0)
+
+    addiu             a2, a2, -1
+    bnez              a2, 3b
+     addiu            a0, a0, 4
+4:
+    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
+5:
+    j                 ra
+     nop
+
+END(pixman_composite_over_reverse_n_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_in_n_8_asm_mips)
+/*
+ * a0 - dst  (a8)
+ * a1 - src  (32bit constant)
+ * a2 - w
+ */
+
+    li                t9, 0x00ff00ff
+    beqz              a2, 3f
+     nop
+    srl               t7, a2, 2   /* t7 = how many multiples of 4 dst pixels */
+    beqz              t7, 1f      /* branch if less than 4 src pixels */
+     nop
+
+    srl               t8, a1, 24
+    replv.ph          t8, t8
+
+0:
+    beqz              t7, 1f
+     addiu            t7, t7, -1
+    lbu               t0, 0(a0)
+    lbu               t1, 1(a0)
+    lbu               t2, 2(a0)
+    lbu               t3, 3(a0)
+
+    precr_sra.ph.w    t1, t0, 0
+    precr_sra.ph.w    t3, t2, 0
+    precr.qb.ph       t0, t3, t1
+
+    muleu_s.ph.qbl    t2, t0, t8
+    muleu_s.ph.qbr    t3, t0, t8
+    shra_r.ph         t4, t2, 8
+    shra_r.ph         t5, t3, 8
+    and               t4, t4, t9
+    and               t5, t5, t9
+    addq.ph           t2, t2, t4
+    addq.ph           t3, t3, t5
+    shra_r.ph         t2, t2, 8
+    shra_r.ph         t3, t3, 8
+    precr.qb.ph       t2, t2, t3
+
+    sb                t2, 0(a0)
+    srl               t2, t2, 8
+    sb                t2, 1(a0)
+    srl               t2, t2, 8
+    sb                t2, 2(a0)
+    srl               t2, t2, 8
+    sb                t2, 3(a0)
+    addiu             a2, a2, -4
+    b                 0b
+     addiu            a0, a0, 4
+
+1:
+    beqz              a2, 3f
+     nop
+    srl               t8, a1, 24
+2:
+    lbu               t0, 0(a0)
+
+    mul               t2, t0, t8
+    shra_r.ph         t3, t2, 8
+    andi              t3, t3, 0x00ff
+    addq.ph           t2, t2, t3
+    shra_r.ph         t2, t2, 8
+
+    sb                t2, 0(a0)
+    addiu             a2, a2, -1
+    bnez              a2, 2b
+     addiu            a0, a0, 1
+
+3:
+    j                 ra
+     nop
+
+END(pixman_composite_in_n_8_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8888_OVER_asm_mips)
+/*
+ * a0     - dst  (a8r8g8b8)
+ * a1     - src  (a8r8g8b8)
+ * a2     - w
+ * a3     - vx
+ * 16(sp) - unit_x
+ */
+
+    SAVE_REGS_ON_STACK 0, s0, s1, s2, s3
+    lw       t8, 16(sp) /* t8 = unit_x */
+    li       t6, 0x00ff00ff
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    sra      t1, a3, 16 /* t0 = vx >> 16 */
+    sll      t1, t1, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t1, a1, t1
+    lw       t1, 0(t1)  /* t1 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    lw       t2, 0(a0)  /* t2 = destination (a8r8g8b8) */
+    lw       t3, 4(a0)  /* t3 = destination (a8r8g8b8) */
+
+    OVER_2x8888_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t9, s0, s1, s2, s3
+
+    sw       t4, 0(a0)
+    sw       t5, 4(a0)
+    addiu    a2, a2, -2
+    addiu    t1, a2, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    lw       t1, 0(a0)  /* t1 = destination (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    OVER_8888_8888 t0, t1, t2, t6, t4, t5, t3, t7
+
+    sw       t2, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3
+    j        ra
+     nop
+
+END(pixman_scaled_nearest_scanline_8888_8888_OVER_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_0565_OVER_asm_mips)
+/*
+ * a0     - dst  (r5g6b5)
+ * a1     - src  (a8r8g8b8)
+ * a2     - w
+ * a3     - vx
+ * 16(sp) - unit_x
+ */
+
+    SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, v0, v1
+    lw       t8, 40(sp) /* t8 = unit_x */
+    li       t4, 0x00ff00ff
+    li       t5, 0xf800f800
+    li       t6, 0x07e007e0
+    li       t7, 0x001F001F
+    beqz     a2, 3f
+     nop
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+1:
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+    sra      t1, a3, 16 /* t0 = vx >> 16 */
+    sll      t1, t1, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t1, a1, t1
+    lw       t1, 0(t1)  /* t1 = source      (a8r8g8b8) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+    lhu      t2, 0(a0)  /* t2 = destination (r5g6b5) */
+    lhu      t3, 2(a0)  /* t3 = destination (r5g6b5) */
+
+    CONVERT_2x0565_TO_2x8888 t2, t3, v0, v1, t6, t7, s0, s1, s2, s3
+    OVER_2x8888_2x8888       t0, t1, v0, v1, t2, t3, t4, t9, s0, s1, s2, s3, s4
+    CONVERT_2x8888_TO_2x0565 t2, t3, v0, v1, t5, t6, t7, t9, s2
+
+    sh       v0, 0(a0)
+    sh       v1, 2(a0)
+    addiu    a2, a2, -2
+    addiu    t1, a2, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a2, 3f
+     nop
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4 (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    lhu      t1, 0(a0)  /* t1 = destination (r5g6b5) */
+    addu     a3, a3, t8 /* a3 = vx + unit_x */
+
+    CONVERT_1x0565_TO_1x8888 t1, t2, t5, t6
+    OVER_8888_8888           t0, t2, t1, t4, t3, t5, t6, t7
+    CONVERT_1x8888_TO_1x0565 t1, t2, t5, t6
+
+    sh       t2, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, v0, v1
+    j        ra
+     nop
+
+END(pixman_scaled_nearest_scanline_8888_0565_OVER_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_0565_8888_SRC_asm_mips)
+/*
+ * a0     - dst (a8r8g8b8)
+ * a1     - src (r5g6b5)
+ * a2     - w
+ * a3     - vx
+ * 16(sp) - unit_x
+ */
+
+    SAVE_REGS_ON_STACK 0, v0
+    beqz     a2, 3f
+     nop
+
+    lw       v0, 16(sp) /* v0 = unit_x */
+    addiu    t1, a2, -1
+    beqz     t1, 2f
+     nop
+
+    li       t4, 0x07e007e0
+    li       t5, 0x001F001F
+1:
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 1  /* t0 = t0 * 2 ((r5g6b5)) */
+    addu     t0, a1, t0
+    lhu      t0, 0(t0)  /* t0 = source ((r5g6b5)) */
+    addu     a3, a3, v0 /* a3 = vx + unit_x */
+    sra      t1, a3, 16 /* t1 = vx >> 16 */
+    sll      t1, t1, 1  /* t1 = t1 * 2 ((r5g6b5)) */
+    addu     t1, a1, t1
+    lhu      t1, 0(t1)  /* t1 = source ((r5g6b5)) */
+    addu     a3, a3, v0 /* a3 = vx + unit_x */
+    addiu    a2, a2, -2
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
+
+    sw       t2, 0(a0)
+    sw       t3, 4(a0)
+
+    addiu    t2, a2, -1
+    bgtz     t2, 1b
+     addiu   a0, a0, 8
+2:
+    beqz     a2, 3f
+     nop
+    sra      t0, a3, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 1  /* t0 = t0 * 2 ((r5g6b5)) */
+    addu     t0, a1, t0
+    lhu      t0, 0(t0)  /* t0 = source ((r5g6b5)) */
+
+    CONVERT_1x0565_TO_1x8888 t0, t1, t2, t3
+
+    sw       t1, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 0, v0
+    j        ra
+     nop
+
+END(pixman_scaled_nearest_scanline_0565_8888_SRC_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips)
+/*
+ * a0     - dst  (r5g6b5)
+ * a1     - src  (a8r8g8b8)
+ * a2     - mask (a8)
+ * a3     - w
+ * 16(sp) - vx
+ * 20(sp) - unit_x
+ */
+    beqz     a3, 4f
+     nop
+
+    SAVE_REGS_ON_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5
+    lw       v0, 36(sp) /* v0 = vx */
+    lw       v1, 40(sp) /* v1 = unit_x */
+    li       t6, 0x00ff00ff
+    li       t7, 0xf800f800
+    li       t8, 0x07e007e0
+    li       t9, 0x001F001F
+
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    sra      t0, v0, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4      (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    addu     v0, v0, v1 /* v0 = vx + unit_x */
+    sra      t1, v0, 16 /* t1 = vx >> 16 */
+    sll      t1, t1, 2  /* t1 = t1 * 4      (a8r8g8b8) */
+    addu     t1, a1, t1
+    lw       t1, 0(t1)  /* t1 = source      (a8r8g8b8) */
+    addu     v0, v0, v1 /* v0 = vx + unit_x */
+    lbu      t2, 0(a2)  /* t2 = mask        (a8) */
+    lbu      t3, 1(a2)  /* t3 = mask        (a8) */
+    lhu      t4, 0(a0)  /* t4 = destination (r5g6b5) */
+    lhu      t5, 2(a0)  /* t5 = destination (r5g6b5) */
+    addiu    a2, a2, 2
+
+    CONVERT_2x0565_TO_2x8888 t4, t5, s0, s1, t8, t9, s2, s3, s4, s5
+    OVER_2x8888_2x8_2x8888   t0, t1, \
+                             t2, t3, \
+                             s0, s1, \
+                             t4, t5, \
+                             t6, s2, s3, s4, s5, t2, t3
+    CONVERT_2x8888_TO_2x0565 t4, t5, s0, s1, t7, t8, t9, s2, s3
+
+    sh       s0, 0(a0)
+    sh       s1, 2(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a3, 3f
+     nop
+    sra      t0, v0, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 2  /* t0 = t0 * 4      (a8r8g8b8) */
+    addu     t0, a1, t0
+    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
+    lbu      t1, 0(a2)  /* t1 = mask        (a8) */
+    lhu      t2, 0(a0)  /* t2 = destination (r5g6b5) */
+
+    CONVERT_1x0565_TO_1x8888 t2, t3, t4, t5
+    OVER_8888_8_8888         t0, t1, t3, t2, t6, t4, t5, t7, t8
+    CONVERT_1x8888_TO_1x0565 t2, t3, t4, t5
+
+    sh       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5
+4:
+    j        ra
+     nop
+
+END(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_mips)
+/*
+ * a0     - dst  (r5g6b5)
+ * a1     - src  (r5g6b5)
+ * a2     - mask (a8)
+ * a3     - w
+ * 16(sp) - vx
+ * 20(sp) - unit_x
+ */
+
+    beqz     a3, 4f
+     nop
+    SAVE_REGS_ON_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5
+    lw       v0, 36(sp) /* v0 = vx */
+    lw       v1, 40(sp) /* v1 = unit_x */
+    li       t4, 0xf800f800
+    li       t5, 0x07e007e0
+    li       t6, 0x001F001F
+    li       t7, 0x00ff00ff
+
+    addiu    t1, a3, -1
+    beqz     t1, 2f
+     nop
+1:
+    sra      t0, v0, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 1  /* t0 = t0 * 2      (r5g6b5) */
+    addu     t0, a1, t0
+    lhu      t0, 0(t0)  /* t0 = source      (r5g6b5) */
+    addu     v0, v0, v1 /* v0 = vx + unit_x */
+    sra      t1, v0, 16 /* t1 = vx >> 16 */
+    sll      t1, t1, 1  /* t1 = t1 * 2      (r5g6b5) */
+    addu     t1, a1, t1
+    lhu      t1, 0(t1)  /* t1 = source      (r5g6b5) */
+    addu     v0, v0, v1 /* v0 = vx + unit_x */
+    lbu      t2, 0(a2)  /* t2 = mask        (a8) */
+    lbu      t3, 1(a2)  /* t3 = mask        (a8) */
+    lhu      t8, 0(a0)  /* t8 = destination (r5g6b5) */
+    lhu      t9, 2(a0)  /* t9 = destination (r5g6b5) */
+    addiu    a2, a2, 2
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, s0, s1, t5, t6, s2, s3, s4, s5
+    CONVERT_2x0565_TO_2x8888 t8, t9, s2, s3, t5, t6, s4, s5, t0, t1
+    OVER_2x8888_2x8_2x8888   s0, s1, \
+                             t2, t3, \
+                             s2, s3, \
+                             t0, t1, \
+                             t7, t8, t9, s4, s5, s0, s1
+    CONVERT_2x8888_TO_2x0565 t0, t1, s0, s1, t4, t5, t6, s2, s3
+
+    sh       s0, 0(a0)
+    sh       s1, 2(a0)
+    addiu    a3, a3, -2
+    addiu    t1, a3, -1
+    bgtz     t1, 1b
+     addiu   a0, a0, 4
+2:
+    beqz     a3, 3f
+     nop
+    sra      t0, v0, 16 /* t0 = vx >> 16 */
+    sll      t0, t0, 1  /* t0 = t0 * 2      (r5g6b5) */
+    addu     t0, a1, t0
+
+    lhu      t0, 0(t0)  /* t0 = source      (r5g6b5) */
+    lbu      t1, 0(a2)  /* t1 = mask        (a8) */
+    lhu      t2, 0(a0)  /* t2 = destination (r5g6b5) */
+
+    CONVERT_1x0565_TO_1x8888 t0, t3, t4, t5
+    CONVERT_1x0565_TO_1x8888 t2, t4, t5, t6
+    OVER_8888_8_8888         t3, t1, t4, t0, t7, t2, t5, t6, t8
+    CONVERT_1x8888_TO_1x0565 t0, t3, t4, t5
+
+    sh       t3, 0(a0)
+3:
+    RESTORE_REGS_FROM_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5
+4:
+    j        ra
+     nop
+
+END(pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *src_top
+ * a2     - *src_bottom
+ * a3     - w
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ */
+
+    beqz     a3, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw       s0, 36(sp)     /* s0 = wt */
+    lw       s1, 40(sp)     /* s1 = wb */
+    lw       s2, 44(sp)     /* s2 = vx */
+    lw       s3, 48(sp)     /* s3 = unit_x */
+    li       v0, BILINEAR_INTERPOLATION_RANGE
+
+    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+0:
+    andi     t4, s2, 0xffff /* t4 = (short)vx */
+    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
+    subu     t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4     /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4     /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 2
+    addiu    t8, t9, 4
+    lwx      t0, t9(a1)     /* t0 = tl */
+    lwx      t1, t8(a1)     /* t1 = tr */
+    addiu    a3, a3, -1
+    lwx      t2, t9(a2)     /* t2 = bl */
+    lwx      t3, t8(a2)     /* t3 = br */
+
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+
+    addu     s2, s2, s3     /* vx += unit_x; */
+    sw       t0, 0(a0)
+    bnez     a3, 0b
+     addiu   a0, a0, 4
+
+    RESTORE_REGS_FROM_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7
+1:
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *src_top
+ * a2     - *src_bottom
+ * a3     - w
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ */
+
+    beqz     a3, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw       s0, 36(sp)     /* s0 = wt */
+    lw       s1, 40(sp)     /* s1 = wb */
+    lw       s2, 44(sp)     /* s2 = vx */
+    lw       s3, 48(sp)     /* s3 = unit_x */
+    li       v0, BILINEAR_INTERPOLATION_RANGE
+
+    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+0:
+    andi     t4, s2, 0xffff /* t4 = (short)vx */
+    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
+    subu     t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4     /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4     /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 2
+    addiu    t8, t9, 4
+    lwx      t0, t9(a1)     /* t0 = tl */
+    lwx      t1, t8(a1)     /* t1 = tr */
+    addiu    a3, a3, -1
+    lwx      t2, t9(a2)     /* t2 = bl */
+    lwx      t3, t8(a2)     /* t3 = br */
+
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
+
+    addu     s2, s2, s3     /* vx += unit_x; */
+    sh       t1, 0(a0)
+    bnez     a3, 0b
+     addiu   a0, a0, 2
+
+    RESTORE_REGS_FROM_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7
+1:
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_8888_SRC_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *src_top
+ * a2     - *src_bottom
+ * a3     - w
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ */
+
+    beqz     a3, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lw       s0, 44(sp)     /* s0 = wt */
+    lw       s1, 48(sp)     /* s1 = wb */
+    lw       s2, 52(sp)     /* s2 = vx */
+    lw       s3, 56(sp)     /* s3 = unit_x */
+    li       v0, BILINEAR_INTERPOLATION_RANGE
+    li       v1, 0x07e007e0
+    li       s8, 0x001f001f
+
+    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+0:
+    andi     t4, s2, 0xffff /* t4 = (short)vx */
+    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
+    subu     t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4     /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4     /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 1
+    addiu    t8, t9, 2
+    lhx      t0, t9(a1)     /* t0 = tl */
+    lhx      t1, t8(a1)     /* t1 = tr */
+    andi     t1, t1, 0xffff
+    addiu    a3, a3, -1
+    lhx      t2, t9(a2)     /* t2 = bl */
+    lhx      t3, t8(a2)     /* t3 = br */
+    andi     t3, t3, 0xffff
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7
+    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+
+    addu     s2, s2, s3     /* vx += unit_x; */
+    sw       t0, 0(a0)
+    bnez     a3, 0b
+     addiu   a0, a0, 4
+
+    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+1:
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_0565_8888_SRC_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *src_top
+ * a2     - *src_bottom
+ * a3     - w
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ */
+
+    beqz     a3, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lw       s0, 44(sp)     /* s0 = wt */
+    lw       s1, 48(sp)     /* s1 = wb */
+    lw       s2, 52(sp)     /* s2 = vx */
+    lw       s3, 56(sp)     /* s3 = unit_x */
+    li       v0, BILINEAR_INTERPOLATION_RANGE
+    li       v1, 0x07e007e0
+    li       s8, 0x001f001f
+
+    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+0:
+    andi     t4, s2, 0xffff /* t4 = (short)vx */
+    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
+    subu     t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4     /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4     /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 1
+    addiu    t8, t9, 2
+    lhx      t0, t9(a1)     /* t0 = tl */
+    lhx      t1, t8(a1)     /* t1 = tr */
+    andi     t1, t1, 0xffff
+    addiu    a3, a3, -1
+    lhx      t2, t9(a2)     /* t2 = bl */
+    lhx      t3, t8(a2)     /* t3 = br */
+    andi     t3, t3, 0xffff
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7
+    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
+
+    addu     s2, s2, s3     /* vx += unit_x; */
+    sh       t1, 0(a0)
+    bnez     a3, 0b
+     addiu   a0, a0, 2
+
+    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+1:
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *src_top
+ * a2     - *src_bottom
+ * a3     - w
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ */
+
+    beqz     a3, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lw       s0, 40(sp)     /* s0 = wt */
+    lw       s1, 44(sp)     /* s1 = wb */
+    lw       s2, 48(sp)     /* s2 = vx */
+    lw       s3, 52(sp)     /* s3 = unit_x */
+    li       v0, BILINEAR_INTERPOLATION_RANGE
+    li       s8, 0x00ff00ff
+
+    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+0:
+    andi     t4, s2, 0xffff /* t4 = (short)vx */
+    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
+    subu     t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4     /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4     /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 2
+    addiu    t8, t9, 4
+    lwx      t0, t9(a1)     /* t0 = tl */
+    lwx      t1, t8(a1)     /* t1 = tr */
+    addiu    a3, a3, -1
+    lwx      t2, t9(a2)     /* t2 = bl */
+    lwx      t3, t8(a2)     /* t3 = br */
+
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+    lw       t1, 0(a0)      /* t1 = dest */
+    OVER_8888_8888 t0, t1, t2, s8, t3, t4, t5, t6
+
+    addu     s2, s2, s3     /* vx += unit_x; */
+    sw       t2, 0(a0)
+    bnez     a3, 0b
+     addiu   a0, a0, 4
+
+    RESTORE_REGS_FROM_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8
+1:
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *src_top
+ * a2     - *src_bottom
+ * a3     - w
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ */
+
+    beqz         a3, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7
+
+    lw           s0, 36(sp)     /* s0 = wt */
+    lw           s1, 40(sp)     /* s1 = wb */
+    lw           s2, 44(sp)     /* s2 = vx */
+    lw           s3, 48(sp)     /* s3 = unit_x */
+    li           v0, BILINEAR_INTERPOLATION_RANGE
+
+    sll          s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+    sll          s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+0:
+    andi         t4, s2, 0xffff /* t4 = (short)vx */
+    srl          t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
+    subu         t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */
+
+    mul          s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */
+    mul          s5, s0, t4     /* s5 = wt*(vx>>8) */
+    mul          s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */
+    mul          s7, s1, t4     /* s7 = wb*(vx>>8) */
+
+    sra          t9, s2, 16
+    sll          t9, t9, 2
+    addiu        t8, t9, 4
+    lwx          t0, t9(a1)     /* t0 = tl */
+    lwx          t1, t8(a1)     /* t1 = tr */
+    addiu        a3, a3, -1
+    lwx          t2, t9(a2)     /* t2 = bl */
+    lwx          t3, t8(a2)     /* t3 = br */
+
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+    lw           t1, 0(a0)
+    addu_s.qb    t2, t0, t1
+
+    addu         s2, s2, s3     /* vx += unit_x; */
+    sw           t2, 0(a0)
+    bnez         a3, 0b
+     addiu       a0, a0, 4
+
+    RESTORE_REGS_FROM_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7
+1:
+    j            ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *mask
+ * a2     - *src_top
+ * a3     - *src_bottom
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+    lw       v1, 32(sp)
+    beqz     v1, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lw       s0, 44(sp)        /* s0 = wt */
+    lw       s1, 48(sp)        /* s1 = wb */
+    lw       s2, 52(sp)        /* s2 = vx */
+    lw       s3, 56(sp)        /* s3 = unit_x */
+    li       v0, BILINEAR_INTERPOLATION_RANGE
+    li       s8, 0x00ff00ff
+
+    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+0:
+    andi     t4, s2, 0xffff    /* t4 = (short)vx */
+    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
+    subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 2
+    addiu    t8, t9, 4
+    lwx      t0, t9(a2)        /* t0 = tl */
+    lwx      t1, t8(a2)        /* t1 = tr */
+    addiu    v1, v1, -1
+    lwx      t2, t9(a3)        /* t2 = bl */
+    lwx      t3, t8(a3)        /* t3 = br */
+
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+    lbu      t1, 0(a1)         /* t1 = mask */
+    addiu    a1, a1, 1
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, s8, t2, t3, t4
+
+    addu     s2, s2, s3        /* vx += unit_x; */
+    sw       t0, 0(a0)
+    bnez     v1, 0b
+     addiu   a0, a0, 4
+
+    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+1:
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *mask
+ * a2     - *src_top
+ * a3     - *src_bottom
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+    lw       v1, 32(sp)
+    beqz     v1, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lw       s0, 44(sp)        /* s0 = wt */
+    lw       s1, 48(sp)        /* s1 = wb */
+    lw       s2, 52(sp)        /* s2 = vx */
+    lw       s3, 56(sp)        /* s3 = unit_x */
+    li       v0, BILINEAR_INTERPOLATION_RANGE
+    li       s8, 0x00ff00ff
+
+    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+0:
+    andi     t4, s2, 0xffff    /* t4 = (short)vx */
+    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
+    subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 2
+    addiu    t8, t9, 4
+    lwx      t0, t9(a2)        /* t0 = tl */
+    lwx      t1, t8(a2)        /* t1 = tr */
+    addiu    v1, v1, -1
+    lwx      t2, t9(a3)        /* t2 = bl */
+    lwx      t3, t8(a3)        /* t3 = br */
+
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+    lbu      t1, 0(a1)         /* t1 = mask */
+    addiu    a1, a1, 1
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, s8, t2, t3, t4
+    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
+
+    addu     s2, s2, s3        /* vx += unit_x; */
+    sh       t1, 0(a0)
+    bnez     v1, 0b
+     addiu   a0, a0, 2
+
+    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+1:
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *mask
+ * a2     - *src_top
+ * a3     - *src_bottom
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+    lw       t0, 32(sp)
+    beqz     t0, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
+
+    lw       s0, 48(sp)        /* s0 = wt */
+    lw       s1, 52(sp)        /* s1 = wb */
+    lw       s2, 56(sp)        /* s2 = vx */
+    lw       s3, 60(sp)        /* s3 = unit_x */
+    lw       ra, 64(sp)        /* ra = w */
+    li       v0, 0x00ff00ff
+    li       v1, 0x07e007e0
+    li       s8, 0x001f001f
+
+    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+0:
+    andi     t4, s2, 0xffff    /* t4 = (short)vx */
+    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
+    li       t5, BILINEAR_INTERPOLATION_RANGE
+    subu     t5, t5, t4        /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 1
+    addiu    t8, t9, 2
+    lhx      t0, t9(a2)        /* t0 = tl */
+    lhx      t1, t8(a2)        /* t1 = tr */
+    andi     t1, t1, 0xffff
+    addiu    ra, ra, -1
+    lhx      t2, t9(a3)        /* t2 = bl */
+    lhx      t3, t8(a3)        /* t3 = br */
+    andi     t3, t3, 0xffff
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7
+    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+    lbu      t1, 0(a1)         /* t1 = mask */
+    addiu    a1, a1, 1
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t2, t3, t4
+
+    addu     s2, s2, s3        /* vx += unit_x; */
+    sw       t0, 0(a0)
+    bnez     ra, 0b
+     addiu   a0, a0, 4
+
+    RESTORE_REGS_FROM_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
+1:
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *mask
+ * a2     - *src_top
+ * a3     - *src_bottom
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+    lw       t0, 32(sp)
+    beqz     t0, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
+
+    lw       s0, 48(sp)        /* s0 = wt */
+    lw       s1, 52(sp)        /* s1 = wb */
+    lw       s2, 56(sp)        /* s2 = vx */
+    lw       s3, 60(sp)        /* s3 = unit_x */
+    lw       ra, 64(sp)        /* ra = w */
+    li       v0, 0x00ff00ff
+    li       v1, 0x07e007e0
+    li       s8, 0x001f001f
+
+    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+0:
+    andi     t4, s2, 0xffff    /* t4 = (short)vx */
+    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
+    li       t5, BILINEAR_INTERPOLATION_RANGE
+    subu     t5, t5, t4        /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 1
+    addiu    t8, t9, 2
+    lhx      t0, t9(a2)        /* t0 = tl */
+    lhx      t1, t8(a2)        /* t1 = tr */
+    andi     t1, t1, 0xffff
+    addiu    ra, ra, -1
+    lhx      t2, t9(a3)        /* t2 = bl */
+    lhx      t3, t8(a3)        /* t3 = br */
+    andi     t3, t3, 0xffff
+
+    CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7
+    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+    lbu      t1, 0(a1)         /* t1 = mask */
+    addiu    a1, a1, 1
+    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t2, t3, t4
+    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
+
+    addu     s2, s2, s3        /* vx += unit_x; */
+    sh       t1, 0(a0)
+    bnez     ra, 0b
+     addiu   a0, a0, 2
+
+    RESTORE_REGS_FROM_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
+1:
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips)
+/*
+ * a0     - dst        (a8r8g8b8)
+ * a1     - mask       (a8)
+ * a2     - src_top    (a8r8g8b8)
+ * a3     - src_bottom (a8r8g8b8)
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lw       v1, 60(sp)        /* v1 = w(sp + 32 + 28 save regs stack offset)*/
+    beqz     v1, 1f
+     nop
+
+    lw       s0, 44(sp)        /* s0 = wt */
+    lw       s1, 48(sp)        /* s1 = wb */
+    lw       s2, 52(sp)        /* s2 = vx */
+    lw       s3, 56(sp)        /* s3 = unit_x */
+    li       v0, BILINEAR_INTERPOLATION_RANGE
+    li       s8, 0x00ff00ff
+
+    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+
+0:
+    andi     t4, s2, 0xffff    /* t4 = (short)vx */
+    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
+    subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 2
+    addiu    t8, t9, 4
+    lwx      t0, t9(a2)        /* t0 = tl */
+    lwx      t1, t8(a2)        /* t1 = tr */
+    addiu    v1, v1, -1
+    lwx      t2, t9(a3)        /* t2 = bl */
+    lwx      t3, t8(a3)        /* t3 = br */
+
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, \
+                                      t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+    lbu      t1, 0(a1)         /* t1 = mask */
+    lw       t2, 0(a0)         /* t2 = dst */
+    addiu    a1, a1, 1
+    OVER_8888_8_8888 t0, t1, t2, t0, s8, t3, t4, t5, t6
+
+    addu     s2, s2, s3        /* vx += unit_x; */
+    sw       t0, 0(a0)
+    bnez     v1, 0b
+     addiu   a0, a0, 4
+
+1:
+    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_mips)
+/*
+ * a0     - *dst
+ * a1     - *mask
+ * a2     - *src_top
+ * a3     - *src_bottom
+ * 16(sp) - wt
+ * 20(sp) - wb
+ * 24(sp) - vx
+ * 28(sp) - unit_x
+ * 32(sp) - w
+ */
+
+    lw       v1, 32(sp)
+    beqz     v1, 1f
+     nop
+
+    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+
+    lw       s0, 44(sp)        /* s0 = wt */
+    lw       s1, 48(sp)        /* s1 = wb */
+    lw       s2, 52(sp)        /* s2 = vx */
+    lw       s3, 56(sp)        /* s3 = unit_x */
+    li       v0, BILINEAR_INTERPOLATION_RANGE
+    li       s8, 0x00ff00ff
+
+    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
+0:
+    andi     t4, s2, 0xffff    /* t4 = (short)vx */
+    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
+    subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */
+
+    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
+    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
+    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
+    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
+
+    sra      t9, s2, 16
+    sll      t9, t9, 2
+    addiu    t8, t9, 4
+    lwx      t0, t9(a2)        /* t0 = tl */
+    lwx      t1, t8(a2)        /* t1 = tr */
+    addiu    v1, v1, -1
+    lwx      t2, t9(a3)        /* t2 = bl */
+    lwx      t3, t8(a3)        /* t3 = br */
+
+    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
+    lbu      t1, 0(a1)         /* t1 = mask */
+    lw       t2, 0(a0)         /* t2 = dst */
+    addiu    a1, a1, 1
+    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, t1, t2, t0, s8, t3, t4, t5
+
+    addu     s2, s2, s3        /* vx += unit_x; */
+    sw       t0, 0(a0)
+    bnez     v1, 0b
+     addiu   a0, a0, 4
+
+    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
+1:
+    j        ra
+     nop
+
+END(pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_mips)
diff --git a/src/pixman/pixman/pixman-mips-dspr2-asm.h b/src/pixman/pixman/pixman-mips-dspr2-asm.h
new file mode 100644
index 0000000..11849bd
--- /dev/null
+++ b/src/pixman/pixman/pixman-mips-dspr2-asm.h
@@ -0,0 +1,713 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nemanja Lukic (nlukic@mips.com)
+ */
+
+#ifndef PIXMAN_MIPS_DSPR2_ASM_H
+#define PIXMAN_MIPS_DSPR2_ASM_H
+
+#define zero $0
+#define AT   $1
+#define v0   $2
+#define v1   $3
+#define a0   $4
+#define a1   $5
+#define a2   $6
+#define a3   $7
+#define t0   $8
+#define t1   $9
+#define t2   $10
+#define t3   $11
+#define t4   $12
+#define t5   $13
+#define t6   $14
+#define t7   $15
+#define s0   $16
+#define s1   $17
+#define s2   $18
+#define s3   $19
+#define s4   $20
+#define s5   $21
+#define s6   $22
+#define s7   $23
+#define t8   $24
+#define t9   $25
+#define k0   $26
+#define k1   $27
+#define gp   $28
+#define sp   $29
+#define fp   $30
+#define s8   $30
+#define ra   $31
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS32r2
+ */
+#define LEAF_MIPS32R2(symbol)                           \
+                .globl  symbol;                         \
+                .align  2;                              \
+#ifdef __ELF__
+                .hidden symbol;                         \
+                .type   symbol, @function;              \
+#endif
+                .ent    symbol, 0;                      \
+symbol:         .frame  sp, 0, ra;                      \
+                .set    push;                           \
+                .set    arch=mips32r2;                  \
+                .set    noreorder;                      \
+                .set    noat;
+
+/*
+ * LEAF_MIPS32R2 - declare leaf routine for MIPS DSPr2
+ */
+#define LEAF_MIPS_DSPR2(symbol)                         \
+LEAF_MIPS32R2(symbol)                                   \
+                .set    dspr2;
+
+/*
+ * END - mark end of function
+ */
+#define END(function)                                   \
+                .set    pop;                            \
+                .end    function;                       \
+                .size   function,.-function
+
+/*
+ * Checks if stack offset is big enough for storing/restoring regs_num
+ * number of register to/from stack. Stack offset must be greater than
+ * or equal to the number of bytes needed for storing registers (regs_num*4).
+ * Since MIPS ABI allows usage of first 16 bytes of stack frame (this is
+ * preserved for input arguments of the functions, already stored in a0-a3),
+ * stack size can be further optimized by utilizing this space.
+ */
+.macro CHECK_STACK_OFFSET regs_num, stack_offset
+.if \stack_offset < \regs_num * 4 - 16
+.error "Stack offset too small."
+.endif
+.endm
+
+/*
+ * Saves set of registers on stack. Maximum number of registers that
+ * can be saved on stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * before registers are pushed in order to provide enough space on stack
+ * (offset must be multiple of 4, and must be big enough, as described by
+ * CHECK_STACK_OFFSET macro). This macro is intended to be used in
+ * combination with RESTORE_REGS_FROM_STACK macro. Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro SAVE_REGS_ON_STACK stack_offset = 0, r1, \
+                          r2  = 0, r3  = 0, r4  = 0, \
+                          r5  = 0, r6  = 0, r7  = 0, \
+                          r8  = 0, r9  = 0, r10 = 0, \
+                          r11 = 0, r12 = 0, r13 = 0, \
+                          r14 = 0
+    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset / 4) * 4)
+    .error "Stack offset must be pozitive and multiple of 4."
+    .endif
+    .if \stack_offset != 0
+    addiu           sp, sp, -\stack_offset
+    .endif
+    sw              \r1, 0(sp)
+    .if \r2 != 0
+    sw              \r2, 4(sp)
+    .endif
+    .if \r3 != 0
+    sw              \r3, 8(sp)
+    .endif
+    .if \r4 != 0
+    sw              \r4, 12(sp)
+    .endif
+    .if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    sw              \r5, 16(sp)
+    .endif
+    .if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    sw              \r6, 20(sp)
+    .endif
+    .if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    sw              \r7, 24(sp)
+    .endif
+    .if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    sw              \r8, 28(sp)
+    .endif
+    .if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    sw              \r9, 32(sp)
+    .endif
+    .if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    sw              \r10, 36(sp)
+    .endif
+    .if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    sw              \r11, 40(sp)
+    .endif
+    .if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    sw              \r12, 44(sp)
+    .endif
+    .if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    sw              \r13, 48(sp)
+    .endif
+    .if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    sw              \r14, 52(sp)
+    .endif
+.endm
+
+/*
+ * Restores set of registers from stack. Maximum number of registers that
+ * can be restored from stack is limitted to 14 (a0-a3, v0-v1 and s0-s7).
+ * Stack offset is number of bytes that are added to stack pointer (sp)
+ * after registers are restored (offset must be multiple of 4, and must
+ * be big enough, as described by CHECK_STACK_OFFSET macro). This macro is
+ * intended to be used in combination with RESTORE_REGS_FROM_STACK macro.
+ * Example:
+ *  SAVE_REGS_ON_STACK      4, v0, v1, s0, s1
+ *  RESTORE_REGS_FROM_STACK 4, v0, v1, s0, s1
+ */
+.macro RESTORE_REGS_FROM_STACK stack_offset = 0, r1, \
+                               r2  = 0, r3  = 0, r4  = 0, \
+                               r5  = 0, r6  = 0, r7  = 0, \
+                               r8  = 0, r9  = 0, r10 = 0, \
+                               r11 = 0, r12 = 0, r13 = 0, \
+                               r14 = 0
+    .if (\stack_offset < 0) || (\stack_offset - (\stack_offset/4)*4)
+    .error "Stack offset must be pozitive and multiple of 4."
+    .endif
+    lw              \r1, 0(sp)
+    .if \r2 != 0
+    lw              \r2, 4(sp)
+    .endif
+    .if \r3 != 0
+    lw              \r3, 8(sp)
+    .endif
+    .if \r4 != 0
+    lw              \r4, 12(sp)
+    .endif
+    .if \r5 != 0
+    CHECK_STACK_OFFSET 5, \stack_offset
+    lw              \r5, 16(sp)
+    .endif
+    .if \r6 != 0
+    CHECK_STACK_OFFSET 6, \stack_offset
+    lw              \r6, 20(sp)
+    .endif
+    .if \r7 != 0
+    CHECK_STACK_OFFSET 7, \stack_offset
+    lw              \r7, 24(sp)
+    .endif
+    .if \r8 != 0
+    CHECK_STACK_OFFSET 8, \stack_offset
+    lw              \r8, 28(sp)
+    .endif
+    .if \r9 != 0
+    CHECK_STACK_OFFSET 9, \stack_offset
+    lw              \r9, 32(sp)
+    .endif
+    .if \r10 != 0
+    CHECK_STACK_OFFSET 10, \stack_offset
+    lw              \r10, 36(sp)
+    .endif
+    .if \r11 != 0
+    CHECK_STACK_OFFSET 11, \stack_offset
+    lw              \r11, 40(sp)
+    .endif
+    .if \r12 != 0
+    CHECK_STACK_OFFSET 12, \stack_offset
+    lw              \r12, 44(sp)
+    .endif
+    .if \r13 != 0
+    CHECK_STACK_OFFSET 13, \stack_offset
+    lw              \r13, 48(sp)
+    .endif
+    .if \r14 != 0
+    CHECK_STACK_OFFSET 14, \stack_offset
+    lw              \r14, 52(sp)
+    .endif
+    .if \stack_offset != 0
+    addiu           sp, sp, \stack_offset
+    .endif
+.endm
+
+/*
+ * Conversion of single r5g6b5 pixel (in_565) to single a8r8g8b8 pixel
+ * returned in (out_8888) register. Requires two temporary registers
+ * (scratch1 and scratch2).
+ */
+.macro CONVERT_1x0565_TO_1x8888 in_565,   \
+                                out_8888, \
+                                scratch1, scratch2
+    lui     \out_8888, 0xff00
+    sll     \scratch1, \in_565,   0x3
+    andi    \scratch2, \scratch1, 0xff
+    ext     \scratch1, \in_565,   0x2, 0x3
+    or      \scratch1, \scratch2, \scratch1
+    or      \out_8888, \out_8888, \scratch1
+
+    sll     \scratch1, \in_565,   0x5
+    andi    \scratch1, \scratch1, 0xfc00
+    srl     \scratch2, \in_565,   0x1
+    andi    \scratch2, \scratch2, 0x300
+    or      \scratch2, \scratch1, \scratch2
+    or      \out_8888, \out_8888, \scratch2
+
+    andi    \scratch1, \in_565,   0xf800
+    srl     \scratch2, \scratch1, 0x5
+    andi    \scratch2, \scratch2, 0xff00
+    or      \scratch1, \scratch1, \scratch2
+    sll     \scratch1, \scratch1, 0x8
+    or      \out_8888, \out_8888, \scratch1
+.endm
+
+/*
+ * Conversion of two r5g6b5 pixels (in1_565 and in2_565) to two a8r8g8b8 pixels
+ * returned in (out1_8888 and out2_8888) registers. Requires four scratch
+ * registers (scratch1 ... scratch4). It also requires maskG and maskB for
+ * color component extractions. These masks must have following values:
+ *   li       maskG, 0x07e007e0
+ *   li       maskB, 0x001F001F
+ */
+.macro CONVERT_2x0565_TO_2x8888 in1_565, in2_565,     \
+                                out1_8888, out2_8888, \
+                                maskG, maskB,         \
+                                scratch1, scratch2, scratch3, scratch4
+    sll               \scratch1,  \in1_565,   16
+    or                \scratch1,  \scratch1,  \in2_565
+    lui               \out2_8888, 0xff00
+    ori               \out2_8888, \out2_8888, 0xff00
+    shrl.ph           \scratch2,  \scratch1,  11
+    and               \scratch3,  \scratch1,  \maskG
+    shra.ph           \scratch4,  \scratch2,  2
+    shll.ph           \scratch2,  \scratch2,  3
+    shll.ph           \scratch3,  \scratch3,  5
+    or                \scratch2,  \scratch2,  \scratch4
+    shrl.qb           \scratch4,  \scratch3,  6
+    or                \out2_8888, \out2_8888, \scratch2
+    or                \scratch3,  \scratch3,  \scratch4
+    and               \scratch1,  \scratch1,  \maskB
+    shll.ph           \scratch2,  \scratch1,  3
+    shra.ph           \scratch4,  \scratch1,  2
+    or                \scratch2,  \scratch2,  \scratch4
+    or                \scratch3,  \scratch2,  \scratch3
+    precrq.ph.w       \out1_8888, \out2_8888, \scratch3
+    precr_sra.ph.w    \out2_8888, \scratch3,  0
+.endm
+
+/*
+ * Conversion of single a8r8g8b8 pixel (in_8888) to single r5g6b5 pixel
+ * returned in (out_565) register. Requires two temporary registers
+ * (scratch1 and scratch2).
+ */
+.macro CONVERT_1x8888_TO_1x0565 in_8888, \
+                                out_565, \
+                                scratch1, scratch2
+    ext     \out_565,  \in_8888,  0x3, 0x5
+    srl     \scratch1, \in_8888,  0x5
+    andi    \scratch1, \scratch1, 0x07e0
+    srl     \scratch2, \in_8888,  0x8
+    andi    \scratch2, \scratch2, 0xf800
+    or      \out_565,  \out_565,  \scratch1
+    or      \out_565,  \out_565,  \scratch2
+.endm
+
+/*
+ * Conversion of two a8r8g8b8 pixels (in1_8888 and in2_8888) to two r5g6b5
+ * pixels returned in (out1_565 and out2_565) registers. Requires two temporary
+ * registers (scratch1 and scratch2). It also requires maskR, maskG and maskB
+ * for color component extractions. These masks must have following values:
+ *   li       maskR, 0xf800f800
+ *   li       maskG, 0x07e007e0
+ *   li       maskB, 0x001F001F
+ * Value of input register in2_8888 is lost.
+ */
+.macro CONVERT_2x8888_TO_2x0565 in1_8888, in2_8888,  \
+                                out1_565, out2_565,  \
+                                maskR, maskG, maskB, \
+                                scratch1, scratch2
+    precr.qb.ph    \scratch1, \in2_8888, \in1_8888
+    precrq.qb.ph   \in2_8888, \in2_8888, \in1_8888
+    and            \out1_565, \scratch1, \maskR
+    shrl.ph        \scratch1, \scratch1, 3
+    shll.ph        \in2_8888, \in2_8888, 3
+    and            \scratch1, \scratch1, \maskB
+    or             \out1_565, \out1_565, \scratch1
+    and            \in2_8888, \in2_8888, \maskG
+    or             \out1_565, \out1_565, \in2_8888
+    srl            \out2_565, \out1_565, 16
+.endm
+
+/*
+ * Multiply pixel (a8) with single pixel (a8r8g8b8). It requires maskLSR needed
+ * for rounding process. maskLSR must have following value:
+ *   li       maskLSR, 0x00ff00ff
+ */
+.macro MIPS_UN8x4_MUL_UN8 s_8888,  \
+                          m_8,     \
+                          d_8888,  \
+                          maskLSR, \
+                          scratch1, scratch2, scratch3
+    replv.ph          \m_8,      \m_8                 /*   0 | M | 0 | M */
+    muleu_s.ph.qbl    \scratch1, \s_8888,   \m_8      /*    A*M  |  R*M */
+    muleu_s.ph.qbr    \scratch2, \s_8888,   \m_8      /*    G*M  |  B*M */
+    shra_r.ph         \scratch3, \scratch1, 8
+    shra_r.ph         \d_8888,   \scratch2, 8
+    and               \scratch3, \scratch3, \maskLSR  /*   0 |A*M| 0 |R*M */
+    and               \d_8888,   \d_8888,   \maskLSR  /*   0 |G*M| 0 |B*M */
+    addq.ph           \scratch1, \scratch1, \scratch3 /* A*M+A*M | R*M+R*M */
+    addq.ph           \scratch2, \scratch2, \d_8888   /* G*M+G*M | B*M+B*M */
+    shra_r.ph         \scratch1, \scratch1, 8
+    shra_r.ph         \scratch2, \scratch2, 8
+    precr.qb.ph       \d_8888,   \scratch1, \scratch2
+.endm
+
+/*
+ * Multiply two pixels (a8) with two pixels (a8r8g8b8). It requires maskLSR
+ * needed for rounding process. maskLSR must have following value:
+ *   li       maskLSR, 0x00ff00ff
+ */
+.macro MIPS_2xUN8x4_MUL_2xUN8 s1_8888, \
+                              s2_8888, \
+                              m1_8,    \
+                              m2_8,    \
+                              d1_8888, \
+                              d2_8888, \
+                              maskLSR, \
+                              scratch1, scratch2, scratch3, \
+                              scratch4, scratch5, scratch6
+    replv.ph          \m1_8,     \m1_8                /*  0 | M1 | 0 | M1 */
+    replv.ph          \m2_8,     \m2_8                /*  0 | M2 | 0 | M2 */
+    muleu_s.ph.qbl    \scratch1, \s1_8888,  \m1_8     /*  A1*M1  |  R1*M1 */
+    muleu_s.ph.qbr    \scratch2, \s1_8888,  \m1_8     /*  G1*M1  |  B1*M1 */
+    muleu_s.ph.qbl    \scratch3, \s2_8888,  \m2_8     /*  A2*M2  |  R2*M2 */
+    muleu_s.ph.qbr    \scratch4, \s2_8888,  \m2_8     /*  G2*M2  |  B2*M2 */
+    shra_r.ph         \scratch5, \scratch1, 8
+    shra_r.ph         \d1_8888,  \scratch2, 8
+    shra_r.ph         \scratch6, \scratch3, 8
+    shra_r.ph         \d2_8888,  \scratch4, 8
+    and               \scratch5, \scratch5, \maskLSR  /* 0 |A1*M1| 0 |R1*M1 */
+    and               \d1_8888,  \d1_8888,  \maskLSR  /* 0 |G1*M1| 0 |B1*M1 */
+    and               \scratch6, \scratch6, \maskLSR  /* 0 |A2*M2| 0 |R2*M2 */
+    and               \d2_8888,  \d2_8888,  \maskLSR  /* 0 |G2*M2| 0 |B2*M2 */
+    addq.ph           \scratch1, \scratch1, \scratch5
+    addq.ph           \scratch2, \scratch2, \d1_8888
+    addq.ph           \scratch3, \scratch3, \scratch6
+    addq.ph           \scratch4, \scratch4, \d2_8888
+    shra_r.ph         \scratch1, \scratch1, 8
+    shra_r.ph         \scratch2, \scratch2, 8
+    shra_r.ph         \scratch3, \scratch3, 8
+    shra_r.ph         \scratch4, \scratch4, 8
+    precr.qb.ph       \d1_8888,  \scratch1, \scratch2
+    precr.qb.ph       \d2_8888,  \scratch3, \scratch4
+.endm
+
+/*
+ * Multiply pixel (a8r8g8b8) with single pixel (a8r8g8b8). It requires maskLSR
+ * needed for rounding process. maskLSR must have following value:
+ *   li       maskLSR, 0x00ff00ff
+ */
+.macro MIPS_UN8x4_MUL_UN8x4 s_8888,  \
+                            m_8888,  \
+                            d_8888,  \
+                            maskLSR, \
+                            scratch1, scratch2, scratch3, scratch4
+    preceu.ph.qbl     \scratch1, \m_8888              /*   0 | A | 0 | R */
+    preceu.ph.qbr     \scratch2, \m_8888              /*   0 | G | 0 | B */
+    muleu_s.ph.qbl    \scratch3, \s_8888,   \scratch1 /*    A*A  |  R*R */
+    muleu_s.ph.qbr    \scratch4, \s_8888,   \scratch2 /*    G*G  |  B*B */
+    shra_r.ph         \scratch1, \scratch3, 8
+    shra_r.ph         \scratch2, \scratch4, 8
+    and               \scratch1, \scratch1, \maskLSR  /*   0 |A*A| 0 |R*R */
+    and               \scratch2, \scratch2, \maskLSR  /*   0 |G*G| 0 |B*B */
+    addq.ph           \scratch1, \scratch1, \scratch3
+    addq.ph           \scratch2, \scratch2, \scratch4
+    shra_r.ph         \scratch1, \scratch1, 8
+    shra_r.ph         \scratch2, \scratch2, 8
+    precr.qb.ph       \d_8888,   \scratch1, \scratch2
+.endm
+
+/*
+ * Multiply two pixels (a8r8g8b8) with two pixels (a8r8g8b8). It requires
+ * maskLSR needed for rounding process. maskLSR must have following value:
+ *   li       maskLSR, 0x00ff00ff
+ */
+
+.macro MIPS_2xUN8x4_MUL_2xUN8x4 s1_8888,  \
+                                s2_8888,  \
+                                m1_8888,  \
+                                m2_8888,  \
+                                d1_8888,  \
+                                d2_8888,  \
+                                maskLSR,  \
+                                scratch1, scratch2, scratch3, \
+                                scratch4, scratch5, scratch6
+    preceu.ph.qbl     \scratch1, \m1_8888             /*   0 | A | 0 | R */
+    preceu.ph.qbr     \scratch2, \m1_8888             /*   0 | G | 0 | B */
+    preceu.ph.qbl     \scratch3, \m2_8888             /*   0 | A | 0 | R */
+    preceu.ph.qbr     \scratch4, \m2_8888             /*   0 | G | 0 | B */
+    muleu_s.ph.qbl    \scratch5, \s1_8888,  \scratch1 /*    A*A  |  R*R */
+    muleu_s.ph.qbr    \scratch6, \s1_8888,  \scratch2 /*    G*G  |  B*B */
+    muleu_s.ph.qbl    \scratch1, \s2_8888,  \scratch3 /*    A*A  |  R*R */
+    muleu_s.ph.qbr    \scratch2, \s2_8888,  \scratch4 /*    G*G  |  B*B */
+    shra_r.ph         \scratch3, \scratch5, 8
+    shra_r.ph         \scratch4, \scratch6, 8
+    shra_r.ph         \d1_8888,  \scratch1, 8
+    shra_r.ph         \d2_8888,  \scratch2, 8
+    and               \scratch3, \scratch3, \maskLSR  /*   0 |A*A| 0 |R*R */
+    and               \scratch4, \scratch4, \maskLSR  /*   0 |G*G| 0 |B*B */
+    and               \d1_8888,  \d1_8888,  \maskLSR  /*   0 |A*A| 0 |R*R */
+    and               \d2_8888,  \d2_8888,  \maskLSR  /*   0 |G*G| 0 |B*B */
+    addq.ph           \scratch3, \scratch3, \scratch5
+    addq.ph           \scratch4, \scratch4, \scratch6
+    addq.ph           \d1_8888,  \d1_8888,  \scratch1
+    addq.ph           \d2_8888,  \d2_8888,  \scratch2
+    shra_r.ph         \scratch3, \scratch3, 8
+    shra_r.ph         \scratch4, \scratch4, 8
+    shra_r.ph         \scratch5, \d1_8888,  8
+    shra_r.ph         \scratch6, \d2_8888,  8
+    precr.qb.ph       \d1_8888,  \scratch3, \scratch4
+    precr.qb.ph       \d2_8888,  \scratch5, \scratch6
+.endm
+
+/*
+ * OVER operation on single a8r8g8b8 source pixel (s_8888) and single a8r8g8b8
+ * destination pixel (d_8888) using a8 mask (m_8). It also requires maskLSR
+ * needed for rounding process. maskLSR must have following value:
+ *   li       maskLSR, 0x00ff00ff
+ */
+.macro OVER_8888_8_8888 s_8888,   \
+                        m_8,      \
+                        d_8888,   \
+                        out_8888, \
+                        maskLSR,  \
+                        scratch1, scratch2, scratch3, scratch4
+    MIPS_UN8x4_MUL_UN8 \s_8888,   \m_8, \
+                       \scratch1, \maskLSR, \
+                       \scratch2, \scratch3, \scratch4
+
+    not                \scratch2, \scratch1
+    srl                \scratch2, \scratch2, 24
+
+    MIPS_UN8x4_MUL_UN8 \d_8888,   \scratch2, \
+                       \d_8888,   \maskLSR,  \
+                       \scratch3, \scratch4, \out_8888
+
+    addu_s.qb          \out_8888, \d_8888,   \scratch1
+.endm
+
+/*
+ * OVER operation on two a8r8g8b8 source pixels (s1_8888 and s2_8888) and two
+ * a8r8g8b8 destination pixels (d1_8888 and d2_8888) using a8 masks (m1_8 and
+ * m2_8). It also requires maskLSR needed for rounding process. maskLSR must
+ * have following value:
+ *   li       maskLSR, 0x00ff00ff
+ */
+.macro OVER_2x8888_2x8_2x8888 s1_8888,   \
+                              s2_8888,   \
+                              m1_8,      \
+                              m2_8,      \
+                              d1_8888,   \
+                              d2_8888,   \
+                              out1_8888, \
+                              out2_8888, \
+                              maskLSR,   \
+                              scratch1, scratch2, scratch3, \
+                              scratch4, scratch5, scratch6
+    MIPS_2xUN8x4_MUL_2xUN8 \s1_8888,   \s2_8888, \
+                           \m1_8,      \m2_8, \
+                           \scratch1,  \scratch2, \
+                           \maskLSR, \
+                           \scratch3,  \scratch4, \out1_8888, \
+                           \out2_8888, \scratch5, \scratch6
+
+    not                    \scratch3,  \scratch1
+    srl                    \scratch3,  \scratch3, 24
+    not                    \scratch4,  \scratch2
+    srl                    \scratch4,  \scratch4, 24
+
+    MIPS_2xUN8x4_MUL_2xUN8 \d1_8888,   \d2_8888, \
+                           \scratch3,  \scratch4, \
+                           \d1_8888,   \d2_8888, \
+                           \maskLSR, \
+                           \scratch5,  \scratch6, \out1_8888, \
+                           \out2_8888, \scratch3, \scratch4
+
+    addu_s.qb              \out1_8888, \d1_8888,  \scratch1
+    addu_s.qb              \out2_8888, \d2_8888,  \scratch2
+.endm
+
+/*
+ * OVER operation on single a8r8g8b8 source pixel (s_8888) and single a8r8g8b8
+ * destination pixel (d_8888). It also requires maskLSR needed for rounding
+ * process. maskLSR must have following value:
+ *   li       maskLSR, 0x00ff00ff
+ */
+.macro OVER_8888_8888 s_8888,   \
+                      d_8888,   \
+                      out_8888, \
+                      maskLSR,  \
+                      scratch1, scratch2, scratch3, scratch4
+    not                \scratch1, \s_8888
+    srl                \scratch1, \scratch1, 24
+
+    MIPS_UN8x4_MUL_UN8 \d_8888,   \scratch1, \
+                       \out_8888, \maskLSR, \
+                       \scratch2, \scratch3, \scratch4
+
+    addu_s.qb          \out_8888, \out_8888, \s_8888
+.endm
+
+/*
+ * OVER operation on two a8r8g8b8 source pixels (s1_8888 and s2_8888) and two
+ * a8r8g8b8 destination pixels (d1_8888 and d2_8888). It also requires maskLSR
+ * needed for rounding process. maskLSR must have following value:
+ *   li       maskLSR, 0x00ff00ff
+ */
+.macro OVER_2x8888_2x8888 s1_8888,   \
+                          s2_8888,   \
+                          d1_8888,   \
+                          d2_8888,   \
+                          out1_8888, \
+                          out2_8888, \
+                          maskLSR,   \
+                          scratch1, scratch2, scratch3, \
+                          scratch4, scratch5, scratch6
+    not                    \scratch1,  \s1_8888
+    srl                    \scratch1,  \scratch1,  24
+    not                    \scratch2,  \s2_8888
+    srl                    \scratch2,  \scratch2,  24
+    MIPS_2xUN8x4_MUL_2xUN8 \d1_8888,   \d2_8888, \
+                           \scratch1,  \scratch2,  \
+                           \out1_8888, \out2_8888, \
+                           \maskLSR, \
+                           \scratch3,  \scratch4, \scratch5, \
+                           \scratch6,  \d1_8888,  \d2_8888
+
+    addu_s.qb              \out1_8888, \out1_8888, \s1_8888
+    addu_s.qb              \out2_8888, \out2_8888, \s2_8888
+.endm
+
+.macro MIPS_UN8x4_MUL_UN8_ADD_UN8x4 s_8888,   \
+                                    m_8,      \
+                                    d_8888,   \
+                                    out_8888, \
+                                    maskLSR,  \
+                                    scratch1, scratch2, scratch3
+    MIPS_UN8x4_MUL_UN8 \s_8888, \m_8, \
+                       \out_8888, \maskLSR, \
+                       \scratch1, \scratch2, \scratch3
+
+    addu_s.qb          \out_8888, \out_8888, \d_8888
+.endm
+
+.macro MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 s1_8888,   \
+                             s2_8888,   \
+                             m1_8,      \
+                             m2_8,      \
+                             d1_8888,   \
+                             d2_8888,   \
+                             out1_8888, \
+                             out2_8888, \
+                             maskLSR,   \
+                             scratch1,  scratch2, scratch3, \
+                             scratch4, scratch5, scratch6
+    MIPS_2xUN8x4_MUL_2xUN8 \s1_8888,   \s2_8888, \
+                           \m1_8,      \m2_8, \
+                           \out1_8888, \out2_8888, \
+                           \maskLSR, \
+                           \scratch1,  \scratch2, \scratch3, \
+                           \scratch4,  \scratch5, \scratch6
+
+    addu_s.qb             \out1_8888, \out1_8888, \d1_8888
+    addu_s.qb             \out2_8888, \out2_8888, \d2_8888
+.endm
+
+.macro BILINEAR_INTERPOLATE_SINGLE_PIXEL tl, tr, bl, br,         \
+                                         scratch1, scratch2,     \
+                                         alpha, red, green, blue \
+                                         wt1, wt2, wb1, wb2
+    andi            \scratch1, \tl,  0xff
+    andi            \scratch2, \tr,  0xff
+    andi            \alpha,    \bl,  0xff
+    andi            \red,      \br,  0xff
+
+    multu           $ac0,      \wt1, \scratch1
+    maddu           $ac0,      \wt2, \scratch2
+    maddu           $ac0,      \wb1, \alpha
+    maddu           $ac0,      \wb2, \red
+
+    ext             \scratch1, \tl,  8, 8
+    ext             \scratch2, \tr,  8, 8
+    ext             \alpha,    \bl,  8, 8
+    ext             \red,      \br,  8, 8
+
+    multu           $ac1,      \wt1, \scratch1
+    maddu           $ac1,      \wt2, \scratch2
+    maddu           $ac1,      \wb1, \alpha
+    maddu           $ac1,      \wb2, \red
+
+    ext             \scratch1, \tl,  16, 8
+    ext             \scratch2, \tr,  16, 8
+    ext             \alpha,    \bl,  16, 8
+    ext             \red,      \br,  16, 8
+
+    mflo            \blue,     $ac0
+
+    multu           $ac2,      \wt1, \scratch1
+    maddu           $ac2,      \wt2, \scratch2
+    maddu           $ac2,      \wb1, \alpha
+    maddu           $ac2,      \wb2, \red
+
+    ext             \scratch1, \tl,  24, 8
+    ext             \scratch2, \tr,  24, 8
+    ext             \alpha,    \bl,  24, 8
+    ext             \red,      \br,  24, 8
+
+    mflo            \green,    $ac1
+
+    multu           $ac3,      \wt1, \scratch1
+    maddu           $ac3,      \wt2, \scratch2
+    maddu           $ac3,      \wb1, \alpha
+    maddu           $ac3,      \wb2, \red
+
+    mflo            \red,      $ac2
+    mflo            \alpha,    $ac3
+
+    precr.qb.ph     \alpha,    \alpha, \red
+    precr.qb.ph     \scratch1, \green, \blue
+    precrq.qb.ph    \tl,       \alpha, \scratch1
+.endm
+
+#endif //PIXMAN_MIPS_DSPR2_ASM_H
diff --git a/src/pixman/pixman/pixman-mips-dspr2.c b/src/pixman/pixman/pixman-mips-dspr2.c
new file mode 100644
index 0000000..e10c9df
--- /dev/null
+++ b/src/pixman/pixman/pixman-mips-dspr2.c
@@ -0,0 +1,459 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nemanja Lukic (nlukic@mips.com)
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+#include "pixman-mips-dspr2.h"
+
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_x888_8888,
+                                    uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_8888_0565,
+                                    uint32_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0565_8888,
+                                    uint16_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0565_0565,
+                                    uint16_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_8888_8888,
+                                    uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (DO_FAST_MEMCPY, src_0888_0888,
+                                    uint8_t, 3, uint8_t, 3)
+#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0888_8888_rev,
+                                    uint8_t, 3, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_0888_0565_rev,
+                                    uint8_t, 3, uint16_t, 1)
+#endif
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_pixbuf_8888,
+                                    uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, src_rpixbuf_8888,
+                                    uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_8888,
+                                    uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, over_8888_0565,
+                                    uint32_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8_8,
+                                    uint8_t, 1, uint8_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, add_8888_8888,
+                                    uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, out_reverse_8_0565,
+                                    uint8_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST (0, out_reverse_8_8888,
+                                    uint8_t,  1, uint32_t, 1)
+
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (0, src_n_8_8888,
+                                       uint8_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (0, src_n_8_8,
+                                       uint8_t, 1, uint8_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca,
+                                       uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_0565_ca,
+                                       uint32_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8,
+                                       uint8_t, 1, uint8_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8888,
+                                       uint8_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565,
+                                       uint8_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, add_n_8_8,
+                                       uint8_t, 1, uint8_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, add_n_8_8888,
+                                       uint8_t, 1, uint32_t, 1)
+
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_8888_n_8888,
+                                      uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_8888_n_0565,
+                                      uint32_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, over_0565_n_0565,
+                                      uint16_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, add_8888_n_8888,
+                                      uint32_t, 1, uint32_t, 1)
+
+PIXMAN_MIPS_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, over_n_0565,
+                                  uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, over_n_8888,
+                                  uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_DST (SKIP_ZERO_SRC, over_reverse_n_8888,
+                                  uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_DST (0, in_n_8,
+                                  uint8_t, 1)
+
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8_8_8, uint8_t,  1,
+                                         uint8_t,  1, uint8_t,  1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8888_8_8888, uint32_t, 1,
+                                         uint8_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_8888_8888_8888, uint32_t, 1,
+                                         uint32_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (add_0565_8_0565, uint16_t, 1,
+                                         uint8_t,  1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_8888, uint32_t, 1,
+                                         uint8_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8_0565, uint32_t, 1,
+                                         uint8_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_0565_8_0565, uint16_t, 1,
+                                         uint8_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST (over_8888_8888_8888, uint32_t, 1,
+                                         uint32_t, 1, uint32_t, 1)
+
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (8888_8888, OVER,
+                                         uint32_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (8888_0565, OVER,
+                                         uint32_t, uint16_t)
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST (0565_8888, SRC,
+                                         uint16_t, uint32_t)
+
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_8888, SRC,
+                                          uint32_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 8888_0565, SRC,
+                                          uint32_t, uint16_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 0565_8888, SRC,
+                                          uint16_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (0, 0565_0565, SRC,
+                                          uint16_t, uint16_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, 8888_8888, OVER,
+                                          uint32_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, 8888_8888, ADD,
+                                          uint32_t, uint32_t)
+
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_0565,
+                                            OVER, uint32_t, uint16_t)
+PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST (SKIP_ZERO_SRC, 0565_8_0565,
+                                            OVER, uint16_t, uint16_t)
+
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 8888_8_8888, SRC,
+                                             uint32_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 8888_8_0565, SRC,
+                                             uint32_t, uint16_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 0565_8_x888, SRC,
+                                             uint16_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (0, 0565_8_0565, SRC,
+                                             uint16_t, uint16_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_8888, OVER,
+                                             uint32_t, uint32_t)
+PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, 8888_8_8888, ADD,
+                                             uint32_t, uint32_t)
+
+static pixman_bool_t
+mips_dspr2_fill (pixman_implementation_t *imp,
+                 uint32_t *               bits,
+                 int                      stride,
+                 int                      bpp,
+                 int                      x,
+                 int                      y,
+                 int                      width,
+                 int                      height,
+                 uint32_t                 _xor)
+{
+    uint8_t *byte_line;
+    uint32_t byte_width;
+    switch (bpp)
+    {
+    case 16:
+        stride = stride * (int) sizeof (uint32_t) / 2;
+        byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+        byte_width = width * 2;
+        stride *= 2;
+
+        while (height--)
+        {
+            uint8_t *dst = byte_line;
+            byte_line += stride;
+            pixman_fill_buff16_mips (dst, byte_width, _xor & 0xffff);
+        }
+        return TRUE;
+    case 32:
+        stride = stride * (int) sizeof (uint32_t) / 4;
+        byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+        byte_width = width * 4;
+        stride *= 4;
+
+        while (height--)
+        {
+            uint8_t *dst = byte_line;
+            byte_line += stride;
+            pixman_fill_buff32_mips (dst, byte_width, _xor);
+        }
+        return TRUE;
+    default:
+        return FALSE;
+    }
+}
+
+static pixman_bool_t
+mips_dspr2_blt (pixman_implementation_t *imp,
+                uint32_t *               src_bits,
+                uint32_t *               dst_bits,
+                int                      src_stride,
+                int                      dst_stride,
+                int                      src_bpp,
+                int                      dst_bpp,
+                int                      src_x,
+                int                      src_y,
+                int                      dest_x,
+                int                      dest_y,
+                int                      width,
+                int                      height)
+{
+    if (src_bpp != dst_bpp)
+        return FALSE;
+
+    uint8_t *src_bytes;
+    uint8_t *dst_bytes;
+    uint32_t byte_width;
+
+    switch (src_bpp)
+    {
+    case 16:
+        src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+        dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+        src_bytes =(uint8_t *)(((uint16_t *)src_bits)
+                                          + src_stride * (src_y) + (src_x));
+        dst_bytes = (uint8_t *)(((uint16_t *)dst_bits)
+                                           + dst_stride * (dest_y) + (dest_x));
+        byte_width = width * 2;
+        src_stride *= 2;
+        dst_stride *= 2;
+
+        while (height--)
+        {
+            uint8_t *src = src_bytes;
+            uint8_t *dst = dst_bytes;
+            src_bytes += src_stride;
+            dst_bytes += dst_stride;
+            pixman_mips_fast_memcpy (dst, src, byte_width);
+        }
+        return TRUE;
+    case 32:
+        src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+        dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+        src_bytes = (uint8_t *)(((uint32_t *)src_bits)
+                                           + src_stride * (src_y) + (src_x));
+        dst_bytes = (uint8_t *)(((uint32_t *)dst_bits)
+                                           + dst_stride * (dest_y) + (dest_x));
+        byte_width = width * 4;
+        src_stride *= 4;
+        dst_stride *= 4;
+
+        while (height--)
+        {
+            uint8_t *src = src_bytes;
+            uint8_t *dst = dst_bytes;
+            src_bytes += src_stride;
+            dst_bytes += dst_stride;
+            pixman_mips_fast_memcpy (dst, src, byte_width);
+        }
+        return TRUE;
+    default:
+        return FALSE;
+    }
+}
+
+static const pixman_fast_path_t mips_dspr2_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5,   null, r5g6b5,   mips_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5,   null, b5g6r5,   mips_composite_src_0565_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5,   mips_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5,   mips_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5,   mips_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5,   mips_composite_src_8888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5,   null, a8r8g8b8, mips_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5,   null, x8r8g8b8, mips_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5,   null, a8b8g8r8, mips_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5,   null, x8b8g8r8, mips_composite_src_0565_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mips_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mips_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mips_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mips_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mips_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mips_composite_src_8888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, mips_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, mips_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, r8g8b8,   null, r8g8b8,   mips_composite_src_0888_0888),
+#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8,   null, x8r8g8b8, mips_composite_src_0888_8888_rev),
+    PIXMAN_STD_FAST_PATH (SRC, b8g8r8,   null, r5g6b5,   mips_composite_src_0888_0565_rev),
+#endif
+    PIXMAN_STD_FAST_PATH (SRC, pixbuf,   pixbuf,  a8r8g8b8, mips_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC, pixbuf,   pixbuf,  a8b8g8r8, mips_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC, rpixbuf,  rpixbuf, a8r8g8b8, mips_composite_src_rpixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC, rpixbuf,  rpixbuf, a8b8g8r8, mips_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   a8r8g8b8, mips_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   x8r8g8b8, mips_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   a8b8g8r8, mips_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   x8b8g8r8, mips_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid,    a8,   a8,       mips_composite_src_n_8_8),
+
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mips_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mips_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mips_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mips_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5,   mips_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5,   mips_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8,       mips_composite_over_n_8_8),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8r8g8b8, mips_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8r8g8b8, mips_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8b8g8r8, mips_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       x8b8g8r8, mips_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       r5g6b5,   mips_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       b5g6r5,   mips_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     r5g6b5,   mips_composite_over_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     a8r8g8b8, mips_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null,     x8r8g8b8, mips_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    a8r8g8b8, mips_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    x8r8g8b8, mips_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid,    r5g6b5,   mips_composite_over_8888_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid,    b5g6r5,   mips_composite_over_8888_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   solid,    r5g6b5,   mips_composite_over_0565_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   solid,    b5g6r5,   mips_composite_over_0565_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       a8r8g8b8, mips_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       x8r8g8b8, mips_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       a8b8g8r8, mips_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       x8b8g8r8, mips_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8,       r5g6b5,   mips_composite_over_8888_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8,       b5g6r5,   mips_composite_over_8888_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, r5g6b5,   a8,       r5g6b5,   mips_composite_over_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, b5g6r5,   a8,       b5g6r5,   mips_composite_over_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, mips_composite_over_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     a8r8g8b8, mips_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     x8r8g8b8, mips_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     a8b8g8r8, mips_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     x8b8g8r8, mips_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null,     r5g6b5,   mips_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null,     b5g6r5,   mips_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8,       mips_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8r8g8b8, mips_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  solid,    a8,       a8b8g8r8, mips_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       a8,       a8,       mips_composite_add_8_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  r5g6b5,   a8,       r5g6b5,   mips_composite_add_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (ADD,  b5g6r5,   a8,       b5g6r5,   mips_composite_add_0565_8_0565),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8,       a8r8g8b8, mips_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, a8,       a8b8g8r8, mips_composite_add_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, a8r8g8b8, a8r8g8b8, mips_composite_add_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, solid,    a8r8g8b8, mips_composite_add_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, solid,    a8b8g8r8, mips_composite_add_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8,       null,     a8,       mips_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD,  a8r8g8b8, null,     a8r8g8b8, mips_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD,  a8b8g8r8, null,     a8b8g8r8, mips_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8,    null, r5g6b5,   mips_composite_out_reverse_8_0565),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8,    null, b5g6r5,   mips_composite_out_reverse_8_0565),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8,    null, a8r8g8b8, mips_composite_out_reverse_8_8888),
+    PIXMAN_STD_FAST_PATH (OUT_REVERSE, a8,    null, a8b8g8r8, mips_composite_out_reverse_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mips_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mips_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (IN,           solid, null, a8,       mips_composite_in_n_8),
+
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mips_8888_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mips_8888_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mips_8888_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mips_8888_8888),
+
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_0565),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_0565),
+
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (SRC, b5g6r5, x8b8g8r8, mips_0565_8888),
+    PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH (SRC, r5g6b5, x8r8g8b8, mips_0565_8888),
+    /* Note: NONE repeat is not supported yet */
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, r5g6b5, a8r8g8b8, mips_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (SRC, b5g6r5, a8b8g8r8, mips_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, r5g6b5, a8r8g8b8, mips_0565_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (SRC, b5g6r5, a8b8g8r8, mips_0565_8888),
+
+    PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8r8g8b8, r5g6b5, mips_8888_8_0565),
+    PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, a8b8g8r8, b5g6r5, mips_8888_8_0565),
+
+    PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, r5g6b5, r5g6b5, mips_0565_8_0565),
+    PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH (OVER, b5g6r5, b5g6r5, mips_0565_8_0565),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mips_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, r5g6b5, mips_8888_0565),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, r5g6b5, mips_8888_0565),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, mips_0565_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, mips_0565_0565),
+
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mips_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mips_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, mips_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, mips_8888_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mips_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mips_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mips_8888_8_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, r5g6b5, mips_8888_8_0565),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, r5g6b5, mips_8888_8_0565),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, x8r8g8b8, mips_0565_8_x888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, mips_0565_8_0565),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mips_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mips_8888_8_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, mips_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, mips_8888_8_8888),
+    { PIXMAN_OP_NONE },
+};
+
+static void
+mips_dspr2_combine_over_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    if (mask)
+        pixman_composite_over_8888_8888_8888_asm_mips (
+            dest, (uint32_t *)src, (uint32_t *)mask, width);
+    else
+        pixman_composite_over_8888_8888_asm_mips (
+		    dest, (uint32_t *)src, width);
+}
+
+pixman_implementation_t *
+_pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp =
+        _pixman_implementation_create (fallback, mips_dspr2_fast_paths);
+
+    imp->combine_32[PIXMAN_OP_OVER] = mips_dspr2_combine_over_u;
+
+    imp->blt = mips_dspr2_blt;
+    imp->fill = mips_dspr2_fill;
+
+    return imp;
+}
diff --git a/src/pixman/pixman/pixman-mips-dspr2.h b/src/pixman/pixman/pixman-mips-dspr2.h
new file mode 100644
index 0000000..955ed70
--- /dev/null
+++ b/src/pixman/pixman/pixman-mips-dspr2.h
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Author:  Nemanja Lukic (nlukic@mips.com)
+ */
+
+#ifndef PIXMAN_MIPS_DSPR2_H
+#define PIXMAN_MIPS_DSPR2_H
+
+#include "pixman-private.h"
+#include "pixman-inlines.h"
+
+#define SKIP_ZERO_SRC  1
+#define SKIP_ZERO_MASK 2
+#define DO_FAST_MEMCPY 3
+
+void
+pixman_mips_fast_memcpy (void *dst, void *src, uint32_t n_bytes);
+void
+pixman_fill_buff16_mips (void *dst, uint32_t n_bytes, uint16_t value);
+void
+pixman_fill_buff32_mips (void *dst, uint32_t n_bytes, uint32_t value);
+
+/****************************************************************/
+
+#define PIXMAN_MIPS_BIND_FAST_PATH_SRC_DST(flags, name,          \
+                                           src_type, src_cnt,    \
+                                           dst_type, dst_cnt)    \
+void                                                             \
+pixman_composite_##name##_asm_mips (dst_type *dst,               \
+                                    src_type *src,               \
+                                    int32_t   w);                \
+                                                                 \
+static void                                                      \
+mips_composite_##name (pixman_implementation_t *imp,             \
+                       pixman_composite_info_t *info)            \
+{                                                                \
+    PIXMAN_COMPOSITE_ARGS (info);                                \
+    dst_type *dst_line, *dst;                                    \
+    src_type *src_line, *src;                                    \
+    int32_t dst_stride, src_stride;                              \
+    int bpp = PIXMAN_FORMAT_BPP (dest_image->bits.format) / 8;   \
+                                                                 \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,    \
+                           src_stride, src_line, src_cnt);       \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type, \
+                           dst_stride, dst_line, dst_cnt);       \
+                                                                 \
+    while (height--)                                             \
+    {                                                            \
+      dst = dst_line;                                            \
+      dst_line += dst_stride;                                    \
+      src = src_line;                                            \
+      src_line += src_stride;                                    \
+                                                                 \
+      if (flags == DO_FAST_MEMCPY)                               \
+        pixman_mips_fast_memcpy (dst, src, width * bpp);         \
+      else                                                       \
+        pixman_composite_##name##_asm_mips (dst, src, width);    \
+    }                                                            \
+}
+
+/****************************************************************/
+
+#define PIXMAN_MIPS_BIND_FAST_PATH_N_DST(flags, name,            \
+                                         dst_type, dst_cnt)      \
+void                                                             \
+pixman_composite_##name##_asm_mips (dst_type *dst,               \
+                                    uint32_t  src,               \
+                                    int32_t   w);                \
+                                                                 \
+static void                                                      \
+mips_composite_##name (pixman_implementation_t *imp,             \
+                       pixman_composite_info_t *info)            \
+{                                                                \
+    PIXMAN_COMPOSITE_ARGS (info);                                \
+    dst_type  *dst_line, *dst;                                   \
+    int32_t    dst_stride;                                       \
+    uint32_t   src;                                              \
+                                                                 \
+    src = _pixman_image_get_solid (                              \
+    imp, src_image, dest_image->bits.format);                    \
+                                                                 \
+    if ((flags & SKIP_ZERO_SRC) && src == 0)                     \
+        return;                                                  \
+                                                                 \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type, \
+                           dst_stride, dst_line, dst_cnt);       \
+                                                                 \
+    while (height--)                                             \
+    {                                                            \
+        dst = dst_line;                                          \
+        dst_line += dst_stride;                                  \
+                                                                 \
+        pixman_composite_##name##_asm_mips (dst, src, width);    \
+    }                                                            \
+}
+
+/*******************************************************************/
+
+#define PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST(flags, name,          \
+                                              mask_type, mask_cnt,  \
+                                              dst_type, dst_cnt)    \
+void                                                                \
+pixman_composite_##name##_asm_mips (dst_type  *dst,                 \
+                                    uint32_t  src,                  \
+                                    mask_type *mask,                \
+                                    int32_t   w);                   \
+                                                                    \
+static void                                                         \
+mips_composite_##name (pixman_implementation_t *imp,                \
+                       pixman_composite_info_t *info)               \
+{                                                                   \
+    PIXMAN_COMPOSITE_ARGS (info);                                   \
+    dst_type  *dst_line, *dst;                                      \
+    mask_type *mask_line, *mask;                                    \
+    int32_t    dst_stride, mask_stride;                             \
+    uint32_t   src;                                                 \
+                                                                    \
+    src = _pixman_image_get_solid (                                 \
+        imp, src_image, dest_image->bits.format);                   \
+                                                                    \
+    if ((flags & SKIP_ZERO_SRC) && src == 0)                        \
+        return;                                                     \
+                                                                    \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,    \
+                           dst_stride, dst_line, dst_cnt);          \
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,   \
+                           mask_stride, mask_line, mask_cnt);       \
+                                                                    \
+    while (height--)                                                \
+    {                                                               \
+        dst = dst_line;                                             \
+        dst_line += dst_stride;                                     \
+        mask = mask_line;                                           \
+        mask_line += mask_stride;                                   \
+        pixman_composite_##name##_asm_mips (dst, src, mask, width); \
+    }                                                               \
+}
+
+/*******************************************************************/
+
+#define PIXMAN_MIPS_BIND_FAST_PATH_SRC_N_DST(flags, name,           \
+                                            src_type, src_cnt,      \
+                                            dst_type, dst_cnt)      \
+void                                                                \
+pixman_composite_##name##_asm_mips (dst_type  *dst,                 \
+                                    src_type  *src,                 \
+                                    uint32_t   mask,                \
+                                    int32_t    w);                  \
+                                                                    \
+static void                                                         \
+mips_composite_##name (pixman_implementation_t *imp,                \
+                       pixman_composite_info_t *info)               \
+{                                                                   \
+    PIXMAN_COMPOSITE_ARGS (info);                                   \
+    dst_type  *dst_line, *dst;                                      \
+    src_type  *src_line, *src;                                      \
+    int32_t    dst_stride, src_stride;                              \
+    uint32_t   mask;                                                \
+                                                                    \
+    mask = _pixman_image_get_solid (                                \
+        imp, mask_image, dest_image->bits.format);                  \
+                                                                    \
+    if ((flags & SKIP_ZERO_MASK) && mask == 0)                      \
+        return;                                                     \
+                                                                    \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,    \
+                           dst_stride, dst_line, dst_cnt);          \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,       \
+                           src_stride, src_line, src_cnt);          \
+                                                                    \
+    while (height--)                                                \
+    {                                                               \
+        dst = dst_line;                                             \
+        dst_line += dst_stride;                                     \
+        src = src_line;                                             \
+        src_line += src_stride;                                     \
+                                                                    \
+        pixman_composite_##name##_asm_mips (dst, src, mask, width); \
+    }                                                               \
+}
+
+/************************************************************************/
+
+#define PIXMAN_MIPS_BIND_FAST_PATH_SRC_MASK_DST(name, src_type, src_cnt, \
+                                                mask_type, mask_cnt,     \
+                                                dst_type, dst_cnt)       \
+void                                                                     \
+pixman_composite_##name##_asm_mips (dst_type  *dst,                      \
+                                    src_type  *src,                      \
+                                    mask_type *mask,                     \
+                                    int32_t   w);                        \
+                                                                         \
+static void                                                              \
+mips_composite_##name (pixman_implementation_t *imp,                     \
+                       pixman_composite_info_t *info)                    \
+{                                                                        \
+    PIXMAN_COMPOSITE_ARGS (info);                                        \
+    dst_type  *dst_line, *dst;                                           \
+    src_type  *src_line, *src;                                           \
+    mask_type *mask_line, *mask;                                         \
+    int32_t    dst_stride, src_stride, mask_stride;                      \
+                                                                         \
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type,         \
+                           dst_stride, dst_line, dst_cnt);               \
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, src_type,            \
+                           src_stride, src_line, src_cnt);               \
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type,        \
+                           mask_stride, mask_line, mask_cnt);            \
+                                                                         \
+    while (height--)                                                     \
+    {                                                                    \
+        dst = dst_line;                                                  \
+        dst_line += dst_stride;                                          \
+        mask = mask_line;                                                \
+        mask_line += mask_stride;                                        \
+        src = src_line;                                                  \
+        src_line += src_stride;                                          \
+        pixman_composite_##name##_asm_mips (dst, src, mask, width);      \
+    }                                                                    \
+}
+
+/****************************************************************************/
+
+#define PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_DST(name, op,                    \
+                                                src_type, dst_type)          \
+void                                                                         \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (                    \
+                                                   dst_type *       dst,     \
+                                                   const src_type * src,     \
+                                                   int32_t          w,       \
+                                                   pixman_fixed_t   vx,      \
+                                                   pixman_fixed_t   unit_x); \
+                                                                             \
+static force_inline void                                                     \
+scaled_nearest_scanline_mips_##name##_##op (dst_type *       pd,             \
+                                            const src_type * ps,             \
+                                            int32_t          w,              \
+                                            pixman_fixed_t   vx,             \
+                                            pixman_fixed_t   unit_x,         \
+                                            pixman_fixed_t   max_vx,         \
+                                            pixman_bool_t    zero_src)       \
+{                                                                            \
+    pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (pd, ps, w,      \
+                                                             vx, unit_x);    \
+}                                                                            \
+                                                                             \
+FAST_NEAREST_MAINLOOP (mips_##name##_cover_##op,                             \
+                       scaled_nearest_scanline_mips_##name##_##op,           \
+                       src_type, dst_type, COVER)                            \
+FAST_NEAREST_MAINLOOP (mips_##name##_none_##op,                              \
+                       scaled_nearest_scanline_mips_##name##_##op,           \
+                       src_type, dst_type, NONE)                             \
+FAST_NEAREST_MAINLOOP (mips_##name##_pad_##op,                               \
+                       scaled_nearest_scanline_mips_##name##_##op,           \
+                       src_type, dst_type, PAD)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_MIPS_SIMPLE_NEAREST_FAST_PATH(op,s,d,func)                    \
+    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),                            \
+    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),                             \
+    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func)
+
+
+/*****************************************************************************/
+
+#define PIXMAN_MIPS_BIND_SCALED_NEAREST_SRC_A8_DST(flags, name, op,           \
+                                                  src_type, dst_type)         \
+void                                                                          \
+pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (                     \
+                                                   dst_type *       dst,      \
+                                                   const src_type * src,      \
+                                                   const uint8_t *  mask,     \
+                                                   int32_t          w,        \
+                                                   pixman_fixed_t   vx,       \
+                                                   pixman_fixed_t   unit_x);  \
+                                                                              \
+static force_inline void                                                      \
+scaled_nearest_scanline_mips_##name##_##op (const uint8_t *  mask,            \
+                                            dst_type *       pd,              \
+                                            const src_type * ps,              \
+                                            int32_t          w,               \
+                                            pixman_fixed_t   vx,              \
+                                            pixman_fixed_t   unit_x,          \
+                                            pixman_fixed_t   max_vx,          \
+                                            pixman_bool_t    zero_src)        \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+        return;                                                               \
+    pixman_scaled_nearest_scanline_##name##_##op##_asm_mips (pd, ps,          \
+                                                             mask, w,         \
+                                                             vx, unit_x);     \
+}                                                                             \
+                                                                              \
+FAST_NEAREST_MAINLOOP_COMMON (mips_##name##_cover_##op,                       \
+                              scaled_nearest_scanline_mips_##name##_##op,     \
+                              src_type, uint8_t, dst_type, COVER, TRUE, FALSE)\
+FAST_NEAREST_MAINLOOP_COMMON (mips_##name##_none_##op,                        \
+                              scaled_nearest_scanline_mips_##name##_##op,     \
+                              src_type, uint8_t, dst_type, NONE, TRUE, FALSE) \
+FAST_NEAREST_MAINLOOP_COMMON (mips_##name##_pad_##op,                         \
+                              scaled_nearest_scanline_mips_##name##_##op,     \
+                              src_type, uint8_t, dst_type, PAD, TRUE, FALSE)
+
+/* Provide entries for the fast path table */
+#define PIXMAN_MIPS_SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)             \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),                     \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),                      \
+    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
+
+/****************************************************************************/
+
+#define PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_DST(flags, name, op,            \
+                                                 src_type, dst_type)         \
+void                                                                         \
+pixman_scaled_bilinear_scanline_##name##_##op##_asm_mips(                    \
+                                             dst_type *       dst,           \
+                                             const src_type * src_top,       \
+                                             const src_type * src_bottom,    \
+                                             int32_t          w,             \
+                                             int              wt,            \
+                                             int              wb,            \
+                                             pixman_fixed_t   vx,            \
+                                             pixman_fixed_t   unit_x);       \
+static force_inline void                                                     \
+scaled_bilinear_scanline_mips_##name##_##op (dst_type *       dst,           \
+                                             const uint32_t * mask,          \
+                                             const src_type * src_top,       \
+                                             const src_type * src_bottom,    \
+                                             int32_t          w,             \
+                                             int              wt,            \
+                                             int              wb,            \
+                                             pixman_fixed_t   vx,            \
+                                             pixman_fixed_t   unit_x,        \
+                                             pixman_fixed_t   max_vx,        \
+                                             pixman_bool_t    zero_src)      \
+{                                                                            \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                 \
+        return;                                                              \
+    pixman_scaled_bilinear_scanline_##name##_##op##_asm_mips (dst, src_top,  \
+                                                              src_bottom, w, \
+                                                              wt, wb,        \
+                                                              vx, unit_x);   \
+}                                                                            \
+                                                                             \
+FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_cover_##op,                     \
+                       scaled_bilinear_scanline_mips_##name##_##op,          \
+                       src_type, uint32_t, dst_type, COVER, FLAG_NONE)       \
+FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_none_##op,                      \
+                       scaled_bilinear_scanline_mips_##name##_##op,          \
+                       src_type, uint32_t, dst_type, NONE, FLAG_NONE)        \
+FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_pad_##op,                       \
+                       scaled_bilinear_scanline_mips_##name##_##op,          \
+                       src_type, uint32_t, dst_type, PAD, FLAG_NONE)         \
+FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_normal_##op,                    \
+                       scaled_bilinear_scanline_mips_##name##_##op,          \
+                       src_type, uint32_t, dst_type, NORMAL,                 \
+                       FLAG_NONE)
+
+/*****************************************************************************/
+
+#define PIXMAN_MIPS_BIND_SCALED_BILINEAR_SRC_A8_DST(flags, name, op,          \
+                                                src_type, dst_type)           \
+void                                                                          \
+pixman_scaled_bilinear_scanline_##name##_##op##_asm_mips (                    \
+                                             dst_type *       dst,            \
+                                             const uint8_t *  mask,           \
+                                             const src_type * top,            \
+                                             const src_type * bottom,         \
+                                             int              wt,             \
+                                             int              wb,             \
+                                             pixman_fixed_t   x,              \
+                                             pixman_fixed_t   ux,             \
+                                             int              width);         \
+                                                                              \
+static force_inline void                                                      \
+scaled_bilinear_scanline_mips_##name##_##op (dst_type *       dst,            \
+                                             const uint8_t *  mask,           \
+                                             const src_type * src_top,        \
+                                             const src_type * src_bottom,     \
+                                             int32_t          w,              \
+                                             int              wt,             \
+                                             int              wb,             \
+                                             pixman_fixed_t   vx,             \
+                                             pixman_fixed_t   unit_x,         \
+                                             pixman_fixed_t   max_vx,         \
+                                             pixman_bool_t    zero_src)       \
+{                                                                             \
+    if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
+        return;                                                               \
+    pixman_scaled_bilinear_scanline_##name##_##op##_asm_mips (                \
+                      dst, mask, src_top, src_bottom, wt, wb, vx, unit_x, w); \
+}                                                                             \
+                                                                              \
+FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_cover_##op,                      \
+                       scaled_bilinear_scanline_mips_##name##_##op,           \
+                       src_type, uint8_t, dst_type, COVER,                    \
+                       FLAG_HAVE_NON_SOLID_MASK)                              \
+FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_none_##op,                       \
+                       scaled_bilinear_scanline_mips_##name##_##op,           \
+                       src_type, uint8_t, dst_type, NONE,                     \
+                       FLAG_HAVE_NON_SOLID_MASK)                              \
+FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_pad_##op,                        \
+                       scaled_bilinear_scanline_mips_##name##_##op,           \
+                       src_type, uint8_t, dst_type, PAD,                      \
+                       FLAG_HAVE_NON_SOLID_MASK)                              \
+FAST_BILINEAR_MAINLOOP_COMMON (mips_##name##_normal_##op,                     \
+                       scaled_bilinear_scanline_mips_##name##_##op,           \
+                       src_type, uint8_t, dst_type, NORMAL,                   \
+                       FLAG_HAVE_NON_SOLID_MASK)
+
+#endif //PIXMAN_MIPS_DSPR2_H
diff --git a/src/pixman/pixman/pixman-mips-memcpy-asm.S b/src/pixman/pixman/pixman-mips-memcpy-asm.S
new file mode 100644
index 0000000..9ad6da5
--- /dev/null
+++ b/src/pixman/pixman/pixman-mips-memcpy-asm.S
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) 2012
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "pixman-mips-dspr2-asm.h"
+
+/*
+ * This routine could be optimized for MIPS64. The current code only
+ * uses MIPS32 instructions.
+ */
+
+#ifdef EB
+#  define LWHI	lwl		/* high part is left in big-endian */
+#  define SWHI	swl		/* high part is left in big-endian */
+#  define LWLO	lwr		/* low part is right in big-endian */
+#  define SWLO	swr		/* low part is right in big-endian */
+#else
+#  define LWHI	lwr		/* high part is right in little-endian */
+#  define SWHI	swr		/* high part is right in little-endian */
+#  define LWLO	lwl		/* low part is left in big-endian */
+#  define SWLO	swl		/* low part is left in big-endian */
+#endif
+
+LEAF_MIPS32R2(pixman_mips_fast_memcpy)
+
+	slti	AT, a2, 8
+	bne	AT, zero, $last8
+	move	v0, a0	/* memcpy returns the dst pointer */
+
+/* Test if the src and dst are word-aligned, or can be made word-aligned */
+	xor	t8, a1, a0
+	andi	t8, t8, 0x3		/* t8 is a0/a1 word-displacement */
+
+	bne	t8, zero, $unaligned
+	negu	a3, a0
+
+	andi	a3, a3, 0x3	/* we need to copy a3 bytes to make a0/a1 aligned */
+	beq	a3, zero, $chk16w	/* when a3=0 then the dst (a0) is word-aligned */
+	subu	a2, a2, a3	/* now a2 is the remining bytes count */
+
+	LWHI	t8, 0(a1)
+	addu	a1, a1, a3
+	SWHI	t8, 0(a0)
+	addu	a0, a0, a3
+
+/* Now the dst/src are mutually word-aligned with word-aligned addresses */
+$chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
+				/* t8 is the byte count after 64-byte chunks */
+
+	beq	a2, t8, $chk8w	/* if a2==t8, no 64-byte chunks */
+				/* There will be at most 1 32-byte chunk after it */
+	subu	a3, a2, t8	/* subtract from a2 the reminder */
+                                /* Here a3 counts bytes in 16w chunks */
+	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
+
+	addu	t0, a0, a2	/* t0 is the "past the end" address */
+
+/*
+ * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
+ * the "t0-32" address
+ * This means: for x=128 the last "safe" a0 address is "t0-160"
+ * Alternatively, for x=64 the last "safe" a0 address is "t0-96"
+ * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit
+ */
+	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
+
+	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
+	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
+	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
+	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
+/* In case the a0 > t9 don't use "pref 30" at all */
+	sgtu	v1, a0, t9
+	bgtz	v1, $loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
+	nop
+/* otherwise, start with using pref30 */
+	pref	30, 64(a0)
+$loop16w:
+	pref	0, 96(a1)
+	lw	t0, 0(a1)
+	bgtz	v1, $skip_pref30_96	/* skip "pref 30, 96(a0)" */
+	lw	t1, 4(a1)
+	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
+$skip_pref30_96:
+	lw	t2, 8(a1)
+	lw	t3, 12(a1)
+	lw	t4, 16(a1)
+	lw	t5, 20(a1)
+	lw	t6, 24(a1)
+	lw	t7, 28(a1)
+        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
+
+	sw	t0, 0(a0)
+	sw	t1, 4(a0)
+	sw	t2, 8(a0)
+	sw	t3, 12(a0)
+	sw	t4, 16(a0)
+	sw	t5, 20(a0)
+	sw	t6, 24(a0)
+	sw	t7, 28(a0)
+
+	lw	t0, 32(a1)
+	bgtz	v1, $skip_pref30_128	/* skip "pref 30, 128(a0)" */
+	lw	t1, 36(a1)
+	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
+$skip_pref30_128:
+	lw	t2, 40(a1)
+	lw	t3, 44(a1)
+	lw	t4, 48(a1)
+	lw	t5, 52(a1)
+	lw	t6, 56(a1)
+	lw	t7, 60(a1)
+        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
+
+	sw	t0, 32(a0)
+	sw	t1, 36(a0)
+	sw	t2, 40(a0)
+	sw	t3, 44(a0)
+	sw	t4, 48(a0)
+	sw	t5, 52(a0)
+	sw	t6, 56(a0)
+	sw	t7, 60(a0)
+
+	addiu	a0, a0, 64	/* adding 64 to dest */
+	sgtu	v1, a0, t9
+	bne	a0, a3, $loop16w
+	addiu	a1, a1, 64	/* adding 64 to src */
+	move	a2, t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes to go */
+
+$chk8w:
+	pref 0, 0x0(a1)
+	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
+				/* the t8 is the reminder count past 32-bytes */
+	beq	a2, t8, $chk1w	/* when a2=t8, no 32-byte chunk */
+	 nop
+
+	lw	t0, 0(a1)
+	lw	t1, 4(a1)
+	lw	t2, 8(a1)
+	lw	t3, 12(a1)
+	lw	t4, 16(a1)
+	lw	t5, 20(a1)
+	lw	t6, 24(a1)
+	lw	t7, 28(a1)
+	addiu	a1, a1, 32
+
+	sw	t0, 0(a0)
+	sw	t1, 4(a0)
+	sw	t2, 8(a0)
+	sw	t3, 12(a0)
+	sw	t4, 16(a0)
+	sw	t5, 20(a0)
+	sw	t6, 24(a0)
+	sw	t7, 28(a0)
+	addiu	a0, a0, 32
+
+$chk1w:
+	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
+	beq	a2, t8, $last8
+	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
+	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
+
+/* copying in words (4-byte chunks) */
+$wordCopy_loop:
+	lw	t3, 0(a1)	/* the first t3 may be equal t0 ... optimize? */
+	addiu	a1, a1, 4
+	addiu	a0, a0, 4
+	bne	a0, a3, $wordCopy_loop
+	sw	t3, -4(a0)
+
+/* For the last (<8) bytes */
+$last8:
+	blez	a2, leave
+	addu	a3, a0, a2	/* a3 is the last dst address */
+$last8loop:
+	lb	v1, 0(a1)
+	addiu	a1, a1, 1
+	addiu	a0, a0, 1
+	bne	a0, a3, $last8loop
+	sb	v1, -1(a0)
+
+leave:	j	ra
+	nop
+
+/*
+ * UNALIGNED case
+ */
+
+$unaligned:
+	/* got here with a3="negu a0" */
+	andi	a3, a3, 0x3	/* test if the a0 is word aligned */
+	beqz	a3, $ua_chk16w
+	subu	a2, a2, a3	/* bytes left after initial a3 bytes */
+
+	LWHI	v1, 0(a1)
+	LWLO	v1, 3(a1)
+	addu	a1, a1, a3	/* a3 may be here 1, 2 or 3 */
+	SWHI	v1, 0(a0)
+	addu	a0, a0, a3	/* below the dst will be word aligned (NOTE1) */
+
+$ua_chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
+				/* t8 is the byte count after 64-byte chunks */
+	beq	a2, t8, $ua_chk8w	/* if a2==t8, no 64-byte chunks */
+				/* There will be at most 1 32-byte chunk after it */
+	subu	a3, a2, t8	/* subtract from a2 the reminder */
+                                /* Here a3 counts bytes in 16w chunks */
+	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
+
+	addu	t0, a0, a2	/* t0 is the "past the end" address */
+
+	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
+
+	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
+	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
+	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
+	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
+/* In case the a0 > t9 don't use "pref 30" at all */
+	sgtu	v1, a0, t9
+	bgtz	v1, $ua_loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
+	nop
+/* otherwise,  start with using pref30 */
+	pref	30, 64(a0)
+$ua_loop16w:
+	pref	0, 96(a1)
+	LWHI	t0, 0(a1)
+	LWLO	t0, 3(a1)
+	LWHI	t1, 4(a1)
+	bgtz	v1, $ua_skip_pref30_96
+	LWLO	t1, 7(a1)
+	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
+$ua_skip_pref30_96:
+	LWHI	t2, 8(a1)
+	LWLO	t2, 11(a1)
+	LWHI	t3, 12(a1)
+	LWLO	t3, 15(a1)
+	LWHI	t4, 16(a1)
+	LWLO	t4, 19(a1)
+	LWHI	t5, 20(a1)
+	LWLO	t5, 23(a1)
+	LWHI	t6, 24(a1)
+	LWLO	t6, 27(a1)
+	LWHI	t7, 28(a1)
+	LWLO	t7, 31(a1)
+        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
+
+	sw	t0, 0(a0)
+	sw	t1, 4(a0)
+	sw	t2, 8(a0)
+	sw	t3, 12(a0)
+	sw	t4, 16(a0)
+	sw	t5, 20(a0)
+	sw	t6, 24(a0)
+	sw	t7, 28(a0)
+
+	LWHI	t0, 32(a1)
+	LWLO	t0, 35(a1)
+	LWHI	t1, 36(a1)
+	bgtz	v1, $ua_skip_pref30_128
+	LWLO	t1, 39(a1)
+	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
+$ua_skip_pref30_128:
+	LWHI	t2, 40(a1)
+	LWLO	t2, 43(a1)
+	LWHI	t3, 44(a1)
+	LWLO	t3, 47(a1)
+	LWHI	t4, 48(a1)
+	LWLO	t4, 51(a1)
+	LWHI	t5, 52(a1)
+	LWLO	t5, 55(a1)
+	LWHI	t6, 56(a1)
+	LWLO	t6, 59(a1)
+	LWHI	t7, 60(a1)
+	LWLO	t7, 63(a1)
+        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
+
+	sw	t0, 32(a0)
+	sw	t1, 36(a0)
+	sw	t2, 40(a0)
+	sw	t3, 44(a0)
+	sw	t4, 48(a0)
+	sw	t5, 52(a0)
+	sw	t6, 56(a0)
+	sw	t7, 60(a0)
+
+	addiu	a0, a0, 64	/* adding 64 to dest */
+	sgtu	v1, a0, t9
+	bne	a0, a3, $ua_loop16w
+	addiu	a1, a1, 64	/* adding 64 to src */
+	move	a2, t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes to go */
+
+$ua_chk8w:
+	pref 0, 0x0(a1)
+	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
+				/* the t8 is the reminder count */
+	beq	a2, t8, $ua_chk1w	/* when a2=t8, no 32-byte chunk */
+
+	LWHI	t0, 0(a1)
+	LWLO	t0, 3(a1)
+	LWHI	t1, 4(a1)
+	LWLO	t1, 7(a1)
+	LWHI	t2, 8(a1)
+	LWLO	t2, 11(a1)
+	LWHI	t3, 12(a1)
+	LWLO	t3, 15(a1)
+	LWHI	t4, 16(a1)
+	LWLO	t4, 19(a1)
+	LWHI	t5, 20(a1)
+	LWLO	t5, 23(a1)
+	LWHI	t6, 24(a1)
+	LWLO	t6, 27(a1)
+	LWHI	t7, 28(a1)
+	LWLO	t7, 31(a1)
+	addiu	a1, a1, 32
+
+	sw	t0, 0(a0)
+	sw	t1, 4(a0)
+	sw	t2, 8(a0)
+	sw	t3, 12(a0)
+	sw	t4, 16(a0)
+	sw	t5, 20(a0)
+	sw	t6, 24(a0)
+	sw	t7, 28(a0)
+	addiu	a0, a0, 32
+
+$ua_chk1w:
+	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
+	beq	a2, t8, $ua_smallCopy
+	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
+	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
+
+/* copying in words (4-byte chunks) */
+$ua_wordCopy_loop:
+	LWHI	v1, 0(a1)
+	LWLO	v1, 3(a1)
+	addiu	a1, a1, 4
+	addiu	a0, a0, 4		/* note: dst=a0 is word aligned here, see NOTE1 */
+	bne	a0, a3, $ua_wordCopy_loop
+	sw	v1, -4(a0)
+
+/* Now less than 4 bytes (value in a2) left to copy */
+$ua_smallCopy:
+	beqz	a2, leave
+	addu	a3, a0, a2	/* a3 is the last dst address */
+$ua_smallCopy_loop:
+	lb	v1, 0(a1)
+	addiu	a1, a1, 1
+	addiu	a0, a0, 1
+	bne	a0, a3, $ua_smallCopy_loop
+	sb	v1, -1(a0)
+
+	j	ra
+	nop
+
+END(pixman_mips_fast_memcpy)
diff --git a/src/pixman/pixman/pixman-mips.c b/src/pixman/pixman/pixman-mips.c
new file mode 100644
index 0000000..3048813
--- /dev/null
+++ b/src/pixman/pixman/pixman-mips.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+
+#if defined(USE_MIPS_DSPR2) || defined(USE_LOONGSON_MMI)
+
+#include <string.h>
+#include <stdlib.h>
+
+static pixman_bool_t
+have_feature (const char *search_string)
+{
+#if defined (__linux__) /* linux ELF */
+    /* Simple detection of MIPS features at runtime for Linux.
+     * It is based on /proc/cpuinfo, which reveals hardware configuration
+     * to user-space applications.  According to MIPS (early 2010), no similar
+     * facility is universally available on the MIPS architectures, so it's up
+     * to individual OSes to provide such.
+     */
+    const char *file_name = "/proc/cpuinfo";
+    char cpuinfo_line[256];
+    FILE *f = NULL;
+
+    if ((f = fopen (file_name, "r")) == NULL)
+        return FALSE;
+
+    while (fgets (cpuinfo_line, sizeof (cpuinfo_line), f) != NULL)
+    {
+        if (strstr (cpuinfo_line, search_string) != NULL)
+        {
+            fclose (f);
+            return TRUE;
+        }
+    }
+
+    fclose (f);
+#endif
+
+    /* Did not find string in the proc file, or not Linux ELF. */
+    return FALSE;
+}
+
+#endif
+
+pixman_implementation_t *
+_pixman_mips_get_implementations (pixman_implementation_t *imp)
+{
+#ifdef USE_LOONGSON_MMI
+    /* I really don't know if some Loongson CPUs don't have MMI. */
+    if (!_pixman_disabled ("loongson-mmi") && have_feature ("Loongson"))
+	imp = _pixman_implementation_create_mmx (imp);
+#endif
+
+#ifdef USE_MIPS_DSPR2
+    if (!_pixman_disabled ("mips-dspr2"))
+    {
+	int already_compiling_everything_for_dspr2 = 0;
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+	already_compiling_everything_for_dspr2 = 1;
+#endif
+	if (already_compiling_everything_for_dspr2 ||
+	    /* Only currently available MIPS core that supports DSPr2 is 74K. */
+	    have_feature ("MIPS 74K"))
+	{
+	    imp = _pixman_implementation_create_mips_dspr2 (imp);
+	}
+    }
+#endif
+
+    return imp;
+}
diff --git a/src/pixman/pixman/pixman-mmx.c b/src/pixman/pixman/pixman-mmx.c
new file mode 100644
index 0000000..f9a92ce
--- /dev/null
+++ b/src/pixman/pixman/pixman-mmx.c
@@ -0,0 +1,4055 @@
+/*
+ * Copyright © 2004, 2005 Red Hat, Inc.
+ * Copyright © 2004 Nicholas Miell
+ * Copyright © 2005 Trolltech AS
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Søren Sandmann (sandmann@redhat.com)
+ * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
+ * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
+ *
+ * Based on work by Owen Taylor
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
+
+#ifdef USE_LOONGSON_MMI
+#include <loongson-mmintrin.h>
+#else
+#include <mmintrin.h>
+#endif
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+#ifdef VERBOSE
+#define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
+#else
+#define CHECKPOINT()
+#endif
+
+#if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
+/* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_empty (void)
+{
+
+}
+#endif
+
+#ifdef USE_X86_MMX
+# if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
+#  include <xmmintrin.h>
+# else
+/* We have to compile with -msse to use xmmintrin.h, but that causes SSE
+ * instructions to be generated that we don't want. Just duplicate the
+ * functions we want to use.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_pi8 (__m64 __A)
+{
+    int ret;
+
+    asm ("pmovmskb %1, %0\n\t"
+	: "=r" (ret)
+	: "y" (__A)
+    );
+
+    return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_pu16 (__m64 __A, __m64 __B)
+{
+    asm ("pmulhuw %1, %0\n\t"
+	: "+y" (__A)
+	: "y" (__B)
+    );
+    return __A;
+}
+
+#  ifdef __OPTIMIZE__
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi16 (__m64 __A, int8_t const __N)
+{
+    __m64 ret;
+
+    asm ("pshufw %2, %1, %0\n\t"
+	: "=y" (ret)
+	: "y" (__A), "K" (__N)
+    );
+
+    return ret;
+}
+#  else
+#   define _mm_shuffle_pi16(A, N)					\
+    ({									\
+	__m64 ret;							\
+									\
+	asm ("pshufw %2, %1, %0\n\t"					\
+	     : "=y" (ret)						\
+	     : "y" (A), "K" ((const int8_t)N)				\
+	);								\
+									\
+	ret;								\
+    })
+#  endif
+# endif
+#endif
+
+#ifndef _MSC_VER
+#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+#endif
+
+/* Notes about writing mmx code
+ *
+ * give memory operands as the second operand. If you give it as the
+ * first, gcc will first load it into a register, then use that
+ * register
+ *
+ *   ie. use
+ *
+ *         _mm_mullo_pi16 (x, mmx_constant);
+ *
+ *   not
+ *
+ *         _mm_mullo_pi16 (mmx_constant, x);
+ *
+ * Also try to minimize dependencies. i.e. when you need a value, try
+ * to calculate it from a value that was calculated as early as
+ * possible.
+ */
+
+/* --------------- MMX primitives ------------------------------------- */
+
+/* If __m64 is defined as a struct or union, then define M64_MEMBER to be
+ * the name of the member used to access the data.
+ * If __m64 requires using mm_cvt* intrinsics functions to convert between
+ * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
+ * If __m64 and uint64_t values can just be cast to each other directly,
+ * then define USE_M64_CASTS.
+ * If __m64 is a double datatype, then define USE_M64_DOUBLE.
+ */
+#ifdef _MSC_VER
+# define M64_MEMBER m64_u64
+#elif defined(__ICC)
+# define USE_CVT_INTRINSICS
+#elif defined(USE_LOONGSON_MMI)
+# define USE_M64_DOUBLE
+#elif defined(__GNUC__)
+# define USE_M64_CASTS
+#elif defined(__SUNPRO_C)
+# if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
+/* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
+ * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
+ * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
+ */
+#  define USE_CVT_INTRINSICS
+# else
+/* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
+ * disabled, __m64 is defined as a struct containing "unsigned long long l_".
+ */
+#  define M64_MEMBER l_
+# endif
+#endif
+
+#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
+typedef uint64_t mmxdatafield;
+#else
+typedef __m64 mmxdatafield;
+#endif
+
+typedef struct
+{
+    mmxdatafield mmx_4x00ff;
+    mmxdatafield mmx_4x0080;
+    mmxdatafield mmx_565_rgb;
+    mmxdatafield mmx_565_unpack_multiplier;
+    mmxdatafield mmx_565_pack_multiplier;
+    mmxdatafield mmx_565_r;
+    mmxdatafield mmx_565_g;
+    mmxdatafield mmx_565_b;
+    mmxdatafield mmx_packed_565_rb;
+    mmxdatafield mmx_packed_565_g;
+    mmxdatafield mmx_expand_565_g;
+    mmxdatafield mmx_expand_565_b;
+    mmxdatafield mmx_expand_565_r;
+#ifndef USE_LOONGSON_MMI
+    mmxdatafield mmx_mask_0;
+    mmxdatafield mmx_mask_1;
+    mmxdatafield mmx_mask_2;
+    mmxdatafield mmx_mask_3;
+#endif
+    mmxdatafield mmx_full_alpha;
+    mmxdatafield mmx_4x0101;
+    mmxdatafield mmx_ff000000;
+} mmx_data_t;
+
+#if defined(_MSC_VER)
+# define MMXDATA_INIT(field, val) { val ## UI64 }
+#elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
+# define MMXDATA_INIT(field, val) field =   { val ## ULL }
+#else                           /* mmxdatafield is an integral type */
+# define MMXDATA_INIT(field, val) field =   val ## ULL
+#endif
+
+static const mmx_data_t c =
+{
+    MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
+    MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
+    MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
+    MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
+    MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004),
+    MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
+    MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
+    MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
+    MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
+    MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
+    MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0),
+    MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f),
+    MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800),
+#ifndef USE_LOONGSON_MMI
+    MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
+    MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
+    MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
+    MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
+#endif
+    MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
+    MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
+    MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000),
+};
+
+#ifdef USE_CVT_INTRINSICS
+#    define MC(x) to_m64 (c.mmx_ ## x)
+#elif defined(USE_M64_CASTS)
+#    define MC(x) ((__m64)c.mmx_ ## x)
+#elif defined(USE_M64_DOUBLE)
+#    define MC(x) (*(__m64 *)&c.mmx_ ## x)
+#else
+#    define MC(x) c.mmx_ ## x
+#endif
+
+static force_inline __m64
+to_m64 (uint64_t x)
+{
+#ifdef USE_CVT_INTRINSICS
+    return _mm_cvtsi64_m64 (x);
+#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
+    __m64 res;
+
+    res.M64_MEMBER = x;
+    return res;
+#elif defined USE_M64_DOUBLE
+    return *(__m64 *)&x;
+#else /* USE_M64_CASTS */
+    return (__m64)x;
+#endif
+}
+
+static force_inline uint64_t
+to_uint64 (__m64 x)
+{
+#ifdef USE_CVT_INTRINSICS
+    return _mm_cvtm64_si64 (x);
+#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
+    uint64_t res = x.M64_MEMBER;
+    return res;
+#elif defined USE_M64_DOUBLE
+    return *(uint64_t *)&x;
+#else /* USE_M64_CASTS */
+    return (uint64_t)x;
+#endif
+}
+
+static force_inline __m64
+shift (__m64 v,
+       int   s)
+{
+    if (s > 0)
+	return _mm_slli_si64 (v, s);
+    else if (s < 0)
+	return _mm_srli_si64 (v, -s);
+    else
+	return v;
+}
+
+static force_inline __m64
+negate (__m64 mask)
+{
+    return _mm_xor_si64 (mask, MC (4x00ff));
+}
+
+/* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1
+ * and maps its result to the same range.
+ *
+ * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner:
+ * Notation, Notation, Notation", the first of which is
+ *
+ *   prod(a, b) = (a * b + 128) / 255.
+ *
+ * By approximating the division by 255 as 257/65536 it can be replaced by a
+ * multiply and a right shift. This is the implementation that we use in
+ * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended
+ * 3DNow!, and unavailable at the time of the book's publication) to perform
+ * the multiplication and right shift in a single operation.
+ *
+ *   prod(a, b) = ((a * b + 128) * 257) >> 16.
+ *
+ * A third way (how pix_multiply() was implemented prior to 14208344) exists
+ * also that performs the multiplication by 257 with adds and shifts.
+ *
+ * Where temp = a * b + 128
+ *
+ *   prod(a, b) = (temp + (temp >> 8)) >> 8.
+ */
+static force_inline __m64
+pix_multiply (__m64 a, __m64 b)
+{
+    __m64 res;
+
+    res = _mm_mullo_pi16 (a, b);
+    res = _mm_adds_pu16 (res, MC (4x0080));
+    res = _mm_mulhi_pu16 (res, MC (4x0101));
+
+    return res;
+}
+
+static force_inline __m64
+pix_add (__m64 a, __m64 b)
+{
+    return _mm_adds_pu8 (a, b);
+}
+
+static force_inline __m64
+expand_alpha (__m64 pixel)
+{
+    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline __m64
+expand_alpha_rev (__m64 pixel)
+{
+    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m64
+invert_colors (__m64 pixel)
+{
+    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline __m64
+over (__m64 src,
+      __m64 srca,
+      __m64 dest)
+{
+    return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
+}
+
+static force_inline __m64
+over_rev_non_pre (__m64 src, __m64 dest)
+{
+    __m64 srca = expand_alpha (src);
+    __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
+
+    return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
+}
+
+static force_inline __m64
+in (__m64 src, __m64 mask)
+{
+    return pix_multiply (src, mask);
+}
+
+#ifndef _MSC_VER
+static force_inline __m64
+in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
+{
+    return over (in (src, mask), pix_multiply (srca, mask), dest);
+}
+
+#else
+
+#define in_over(src, srca, mask, dest)					\
+    over (in (src, mask), pix_multiply (srca, mask), dest)
+
+#endif
+
+/* Elemental unaligned loads */
+
+static force_inline __m64 ldq_u(__m64 *p)
+{
+#ifdef USE_X86_MMX
+    /* x86's alignment restrictions are very relaxed. */
+    return *(__m64 *)p;
+#elif defined USE_ARM_IWMMXT
+    int align = (uintptr_t)p & 7;
+    __m64 *aligned_p;
+    if (align == 0)
+	return *p;
+    aligned_p = (__m64 *)((uintptr_t)p & ~7);
+    return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
+#else
+    struct __una_u64 { __m64 x __attribute__((packed)); };
+    const struct __una_u64 *ptr = (const struct __una_u64 *) p;
+    return (__m64) ptr->x;
+#endif
+}
+
+static force_inline uint32_t ldl_u(const uint32_t *p)
+{
+#ifdef USE_X86_MMX
+    /* x86's alignment restrictions are very relaxed. */
+    return *p;
+#else
+    struct __una_u32 { uint32_t x __attribute__((packed)); };
+    const struct __una_u32 *ptr = (const struct __una_u32 *) p;
+    return ptr->x;
+#endif
+}
+
+static force_inline __m64
+load (const uint32_t *v)
+{
+#ifdef USE_LOONGSON_MMI
+    __m64 ret;
+    asm ("lwc1 %0, %1\n\t"
+	: "=f" (ret)
+	: "m" (*v)
+    );
+    return ret;
+#else
+    return _mm_cvtsi32_si64 (*v);
+#endif
+}
+
+static force_inline __m64
+load8888 (const uint32_t *v)
+{
+#ifdef USE_LOONGSON_MMI
+    return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
+#else
+    return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
+#endif
+}
+
+static force_inline __m64
+load8888u (const uint32_t *v)
+{
+    uint32_t l = ldl_u (v);
+    return load8888 (&l);
+}
+
+static force_inline __m64
+pack8888 (__m64 lo, __m64 hi)
+{
+    return _mm_packs_pu16 (lo, hi);
+}
+
+static force_inline void
+store (uint32_t *dest, __m64 v)
+{
+#ifdef USE_LOONGSON_MMI
+    asm ("swc1 %1, %0\n\t"
+	: "=m" (*dest)
+	: "f" (v)
+	: "memory"
+    );
+#else
+    *dest = _mm_cvtsi64_si32 (v);
+#endif
+}
+
+static force_inline void
+store8888 (uint32_t *dest, __m64 v)
+{
+    v = pack8888 (v, _mm_setzero_si64 ());
+    store (dest, v);
+}
+
+static force_inline pixman_bool_t
+is_equal (__m64 a, __m64 b)
+{
+#ifdef USE_LOONGSON_MMI
+    /* __m64 is double, we can compare directly. */
+    return a == b;
+#else
+    return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
+#endif
+}
+
+static force_inline pixman_bool_t
+is_opaque (__m64 v)
+{
+#ifdef USE_LOONGSON_MMI
+    return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
+#else
+    __m64 ffs = _mm_cmpeq_pi8 (v, v);
+    return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
+#endif
+}
+
+static force_inline pixman_bool_t
+is_zero (__m64 v)
+{
+    return is_equal (v, _mm_setzero_si64 ());
+}
+
+/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
+ *
+ *    00RR00GG00BB
+ *
+ * --- Expanding 565 in the low word ---
+ *
+ * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
+ * m = m & (01f0003f001f);
+ * m = m * (008404100840);
+ * m = m >> 8;
+ *
+ * Note the trick here - the top word is shifted by another nibble to
+ * avoid it bumping into the middle word
+ */
+static force_inline __m64
+expand565 (__m64 pixel, int pos)
+{
+    __m64 p = pixel;
+    __m64 t1, t2;
+
+    /* move pixel to low 16 bit and zero the rest */
+#ifdef USE_LOONGSON_MMI
+    p = loongson_extract_pi16 (p, pos);
+#else
+    p = shift (shift (p, (3 - pos) * 16), -48);
+#endif
+
+    t1 = shift (p, 36 - 11);
+    t2 = shift (p, 16 - 5);
+
+    p = _mm_or_si64 (t1, p);
+    p = _mm_or_si64 (t2, p);
+    p = _mm_and_si64 (p, MC (565_rgb));
+
+    pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
+    return _mm_srli_pi16 (pixel, 8);
+}
+
+/* Expand 4 16 bit pixels in an mmx register into two mmx registers of
+ *
+ *    AARRGGBBRRGGBB
+ */
+static force_inline void
+expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
+{
+    __m64 t0, t1, alpha = _mm_setzero_si64 ();
+    __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
+    __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
+    __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
+    if (full_alpha)
+	alpha = _mm_cmpeq_pi32 (alpha, alpha);
+
+    /* Replicate high bits into empty low bits. */
+    r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
+    g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
+    b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
+
+    r = _mm_packs_pu16 (r, _mm_setzero_si64 ());	/* 00 00 00 00 R3 R2 R1 R0 */
+    g = _mm_packs_pu16 (g, _mm_setzero_si64 ());	/* 00 00 00 00 G3 G2 G1 G0 */
+    b = _mm_packs_pu16 (b, _mm_setzero_si64 ());	/* 00 00 00 00 B3 B2 B1 B0 */
+
+    t1 = _mm_unpacklo_pi8 (r, alpha);			/* A3 R3 A2 R2 A1 R1 A0 R0 */
+    t0 = _mm_unpacklo_pi8 (b, g);			/* G3 B3 G2 B2 G1 B1 G0 B0 */
+
+    *vout0 = _mm_unpacklo_pi16 (t0, t1);		/* A1 R1 G1 B1 A0 R0 G0 B0 */
+    *vout1 = _mm_unpackhi_pi16 (t0, t1);		/* A3 R3 G3 B3 A2 R2 G2 B2 */
+}
+
+static force_inline __m64
+expand8888 (__m64 in, int pos)
+{
+    if (pos == 0)
+	return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
+    else
+	return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
+}
+
+static force_inline __m64
+expandx888 (__m64 in, int pos)
+{
+    return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
+}
+
+static force_inline void
+expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
+{
+    __m64 v0, v1;
+    expand_4xpacked565 (vin, &v0, &v1, full_alpha);
+    *vout0 = expand8888 (v0, 0);
+    *vout1 = expand8888 (v0, 1);
+    *vout2 = expand8888 (v1, 0);
+    *vout3 = expand8888 (v1, 1);
+}
+
+static force_inline __m64
+pack_565 (__m64 pixel, __m64 target, int pos)
+{
+    __m64 p = pixel;
+    __m64 t = target;
+    __m64 r, g, b;
+
+    r = _mm_and_si64 (p, MC (565_r));
+    g = _mm_and_si64 (p, MC (565_g));
+    b = _mm_and_si64 (p, MC (565_b));
+
+#ifdef USE_LOONGSON_MMI
+    r = shift (r, -(32 - 8));
+    g = shift (g, -(16 - 3));
+    b = shift (b, -(0  + 3));
+
+    p = _mm_or_si64 (r, g);
+    p = _mm_or_si64 (p, b);
+    return loongson_insert_pi16 (t, p, pos);
+#else
+    r = shift (r, -(32 - 8) + pos * 16);
+    g = shift (g, -(16 - 3) + pos * 16);
+    b = shift (b, -(0  + 3) + pos * 16);
+
+    if (pos == 0)
+	t = _mm_and_si64 (t, MC (mask_0));
+    else if (pos == 1)
+	t = _mm_and_si64 (t, MC (mask_1));
+    else if (pos == 2)
+	t = _mm_and_si64 (t, MC (mask_2));
+    else if (pos == 3)
+	t = _mm_and_si64 (t, MC (mask_3));
+
+    p = _mm_or_si64 (r, t);
+    p = _mm_or_si64 (g, p);
+
+    return _mm_or_si64 (b, p);
+#endif
+}
+
+static force_inline __m64
+pack_4xpacked565 (__m64 a, __m64 b)
+{
+    __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
+    __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
+
+    __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
+    __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
+
+    __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
+    __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
+
+    t0 = _mm_or_si64 (t0, g0);
+    t1 = _mm_or_si64 (t1, g1);
+
+    t0 = shift(t0, -5);
+#ifdef USE_ARM_IWMMXT
+    t1 = shift(t1, -5);
+    return _mm_packs_pu32 (t0, t1);
+#else
+    t1 = shift(t1, -5 + 16);
+    return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
+#endif
+}
+
+#ifndef _MSC_VER
+
+static force_inline __m64
+pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
+{
+    return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
+}
+
+static force_inline __m64
+pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
+{
+    x = pix_multiply (x, a);
+    y = pix_multiply (y, b);
+
+    return pix_add (x, y);
+}
+
+#else
+
+/* MSVC only handles a "pass by register" of up to three SSE intrinsics */
+
+#define pack_4x565(v0, v1, v2, v3) \
+    pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
+
+#define pix_add_mul(x, a, y, b)	 \
+    ( x = pix_multiply (x, a),	 \
+      y = pix_multiply (y, b),	 \
+      pix_add (x, y) )
+
+#endif
+
+/* --------------- MMX code patch for fbcompose.c --------------------- */
+
+static force_inline __m64
+combine (const uint32_t *src, const uint32_t *mask)
+{
+    __m64 vsrc = load8888 (src);
+
+    if (mask)
+    {
+	__m64 m = load8888 (mask);
+
+	m = expand_alpha (m);
+	vsrc = pix_multiply (vsrc, m);
+    }
+
+    return vsrc;
+}
+
+static force_inline __m64
+core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
+{
+    vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
+
+    if (is_opaque (vsrc))
+    {
+	return vsrc;
+    }
+    else if (!is_zero (vsrc))
+    {
+	return over (vsrc, expand_alpha (vsrc),
+		     _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
+    }
+
+    return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
+}
+
+static void
+mmx_combine_over_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 vsrc = combine (src, mask);
+
+	if (is_opaque (vsrc))
+	{
+	    store8888 (dest, vsrc);
+	}
+	else if (!is_zero (vsrc))
+	{
+	    __m64 sa = expand_alpha (vsrc);
+	    store8888 (dest, over (vsrc, sa, load8888 (dest)));
+	}
+
+	++dest;
+	++src;
+	if (mask)
+	    ++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_over_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 d, da;
+	__m64 s = combine (src, mask);
+
+	d = load8888 (dest);
+	da = expand_alpha (d);
+	store8888 (dest, over (d, da, s));
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_in_u (pixman_implementation_t *imp,
+                  pixman_op_t              op,
+                  uint32_t *               dest,
+                  const uint32_t *         src,
+                  const uint32_t *         mask,
+                  int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 a;
+	__m64 x = combine (src, mask);
+
+	a = load8888 (dest);
+	a = expand_alpha (a);
+	x = pix_multiply (x, a);
+
+	store8888 (dest, x);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_in_reverse_u (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *               dest,
+                          const uint32_t *         src,
+                          const uint32_t *         mask,
+                          int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 a = combine (src, mask);
+	__m64 x;
+
+	x = load8888 (dest);
+	a = expand_alpha (a);
+	x = pix_multiply (x, a);
+	store8888 (dest, x);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_out_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 a;
+	__m64 x = combine (src, mask);
+
+	a = load8888 (dest);
+	a = expand_alpha (a);
+	a = negate (a);
+	x = pix_multiply (x, a);
+	store8888 (dest, x);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_out_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 a = combine (src, mask);
+	__m64 x;
+
+	x = load8888 (dest);
+	a = expand_alpha (a);
+	a = negate (a);
+	x = pix_multiply (x, a);
+
+	store8888 (dest, x);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_atop_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 da, d, sia;
+	__m64 s = combine (src, mask);
+
+	d = load8888 (dest);
+	sia = expand_alpha (s);
+	sia = negate (sia);
+	da = expand_alpha (d);
+	s = pix_add_mul (s, da, d, sia);
+	store8888 (dest, s);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    const uint32_t *end;
+
+    end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 dia, d, sa;
+	__m64 s = combine (src, mask);
+
+	d = load8888 (dest);
+	sa = expand_alpha (s);
+	dia = expand_alpha (d);
+	dia = negate (dia);
+	s = pix_add_mul (s, dia, d, sa);
+	store8888 (dest, s);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_xor_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 dia, d, sia;
+	__m64 s = combine (src, mask);
+
+	d = load8888 (dest);
+	sia = expand_alpha (s);
+	dia = expand_alpha (d);
+	sia = negate (sia);
+	dia = negate (dia);
+	s = pix_add_mul (s, dia, d, sia);
+	store8888 (dest, s);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_add_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	__m64 d;
+	__m64 s = combine (src, mask);
+
+	d = load8888 (dest);
+	s = pix_add (s, d);
+	store8888 (dest, s);
+
+	++dest;
+	++src;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_saturate_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *               dest,
+                        const uint32_t *         src,
+                        const uint32_t *         mask,
+                        int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	uint32_t s, sa, da;
+	uint32_t d = *dest;
+	__m64 ms = combine (src, mask);
+	__m64 md = load8888 (dest);
+
+	store8888(&s, ms);
+	da = ~d >> 24;
+	sa = s >> 24;
+
+	if (sa > da)
+	{
+	    uint32_t quot = DIV_UN8 (da, sa) << 24;
+	    __m64 msa = load8888 (&quot);
+	    msa = expand_alpha (msa);
+	    ms = pix_multiply (ms, msa);
+	}
+
+	md = pix_add (md, ms);
+	store8888 (dest, md);
+
+	++src;
+	++dest;
+	if (mask)
+	    mask++;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_src_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+
+	s = pix_multiply (s, a);
+	store8888 (dest, s);
+
+	++src;
+	++mask;
+	++dest;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_over_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
+	__m64 sa = expand_alpha (s);
+
+	store8888 (dest, in_over (s, sa, a, d));
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
+	__m64 da = expand_alpha (d);
+
+	store8888 (dest, over (d, da, in (s, a)));
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_in_ca (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
+	__m64 da = expand_alpha (d);
+
+	s = pix_multiply (s, a);
+	s = pix_multiply (s, da);
+	store8888 (dest, s);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
+	__m64 sa = expand_alpha (s);
+
+	a = pix_multiply (a, sa);
+	d = pix_multiply (d, a);
+	store8888 (dest, d);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_out_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
+	__m64 da = expand_alpha (d);
+
+	da = negate (da);
+	s = pix_multiply (s, a);
+	s = pix_multiply (s, da);
+	store8888 (dest, s);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
+	__m64 sa = expand_alpha (s);
+
+	a = pix_multiply (a, sa);
+	a = negate (a);
+	d = pix_multiply (d, a);
+	store8888 (dest, d);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_atop_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
+	__m64 da = expand_alpha (d);
+	__m64 sa = expand_alpha (s);
+
+	s = pix_multiply (s, a);
+	a = pix_multiply (a, sa);
+	a = negate (a);
+	d = pix_add_mul (d, a, s, da);
+	store8888 (dest, d);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
+	__m64 da = expand_alpha (d);
+	__m64 sa = expand_alpha (s);
+
+	s = pix_multiply (s, a);
+	a = pix_multiply (a, sa);
+	da = negate (da);
+	d = pix_add_mul (d, a, s, da);
+	store8888 (dest, d);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_xor_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
+	__m64 da = expand_alpha (d);
+	__m64 sa = expand_alpha (s);
+
+	s = pix_multiply (s, a);
+	a = pix_multiply (a, sa);
+	da = negate (da);
+	a = negate (a);
+	d = pix_add_mul (d, a, s, da);
+	store8888 (dest, d);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_add_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+	__m64 a = load8888 (mask);
+	__m64 s = load8888 (src);
+	__m64 d = load8888 (dest);
+
+	s = pix_multiply (s, a);
+	d = pix_add (s, d);
+	store8888 (dest, d);
+
+	++src;
+	++dest;
+	++mask;
+    }
+    _mm_empty ();
+}
+
+/* ------------- MMX code paths called from fbpict.c -------------------- */
+
+static void
+mmx_composite_over_n_8888 (pixman_implementation_t *imp,
+                           pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst;
+    int32_t w;
+    int dst_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    vsrc = load8888 (&src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 2)
+	{
+	    __m64 vdest;
+	    __m64 dest0, dest1;
+
+	    vdest = *(__m64 *)dst;
+
+	    dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
+	    dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
+
+	    *(__m64 *)dst = pack8888 (dest0, dest1);
+
+	    dst += 2;
+	    w -= 2;
+	}
+
+	CHECKPOINT ();
+
+	if (w)
+	{
+	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_n_0565 (pixman_implementation_t *imp,
+                           pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line, *dst;
+    int32_t w;
+    int dst_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    vsrc = load8888 (&src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    uint64_t d = *dst;
+	    __m64 vdest = expand565 (to_m64 (d), 0);
+
+	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
+	    *dst = to_uint64 (vdest);
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 4)
+	{
+	    __m64 vdest = *(__m64 *)dst;
+	    __m64 v0, v1, v2, v3;
+
+	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
+
+	    v0 = over (vsrc, vsrca, v0);
+	    v1 = over (vsrc, vsrca, v1);
+	    v2 = over (vsrc, vsrca, v2);
+	    v3 = over (vsrc, vsrca, v3);
+
+	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
+
+	    dst += 4;
+	    w -= 4;
+	}
+
+	CHECKPOINT ();
+
+	while (w)
+	{
+	    uint64_t d = *dst;
+	    __m64 vdest = expand565 (to_m64 (d), 0);
+
+	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
+	    *dst = to_uint64 (vdest);
+
+	    w--;
+	    dst++;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line;
+    uint32_t    *mask_line;
+    int dst_stride, mask_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    vsrc = load8888 (&src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+	int twidth = width;
+	uint32_t *p = (uint32_t *)mask_line;
+	uint32_t *q = (uint32_t *)dst_line;
+
+	while (twidth && (uintptr_t)q & 7)
+	{
+	    uint32_t m = *(uint32_t *)p;
+
+	    if (m)
+	    {
+		__m64 vdest = load8888 (q);
+		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
+		store8888 (q, vdest);
+	    }
+
+	    twidth--;
+	    p++;
+	    q++;
+	}
+
+	while (twidth >= 2)
+	{
+	    uint32_t m0, m1;
+	    m0 = *p;
+	    m1 = *(p + 1);
+
+	    if (m0 | m1)
+	    {
+		__m64 dest0, dest1;
+		__m64 vdest = *(__m64 *)q;
+
+		dest0 = in_over (vsrc, vsrca, load8888 (&m0),
+		                 expand8888 (vdest, 0));
+		dest1 = in_over (vsrc, vsrca, load8888 (&m1),
+		                 expand8888 (vdest, 1));
+
+		*(__m64 *)q = pack8888 (dest0, dest1);
+	    }
+
+	    p += 2;
+	    q += 2;
+	    twidth -= 2;
+	}
+
+	if (twidth)
+	{
+	    uint32_t m = *(uint32_t *)p;
+
+	    if (m)
+	    {
+		__m64 vdest = load8888 (q);
+		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
+		store8888 (q, vdest);
+	    }
+
+	    twidth--;
+	    p++;
+	    q++;
+	}
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    __m64 vmask;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
+    vmask = expand_alpha (load8888 (&mask));
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    __m64 s = load8888 (src);
+	    __m64 d = load8888 (dst);
+
+	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+
+	while (w >= 2)
+	{
+	    __m64 vs = ldq_u ((__m64 *)src);
+	    __m64 vd = *(__m64 *)dst;
+	    __m64 vsrc0 = expand8888 (vs, 0);
+	    __m64 vsrc1 = expand8888 (vs, 1);
+
+	    *(__m64 *)dst = pack8888 (
+	        in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
+	        in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
+
+	    w -= 2;
+	    dst += 2;
+	    src += 2;
+	}
+
+	if (w)
+	{
+	    __m64 s = load8888 (src);
+	    __m64 d = load8888 (dst);
+
+	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    uint32_t mask;
+    __m64 vmask;
+    int dst_stride, src_stride;
+    int32_t w;
+    __m64 srca;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
+
+    vmask = expand_alpha (load8888 (&mask));
+    srca = MC (4x00ff);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    uint32_t ssrc = *src | 0xff000000;
+	    __m64 s = load8888 (&ssrc);
+	    __m64 d = load8888 (dst);
+
+	    store8888 (dst, in_over (s, srca, vmask, d));
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+
+	while (w >= 16)
+	{
+	    __m64 vd0 = *(__m64 *)(dst + 0);
+	    __m64 vd1 = *(__m64 *)(dst + 2);
+	    __m64 vd2 = *(__m64 *)(dst + 4);
+	    __m64 vd3 = *(__m64 *)(dst + 6);
+	    __m64 vd4 = *(__m64 *)(dst + 8);
+	    __m64 vd5 = *(__m64 *)(dst + 10);
+	    __m64 vd6 = *(__m64 *)(dst + 12);
+	    __m64 vd7 = *(__m64 *)(dst + 14);
+
+	    __m64 vs0 = ldq_u ((__m64 *)(src + 0));
+	    __m64 vs1 = ldq_u ((__m64 *)(src + 2));
+	    __m64 vs2 = ldq_u ((__m64 *)(src + 4));
+	    __m64 vs3 = ldq_u ((__m64 *)(src + 6));
+	    __m64 vs4 = ldq_u ((__m64 *)(src + 8));
+	    __m64 vs5 = ldq_u ((__m64 *)(src + 10));
+	    __m64 vs6 = ldq_u ((__m64 *)(src + 12));
+	    __m64 vs7 = ldq_u ((__m64 *)(src + 14));
+
+	    vd0 = pack8888 (
+	        in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
+	        in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
+
+	    vd1 = pack8888 (
+	        in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
+	        in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
+
+	    vd2 = pack8888 (
+	        in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
+	        in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
+
+	    vd3 = pack8888 (
+	        in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
+	        in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
+
+	    vd4 = pack8888 (
+	        in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
+	        in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
+
+	    vd5 = pack8888 (
+	        in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
+	        in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
+
+	    vd6 = pack8888 (
+	        in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
+	        in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
+
+	    vd7 = pack8888 (
+	        in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
+	        in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
+
+	    *(__m64 *)(dst + 0) = vd0;
+	    *(__m64 *)(dst + 2) = vd1;
+	    *(__m64 *)(dst + 4) = vd2;
+	    *(__m64 *)(dst + 6) = vd3;
+	    *(__m64 *)(dst + 8) = vd4;
+	    *(__m64 *)(dst + 10) = vd5;
+	    *(__m64 *)(dst + 12) = vd6;
+	    *(__m64 *)(dst + 14) = vd7;
+
+	    w -= 16;
+	    dst += 16;
+	    src += 16;
+	}
+
+	while (w)
+	{
+	    uint32_t ssrc = *src | 0xff000000;
+	    __m64 s = load8888 (&ssrc);
+	    __m64 d = load8888 (dst);
+
+	    store8888 (dst, in_over (s, srca, vmask, d));
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    uint32_t s;
+    int dst_stride, src_stride;
+    uint8_t a;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+
+	    if (a == 0xff)
+	    {
+		*dst = s;
+	    }
+	    else if (s)
+	    {
+		__m64 ms, sa;
+		ms = load8888 (&s);
+		sa = expand_alpha (ms);
+		store8888 (dst, over (ms, sa, load8888 (dst)));
+	    }
+
+	    dst++;
+	}
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+#if 0
+    /* FIXME */
+    assert (src_image->drawable == mask_image->drawable);
+#endif
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    __m64 vsrc = load8888 (src);
+	    uint64_t d = *dst;
+	    __m64 vdest = expand565 (to_m64 (d), 0);
+
+	    vdest = pack_565 (
+		over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
+
+	    *dst = to_uint64 (vdest);
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+
+	CHECKPOINT ();
+
+	while (w >= 4)
+	{
+	    __m64 vdest = *(__m64 *)dst;
+	    __m64 v0, v1, v2, v3;
+	    __m64 vsrc0, vsrc1, vsrc2, vsrc3;
+
+	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
+
+	    vsrc0 = load8888 ((src + 0));
+	    vsrc1 = load8888 ((src + 1));
+	    vsrc2 = load8888 ((src + 2));
+	    vsrc3 = load8888 ((src + 3));
+
+	    v0 = over (vsrc0, expand_alpha (vsrc0), v0);
+	    v1 = over (vsrc1, expand_alpha (vsrc1), v1);
+	    v2 = over (vsrc2, expand_alpha (vsrc2), v2);
+	    v3 = over (vsrc3, expand_alpha (vsrc3), v3);
+
+	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
+
+	    w -= 4;
+	    dst += 4;
+	    src += 4;
+	}
+
+	CHECKPOINT ();
+
+	while (w)
+	{
+	    __m64 vsrc = load8888 (src);
+	    uint64_t d = *dst;
+	    __m64 vdest = expand565 (to_m64 (d), 0);
+
+	    vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
+
+	    *dst = to_uint64 (vdest);
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    __m64 vsrc, vsrca;
+    uint64_t srcsrc;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    srcsrc = (uint64_t)src << 32 | src;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    vsrc = load8888 (&src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    uint64_t m = *mask;
+
+	    if (m)
+	    {
+		__m64 vdest = in_over (vsrc, vsrca,
+				       expand_alpha_rev (to_m64 (m)),
+				       load8888 (dst));
+
+		store8888 (dst, vdest);
+	    }
+
+	    w--;
+	    mask++;
+	    dst++;
+	}
+
+	CHECKPOINT ();
+
+	while (w >= 2)
+	{
+	    uint64_t m0, m1;
+
+	    m0 = *mask;
+	    m1 = *(mask + 1);
+
+	    if (srca == 0xff && (m0 & m1) == 0xff)
+	    {
+		*(uint64_t *)dst = srcsrc;
+	    }
+	    else if (m0 | m1)
+	    {
+		__m64 vdest;
+		__m64 dest0, dest1;
+
+		vdest = *(__m64 *)dst;
+
+		dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
+				 expand8888 (vdest, 0));
+		dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
+				 expand8888 (vdest, 1));
+
+		*(__m64 *)dst = pack8888 (dest0, dest1);
+	    }
+
+	    mask += 2;
+	    dst += 2;
+	    w -= 2;
+	}
+
+	CHECKPOINT ();
+
+	if (w)
+	{
+	    uint64_t m = *mask;
+
+	    if (m)
+	    {
+		__m64 vdest = load8888 (dst);
+
+		vdest = in_over (
+		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
+		store8888 (dst, vdest);
+	    }
+	}
+    }
+
+    _mm_empty ();
+}
+
+static pixman_bool_t
+mmx_fill (pixman_implementation_t *imp,
+          uint32_t *               bits,
+          int                      stride,
+          int                      bpp,
+          int                      x,
+          int                      y,
+          int                      width,
+          int                      height,
+          uint32_t		   filler)
+{
+    uint64_t fill;
+    __m64 vfill;
+    uint32_t byte_width;
+    uint8_t     *byte_line;
+
+#if defined __GNUC__ && defined USE_X86_MMX
+    __m64 v1, v2, v3, v4, v5, v6, v7;
+#endif
+
+    if (bpp != 16 && bpp != 32 && bpp != 8)
+	return FALSE;
+
+    if (bpp == 8)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 1;
+	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+	byte_width = width;
+	stride *= 1;
+        filler = (filler & 0xff) * 0x01010101;
+    }
+    else if (bpp == 16)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 2;
+	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+	byte_width = 2 * width;
+	stride *= 2;
+        filler = (filler & 0xffff) * 0x00010001;
+    }
+    else
+    {
+	stride = stride * (int) sizeof (uint32_t) / 4;
+	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+	byte_width = 4 * width;
+	stride *= 4;
+    }
+
+    fill = ((uint64_t)filler << 32) | filler;
+    vfill = to_m64 (fill);
+
+#if defined __GNUC__ && defined USE_X86_MMX
+    __asm__ (
+        "movq		%7,	%0\n"
+        "movq		%7,	%1\n"
+        "movq		%7,	%2\n"
+        "movq		%7,	%3\n"
+        "movq		%7,	%4\n"
+        "movq		%7,	%5\n"
+        "movq		%7,	%6\n"
+	: "=&y" (v1), "=&y" (v2), "=&y" (v3),
+	  "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
+	: "y" (vfill));
+#endif
+
+    while (height--)
+    {
+	int w;
+	uint8_t *d = byte_line;
+
+	byte_line += stride;
+	w = byte_width;
+
+	if (w >= 1 && ((uintptr_t)d & 1))
+	{
+	    *(uint8_t *)d = (filler & 0xff);
+	    w--;
+	    d++;
+	}
+
+	if (w >= 2 && ((uintptr_t)d & 3))
+	{
+	    *(uint16_t *)d = filler;
+	    w -= 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((uintptr_t)d & 7))
+	{
+	    *(uint32_t *)d = filler;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	while (w >= 64)
+	{
+#if defined __GNUC__ && defined USE_X86_MMX
+	    __asm__ (
+	        "movq	%1,	  (%0)\n"
+	        "movq	%2,	 8(%0)\n"
+	        "movq	%3,	16(%0)\n"
+	        "movq	%4,	24(%0)\n"
+	        "movq	%5,	32(%0)\n"
+	        "movq	%6,	40(%0)\n"
+	        "movq	%7,	48(%0)\n"
+	        "movq	%8,	56(%0)\n"
+		:
+		: "r" (d),
+		  "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
+		  "y" (v4), "y" (v5), "y" (v6), "y" (v7)
+		: "memory");
+#else
+	    *(__m64*) (d +  0) = vfill;
+	    *(__m64*) (d +  8) = vfill;
+	    *(__m64*) (d + 16) = vfill;
+	    *(__m64*) (d + 24) = vfill;
+	    *(__m64*) (d + 32) = vfill;
+	    *(__m64*) (d + 40) = vfill;
+	    *(__m64*) (d + 48) = vfill;
+	    *(__m64*) (d + 56) = vfill;
+#endif
+	    w -= 64;
+	    d += 64;
+	}
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = filler;
+
+	    w -= 4;
+	    d += 4;
+	}
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = filler;
+	    w -= 2;
+	    d += 2;
+	}
+	if (w >= 1)
+	{
+	    *(uint8_t *)d = (filler & 0xff);
+	    w--;
+	    d++;
+	}
+
+    }
+
+    _mm_empty ();
+    return TRUE;
+}
+
+static void
+mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    s = *src++;
+	    *dst = convert_8888_to_0565 (s);
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    __m64 vdest;
+	    __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
+	    __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
+
+	    vdest = pack_4xpacked565 (vsrc0, vsrc1);
+
+	    *(__m64 *)dst = vdest;
+
+	    w -= 4;
+	    src += 4;
+	    dst += 4;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    *dst = convert_8888_to_0565 (s);
+	    dst++;
+	    w--;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
+                            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    __m64 vsrc;
+    uint64_t srcsrc;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+    {
+	mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
+		  PIXMAN_FORMAT_BPP (dest_image->bits.format),
+		  dest_x, dest_y, width, height, 0);
+	return;
+    }
+
+    srcsrc = (uint64_t)src << 32 | src;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    vsrc = load8888 (&src);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    uint64_t m = *mask;
+
+	    if (m)
+	    {
+		__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
+
+		store8888 (dst, vdest);
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+
+	    w--;
+	    mask++;
+	    dst++;
+	}
+
+	CHECKPOINT ();
+
+	while (w >= 2)
+	{
+	    uint64_t m0, m1;
+	    m0 = *mask;
+	    m1 = *(mask + 1);
+
+	    if (srca == 0xff && (m0 & m1) == 0xff)
+	    {
+		*(uint64_t *)dst = srcsrc;
+	    }
+	    else if (m0 | m1)
+	    {
+		__m64 dest0, dest1;
+
+		dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
+		dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
+
+		*(__m64 *)dst = pack8888 (dest0, dest1);
+	    }
+	    else
+	    {
+		*(uint64_t *)dst = 0;
+	    }
+
+	    mask += 2;
+	    dst += 2;
+	    w -= 2;
+	}
+
+	CHECKPOINT ();
+
+	if (w)
+	{
+	    uint64_t m = *mask;
+
+	    if (m)
+	    {
+		__m64 vdest = load8888 (dst);
+
+		vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
+		store8888 (dst, vdest);
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint16_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    __m64 vsrc, vsrca, tmp;
+    __m64 srcsrcsrcsrc;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    vsrc = load8888 (&src);
+    vsrca = expand_alpha (vsrc);
+
+    tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
+    srcsrcsrcsrc = expand_alpha_rev (tmp);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    uint64_t m = *mask;
+
+	    if (m)
+	    {
+		uint64_t d = *dst;
+		__m64 vd = to_m64 (d);
+		__m64 vdest = in_over (
+		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
+
+		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
+		*dst = to_uint64 (vd);
+	    }
+
+	    w--;
+	    mask++;
+	    dst++;
+	}
+
+	CHECKPOINT ();
+
+	while (w >= 4)
+	{
+	    uint64_t m0, m1, m2, m3;
+	    m0 = *mask;
+	    m1 = *(mask + 1);
+	    m2 = *(mask + 2);
+	    m3 = *(mask + 3);
+
+	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
+	    {
+		*(__m64 *)dst = srcsrcsrcsrc;
+	    }
+	    else if (m0 | m1 | m2 | m3)
+	    {
+		__m64 vdest = *(__m64 *)dst;
+		__m64 v0, v1, v2, v3;
+		__m64 vm0, vm1, vm2, vm3;
+
+		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
+
+		vm0 = to_m64 (m0);
+		v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
+
+		vm1 = to_m64 (m1);
+		v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
+
+		vm2 = to_m64 (m2);
+		v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
+
+		vm3 = to_m64 (m3);
+		v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
+
+		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
+	    }
+
+	    w -= 4;
+	    mask += 4;
+	    dst += 4;
+	}
+
+	CHECKPOINT ();
+
+	while (w)
+	{
+	    uint64_t m = *mask;
+
+	    if (m)
+	    {
+		uint64_t d = *dst;
+		__m64 vd = to_m64 (d);
+		__m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
+				       expand565 (vd, 0));
+		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
+		*dst = to_uint64 (vd);
+	    }
+
+	    w--;
+	    mask++;
+	    dst++;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+#if 0
+    /* FIXME */
+    assert (src_image->drawable == mask_image->drawable);
+#endif
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    __m64 vsrc = load8888 (src);
+	    uint64_t d = *dst;
+	    __m64 vdest = expand565 (to_m64 (d), 0);
+
+	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
+
+	    *dst = to_uint64 (vdest);
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+
+	CHECKPOINT ();
+
+	while (w >= 4)
+	{
+	    uint32_t s0, s1, s2, s3;
+	    unsigned char a0, a1, a2, a3;
+
+	    s0 = *src;
+	    s1 = *(src + 1);
+	    s2 = *(src + 2);
+	    s3 = *(src + 3);
+
+	    a0 = (s0 >> 24);
+	    a1 = (s1 >> 24);
+	    a2 = (s2 >> 24);
+	    a3 = (s3 >> 24);
+
+	    if ((a0 & a1 & a2 & a3) == 0xFF)
+	    {
+		__m64 v0 = invert_colors (load8888 (&s0));
+		__m64 v1 = invert_colors (load8888 (&s1));
+		__m64 v2 = invert_colors (load8888 (&s2));
+		__m64 v3 = invert_colors (load8888 (&s3));
+
+		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
+	    }
+	    else if (s0 | s1 | s2 | s3)
+	    {
+		__m64 vdest = *(__m64 *)dst;
+		__m64 v0, v1, v2, v3;
+
+		__m64 vsrc0 = load8888 (&s0);
+		__m64 vsrc1 = load8888 (&s1);
+		__m64 vsrc2 = load8888 (&s2);
+		__m64 vsrc3 = load8888 (&s3);
+
+		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
+
+		v0 = over_rev_non_pre (vsrc0, v0);
+		v1 = over_rev_non_pre (vsrc1, v1);
+		v2 = over_rev_non_pre (vsrc2, v2);
+		v3 = over_rev_non_pre (vsrc3, v3);
+
+		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    src += 4;
+	}
+
+	CHECKPOINT ();
+
+	while (w)
+	{
+	    __m64 vsrc = load8888 (src);
+	    uint64_t d = *dst;
+	    __m64 vdest = expand565 (to_m64 (d), 0);
+
+	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
+
+	    *dst = to_uint64 (vdest);
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+#if 0
+    /* FIXME */
+    assert (src_image->drawable == mask_image->drawable);
+#endif
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    __m64 s = load8888 (src);
+	    __m64 d = load8888 (dst);
+
+	    store8888 (dst, over_rev_non_pre (s, d));
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+
+	while (w >= 2)
+	{
+	    uint32_t s0, s1;
+	    unsigned char a0, a1;
+	    __m64 d0, d1;
+
+	    s0 = *src;
+	    s1 = *(src + 1);
+
+	    a0 = (s0 >> 24);
+	    a1 = (s1 >> 24);
+
+	    if ((a0 & a1) == 0xFF)
+	    {
+		d0 = invert_colors (load8888 (&s0));
+		d1 = invert_colors (load8888 (&s1));
+
+		*(__m64 *)dst = pack8888 (d0, d1);
+	    }
+	    else if (s0 | s1)
+	    {
+		__m64 vdest = *(__m64 *)dst;
+
+		d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
+		d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
+
+		*(__m64 *)dst = pack8888 (d0, d1);
+	    }
+
+	    w -= 2;
+	    dst += 2;
+	    src += 2;
+	}
+
+	if (w)
+	{
+	    __m64 s = load8888 (src);
+	    __m64 d = load8888 (dst);
+
+	    store8888 (dst, over_rev_non_pre (s, d));
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line;
+    uint32_t    *mask_line;
+    int dst_stride, mask_stride;
+    __m64 vsrc, vsrca;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    vsrc = load8888 (&src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+	int twidth = width;
+	uint32_t *p = (uint32_t *)mask_line;
+	uint16_t *q = (uint16_t *)dst_line;
+
+	while (twidth && ((uintptr_t)q & 7))
+	{
+	    uint32_t m = *(uint32_t *)p;
+
+	    if (m)
+	    {
+		uint64_t d = *q;
+		__m64 vdest = expand565 (to_m64 (d), 0);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
+		*q = to_uint64 (vdest);
+	    }
+
+	    twidth--;
+	    p++;
+	    q++;
+	}
+
+	while (twidth >= 4)
+	{
+	    uint32_t m0, m1, m2, m3;
+
+	    m0 = *p;
+	    m1 = *(p + 1);
+	    m2 = *(p + 2);
+	    m3 = *(p + 3);
+
+	    if ((m0 | m1 | m2 | m3))
+	    {
+		__m64 vdest = *(__m64 *)q;
+		__m64 v0, v1, v2, v3;
+
+		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
+
+		v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
+		v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
+		v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
+		v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
+
+		*(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
+	    }
+	    twidth -= 4;
+	    p += 4;
+	    q += 4;
+	}
+
+	while (twidth)
+	{
+	    uint32_t m;
+
+	    m = *(uint32_t *)p;
+	    if (m)
+	    {
+		uint64_t d = *q;
+		__m64 vdest = expand565 (to_m64 (d), 0);
+		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
+		*q = to_uint64 (vdest);
+	    }
+
+	    twidth--;
+	    p++;
+	    q++;
+	}
+
+	mask_line += mask_stride;
+	dst_line += dst_stride;
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
+                        pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint8_t sa;
+    __m64 vsrc, vsrca;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    sa = src >> 24;
+
+    vsrc = load8888 (&src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    uint16_t tmp;
+	    uint8_t a;
+	    uint32_t m, d;
+
+	    a = *mask++;
+	    d = *dst;
+
+	    m = MUL_UN8 (sa, a, tmp);
+	    d = MUL_UN8 (m, d, tmp);
+
+	    *dst++ = d;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    __m64 vmask;
+	    __m64 vdest;
+
+	    vmask = load8888u ((uint32_t *)mask);
+	    vdest = load8888 ((uint32_t *)dst);
+
+	    store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
+
+	    dst += 4;
+	    mask += 4;
+	    w -= 4;
+	}
+
+	while (w--)
+	{
+	    uint16_t tmp;
+	    uint8_t a;
+	    uint32_t m, d;
+
+	    a = *mask++;
+	    d = *dst;
+
+	    m = MUL_UN8 (sa, a, tmp);
+	    d = MUL_UN8 (m, d, tmp);
+
+	    *dst++ = d;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_in_8_8 (pixman_implementation_t *imp,
+                      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int src_stride, dst_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 3)
+	{
+	    uint8_t s, d;
+	    uint16_t tmp;
+
+	    s = *src;
+	    d = *dst;
+
+	    *dst = MUL_UN8 (s, d, tmp);
+
+	    src++;
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    uint32_t *s = (uint32_t *)src;
+	    uint32_t *d = (uint32_t *)dst;
+
+	    store8888 (d, in (load8888u (s), load8888 (d)));
+
+	    w -= 4;
+	    dst += 4;
+	    src += 4;
+	}
+
+	while (w--)
+	{
+	    uint8_t s, d;
+	    uint16_t tmp;
+
+	    s = *src;
+	    d = *dst;
+
+	    *dst = MUL_UN8 (s, d, tmp);
+
+	    src++;
+	    dst++;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
+			 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint8_t sa;
+    __m64 vsrc, vsrca;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    sa = src >> 24;
+
+    if (src == 0)
+	return;
+
+    vsrc = load8888 (&src);
+    vsrca = expand_alpha (vsrc);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 3)
+	{
+	    uint16_t tmp;
+	    uint16_t a;
+	    uint32_t m, d;
+	    uint32_t r;
+
+	    a = *mask++;
+	    d = *dst;
+
+	    m = MUL_UN8 (sa, a, tmp);
+	    r = ADD_UN8 (m, d, tmp);
+
+	    *dst++ = r;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    __m64 vmask;
+	    __m64 vdest;
+
+	    vmask = load8888u ((uint32_t *)mask);
+	    vdest = load8888 ((uint32_t *)dst);
+
+	    store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
+
+	    dst += 4;
+	    mask += 4;
+	    w -= 4;
+	}
+
+	while (w--)
+	{
+	    uint16_t tmp;
+	    uint16_t a;
+	    uint32_t m, d;
+	    uint32_t r;
+
+	    a = *mask++;
+	    d = *dst;
+
+	    m = MUL_UN8 (sa, a, tmp);
+	    r = ADD_UN8 (m, d, tmp);
+
+	    *dst++ = r;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_add_8_8 (pixman_implementation_t *imp,
+		       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t *dst_line, *dst;
+    uint8_t *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint8_t s, d;
+    uint16_t t;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    s = *src;
+	    d = *dst;
+	    t = d + s;
+	    s = t | (0 - (t >> 8));
+	    *dst = s;
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+
+	while (w >= 8)
+	{
+	    *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
+	    dst += 8;
+	    src += 8;
+	    w -= 8;
+	}
+
+	while (w)
+	{
+	    s = *src;
+	    d = *dst;
+	    t = d + s;
+	    s = t | (0 - (t >> 8));
+	    *dst = s;
+
+	    dst++;
+	    src++;
+	    w--;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t	d;
+    uint16_t    *src_line, *src;
+    uint32_t	s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    s = *src++;
+	    if (s)
+	    {
+		d = *dst;
+		s = convert_0565_to_8888 (s);
+		if (d)
+		{
+		    d = convert_0565_to_8888 (d);
+		    UN8x4_ADD_UN8x4 (s, d);
+		}
+		*dst = convert_8888_to_0565 (s);
+	    }
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    __m64 vdest = *(__m64 *)dst;
+	    __m64 vsrc = ldq_u ((__m64 *)src);
+	    __m64 vd0, vd1;
+	    __m64 vs0, vs1;
+
+	    expand_4xpacked565 (vdest, &vd0, &vd1, 0);
+	    expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
+
+	    vd0 = _mm_adds_pu8 (vd0, vs0);
+	    vd1 = _mm_adds_pu8 (vd1, vs1);
+
+	    *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
+
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+	}
+
+	while (w--)
+	{
+	    s = *src++;
+	    if (s)
+	    {
+		d = *dst;
+		s = convert_0565_to_8888 (s);
+		if (d)
+		{
+		    d = convert_0565_to_8888 (d);
+		    UN8x4_ADD_UN8x4 (s, d);
+		}
+		*dst = convert_8888_to_0565 (s);
+	    }
+	    dst++;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    CHECKPOINT ();
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
+	                              load ((const uint32_t *)dst)));
+	    dst++;
+	    src++;
+	    w--;
+	}
+
+	while (w >= 2)
+	{
+	    *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
+	    dst += 2;
+	    src += 2;
+	    w -= 2;
+	}
+
+	if (w)
+	{
+	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
+	                              load ((const uint32_t *)dst)));
+
+	}
+    }
+
+    _mm_empty ();
+}
+
+static pixman_bool_t
+mmx_blt (pixman_implementation_t *imp,
+         uint32_t *               src_bits,
+         uint32_t *               dst_bits,
+         int                      src_stride,
+         int                      dst_stride,
+         int                      src_bpp,
+         int                      dst_bpp,
+         int                      src_x,
+         int                      src_y,
+         int                      dest_x,
+         int                      dest_y,
+         int                      width,
+         int                      height)
+{
+    uint8_t *   src_bytes;
+    uint8_t *   dst_bytes;
+    int byte_width;
+
+    if (src_bpp != dst_bpp)
+	return FALSE;
+
+    if (src_bpp == 16)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+	byte_width = 2 * width;
+	src_stride *= 2;
+	dst_stride *= 2;
+    }
+    else if (src_bpp == 32)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+	byte_width = 4 * width;
+	src_stride *= 4;
+	dst_stride *= 4;
+    }
+    else
+    {
+	return FALSE;
+    }
+
+    while (height--)
+    {
+	int w;
+	uint8_t *s = src_bytes;
+	uint8_t *d = dst_bytes;
+	src_bytes += src_stride;
+	dst_bytes += dst_stride;
+	w = byte_width;
+
+	if (w >= 1 && ((uintptr_t)d & 1))
+	{
+	    *(uint8_t *)d = *(uint8_t *)s;
+	    w -= 1;
+	    s += 1;
+	    d += 1;
+	}
+
+	if (w >= 2 && ((uintptr_t)d & 3))
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((uintptr_t)d & 7))
+	{
+	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+
+	while (w >= 64)
+	{
+#if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
+	    __asm__ (
+	        "movq	  (%1),	  %%mm0\n"
+	        "movq	 8(%1),	  %%mm1\n"
+	        "movq	16(%1),	  %%mm2\n"
+	        "movq	24(%1),	  %%mm3\n"
+	        "movq	32(%1),	  %%mm4\n"
+	        "movq	40(%1),	  %%mm5\n"
+	        "movq	48(%1),	  %%mm6\n"
+	        "movq	56(%1),	  %%mm7\n"
+
+	        "movq	%%mm0,	  (%0)\n"
+	        "movq	%%mm1,	 8(%0)\n"
+	        "movq	%%mm2,	16(%0)\n"
+	        "movq	%%mm3,	24(%0)\n"
+	        "movq	%%mm4,	32(%0)\n"
+	        "movq	%%mm5,	40(%0)\n"
+	        "movq	%%mm6,	48(%0)\n"
+	        "movq	%%mm7,	56(%0)\n"
+		:
+		: "r" (d), "r" (s)
+		: "memory",
+		  "%mm0", "%mm1", "%mm2", "%mm3",
+		  "%mm4", "%mm5", "%mm6", "%mm7");
+#else
+	    __m64 v0 = ldq_u ((__m64 *)(s + 0));
+	    __m64 v1 = ldq_u ((__m64 *)(s + 8));
+	    __m64 v2 = ldq_u ((__m64 *)(s + 16));
+	    __m64 v3 = ldq_u ((__m64 *)(s + 24));
+	    __m64 v4 = ldq_u ((__m64 *)(s + 32));
+	    __m64 v5 = ldq_u ((__m64 *)(s + 40));
+	    __m64 v6 = ldq_u ((__m64 *)(s + 48));
+	    __m64 v7 = ldq_u ((__m64 *)(s + 56));
+	    *(__m64 *)(d + 0)  = v0;
+	    *(__m64 *)(d + 8)  = v1;
+	    *(__m64 *)(d + 16) = v2;
+	    *(__m64 *)(d + 24) = v3;
+	    *(__m64 *)(d + 32) = v4;
+	    *(__m64 *)(d + 40) = v5;
+	    *(__m64 *)(d + 48) = v6;
+	    *(__m64 *)(d + 56) = v7;
+#endif
+
+	    w -= 64;
+	    s += 64;
+	    d += 64;
+	}
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+    }
+
+    _mm_empty ();
+
+    return TRUE;
+}
+
+static void
+mmx_composite_copy_area (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+
+    mmx_blt (imp, src_image->bits.bits,
+	     dest_image->bits.bits,
+	     src_image->bits.rowstride,
+	     dest_image->bits.rowstride,
+	     PIXMAN_FORMAT_BPP (src_image->bits.format),
+	     PIXMAN_FORMAT_BPP (dest_image->bits.format),
+	     src_x, src_y, dest_x, dest_y, width, height);
+}
+
+static void
+mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t  *src, *src_line;
+    uint32_t  *dst, *dst_line;
+    uint8_t  *mask, *mask_line;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	src = src_line;
+	src_line += src_stride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	w = width;
+
+	while (w--)
+	{
+	    uint64_t m = *mask;
+
+	    if (m)
+	    {
+		uint32_t ssrc = *src | 0xff000000;
+		__m64 s = load8888 (&ssrc);
+
+		if (m == 0xff)
+		{
+		    store8888 (dst, s);
+		}
+		else
+		{
+		    __m64 sa = expand_alpha (s);
+		    __m64 vm = expand_alpha_rev (to_m64 (m));
+		    __m64 vdest = in_over (s, sa, vm, load8888 (dst));
+
+		    store8888 (dst, vdest);
+		}
+	    }
+
+	    mask++;
+	    dst++;
+	    src++;
+	}
+    }
+
+    _mm_empty ();
+}
+
+static void
+mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+                                   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst;
+    int32_t w;
+    int dst_stride;
+    __m64 vsrc;
+
+    CHECKPOINT ();
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    vsrc = load8888 (&src);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	CHECKPOINT ();
+
+	while (w && (uintptr_t)dst & 7)
+	{
+	    __m64 vdest = load8888 (dst);
+
+	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 2)
+	{
+	    __m64 vdest = *(__m64 *)dst;
+	    __m64 dest0 = expand8888 (vdest, 0);
+	    __m64 dest1 = expand8888 (vdest, 1);
+
+
+	    dest0 = over (dest0, expand_alpha (dest0), vsrc);
+	    dest1 = over (dest1, expand_alpha (dest1), vsrc);
+
+	    *(__m64 *)dst = pack8888 (dest0, dest1);
+
+	    dst += 2;
+	    w -= 2;
+	}
+
+	CHECKPOINT ();
+
+	if (w)
+	{
+	    __m64 vdest = load8888 (dst);
+
+	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
+	}
+    }
+
+    _mm_empty ();
+}
+
+#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
+#define BMSK (BSHIFT - 1)
+
+#define BILINEAR_DECLARE_VARIABLES						\
+    const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\
+    const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\
+    const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);				\
+    const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);			\
+    const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\
+    const __m64 mm_zero = _mm_setzero_si64 ();					\
+    __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
+do {										\
+    /* fetch 2x2 pixel block into 2 mmx registers */				\
+    __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);		\
+    __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);		\
+    /* vertical interpolation */						\
+    __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);		\
+    __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);		\
+    __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);		\
+    __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\
+    __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\
+    __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\
+    /* calculate horizontal weights */						\
+    __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\
+			  _mm_srli_pi16 (mm_x,					\
+					 16 - BILINEAR_INTERPOLATION_BITS)));	\
+    /* horizontal interpolation */						\
+    __m64 p = _mm_unpacklo_pi16 (lo, hi);					\
+    __m64 q = _mm_unpackhi_pi16 (lo, hi);					\
+    vx += unit_x;								\
+    lo = _mm_madd_pi16 (p, mm_wh);						\
+    hi = _mm_madd_pi16 (q, mm_wh);						\
+    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
+    /* shift and pack the result */						\
+    hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\
+    lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\
+    lo = _mm_packs_pi32 (lo, hi);						\
+    lo = _mm_packs_pu16 (lo, lo);						\
+    pix = lo;									\
+} while (0)
+
+#define BILINEAR_SKIP_ONE_PIXEL()						\
+do {										\
+    vx += unit_x;								\
+    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
+} while(0)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
+					    const uint32_t * mask,
+					    const uint32_t * src_top,
+					    const uint32_t * src_bottom,
+					    int32_t          w,
+					    int              wt,
+					    int              wb,
+					    pixman_fixed_t   vx,
+					    pixman_fixed_t   unit_x,
+					    pixman_fixed_t   max_vx,
+					    pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    __m64 pix;
+
+    while (w--)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
+	store (dst, pix);
+	dst++;
+    }
+
+    _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
+			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst,
+					     const uint32_t * mask,
+					     const uint32_t * src_top,
+					     const uint32_t * src_bottom,
+					     int32_t          w,
+					     int              wt,
+					     int              wb,
+					     pixman_fixed_t   vx,
+					     pixman_fixed_t   unit_x,
+					     pixman_fixed_t   max_vx,
+					     pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    __m64 pix1, pix2;
+
+    while (w)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+	if (!is_zero (pix1))
+	{
+	    pix2 = load (dst);
+	    store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
+	}
+
+	w--;
+	dst++;
+    }
+
+    _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst,
+					       const uint8_t  * mask,
+					       const uint32_t * src_top,
+					       const uint32_t * src_bottom,
+					       int32_t          w,
+					       int              wt,
+					       int              wb,
+					       pixman_fixed_t   vx,
+					       pixman_fixed_t   unit_x,
+					       pixman_fixed_t   max_vx,
+					       pixman_bool_t    zero_src)
+{
+    BILINEAR_DECLARE_VARIABLES;
+    __m64 pix1, pix2;
+    uint32_t m;
+
+    while (w)
+    {
+	m = (uint32_t) *mask++;
+
+	if (m)
+	{
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+	    if (m == 0xff && is_opaque (pix1))
+	    {
+		store (dst, pix1);
+	    }
+	    else
+	    {
+		__m64 ms, md, ma, msa;
+
+		pix2 = load (dst);
+		ma = expand_alpha_rev (to_m64 (m));
+		ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
+		md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
+
+		msa = expand_alpha (ms);
+
+		store8888 (dst, (in_over (ms, msa, ma, md)));
+	    }
+	}
+	else
+	{
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	}
+
+	w--;
+	dst++;
+    }
+
+    _mm_empty ();
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       COVER, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       PAD, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       NONE, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
+			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
+
+static uint32_t *
+mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint32_t *src = (uint32_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((uintptr_t)dst) & 7)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    while (w >= 8)
+    {
+	__m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
+	__m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
+	__m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
+	__m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
+
+	*(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
+	*(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
+	*(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
+	*(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
+
+	dst += 8;
+	src += 8;
+	w -= 8;
+    }
+
+    while (w)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    _mm_empty ();
+    return iter->buffer;
+}
+
+static uint32_t *
+mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint16_t *src = (uint16_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((uintptr_t)dst) & 0x0f)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = convert_0565_to_8888 (s);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m64 vsrc = ldq_u ((__m64 *)src);
+	__m64 mm0, mm1;
+
+	expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
+
+	*(__m64 *)(dst + 0) = mm0;
+	*(__m64 *)(dst + 2) = mm1;
+
+	dst += 4;
+	src += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = convert_0565_to_8888 (s);
+	w--;
+    }
+
+    _mm_empty ();
+    return iter->buffer;
+}
+
+static uint32_t *
+mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint8_t *src = iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && (((uintptr_t)dst) & 15))
+    {
+        *dst++ = *(src++) << 24;
+        w--;
+    }
+
+    while (w >= 8)
+    {
+	__m64 mm0 = ldq_u ((__m64 *)src);
+
+	__m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0);
+	__m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0);
+	__m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
+	__m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
+	__m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
+	__m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
+
+	*(__m64 *)(dst + 0) = mm3;
+	*(__m64 *)(dst + 2) = mm4;
+	*(__m64 *)(dst + 4) = mm5;
+	*(__m64 *)(dst + 6) = mm6;
+
+	dst += 8;
+	src += 8;
+	w -= 8;
+    }
+
+    while (w)
+    {
+	*dst++ = *(src++) << 24;
+	w--;
+    }
+
+    _mm_empty ();
+    return iter->buffer;
+}
+
+#define IMAGE_FLAGS							\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
+     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+static const pixman_iter_info_t mmx_iters[] = 
+{
+    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL
+    },
+    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL
+    },
+    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL
+    },
+    { PIXMAN_null },
+};
+
+static const pixman_fast_path_t mmx_fast_paths[] =
+{
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
+
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
+
+    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
+
+    PIXMAN_STD_FAST_PATH    (ADD,  r5g6b5,   null,     r5g6b5,   mmx_composite_add_0565_0565       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  b5g6r5,   null,     b5g6r5,   mmx_composite_add_0565_0565       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ),
+    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
+
+    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
+
+    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
+    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
+
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ),
+
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_mmx (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
+
+    imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
+    imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
+
+    imp->blt = mmx_blt;
+    imp->fill = mmx_fill;
+
+    imp->iter_info = mmx_iters;
+
+    return imp;
+}
+
+#endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */
diff --git a/src/pixman/pixman/pixman-noop.c b/src/pixman/pixman/pixman-noop.c
new file mode 100644
index 0000000..e598904
--- /dev/null
+++ b/src/pixman/pixman/pixman-noop.c
@@ -0,0 +1,161 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2011 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <string.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+static void
+noop_composite (pixman_implementation_t *imp,
+		pixman_composite_info_t *info)
+{
+    return;
+}
+
+static uint32_t *
+noop_get_scanline (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *result = iter->buffer;
+
+    iter->buffer += iter->image->bits.rowstride;
+
+    return result;
+}
+
+static void
+noop_init_solid_narrow (pixman_iter_t *iter,
+			const pixman_iter_info_t *info)
+{ 
+    pixman_image_t *image = iter->image;
+    uint32_t *buffer = iter->buffer;
+    uint32_t *end = buffer + iter->width;
+    uint32_t color;
+
+    if (iter->image->type == SOLID)
+	color = image->solid.color_32;
+    else
+	color = image->bits.fetch_pixel_32 (&image->bits, 0, 0);
+
+    while (buffer < end)
+	*(buffer++) = color;
+}
+
+static void
+noop_init_solid_wide (pixman_iter_t *iter,
+		      const pixman_iter_info_t *info)
+{
+    pixman_image_t *image = iter->image;
+    argb_t *buffer = (argb_t *)iter->buffer;
+    argb_t *end = buffer + iter->width;
+    argb_t color;
+
+    if (iter->image->type == SOLID)
+	color = image->solid.color_float;
+    else
+	color = image->bits.fetch_pixel_float (&image->bits, 0, 0);
+
+    while (buffer < end)
+	*(buffer++) = color;
+}
+
+static void
+noop_init_direct_buffer (pixman_iter_t *iter, const pixman_iter_info_t *info)
+{
+    pixman_image_t *image = iter->image;
+
+    iter->buffer =
+	image->bits.bits + iter->y * image->bits.rowstride + iter->x;
+}
+
+static void
+dest_write_back_direct (pixman_iter_t *iter)
+{
+    iter->buffer += iter->image->bits.rowstride;
+}
+
+static const pixman_iter_info_t noop_iters[] =
+{
+    /* Source iters */
+    { PIXMAN_any,
+      0, ITER_IGNORE_ALPHA | ITER_IGNORE_RGB | ITER_SRC,
+      NULL,
+      _pixman_iter_get_scanline_noop,
+      NULL
+    },
+    { PIXMAN_solid,
+      FAST_PATH_NO_ALPHA_MAP, ITER_NARROW | ITER_SRC,
+      noop_init_solid_narrow,
+      _pixman_iter_get_scanline_noop,
+      NULL,
+    },
+    { PIXMAN_solid,
+      FAST_PATH_NO_ALPHA_MAP, ITER_WIDE | ITER_SRC,
+      noop_init_solid_wide,
+      _pixman_iter_get_scanline_noop,
+      NULL
+    },
+    { PIXMAN_a8r8g8b8,
+      FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |
+          FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,
+      ITER_NARROW | ITER_SRC,
+      noop_init_direct_buffer,
+      noop_get_scanline,
+      NULL
+    },
+    /* Dest iters */
+    { PIXMAN_a8r8g8b8,
+      FAST_PATH_STD_DEST_FLAGS, ITER_NARROW | ITER_DEST,
+      noop_init_direct_buffer,
+      _pixman_iter_get_scanline_noop,
+      dest_write_back_direct
+    },
+    { PIXMAN_x8r8g8b8,
+      FAST_PATH_STD_DEST_FLAGS, ITER_NARROW | ITER_DEST | ITER_LOCALIZED_ALPHA,
+      noop_init_direct_buffer,
+      _pixman_iter_get_scanline_noop,
+      dest_write_back_direct
+    },
+    { PIXMAN_null },
+};
+
+static const pixman_fast_path_t noop_fast_paths[] =
+{
+    { PIXMAN_OP_DST, PIXMAN_any, 0, PIXMAN_any, 0, PIXMAN_any, 0, noop_composite },
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_noop (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp =
+	_pixman_implementation_create (fallback, noop_fast_paths);
+ 
+    imp->iter_info = noop_iters;
+
+    return imp;
+}
diff --git a/src/pixman/pixman/pixman-ppc.c b/src/pixman/pixman/pixman-ppc.c
new file mode 100644
index 0000000..a6e7bb0
--- /dev/null
+++ b/src/pixman/pixman/pixman-ppc.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+
+#ifdef USE_VMX
+
+/* The CPU detection code needs to be in a file not compiled with
+ * "-maltivec -mabi=altivec", as gcc would try to save vector register
+ * across function calls causing SIGILL on cpus without Altivec/vmx.
+ */
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+    int error, have_vmx;
+    size_t length = sizeof(have_vmx);
+
+    error = sysctlbyname ("hw.optional.altivec", &have_vmx, &length, NULL, 0);
+
+    if (error)
+	return FALSE;
+
+    return have_vmx;
+}
+
+#elif defined (__OpenBSD__)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+    int error, have_vmx;
+    int mib[2] = { CTL_MACHDEP, CPU_ALTIVEC };
+    size_t length = sizeof(have_vmx);
+
+    error = sysctl (mib, 2, &have_vmx, &length, NULL, 0);
+
+    if (error != 0)
+	return FALSE;
+
+    return have_vmx;
+}
+
+#elif defined (__linux__)
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <linux/auxvec.h>
+#include <asm/cputable.h>
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+    int have_vmx = FALSE;
+    int fd;
+    struct
+    {
+	unsigned long type;
+	unsigned long value;
+    } aux;
+
+    fd = open ("/proc/self/auxv", O_RDONLY);
+    if (fd >= 0)
+    {
+	while (read (fd, &aux, sizeof (aux)) == sizeof (aux))
+	{
+	    if (aux.type == AT_HWCAP && (aux.value & PPC_FEATURE_HAS_ALTIVEC))
+	    {
+		have_vmx = TRUE;
+		break;
+	    }
+	}
+
+	close (fd);
+    }
+
+    return have_vmx;
+}
+
+#else /* !__APPLE__ && !__OpenBSD__ && !__linux__ */
+#include <signal.h>
+#include <setjmp.h>
+
+static jmp_buf jump_env;
+
+static void
+vmx_test (int        sig,
+	  siginfo_t *si,
+	  void *     unused)
+{
+    longjmp (jump_env, 1);
+}
+
+static pixman_bool_t
+pixman_have_vmx (void)
+{
+    struct sigaction sa, osa;
+    int jmp_result;
+
+    sa.sa_flags = SA_SIGINFO;
+    sigemptyset (&sa.sa_mask);
+    sa.sa_sigaction = vmx_test;
+    sigaction (SIGILL, &sa, &osa);
+    jmp_result = setjmp (jump_env);
+    if (jmp_result == 0)
+    {
+	asm volatile ( "vor 0, 0, 0" );
+    }
+    sigaction (SIGILL, &osa, NULL);
+    return (jmp_result == 0);
+}
+
+#endif /* __APPLE__ */
+#endif /* USE_VMX */
+
+pixman_implementation_t *
+_pixman_ppc_get_implementations (pixman_implementation_t *imp)
+{
+#ifdef USE_VMX
+    if (!_pixman_disabled ("vmx") && pixman_have_vmx ())
+	imp = _pixman_implementation_create_vmx (imp);
+#endif
+
+    return imp;
+}
diff --git a/src/pixman/pixman/pixman-private.h b/src/pixman/pixman/pixman-private.h
new file mode 100644
index 0000000..6ca13b2
--- /dev/null
+++ b/src/pixman/pixman/pixman-private.h
@@ -0,0 +1,1153 @@
+#include <float.h>
+
+#ifndef PIXMAN_PRIVATE_H
+#define PIXMAN_PRIVATE_H
+
+/*
+ * The defines which are shared between C and assembly code
+ */
+
+/* bilinear interpolation precision (must be <= 8) */
+#define BILINEAR_INTERPOLATION_BITS 7
+#define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS)
+
+/*
+ * C specific part
+ */
+
+#ifndef __ASSEMBLER__
+
+#ifndef PACKAGE
+#  error config.h must be included before pixman-private.h
+#endif
+
+#define PIXMAN_DISABLE_DEPRECATED
+#define PIXMAN_USE_INTERNAL_API
+
+#include "pixman.h"
+#include <time.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <stddef.h>
+
+#include "pixman-compiler.h"
+
+/*
+ * Images
+ */
+typedef struct image_common image_common_t;
+typedef struct solid_fill solid_fill_t;
+typedef struct gradient gradient_t;
+typedef struct linear_gradient linear_gradient_t;
+typedef struct horizontal_gradient horizontal_gradient_t;
+typedef struct vertical_gradient vertical_gradient_t;
+typedef struct conical_gradient conical_gradient_t;
+typedef struct radial_gradient radial_gradient_t;
+typedef struct bits_image bits_image_t;
+typedef struct circle circle_t;
+
+typedef struct argb_t argb_t;
+
+struct argb_t
+{
+    float a;
+    float r;
+    float g;
+    float b;
+};
+
+typedef void (*fetch_scanline_t) (bits_image_t   *image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  uint32_t       *buffer,
+				  const uint32_t *mask);
+
+typedef uint32_t (*fetch_pixel_32_t) (bits_image_t *image,
+				      int           x,
+				      int           y);
+
+typedef argb_t (*fetch_pixel_float_t) (bits_image_t *image,
+				       int           x,
+				       int           y);
+
+typedef void (*store_scanline_t) (bits_image_t *  image,
+				  int             x,
+				  int             y,
+				  int             width,
+				  const uint32_t *values);
+
+typedef enum
+{
+    BITS,
+    LINEAR,
+    CONICAL,
+    RADIAL,
+    SOLID
+} image_type_t;
+
+typedef void (*property_changed_func_t) (pixman_image_t *image);
+
+struct image_common
+{
+    image_type_t                type;
+    int32_t                     ref_count;
+    pixman_region32_t           clip_region;
+    int32_t			alpha_count;	    /* How many times this image is being used as an alpha map */
+    pixman_bool_t               have_clip_region;   /* FALSE if there is no clip */
+    pixman_bool_t               client_clip;        /* Whether the source clip was
+						       set by a client */
+    pixman_bool_t               clip_sources;       /* Whether the clip applies when
+						     * the image is used as a source
+						     */
+    pixman_bool_t		dirty;
+    pixman_transform_t *        transform;
+    pixman_repeat_t             repeat;
+    pixman_filter_t             filter;
+    pixman_fixed_t *            filter_params;
+    int                         n_filter_params;
+    bits_image_t *              alpha_map;
+    int                         alpha_origin_x;
+    int                         alpha_origin_y;
+    pixman_bool_t               component_alpha;
+    property_changed_func_t     property_changed;
+
+    pixman_image_destroy_func_t destroy_func;
+    void *                      destroy_data;
+
+    uint32_t			flags;
+    pixman_format_code_t	extended_format_code;
+};
+
+struct solid_fill
+{
+    image_common_t common;
+    pixman_color_t color;
+
+    uint32_t	   color_32;
+    argb_t	   color_float;
+};
+
+struct gradient
+{
+    image_common_t	    common;
+    int                     n_stops;
+    pixman_gradient_stop_t *stops;
+};
+
+struct linear_gradient
+{
+    gradient_t           common;
+    pixman_point_fixed_t p1;
+    pixman_point_fixed_t p2;
+};
+
+struct circle
+{
+    pixman_fixed_t x;
+    pixman_fixed_t y;
+    pixman_fixed_t radius;
+};
+
+struct radial_gradient
+{
+    gradient_t common;
+
+    circle_t   c1;
+    circle_t   c2;
+
+    circle_t   delta;
+    double     a;
+    double     inva;
+    double     mindr;
+};
+
+struct conical_gradient
+{
+    gradient_t           common;
+    pixman_point_fixed_t center;
+    double		 angle;
+};
+
+struct bits_image
+{
+    image_common_t             common;
+    pixman_format_code_t       format;
+    const pixman_indexed_t *   indexed;
+    int                        width;
+    int                        height;
+    uint32_t *                 bits;
+    uint32_t *                 free_me;
+    int                        rowstride;  /* in number of uint32_t's */
+
+    fetch_scanline_t           fetch_scanline_32;
+    fetch_pixel_32_t	       fetch_pixel_32;
+    store_scanline_t           store_scanline_32;
+
+    fetch_scanline_t	       fetch_scanline_float;
+    fetch_pixel_float_t	       fetch_pixel_float;
+    store_scanline_t           store_scanline_float;
+
+    /* Used for indirect access to the bits */
+    pixman_read_memory_func_t  read_func;
+    pixman_write_memory_func_t write_func;
+};
+
+union pixman_image
+{
+    image_type_t       type;
+    image_common_t     common;
+    bits_image_t       bits;
+    gradient_t         gradient;
+    linear_gradient_t  linear;
+    conical_gradient_t conical;
+    radial_gradient_t  radial;
+    solid_fill_t       solid;
+};
+
+typedef struct pixman_iter_t pixman_iter_t;
+typedef uint32_t *(* pixman_iter_get_scanline_t) (pixman_iter_t *iter, const uint32_t *mask);
+typedef void      (* pixman_iter_write_back_t)   (pixman_iter_t *iter);
+typedef void	  (* pixman_iter_fini_t)	 (pixman_iter_t *iter);
+
+typedef enum
+{
+    ITER_NARROW =               (1 << 0),
+    ITER_WIDE =                 (1 << 1),
+
+    /* "Localized alpha" is when the alpha channel is used only to compute
+     * the alpha value of the destination. This means that the computation
+     * of the RGB values of the result is independent of the alpha value.
+     *
+     * For example, the OVER operator has localized alpha for the
+     * destination, because the RGB values of the result can be computed
+     * without knowing the destination alpha. Similarly, ADD has localized
+     * alpha for both source and destination because the RGB values of the
+     * result can be computed without knowing the alpha value of source or
+     * destination.
+     *
+     * When he destination is xRGB, this is useful knowledge, because then
+     * we can treat it as if it were ARGB, which means in some cases we can
+     * avoid copying it to a temporary buffer.
+     */
+    ITER_LOCALIZED_ALPHA =	(1 << 2),
+    ITER_IGNORE_ALPHA =		(1 << 3),
+    ITER_IGNORE_RGB =		(1 << 4),
+
+    /* These indicate whether the iterator is for a source
+     * or a destination image
+     */
+    ITER_SRC =			(1 << 5),
+    ITER_DEST =			(1 << 6)
+} iter_flags_t;
+
+struct pixman_iter_t
+{
+    /* These are initialized by _pixman_implementation_{src,dest}_init */
+    pixman_image_t *		image;
+    uint32_t *			buffer;
+    int				x, y;
+    int				width;
+    int				height;
+    iter_flags_t		iter_flags;
+    uint32_t			image_flags;
+
+    /* These function pointers are initialized by the implementation */
+    pixman_iter_get_scanline_t	get_scanline;
+    pixman_iter_write_back_t	write_back;
+    pixman_iter_fini_t          fini;
+
+    /* These fields are scratch data that implementations can use */
+    void *			data;
+    uint8_t *			bits;
+    int				stride;
+};
+
+typedef struct pixman_iter_info_t pixman_iter_info_t;
+typedef void (* pixman_iter_initializer_t) (pixman_iter_t *iter,
+                                            const pixman_iter_info_t *info);
+struct pixman_iter_info_t
+{
+    pixman_format_code_t	format;
+    uint32_t			image_flags;
+    iter_flags_t		iter_flags;
+    pixman_iter_initializer_t	initializer;
+    pixman_iter_get_scanline_t	get_scanline;
+    pixman_iter_write_back_t	write_back;
+};
+
+void
+_pixman_bits_image_setup_accessors (bits_image_t *image);
+
+void
+_pixman_bits_image_src_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_bits_image_dest_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_linear_gradient_iter_init (pixman_image_t *image, pixman_iter_t  *iter);
+
+void
+_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_conical_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter);
+
+void
+_pixman_image_init (pixman_image_t *image);
+
+pixman_bool_t
+_pixman_bits_image_init (pixman_image_t *     image,
+                         pixman_format_code_t format,
+                         int                  width,
+                         int                  height,
+                         uint32_t *           bits,
+                         int                  rowstride,
+			 pixman_bool_t	      clear);
+pixman_bool_t
+_pixman_image_fini (pixman_image_t *image);
+
+pixman_image_t *
+_pixman_image_allocate (void);
+
+pixman_bool_t
+_pixman_init_gradient (gradient_t *                  gradient,
+                       const pixman_gradient_stop_t *stops,
+                       int                           n_stops);
+void
+_pixman_image_reset_clip_region (pixman_image_t *image);
+
+void
+_pixman_image_validate (pixman_image_t *image);
+
+#define PIXMAN_IMAGE_GET_LINE(image, x, y, type, out_stride, line, mul)	\
+    do									\
+    {									\
+	uint32_t *__bits__;						\
+	int       __stride__;						\
+        								\
+	__bits__ = image->bits.bits;					\
+	__stride__ = image->bits.rowstride;				\
+	(out_stride) =							\
+	    __stride__ * (int) sizeof (uint32_t) / (int) sizeof (type);	\
+	(line) =							\
+	    ((type *) __bits__) + (out_stride) * (y) + (mul) * (x);	\
+    } while (0)
+
+/*
+ * Gradient walker
+ */
+typedef struct
+{
+    float		    a_s, a_b;
+    float		    r_s, r_b;
+    float		    g_s, g_b;
+    float		    b_s, b_b;
+    pixman_fixed_t	    left_x;
+    pixman_fixed_t          right_x;
+
+    pixman_gradient_stop_t *stops;
+    int                     num_stops;
+    pixman_repeat_t	    repeat;
+
+    pixman_bool_t           need_reset;
+} pixman_gradient_walker_t;
+
+void
+_pixman_gradient_walker_init (pixman_gradient_walker_t *walker,
+                              gradient_t *              gradient,
+			      pixman_repeat_t           repeat);
+
+void
+_pixman_gradient_walker_reset (pixman_gradient_walker_t *walker,
+                               pixman_fixed_48_16_t      pos);
+
+uint32_t
+_pixman_gradient_walker_pixel (pixman_gradient_walker_t *walker,
+                               pixman_fixed_48_16_t      x);
+
+/*
+ * Edges
+ */
+
+#define MAX_ALPHA(n)    ((1 << (n)) - 1)
+#define N_Y_FRAC(n)     ((n) == 1 ? 1 : (1 << ((n) / 2)) - 1)
+#define N_X_FRAC(n)     ((n) == 1 ? 1 : (1 << ((n) / 2)) + 1)
+
+#define STEP_Y_SMALL(n) (pixman_fixed_1 / N_Y_FRAC (n))
+#define STEP_Y_BIG(n)   (pixman_fixed_1 - (N_Y_FRAC (n) - 1) * STEP_Y_SMALL (n))
+
+#define Y_FRAC_FIRST(n) (STEP_Y_BIG (n) / 2)
+#define Y_FRAC_LAST(n)  (Y_FRAC_FIRST (n) + (N_Y_FRAC (n) - 1) * STEP_Y_SMALL (n))
+
+#define STEP_X_SMALL(n) (pixman_fixed_1 / N_X_FRAC (n))
+#define STEP_X_BIG(n)   (pixman_fixed_1 - (N_X_FRAC (n) - 1) * STEP_X_SMALL (n))
+
+#define X_FRAC_FIRST(n) (STEP_X_BIG (n) / 2)
+#define X_FRAC_LAST(n)  (X_FRAC_FIRST (n) + (N_X_FRAC (n) - 1) * STEP_X_SMALL (n))
+
+#define RENDER_SAMPLES_X(x, n)						\
+    ((n) == 1? 0 : (pixman_fixed_frac (x) +				\
+		    X_FRAC_FIRST (n)) / STEP_X_SMALL (n))
+
+void
+pixman_rasterize_edges_accessors (pixman_image_t *image,
+                                  pixman_edge_t * l,
+                                  pixman_edge_t * r,
+                                  pixman_fixed_t  t,
+                                  pixman_fixed_t  b);
+
+/*
+ * Implementations
+ */
+typedef struct pixman_implementation_t pixman_implementation_t;
+
+typedef struct
+{
+    pixman_op_t              op;
+    pixman_image_t *         src_image;
+    pixman_image_t *         mask_image;
+    pixman_image_t *         dest_image;
+    int32_t                  src_x;
+    int32_t                  src_y;
+    int32_t                  mask_x;
+    int32_t                  mask_y;
+    int32_t                  dest_x;
+    int32_t                  dest_y;
+    int32_t                  width;
+    int32_t                  height;
+
+    uint32_t                 src_flags;
+    uint32_t                 mask_flags;
+    uint32_t                 dest_flags;
+} pixman_composite_info_t;
+
+#define PIXMAN_COMPOSITE_ARGS(info)					\
+    MAYBE_UNUSED pixman_op_t        op = info->op;			\
+    MAYBE_UNUSED pixman_image_t *   src_image = info->src_image;	\
+    MAYBE_UNUSED pixman_image_t *   mask_image = info->mask_image;	\
+    MAYBE_UNUSED pixman_image_t *   dest_image = info->dest_image;	\
+    MAYBE_UNUSED int32_t            src_x = info->src_x;		\
+    MAYBE_UNUSED int32_t            src_y = info->src_y;		\
+    MAYBE_UNUSED int32_t            mask_x = info->mask_x;		\
+    MAYBE_UNUSED int32_t            mask_y = info->mask_y;		\
+    MAYBE_UNUSED int32_t            dest_x = info->dest_x;		\
+    MAYBE_UNUSED int32_t            dest_y = info->dest_y;		\
+    MAYBE_UNUSED int32_t            width = info->width;		\
+    MAYBE_UNUSED int32_t            height = info->height
+
+typedef void (*pixman_combine_32_func_t) (pixman_implementation_t *imp,
+					  pixman_op_t              op,
+					  uint32_t *               dest,
+					  const uint32_t *         src,
+					  const uint32_t *         mask,
+					  int                      width);
+
+typedef void (*pixman_combine_float_func_t) (pixman_implementation_t *imp,
+					     pixman_op_t	      op,
+					     float *		      dest,
+					     const float *	      src,
+					     const float *	      mask,
+					     int		      n_pixels);
+
+typedef void (*pixman_composite_func_t) (pixman_implementation_t *imp,
+					 pixman_composite_info_t *info);
+typedef pixman_bool_t (*pixman_blt_func_t) (pixman_implementation_t *imp,
+					    uint32_t *               src_bits,
+					    uint32_t *               dst_bits,
+					    int                      src_stride,
+					    int                      dst_stride,
+					    int                      src_bpp,
+					    int                      dst_bpp,
+					    int                      src_x,
+					    int                      src_y,
+					    int                      dest_x,
+					    int                      dest_y,
+					    int                      width,
+					    int                      height);
+typedef pixman_bool_t (*pixman_fill_func_t) (pixman_implementation_t *imp,
+					     uint32_t *               bits,
+					     int                      stride,
+					     int                      bpp,
+					     int                      x,
+					     int                      y,
+					     int                      width,
+					     int                      height,
+					     uint32_t                 filler);
+
+void _pixman_setup_combiner_functions_32 (pixman_implementation_t *imp);
+void _pixman_setup_combiner_functions_float (pixman_implementation_t *imp);
+
+typedef struct
+{
+    pixman_op_t             op;
+    pixman_format_code_t    src_format;
+    uint32_t		    src_flags;
+    pixman_format_code_t    mask_format;
+    uint32_t		    mask_flags;
+    pixman_format_code_t    dest_format;
+    uint32_t		    dest_flags;
+    pixman_composite_func_t func;
+} pixman_fast_path_t;
+
+struct pixman_implementation_t
+{
+    pixman_implementation_t *	toplevel;
+    pixman_implementation_t *	fallback;
+    const pixman_fast_path_t *	fast_paths;
+    const pixman_iter_info_t *  iter_info;
+
+    pixman_blt_func_t		blt;
+    pixman_fill_func_t		fill;
+
+    pixman_combine_32_func_t	combine_32[PIXMAN_N_OPERATORS];
+    pixman_combine_32_func_t	combine_32_ca[PIXMAN_N_OPERATORS];
+    pixman_combine_float_func_t	combine_float[PIXMAN_N_OPERATORS];
+    pixman_combine_float_func_t	combine_float_ca[PIXMAN_N_OPERATORS];
+};
+
+uint32_t
+_pixman_image_get_solid (pixman_implementation_t *imp,
+			 pixman_image_t *         image,
+                         pixman_format_code_t     format);
+
+pixman_implementation_t *
+_pixman_implementation_create (pixman_implementation_t *fallback,
+			       const pixman_fast_path_t *fast_paths);
+
+void
+_pixman_implementation_lookup_composite (pixman_implementation_t  *toplevel,
+					 pixman_op_t               op,
+					 pixman_format_code_t      src_format,
+					 uint32_t                  src_flags,
+					 pixman_format_code_t      mask_format,
+					 uint32_t                  mask_flags,
+					 pixman_format_code_t      dest_format,
+					 uint32_t                  dest_flags,
+					 pixman_implementation_t **out_imp,
+					 pixman_composite_func_t  *out_func);
+
+pixman_combine_32_func_t
+_pixman_implementation_lookup_combiner (pixman_implementation_t *imp,
+					pixman_op_t		 op,
+					pixman_bool_t		 component_alpha,
+					pixman_bool_t		 wide);
+
+pixman_bool_t
+_pixman_implementation_blt (pixman_implementation_t *imp,
+                            uint32_t *               src_bits,
+                            uint32_t *               dst_bits,
+                            int                      src_stride,
+                            int                      dst_stride,
+                            int                      src_bpp,
+                            int                      dst_bpp,
+                            int                      src_x,
+                            int                      src_y,
+                            int                      dest_x,
+                            int                      dest_y,
+                            int                      width,
+                            int                      height);
+
+pixman_bool_t
+_pixman_implementation_fill (pixman_implementation_t *imp,
+                             uint32_t *               bits,
+                             int                      stride,
+                             int                      bpp,
+                             int                      x,
+                             int                      y,
+                             int                      width,
+                             int                      height,
+                             uint32_t                 filler);
+
+void
+_pixman_implementation_iter_init (pixman_implementation_t       *imp,
+                                  pixman_iter_t                 *iter,
+                                  pixman_image_t                *image,
+                                  int                            x,
+                                  int                            y,
+                                  int                            width,
+                                  int                            height,
+                                  uint8_t                       *buffer,
+                                  iter_flags_t                   flags,
+                                  uint32_t                       image_flags);
+
+/* Specific implementations */
+pixman_implementation_t *
+_pixman_implementation_create_general (void);
+
+pixman_implementation_t *
+_pixman_implementation_create_fast_path (pixman_implementation_t *fallback);
+
+pixman_implementation_t *
+_pixman_implementation_create_noop (pixman_implementation_t *fallback);
+
+#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
+pixman_implementation_t *
+_pixman_implementation_create_mmx (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_SSE2
+pixman_implementation_t *
+_pixman_implementation_create_sse2 (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_SSSE3
+pixman_implementation_t *
+_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_ARM_SIMD
+pixman_implementation_t *
+_pixman_implementation_create_arm_simd (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_ARM_NEON
+pixman_implementation_t *
+_pixman_implementation_create_arm_neon (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_MIPS_DSPR2
+pixman_implementation_t *
+_pixman_implementation_create_mips_dspr2 (pixman_implementation_t *fallback);
+#endif
+
+#ifdef USE_VMX
+pixman_implementation_t *
+_pixman_implementation_create_vmx (pixman_implementation_t *fallback);
+#endif
+
+pixman_bool_t
+_pixman_implementation_disabled (const char *name);
+
+pixman_implementation_t *
+_pixman_x86_get_implementations (pixman_implementation_t *imp);
+
+pixman_implementation_t *
+_pixman_arm_get_implementations (pixman_implementation_t *imp);
+
+pixman_implementation_t *
+_pixman_ppc_get_implementations (pixman_implementation_t *imp);
+
+pixman_implementation_t *
+_pixman_mips_get_implementations (pixman_implementation_t *imp);
+
+pixman_implementation_t *
+_pixman_choose_implementation (void);
+
+pixman_bool_t
+_pixman_disabled (const char *name);
+
+
+/*
+ * Utilities
+ */
+pixman_bool_t
+_pixman_compute_composite_region32 (pixman_region32_t * region,
+				    pixman_image_t *    src_image,
+				    pixman_image_t *    mask_image,
+				    pixman_image_t *    dest_image,
+				    int32_t             src_x,
+				    int32_t             src_y,
+				    int32_t             mask_x,
+				    int32_t             mask_y,
+				    int32_t             dest_x,
+				    int32_t             dest_y,
+				    int32_t             width,
+				    int32_t             height);
+uint32_t *
+_pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask);
+
+void
+_pixman_iter_init_bits_stride (pixman_iter_t *iter, const pixman_iter_info_t *info);
+
+/* These "formats" all have depth 0, so they
+ * will never clash with any real ones
+ */
+#define PIXMAN_null             PIXMAN_FORMAT (0, 0, 0, 0, 0, 0)
+#define PIXMAN_solid            PIXMAN_FORMAT (0, 1, 0, 0, 0, 0)
+#define PIXMAN_pixbuf		PIXMAN_FORMAT (0, 2, 0, 0, 0, 0)
+#define PIXMAN_rpixbuf		PIXMAN_FORMAT (0, 3, 0, 0, 0, 0)
+#define PIXMAN_unknown		PIXMAN_FORMAT (0, 4, 0, 0, 0, 0)
+#define PIXMAN_any		PIXMAN_FORMAT (0, 5, 0, 0, 0, 0)
+
+#define PIXMAN_OP_any		(PIXMAN_N_OPERATORS + 1)
+
+#define FAST_PATH_ID_TRANSFORM			(1 <<  0)
+#define FAST_PATH_NO_ALPHA_MAP			(1 <<  1)
+#define FAST_PATH_NO_CONVOLUTION_FILTER		(1 <<  2)
+#define FAST_PATH_NO_PAD_REPEAT			(1 <<  3)
+#define FAST_PATH_NO_REFLECT_REPEAT		(1 <<  4)
+#define FAST_PATH_NO_ACCESSORS			(1 <<  5)
+#define FAST_PATH_NARROW_FORMAT			(1 <<  6)
+#define FAST_PATH_COMPONENT_ALPHA		(1 <<  8)
+#define FAST_PATH_SAMPLES_OPAQUE		(1 <<  7)
+#define FAST_PATH_UNIFIED_ALPHA			(1 <<  9)
+#define FAST_PATH_SCALE_TRANSFORM		(1 << 10)
+#define FAST_PATH_NEAREST_FILTER		(1 << 11)
+#define FAST_PATH_HAS_TRANSFORM			(1 << 12)
+#define FAST_PATH_IS_OPAQUE			(1 << 13)
+#define FAST_PATH_NO_NORMAL_REPEAT		(1 << 14)
+#define FAST_PATH_NO_NONE_REPEAT		(1 << 15)
+#define FAST_PATH_X_UNIT_POSITIVE		(1 << 16)
+#define FAST_PATH_AFFINE_TRANSFORM		(1 << 17)
+#define FAST_PATH_Y_UNIT_ZERO			(1 << 18)
+#define FAST_PATH_BILINEAR_FILTER		(1 << 19)
+#define FAST_PATH_ROTATE_90_TRANSFORM		(1 << 20)
+#define FAST_PATH_ROTATE_180_TRANSFORM		(1 << 21)
+#define FAST_PATH_ROTATE_270_TRANSFORM		(1 << 22)
+#define FAST_PATH_SAMPLES_COVER_CLIP_NEAREST	(1 << 23)
+#define FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR	(1 << 24)
+#define FAST_PATH_BITS_IMAGE			(1 << 25)
+#define FAST_PATH_SEPARABLE_CONVOLUTION_FILTER  (1 << 26)
+
+#define FAST_PATH_PAD_REPEAT						\
+    (FAST_PATH_NO_NONE_REPEAT		|				\
+     FAST_PATH_NO_NORMAL_REPEAT		|				\
+     FAST_PATH_NO_REFLECT_REPEAT)
+
+#define FAST_PATH_NORMAL_REPEAT						\
+    (FAST_PATH_NO_NONE_REPEAT		|				\
+     FAST_PATH_NO_PAD_REPEAT		|				\
+     FAST_PATH_NO_REFLECT_REPEAT)
+
+#define FAST_PATH_NONE_REPEAT						\
+    (FAST_PATH_NO_NORMAL_REPEAT		|				\
+     FAST_PATH_NO_PAD_REPEAT		|				\
+     FAST_PATH_NO_REFLECT_REPEAT)
+
+#define FAST_PATH_REFLECT_REPEAT					\
+    (FAST_PATH_NO_NONE_REPEAT		|				\
+     FAST_PATH_NO_NORMAL_REPEAT		|				\
+     FAST_PATH_NO_PAD_REPEAT)
+
+#define FAST_PATH_STANDARD_FLAGS					\
+    (FAST_PATH_NO_CONVOLUTION_FILTER	|				\
+     FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NARROW_FORMAT)
+
+#define FAST_PATH_STD_DEST_FLAGS					\
+    (FAST_PATH_NO_ACCESSORS		|				\
+     FAST_PATH_NO_ALPHA_MAP		|				\
+     FAST_PATH_NARROW_FORMAT)
+
+#define SOURCE_FLAGS(format)						\
+    (FAST_PATH_STANDARD_FLAGS |						\
+     ((PIXMAN_ ## format == PIXMAN_solid) ?				\
+      0 : (FAST_PATH_SAMPLES_COVER_CLIP_NEAREST | FAST_PATH_NEAREST_FILTER | FAST_PATH_ID_TRANSFORM)))
+
+#define MASK_FLAGS(format, extra)					\
+    ((PIXMAN_ ## format == PIXMAN_null) ? 0 : (SOURCE_FLAGS (format) | extra))
+
+#define FAST_PATH(op, src, src_flags, mask, mask_flags, dest, dest_flags, func) \
+    PIXMAN_OP_ ## op,							\
+    PIXMAN_ ## src,							\
+    src_flags,							        \
+    PIXMAN_ ## mask,						        \
+    mask_flags,							        \
+    PIXMAN_ ## dest,	                                                \
+    dest_flags,							        \
+    func
+
+#define PIXMAN_STD_FAST_PATH(op, src, mask, dest, func)			\
+    { FAST_PATH (							\
+	    op,								\
+	    src,  SOURCE_FLAGS (src),					\
+	    mask, MASK_FLAGS (mask, FAST_PATH_UNIFIED_ALPHA),		\
+	    dest, FAST_PATH_STD_DEST_FLAGS,				\
+	    func) }
+
+#define PIXMAN_STD_FAST_PATH_CA(op, src, mask, dest, func)		\
+    { FAST_PATH (							\
+	    op,								\
+	    src,  SOURCE_FLAGS (src),					\
+	    mask, MASK_FLAGS (mask, FAST_PATH_COMPONENT_ALPHA),		\
+	    dest, FAST_PATH_STD_DEST_FLAGS,				\
+	    func) }
+
+extern pixman_implementation_t *global_implementation;
+
+static force_inline pixman_implementation_t *
+get_implementation (void)
+{
+#ifndef TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR
+    if (!global_implementation)
+	global_implementation = _pixman_choose_implementation ();
+#endif
+    return global_implementation;
+}
+
+/* This function is exported for the sake of the test suite and not part
+ * of the ABI.
+ */
+PIXMAN_EXPORT pixman_implementation_t *
+_pixman_internal_only_get_implementation (void);
+
+/* Memory allocation helpers */
+void *
+pixman_malloc_ab (unsigned int n, unsigned int b);
+
+void *
+pixman_malloc_abc (unsigned int a, unsigned int b, unsigned int c);
+
+void *
+pixman_malloc_ab_plus_c (unsigned int a, unsigned int b, unsigned int c);
+
+pixman_bool_t
+_pixman_multiply_overflows_size (size_t a, size_t b);
+
+pixman_bool_t
+_pixman_multiply_overflows_int (unsigned int a, unsigned int b);
+
+pixman_bool_t
+_pixman_addition_overflows_int (unsigned int a, unsigned int b);
+
+/* Compositing utilities */
+void
+pixman_expand_to_float (argb_t               *dst,
+			const uint32_t       *src,
+			pixman_format_code_t  format,
+			int                   width);
+
+void
+pixman_contract_from_float (uint32_t     *dst,
+			    const argb_t *src,
+			    int           width);
+
+/* Region Helpers */
+pixman_bool_t
+pixman_region32_copy_from_region16 (pixman_region32_t *dst,
+                                    pixman_region16_t *src);
+
+pixman_bool_t
+pixman_region16_copy_from_region32 (pixman_region16_t *dst,
+                                    pixman_region32_t *src);
+
+/* Doubly linked lists */
+typedef struct pixman_link_t pixman_link_t;
+struct pixman_link_t
+{
+    pixman_link_t *next;
+    pixman_link_t *prev;
+};
+
+typedef struct pixman_list_t pixman_list_t;
+struct pixman_list_t
+{
+    pixman_link_t *head;
+    pixman_link_t *tail;
+};
+
+static force_inline void
+pixman_list_init (pixman_list_t *list)
+{
+    list->head = (pixman_link_t *)list;
+    list->tail = (pixman_link_t *)list;
+}
+
+static force_inline void
+pixman_list_prepend (pixman_list_t *list, pixman_link_t *link)
+{
+    link->next = list->head;
+    link->prev = (pixman_link_t *)list;
+    list->head->prev = link;
+    list->head = link;
+}
+
+static force_inline void
+pixman_list_unlink (pixman_link_t *link)
+{
+    link->prev->next = link->next;
+    link->next->prev = link->prev;
+}
+
+static force_inline void
+pixman_list_move_to_front (pixman_list_t *list, pixman_link_t *link)
+{
+    pixman_list_unlink (link);
+    pixman_list_prepend (list, link);
+}
+
+/* Misc macros */
+
+#ifndef FALSE
+#   define FALSE 0
+#endif
+
+#ifndef TRUE
+#   define TRUE 1
+#endif
+
+#ifndef MIN
+#  define MIN(a, b) ((a < b) ? a : b)
+#endif
+
+#ifndef MAX
+#  define MAX(a, b) ((a > b) ? a : b)
+#endif
+
+/* Integer division that rounds towards -infinity */
+#define DIV(a, b)					   \
+    ((((a) < 0) == ((b) < 0)) ? (a) / (b) :                \
+     ((a) - (b) + 1 - (((b) < 0) << 1)) / (b))
+
+/* Modulus that produces the remainder wrt. DIV */
+#define MOD(a, b) ((a) < 0 ? ((b) - ((-(a) - 1) % (b))) - 1 : (a) % (b))
+
+#define CLIP(v, low, high) ((v) < (low) ? (low) : ((v) > (high) ? (high) : (v)))
+
+#define FLOAT_IS_ZERO(f)     (-FLT_MIN < (f) && (f) < FLT_MIN)
+
+/* Conversion between 8888 and 0565 */
+
+static force_inline uint16_t
+convert_8888_to_0565 (uint32_t s)
+{
+    /* The following code can be compiled into just 4 instructions on ARM */
+    uint32_t a, b;
+    a = (s >> 3) & 0x1F001F;
+    b = s & 0xFC00;
+    a |= a >> 5;
+    a |= b >> 5;
+    return (uint16_t)a;
+}
+
+static force_inline uint32_t
+convert_0565_to_0888 (uint16_t s)
+{
+    return (((((s) << 3) & 0xf8) | (((s) >> 2) & 0x7)) |
+            ((((s) << 5) & 0xfc00) | (((s) >> 1) & 0x300)) |
+            ((((s) << 8) & 0xf80000) | (((s) << 3) & 0x70000)));
+}
+
+static force_inline uint32_t
+convert_0565_to_8888 (uint16_t s)
+{
+    return convert_0565_to_0888 (s) | 0xff000000;
+}
+
+/* Trivial versions that are useful in macros */
+
+static force_inline uint32_t
+convert_8888_to_8888 (uint32_t s)
+{
+    return s;
+}
+
+static force_inline uint32_t
+convert_x888_to_8888 (uint32_t s)
+{
+    return s | 0xff000000;
+}
+
+static force_inline uint16_t
+convert_0565_to_0565 (uint16_t s)
+{
+    return s;
+}
+
+#define PIXMAN_FORMAT_IS_WIDE(f)					\
+    (PIXMAN_FORMAT_A (f) > 8 ||						\
+     PIXMAN_FORMAT_R (f) > 8 ||						\
+     PIXMAN_FORMAT_G (f) > 8 ||						\
+     PIXMAN_FORMAT_B (f) > 8 ||						\
+     PIXMAN_FORMAT_TYPE (f) == PIXMAN_TYPE_ARGB_SRGB)
+
+#ifdef WORDS_BIGENDIAN
+#   define SCREEN_SHIFT_LEFT(x,n)	((x) << (n))
+#   define SCREEN_SHIFT_RIGHT(x,n)	((x) >> (n))
+#else
+#   define SCREEN_SHIFT_LEFT(x,n)	((x) >> (n))
+#   define SCREEN_SHIFT_RIGHT(x,n)	((x) << (n))
+#endif
+
+static force_inline uint32_t
+unorm_to_unorm (uint32_t val, int from_bits, int to_bits)
+{
+    uint32_t result;
+
+    if (from_bits == 0)
+	return 0;
+
+    /* Delete any extra bits */
+    val &= ((1 << from_bits) - 1);
+
+    if (from_bits >= to_bits)
+	return val >> (from_bits - to_bits);
+
+    /* Start out with the high bit of val in the high bit of result. */
+    result = val << (to_bits - from_bits);
+
+    /* Copy the bits in result, doubling the number of bits each time, until
+     * we fill all to_bits. Unrolled manually because from_bits and to_bits
+     * are usually known statically, so the compiler can turn all of this
+     * into a few shifts.
+     */
+#define REPLICATE()							\
+    do									\
+    {									\
+	if (from_bits < to_bits)					\
+	{								\
+	    result |= result >> from_bits;				\
+									\
+	    from_bits *= 2;						\
+	}								\
+    }									\
+    while (0)
+
+    REPLICATE();
+    REPLICATE();
+    REPLICATE();
+    REPLICATE();
+    REPLICATE();
+
+    return result;
+}
+
+uint16_t pixman_float_to_unorm (float f, int n_bits);
+float pixman_unorm_to_float (uint16_t u, int n_bits);
+
+/*
+ * Various debugging code
+ */
+
+#undef DEBUG
+
+#define COMPILE_TIME_ASSERT(x)						\
+    do { typedef int compile_time_assertion [(x)?1:-1]; } while (0)
+
+/* Turn on debugging depending on what type of release this is
+ */
+#if (((PIXMAN_VERSION_MICRO % 2) == 0) && ((PIXMAN_VERSION_MINOR % 2) == 1))
+
+/* Debugging gets turned on for development releases because these
+ * are the things that end up in bleeding edge distributions such
+ * as Rawhide etc.
+ *
+ * For performance reasons we don't turn it on for stable releases or
+ * random git checkouts. (Random git checkouts are often used for
+ * performance work).
+ */
+
+#    define DEBUG
+
+#endif
+
+void
+_pixman_log_error (const char *function, const char *message);
+
+#define return_if_fail(expr)                                            \
+    do                                                                  \
+    {                                                                   \
+	if (unlikely (!(expr)))                                         \
+	{								\
+	    _pixman_log_error (FUNC, "The expression " # expr " was false"); \
+	    return;							\
+	}								\
+    }                                                                   \
+    while (0)
+
+#define return_val_if_fail(expr, retval)                                \
+    do                                                                  \
+    {                                                                   \
+	if (unlikely (!(expr)))                                         \
+	{								\
+	    _pixman_log_error (FUNC, "The expression " # expr " was false"); \
+	    return (retval);						\
+	}								\
+    }                                                                   \
+    while (0)
+
+#define critical_if_fail(expr)						\
+    do									\
+    {									\
+	if (unlikely (!(expr)))                                         \
+	    _pixman_log_error (FUNC, "The expression " # expr " was false"); \
+    }									\
+    while (0)
+
+/*
+ * Matrix
+ */
+
+typedef struct { pixman_fixed_48_16_t v[3]; } pixman_vector_48_16_t;
+
+pixman_bool_t
+pixman_transform_point_31_16 (const pixman_transform_t    *t,
+                              const pixman_vector_48_16_t *v,
+                              pixman_vector_48_16_t       *result);
+
+void
+pixman_transform_point_31_16_3d (const pixman_transform_t    *t,
+                                 const pixman_vector_48_16_t *v,
+                                 pixman_vector_48_16_t       *result);
+
+void
+pixman_transform_point_31_16_affine (const pixman_transform_t    *t,
+                                     const pixman_vector_48_16_t *v,
+                                     pixman_vector_48_16_t       *result);
+
+/*
+ * Timers
+ */
+
+#ifdef PIXMAN_TIMERS
+
+static inline uint64_t
+oil_profile_stamp_rdtsc (void)
+{
+    uint32_t hi, lo;
+
+    __asm__ __volatile__ ("rdtsc\n" : "=a" (lo), "=d" (hi));
+
+    return lo | (((uint64_t)hi) << 32);
+}
+
+#define OIL_STAMP oil_profile_stamp_rdtsc
+
+typedef struct pixman_timer_t pixman_timer_t;
+
+struct pixman_timer_t
+{
+    int             initialized;
+    const char *    name;
+    uint64_t        n_times;
+    uint64_t        total;
+    pixman_timer_t *next;
+};
+
+extern int timer_defined;
+
+void pixman_timer_register (pixman_timer_t *timer);
+
+#define TIMER_BEGIN(tname)                                              \
+    {                                                                   \
+	static pixman_timer_t timer ## tname;                           \
+	uint64_t              begin ## tname;                           \
+        								\
+	if (!timer ## tname.initialized)				\
+	{                                                               \
+	    timer ## tname.initialized = 1;				\
+	    timer ## tname.name = # tname;				\
+	    pixman_timer_register (&timer ## tname);			\
+	}                                                               \
+									\
+	timer ## tname.n_times++;					\
+	begin ## tname = OIL_STAMP ();
+
+#define TIMER_END(tname)                                                \
+    timer ## tname.total += OIL_STAMP () - begin ## tname;		\
+    }
+
+#else
+
+#define TIMER_BEGIN(tname)
+#define TIMER_END(tname)
+
+#endif /* PIXMAN_TIMERS */
+
+#endif /* __ASSEMBLER__ */
+
+#endif /* PIXMAN_PRIVATE_H */
diff --git a/src/pixman/pixman/pixman-radial-gradient.c b/src/pixman/pixman/pixman-radial-gradient.c
new file mode 100644
index 0000000..6a21796
--- /dev/null
+++ b/src/pixman/pixman/pixman-radial-gradient.c
@@ -0,0 +1,471 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ *
+ * Copyright © 2000 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright © 2000 SuSE, Inc.
+ *             2005 Lars Knoll & Zack Rusin, Trolltech
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include <math.h>
+#include "pixman-private.h"
+
+static inline pixman_fixed_32_32_t
+dot (pixman_fixed_48_16_t x1,
+     pixman_fixed_48_16_t y1,
+     pixman_fixed_48_16_t z1,
+     pixman_fixed_48_16_t x2,
+     pixman_fixed_48_16_t y2,
+     pixman_fixed_48_16_t z2)
+{
+    /*
+     * Exact computation, assuming that the input values can
+     * be represented as pixman_fixed_16_16_t
+     */
+    return x1 * x2 + y1 * y2 + z1 * z2;
+}
+
+static inline double
+fdot (double x1,
+      double y1,
+      double z1,
+      double x2,
+      double y2,
+      double z2)
+{
+    /*
+     * Error can be unbound in some special cases.
+     * Using clever dot product algorithms (for example compensated
+     * dot product) would improve this but make the code much less
+     * obvious
+     */
+    return x1 * x2 + y1 * y2 + z1 * z2;
+}
+
+static uint32_t
+radial_compute_color (double                    a,
+		      double                    b,
+		      double                    c,
+		      double                    inva,
+		      double                    dr,
+		      double                    mindr,
+		      pixman_gradient_walker_t *walker,
+		      pixman_repeat_t           repeat)
+{
+    /*
+     * In this function error propagation can lead to bad results:
+     *  - discr can have an unbound error (if b*b-a*c is very small),
+     *    potentially making it the opposite sign of what it should have been
+     *    (thus clearing a pixel that would have been colored or vice-versa)
+     *    or propagating the error to sqrtdiscr;
+     *    if discr has the wrong sign or b is very small, this can lead to bad
+     *    results
+     *
+     *  - the algorithm used to compute the solutions of the quadratic
+     *    equation is not numerically stable (but saves one division compared
+     *    to the numerically stable one);
+     *    this can be a problem if a*c is much smaller than b*b
+     *
+     *  - the above problems are worse if a is small (as inva becomes bigger)
+     */
+    double discr;
+
+    if (a == 0)
+    {
+	double t;
+
+	if (b == 0)
+	    return 0;
+
+	t = pixman_fixed_1 / 2 * c / b;
+	if (repeat == PIXMAN_REPEAT_NONE)
+	{
+	    if (0 <= t && t <= pixman_fixed_1)
+		return _pixman_gradient_walker_pixel (walker, t);
+	}
+	else
+	{
+	    if (t * dr >= mindr)
+		return _pixman_gradient_walker_pixel (walker, t);
+	}
+
+	return 0;
+    }
+
+    discr = fdot (b, a, 0, b, -c, 0);
+    if (discr >= 0)
+    {
+	double sqrtdiscr, t0, t1;
+
+	sqrtdiscr = sqrt (discr);
+	t0 = (b + sqrtdiscr) * inva;
+	t1 = (b - sqrtdiscr) * inva;
+
+	/*
+	 * The root that must be used is the biggest one that belongs
+	 * to the valid range ([0,1] for PIXMAN_REPEAT_NONE, any
+	 * solution that results in a positive radius otherwise).
+	 *
+	 * If a > 0, t0 is the biggest solution, so if it is valid, it
+	 * is the correct result.
+	 *
+	 * If a < 0, only one of the solutions can be valid, so the
+	 * order in which they are tested is not important.
+	 */
+	if (repeat == PIXMAN_REPEAT_NONE)
+	{
+	    if (0 <= t0 && t0 <= pixman_fixed_1)
+		return _pixman_gradient_walker_pixel (walker, t0);
+	    else if (0 <= t1 && t1 <= pixman_fixed_1)
+		return _pixman_gradient_walker_pixel (walker, t1);
+	}
+	else
+	{
+	    if (t0 * dr >= mindr)
+		return _pixman_gradient_walker_pixel (walker, t0);
+	    else if (t1 * dr >= mindr)
+		return _pixman_gradient_walker_pixel (walker, t1);
+	}
+    }
+
+    return 0;
+}
+
+static uint32_t *
+radial_get_scanline_narrow (pixman_iter_t *iter, const uint32_t *mask)
+{
+    /*
+     * Implementation of radial gradients following the PDF specification.
+     * See section 8.7.4.5.4 Type 3 (Radial) Shadings of the PDF Reference
+     * Manual (PDF 32000-1:2008 at the time of this writing).
+     *
+     * In the radial gradient problem we are given two circles (c₁,r₁) and
+     * (c₂,r₂) that define the gradient itself.
+     *
+     * Mathematically the gradient can be defined as the family of circles
+     *
+     *     ((1-t)·c₁ + t·(c₂), (1-t)·r₁ + t·r₂)
+     *
+     * excluding those circles whose radius would be < 0. When a point
+     * belongs to more than one circle, the one with a bigger t is the only
+     * one that contributes to its color. When a point does not belong
+     * to any of the circles, it is transparent black, i.e. RGBA (0, 0, 0, 0).
+     * Further limitations on the range of values for t are imposed when
+     * the gradient is not repeated, namely t must belong to [0,1].
+     *
+     * The graphical result is the same as drawing the valid (radius > 0)
+     * circles with increasing t in [-inf, +inf] (or in [0,1] if the gradient
+     * is not repeated) using SOURCE operator composition.
+     *
+     * It looks like a cone pointing towards the viewer if the ending circle
+     * is smaller than the starting one, a cone pointing inside the page if
+     * the starting circle is the smaller one and like a cylinder if they
+     * have the same radius.
+     *
+     * What we actually do is, given the point whose color we are interested
+     * in, compute the t values for that point, solving for t in:
+     *
+     *     length((1-t)·c₁ + t·(c₂) - p) = (1-t)·r₁ + t·r₂
+     *
+     * Let's rewrite it in a simpler way, by defining some auxiliary
+     * variables:
+     *
+     *     cd = c₂ - c₁
+     *     pd = p - c₁
+     *     dr = r₂ - r₁
+     *     length(t·cd - pd) = r₁ + t·dr
+     *
+     * which actually means
+     *
+     *     hypot(t·cdx - pdx, t·cdy - pdy) = r₁ + t·dr
+     *
+     * or
+     *
+     *     ⎷((t·cdx - pdx)² + (t·cdy - pdy)²) = r₁ + t·dr.
+     *
+     * If we impose (as stated earlier) that r₁ + t·dr >= 0, it becomes:
+     *
+     *     (t·cdx - pdx)² + (t·cdy - pdy)² = (r₁ + t·dr)²
+     *
+     * where we can actually expand the squares and solve for t:
+     *
+     *     t²cdx² - 2t·cdx·pdx + pdx² + t²cdy² - 2t·cdy·pdy + pdy² =
+     *       = r₁² + 2·r₁·t·dr + t²·dr²
+     *
+     *     (cdx² + cdy² - dr²)t² - 2(cdx·pdx + cdy·pdy + r₁·dr)t +
+     *         (pdx² + pdy² - r₁²) = 0
+     *
+     *     A = cdx² + cdy² - dr²
+     *     B = pdx·cdx + pdy·cdy + r₁·dr
+     *     C = pdx² + pdy² - r₁²
+     *     At² - 2Bt + C = 0
+     *
+     * The solutions (unless the equation degenerates because of A = 0) are:
+     *
+     *     t = (B ± ⎷(B² - A·C)) / A
+     *
+     * The solution we are going to prefer is the bigger one, unless the
+     * radius associated to it is negative (or it falls outside the valid t
+     * range).
+     *
+     * Additional observations (useful for optimizations):
+     * A does not depend on p
+     *
+     * A < 0 <=> one of the two circles completely contains the other one
+     *   <=> for every p, the radiuses associated with the two t solutions
+     *       have opposite sign
+     */
+    pixman_image_t *image = iter->image;
+    int x = iter->x;
+    int y = iter->y;
+    int width = iter->width;
+    uint32_t *buffer = iter->buffer;
+
+    gradient_t *gradient = (gradient_t *)image;
+    radial_gradient_t *radial = (radial_gradient_t *)image;
+    uint32_t *end = buffer + width;
+    pixman_gradient_walker_t walker;
+    pixman_vector_t v, unit;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    _pixman_gradient_walker_init (&walker, gradient, image->common.repeat);
+
+    if (image->common.transform)
+    {
+	if (!pixman_transform_point_3d (image->common.transform, &v))
+	    return iter->buffer;
+
+	unit.vector[0] = image->common.transform->matrix[0][0];
+	unit.vector[1] = image->common.transform->matrix[1][0];
+	unit.vector[2] = image->common.transform->matrix[2][0];
+    }
+    else
+    {
+	unit.vector[0] = pixman_fixed_1;
+	unit.vector[1] = 0;
+	unit.vector[2] = 0;
+    }
+
+    if (unit.vector[2] == 0 && v.vector[2] == pixman_fixed_1)
+    {
+	/*
+	 * Given:
+	 *
+	 * t = (B ± ⎷(B² - A·C)) / A
+	 *
+	 * where
+	 *
+	 * A = cdx² + cdy² - dr²
+	 * B = pdx·cdx + pdy·cdy + r₁·dr
+	 * C = pdx² + pdy² - r₁²
+	 * det = B² - A·C
+	 *
+	 * Since we have an affine transformation, we know that (pdx, pdy)
+	 * increase linearly with each pixel,
+	 *
+	 * pdx = pdx₀ + n·ux,
+	 * pdy = pdy₀ + n·uy,
+	 *
+	 * we can then express B, C and det through multiple differentiation.
+	 */
+	pixman_fixed_32_32_t b, db, c, dc, ddc;
+
+	/* warning: this computation may overflow */
+	v.vector[0] -= radial->c1.x;
+	v.vector[1] -= radial->c1.y;
+
+	/*
+	 * B and C are computed and updated exactly.
+	 * If fdot was used instead of dot, in the worst case it would
+	 * lose 11 bits of precision in each of the multiplication and
+	 * summing up would zero out all the bit that were preserved,
+	 * thus making the result 0 instead of the correct one.
+	 * This would mean a worst case of unbound relative error or
+	 * about 2^10 absolute error
+	 */
+	b = dot (v.vector[0], v.vector[1], radial->c1.radius,
+		 radial->delta.x, radial->delta.y, radial->delta.radius);
+	db = dot (unit.vector[0], unit.vector[1], 0,
+		  radial->delta.x, radial->delta.y, 0);
+
+	c = dot (v.vector[0], v.vector[1],
+		 -((pixman_fixed_48_16_t) radial->c1.radius),
+		 v.vector[0], v.vector[1], radial->c1.radius);
+	dc = dot (2 * (pixman_fixed_48_16_t) v.vector[0] + unit.vector[0],
+		  2 * (pixman_fixed_48_16_t) v.vector[1] + unit.vector[1],
+		  0,
+		  unit.vector[0], unit.vector[1], 0);
+	ddc = 2 * dot (unit.vector[0], unit.vector[1], 0,
+		       unit.vector[0], unit.vector[1], 0);
+
+	while (buffer < end)
+	{
+	    if (!mask || *mask++)
+	    {
+		*buffer = radial_compute_color (radial->a, b, c,
+						radial->inva,
+						radial->delta.radius,
+						radial->mindr,
+						&walker,
+						image->common.repeat);
+	    }
+
+	    b += db;
+	    c += dc;
+	    dc += ddc;
+	    ++buffer;
+	}
+    }
+    else
+    {
+	/* projective */
+	/* Warning:
+	 * error propagation guarantees are much looser than in the affine case
+	 */
+	while (buffer < end)
+	{
+	    if (!mask || *mask++)
+	    {
+		if (v.vector[2] != 0)
+		{
+		    double pdx, pdy, invv2, b, c;
+
+		    invv2 = 1. * pixman_fixed_1 / v.vector[2];
+
+		    pdx = v.vector[0] * invv2 - radial->c1.x;
+		    /*    / pixman_fixed_1 */
+
+		    pdy = v.vector[1] * invv2 - radial->c1.y;
+		    /*    / pixman_fixed_1 */
+
+		    b = fdot (pdx, pdy, radial->c1.radius,
+			      radial->delta.x, radial->delta.y,
+			      radial->delta.radius);
+		    /*  / pixman_fixed_1 / pixman_fixed_1 */
+
+		    c = fdot (pdx, pdy, -radial->c1.radius,
+			      pdx, pdy, radial->c1.radius);
+		    /*  / pixman_fixed_1 / pixman_fixed_1 */
+
+		    *buffer = radial_compute_color (radial->a, b, c,
+						    radial->inva,
+						    radial->delta.radius,
+						    radial->mindr,
+						    &walker,
+						    image->common.repeat);
+		}
+		else
+		{
+		    *buffer = 0;
+		}
+	    }
+
+	    ++buffer;
+
+	    v.vector[0] += unit.vector[0];
+	    v.vector[1] += unit.vector[1];
+	    v.vector[2] += unit.vector[2];
+	}
+    }
+
+    iter->y++;
+    return iter->buffer;
+}
+
+static uint32_t *
+radial_get_scanline_wide (pixman_iter_t *iter, const uint32_t *mask)
+{
+    uint32_t *buffer = radial_get_scanline_narrow (iter, NULL);
+
+    pixman_expand_to_float (
+	(argb_t *)buffer, buffer, PIXMAN_a8r8g8b8, iter->width);
+
+    return buffer;
+}
+
+void
+_pixman_radial_gradient_iter_init (pixman_image_t *image, pixman_iter_t *iter)
+{
+    if (iter->iter_flags & ITER_NARROW)
+	iter->get_scanline = radial_get_scanline_narrow;
+    else
+	iter->get_scanline = radial_get_scanline_wide;
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_radial_gradient (const pixman_point_fixed_t *  inner,
+                                     const pixman_point_fixed_t *  outer,
+                                     pixman_fixed_t                inner_radius,
+                                     pixman_fixed_t                outer_radius,
+                                     const pixman_gradient_stop_t *stops,
+                                     int                           n_stops)
+{
+    pixman_image_t *image;
+    radial_gradient_t *radial;
+
+    image = _pixman_image_allocate ();
+
+    if (!image)
+	return NULL;
+
+    radial = &image->radial;
+
+    if (!_pixman_init_gradient (&radial->common, stops, n_stops))
+    {
+	free (image);
+	return NULL;
+    }
+
+    image->type = RADIAL;
+
+    radial->c1.x = inner->x;
+    radial->c1.y = inner->y;
+    radial->c1.radius = inner_radius;
+    radial->c2.x = outer->x;
+    radial->c2.y = outer->y;
+    radial->c2.radius = outer_radius;
+
+    /* warning: this computations may overflow */
+    radial->delta.x = radial->c2.x - radial->c1.x;
+    radial->delta.y = radial->c2.y - radial->c1.y;
+    radial->delta.radius = radial->c2.radius - radial->c1.radius;
+
+    /* computed exactly, then cast to double -> every bit of the double
+       representation is correct (53 bits) */
+    radial->a = dot (radial->delta.x, radial->delta.y, -radial->delta.radius,
+		     radial->delta.x, radial->delta.y, radial->delta.radius);
+    if (radial->a != 0)
+	radial->inva = 1. * pixman_fixed_1 / radial->a;
+
+    radial->mindr = -1. * pixman_fixed_1 * radial->c1.radius;
+
+    return image;
+}
diff --git a/src/pixman/pixman/pixman-region.c b/src/pixman/pixman/pixman-region.c
new file mode 100644
index 0000000..59bc9c7
--- /dev/null
+++ b/src/pixman/pixman/pixman-region.c
@@ -0,0 +1,2792 @@
+/*
+ * Copyright 1987, 1988, 1989, 1998  The Open Group
+ * 
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation.
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ * Except as contained in this notice, the name of The Open Group shall not be
+ * used in advertising or otherwise to promote the sale, use or other dealings
+ * in this Software without prior written authorization from The Open Group.
+ * 
+ * Copyright 1987, 1988, 1989 by
+ * Digital Equipment Corporation, Maynard, Massachusetts.
+ * 
+ *                    All Rights Reserved
+ * 
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation for any purpose and without fee is hereby granted,
+ * provided that the above copyright notice appear in all copies and that
+ * both that copyright notice and this permission notice appear in
+ * supporting documentation, and that the name of Digital not be
+ * used in advertising or publicity pertaining to distribution of the
+ * software without specific, written prior permission.
+ * 
+ * DIGITAL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
+ * ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
+ * DIGITAL BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Copyright © 1998 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include <stdio.h>
+#include "pixman-private.h"
+
+#define PIXREGION_NIL(reg) ((reg)->data && !(reg)->data->numRects)
+/* not a region */
+#define PIXREGION_NAR(reg)      ((reg)->data == pixman_broken_data)
+#define PIXREGION_NUMRECTS(reg) ((reg)->data ? (reg)->data->numRects : 1)
+#define PIXREGION_SIZE(reg) ((reg)->data ? (reg)->data->size : 0)
+#define PIXREGION_RECTS(reg) \
+    ((reg)->data ? (box_type_t *)((reg)->data + 1) \
+     : &(reg)->extents)
+#define PIXREGION_BOXPTR(reg) ((box_type_t *)((reg)->data + 1))
+#define PIXREGION_BOX(reg, i) (&PIXREGION_BOXPTR (reg)[i])
+#define PIXREGION_TOP(reg) PIXREGION_BOX (reg, (reg)->data->numRects)
+#define PIXREGION_END(reg) PIXREGION_BOX (reg, (reg)->data->numRects - 1)
+
+#define GOOD_RECT(rect) ((rect)->x1 < (rect)->x2 && (rect)->y1 < (rect)->y2)
+#define BAD_RECT(rect) ((rect)->x1 > (rect)->x2 || (rect)->y1 > (rect)->y2)
+
+#ifdef DEBUG
+
+#define GOOD(reg)							\
+    do									\
+    {									\
+	if (!PREFIX (_selfcheck (reg)))					\
+	    _pixman_log_error (FUNC, "Malformed region " # reg);	\
+    } while (0)
+
+#else
+
+#define GOOD(reg)
+
+#endif
+
+static const box_type_t PREFIX (_empty_box_) = { 0, 0, 0, 0 };
+static const region_data_type_t PREFIX (_empty_data_) = { 0, 0 };
+#if defined (__llvm__) && !defined (__clang__)
+static const volatile region_data_type_t PREFIX (_broken_data_) = { 0, 0 };
+#else
+static const region_data_type_t PREFIX (_broken_data_) = { 0, 0 };
+#endif
+
+static box_type_t *pixman_region_empty_box =
+    (box_type_t *)&PREFIX (_empty_box_);
+static region_data_type_t *pixman_region_empty_data =
+    (region_data_type_t *)&PREFIX (_empty_data_);
+static region_data_type_t *pixman_broken_data =
+    (region_data_type_t *)&PREFIX (_broken_data_);
+
+static pixman_bool_t
+pixman_break (region_type_t *region);
+
+/*
+ * The functions in this file implement the Region abstraction used extensively
+ * throughout the X11 sample server. A Region is simply a set of disjoint
+ * (non-overlapping) rectangles, plus an "extent" rectangle which is the
+ * smallest single rectangle that contains all the non-overlapping rectangles.
+ *
+ * A Region is implemented as a "y-x-banded" array of rectangles.  This array
+ * imposes two degrees of order.  First, all rectangles are sorted by top side
+ * y coordinate first (y1), and then by left side x coordinate (x1).
+ *
+ * Furthermore, the rectangles are grouped into "bands".  Each rectangle in a
+ * band has the same top y coordinate (y1), and each has the same bottom y
+ * coordinate (y2).  Thus all rectangles in a band differ only in their left
+ * and right side (x1 and x2).  Bands are implicit in the array of rectangles:
+ * there is no separate list of band start pointers.
+ *
+ * The y-x band representation does not minimize rectangles.  In particular,
+ * if a rectangle vertically crosses a band (the rectangle has scanlines in
+ * the y1 to y2 area spanned by the band), then the rectangle may be broken
+ * down into two or more smaller rectangles stacked one atop the other.
+ *
+ *  -----------				    -----------
+ *  |         |				    |         |		    band 0
+ *  |         |  --------		    -----------  --------
+ *  |         |  |      |  in y-x banded    |         |  |      |   band 1
+ *  |         |  |      |  form is	    |         |  |      |
+ *  -----------  |      |		    -----------  --------
+ *               |      |				 |      |   band 2
+ *               --------				 --------
+ *
+ * An added constraint on the rectangles is that they must cover as much
+ * horizontal area as possible: no two rectangles within a band are allowed
+ * to touch.
+ *
+ * Whenever possible, bands will be merged together to cover a greater vertical
+ * distance (and thus reduce the number of rectangles). Two bands can be merged
+ * only if the bottom of one touches the top of the other and they have
+ * rectangles in the same places (of the same width, of course).
+ *
+ * Adam de Boor wrote most of the original region code.  Joel McCormack
+ * substantially modified or rewrote most of the core arithmetic routines, and
+ * added pixman_region_validate in order to support several speed improvements
+ * to pixman_region_validate_tree.  Bob Scheifler changed the representation
+ * to be more compact when empty or a single rectangle, and did a bunch of
+ * gratuitous reformatting. Carl Worth did further gratuitous reformatting
+ * while re-merging the server and client region code into libpixregion.
+ * Soren Sandmann did even more gratuitous reformatting.
+ */
+
+/*  true iff two Boxes overlap */
+#define EXTENTCHECK(r1, r2)	   \
+    (!( ((r1)->x2 <= (r2)->x1)  || \
+        ((r1)->x1 >= (r2)->x2)  || \
+        ((r1)->y2 <= (r2)->y1)  || \
+        ((r1)->y1 >= (r2)->y2) ) )
+
+/* true iff (x,y) is in Box */
+#define INBOX(r, x, y)	\
+    ( ((r)->x2 >  x) && \
+      ((r)->x1 <= x) && \
+      ((r)->y2 >  y) && \
+      ((r)->y1 <= y) )
+
+/* true iff Box r1 contains Box r2 */
+#define SUBSUMES(r1, r2)	\
+    ( ((r1)->x1 <= (r2)->x1) && \
+      ((r1)->x2 >= (r2)->x2) && \
+      ((r1)->y1 <= (r2)->y1) && \
+      ((r1)->y2 >= (r2)->y2) )
+
+static size_t
+PIXREGION_SZOF (size_t n)
+{
+    size_t size = n * sizeof(box_type_t);
+    
+    if (n > UINT32_MAX / sizeof(box_type_t))
+	return 0;
+
+    if (sizeof(region_data_type_t) > UINT32_MAX - size)
+	return 0;
+
+    return size + sizeof(region_data_type_t);
+}
+
+static region_data_type_t *
+alloc_data (size_t n)
+{
+    size_t sz = PIXREGION_SZOF (n);
+
+    if (!sz)
+	return NULL;
+
+    return malloc (sz);
+}
+
+#define FREE_DATA(reg) if ((reg)->data && (reg)->data->size) free ((reg)->data)
+
+#define RECTALLOC_BAIL(region, n, bail)					\
+    do									\
+    {									\
+	if (!(region)->data ||						\
+	    (((region)->data->numRects + (n)) > (region)->data->size))	\
+	{								\
+	    if (!pixman_rect_alloc (region, n))				\
+		goto bail;						\
+	}								\
+    } while (0)
+
+#define RECTALLOC(region, n)						\
+    do									\
+    {									\
+	if (!(region)->data ||						\
+	    (((region)->data->numRects + (n)) > (region)->data->size))	\
+	{								\
+	    if (!pixman_rect_alloc (region, n)) {			\
+		return FALSE;						\
+	    }								\
+	}								\
+    } while (0)
+
+#define ADDRECT(next_rect, nx1, ny1, nx2, ny2)      \
+    do						    \
+    {						    \
+	next_rect->x1 = nx1;                        \
+	next_rect->y1 = ny1;                        \
+	next_rect->x2 = nx2;                        \
+	next_rect->y2 = ny2;                        \
+	next_rect++;                                \
+    }						    \
+    while (0)
+
+#define NEWRECT(region, next_rect, nx1, ny1, nx2, ny2)			\
+    do									\
+    {									\
+	if (!(region)->data ||						\
+	    ((region)->data->numRects == (region)->data->size))		\
+	{								\
+	    if (!pixman_rect_alloc (region, 1))				\
+		return FALSE;						\
+	    next_rect = PIXREGION_TOP (region);				\
+	}								\
+	ADDRECT (next_rect, nx1, ny1, nx2, ny2);			\
+	region->data->numRects++;					\
+	critical_if_fail (region->data->numRects <= region->data->size);		\
+    } while (0)
+
+#define DOWNSIZE(reg, numRects)						\
+    do									\
+    {									\
+	if (((numRects) < ((reg)->data->size >> 1)) &&			\
+	    ((reg)->data->size > 50))					\
+	{								\
+	    region_data_type_t * new_data;				\
+	    size_t data_size = PIXREGION_SZOF (numRects);		\
+									\
+	    if (!data_size)						\
+	    {								\
+		new_data = NULL;					\
+	    }								\
+	    else							\
+	    {								\
+		new_data = (region_data_type_t *)			\
+		    realloc ((reg)->data, data_size);			\
+	    }								\
+									\
+	    if (new_data)						\
+	    {								\
+		new_data->size = (numRects);				\
+		(reg)->data = new_data;					\
+	    }								\
+	}								\
+    } while (0)
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_equal) (region_type_t *reg1, region_type_t *reg2)
+{
+    int i;
+    box_type_t *rects1;
+    box_type_t *rects2;
+
+    if (reg1->extents.x1 != reg2->extents.x1)
+	return FALSE;
+    
+    if (reg1->extents.x2 != reg2->extents.x2)
+	return FALSE;
+    
+    if (reg1->extents.y1 != reg2->extents.y1)
+	return FALSE;
+    
+    if (reg1->extents.y2 != reg2->extents.y2)
+	return FALSE;
+    
+    if (PIXREGION_NUMRECTS (reg1) != PIXREGION_NUMRECTS (reg2))
+	return FALSE;
+
+    rects1 = PIXREGION_RECTS (reg1);
+    rects2 = PIXREGION_RECTS (reg2);
+    
+    for (i = 0; i != PIXREGION_NUMRECTS (reg1); i++)
+    {
+	if (rects1[i].x1 != rects2[i].x1)
+	    return FALSE;
+	
+	if (rects1[i].x2 != rects2[i].x2)
+	    return FALSE;
+	
+	if (rects1[i].y1 != rects2[i].y1)
+	    return FALSE;
+	
+	if (rects1[i].y2 != rects2[i].y2)
+	    return FALSE;
+    }
+
+    return TRUE;
+}
+
+int
+PREFIX (_print) (region_type_t *rgn)
+{
+    int num, size;
+    int i;
+    box_type_t * rects;
+
+    num = PIXREGION_NUMRECTS (rgn);
+    size = PIXREGION_SIZE (rgn);
+    rects = PIXREGION_RECTS (rgn);
+
+    fprintf (stderr, "num: %d size: %d\n", num, size);
+    fprintf (stderr, "extents: %d %d %d %d\n",
+             rgn->extents.x1,
+	     rgn->extents.y1,
+	     rgn->extents.x2,
+	     rgn->extents.y2);
+    
+    for (i = 0; i < num; i++)
+    {
+	fprintf (stderr, "%d %d %d %d \n",
+	         rects[i].x1, rects[i].y1, rects[i].x2, rects[i].y2);
+    }
+    
+    fprintf (stderr, "\n");
+
+    return(num);
+}
+
+
+PIXMAN_EXPORT void
+PREFIX (_init) (region_type_t *region)
+{
+    region->extents = *pixman_region_empty_box;
+    region->data = pixman_region_empty_data;
+}
+
+PIXMAN_EXPORT void
+PREFIX (_init_rect) (region_type_t *	region,
+                     int		x,
+		     int		y,
+		     unsigned int	width,
+		     unsigned int	height)
+{
+    region->extents.x1 = x;
+    region->extents.y1 = y;
+    region->extents.x2 = x + width;
+    region->extents.y2 = y + height;
+
+    if (!GOOD_RECT (&region->extents))
+    {
+        if (BAD_RECT (&region->extents))
+            _pixman_log_error (FUNC, "Invalid rectangle passed");
+        PREFIX (_init) (region);
+        return;
+    }
+
+    region->data = NULL;
+}
+
+PIXMAN_EXPORT void
+PREFIX (_init_with_extents) (region_type_t *region, box_type_t *extents)
+{
+    if (!GOOD_RECT (extents))
+    {
+        if (BAD_RECT (extents))
+            _pixman_log_error (FUNC, "Invalid rectangle passed");
+        PREFIX (_init) (region);
+        return;
+    }
+    region->extents = *extents;
+
+    region->data = NULL;
+}
+
+PIXMAN_EXPORT void
+PREFIX (_fini) (region_type_t *region)
+{
+    GOOD (region);
+    FREE_DATA (region);
+}
+
+PIXMAN_EXPORT int
+PREFIX (_n_rects) (region_type_t *region)
+{
+    return PIXREGION_NUMRECTS (region);
+}
+
+PIXMAN_EXPORT box_type_t *
+PREFIX (_rectangles) (region_type_t *region,
+                      int               *n_rects)
+{
+    if (n_rects)
+	*n_rects = PIXREGION_NUMRECTS (region);
+
+    return PIXREGION_RECTS (region);
+}
+
+static pixman_bool_t
+pixman_break (region_type_t *region)
+{
+    FREE_DATA (region);
+
+    region->extents = *pixman_region_empty_box;
+    region->data = pixman_broken_data;
+
+    return FALSE;
+}
+
+static pixman_bool_t
+pixman_rect_alloc (region_type_t * region,
+                   int             n)
+{
+    region_data_type_t *data;
+
+    if (!region->data)
+    {
+	n++;
+	region->data = alloc_data (n);
+
+	if (!region->data)
+	    return pixman_break (region);
+
+	region->data->numRects = 1;
+	*PIXREGION_BOXPTR (region) = region->extents;
+    }
+    else if (!region->data->size)
+    {
+	region->data = alloc_data (n);
+
+	if (!region->data)
+	    return pixman_break (region);
+
+	region->data->numRects = 0;
+    }
+    else
+    {
+	size_t data_size;
+
+	if (n == 1)
+	{
+	    n = region->data->numRects;
+	    if (n > 500) /* XXX pick numbers out of a hat */
+		n = 250;
+	}
+
+	n += region->data->numRects;
+	data_size = PIXREGION_SZOF (n);
+
+	if (!data_size)
+	{
+	    data = NULL;
+	}
+	else
+	{
+	    data = (region_data_type_t *)
+		realloc (region->data, PIXREGION_SZOF (n));
+	}
+	
+	if (!data)
+	    return pixman_break (region);
+	
+	region->data = data;
+    }
+    
+    region->data->size = n;
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_copy) (region_type_t *dst, region_type_t *src)
+{
+    GOOD (dst);
+    GOOD (src);
+
+    if (dst == src)
+	return TRUE;
+    
+    dst->extents = src->extents;
+
+    if (!src->data || !src->data->size)
+    {
+	FREE_DATA (dst);
+	dst->data = src->data;
+	return TRUE;
+    }
+    
+    if (!dst->data || (dst->data->size < src->data->numRects))
+    {
+	FREE_DATA (dst);
+
+	dst->data = alloc_data (src->data->numRects);
+
+	if (!dst->data)
+	    return pixman_break (dst);
+
+	dst->data->size = src->data->numRects;
+    }
+
+    dst->data->numRects = src->data->numRects;
+
+    memmove ((char *)PIXREGION_BOXPTR (dst), (char *)PIXREGION_BOXPTR (src),
+             dst->data->numRects * sizeof(box_type_t));
+
+    return TRUE;
+}
+
+/*======================================================================
+ *	    Generic Region Operator
+ *====================================================================*/
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_coalesce --
+ *	Attempt to merge the boxes in the current band with those in the
+ *	previous one.  We are guaranteed that the current band extends to
+ *      the end of the rects array.  Used only by pixman_op.
+ *
+ * Results:
+ *	The new index for the previous band.
+ *
+ * Side Effects:
+ *	If coalescing takes place:
+ *	    - rectangles in the previous band will have their y2 fields
+ *	      altered.
+ *	    - region->data->numRects will be decreased.
+ *
+ *-----------------------------------------------------------------------
+ */
+static inline int
+pixman_coalesce (region_type_t * region,      /* Region to coalesce		 */
+		 int             prev_start,  /* Index of start of previous band */
+		 int             cur_start)   /* Index of start of current band  */
+{
+    box_type_t *prev_box;       /* Current box in previous band	     */
+    box_type_t *cur_box;        /* Current box in current band       */
+    int numRects;               /* Number rectangles in both bands   */
+    int y2;                     /* Bottom of current band	     */
+
+    /*
+     * Figure out how many rectangles are in the band.
+     */
+    numRects = cur_start - prev_start;
+    critical_if_fail (numRects == region->data->numRects - cur_start);
+
+    if (!numRects) return cur_start;
+
+    /*
+     * The bands may only be coalesced if the bottom of the previous
+     * matches the top scanline of the current.
+     */
+    prev_box = PIXREGION_BOX (region, prev_start);
+    cur_box = PIXREGION_BOX (region, cur_start);
+    if (prev_box->y2 != cur_box->y1) return cur_start;
+
+    /*
+     * Make sure the bands have boxes in the same places. This
+     * assumes that boxes have been added in such a way that they
+     * cover the most area possible. I.e. two boxes in a band must
+     * have some horizontal space between them.
+     */
+    y2 = cur_box->y2;
+
+    do
+    {
+	if ((prev_box->x1 != cur_box->x1) || (prev_box->x2 != cur_box->x2))
+	    return (cur_start);
+	
+	prev_box++;
+	cur_box++;
+	numRects--;
+    }
+    while (numRects);
+
+    /*
+     * The bands may be merged, so set the bottom y of each box
+     * in the previous band to the bottom y of the current band.
+     */
+    numRects = cur_start - prev_start;
+    region->data->numRects -= numRects;
+
+    do
+    {
+	prev_box--;
+	prev_box->y2 = y2;
+	numRects--;
+    }
+    while (numRects);
+
+    return prev_start;
+}
+
+/* Quicky macro to avoid trivial reject procedure calls to pixman_coalesce */
+
+#define COALESCE(new_reg, prev_band, cur_band)                          \
+    do									\
+    {									\
+	if (cur_band - prev_band == new_reg->data->numRects - cur_band)	\
+	    prev_band = pixman_coalesce (new_reg, prev_band, cur_band);	\
+	else								\
+	    prev_band = cur_band;					\
+    } while (0)
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_append_non_o --
+ *	Handle a non-overlapping band for the union and subtract operations.
+ *      Just adds the (top/bottom-clipped) rectangles into the region.
+ *      Doesn't have to check for subsumption or anything.
+ *
+ * Results:
+ *	None.
+ *
+ * Side Effects:
+ *	region->data->numRects is incremented and the rectangles overwritten
+ *	with the rectangles we're passed.
+ *
+ *-----------------------------------------------------------------------
+ */
+static inline pixman_bool_t
+pixman_region_append_non_o (region_type_t * region,
+			    box_type_t *    r,
+			    box_type_t *    r_end,
+			    int             y1,
+			    int             y2)
+{
+    box_type_t *next_rect;
+    int new_rects;
+
+    new_rects = r_end - r;
+
+    critical_if_fail (y1 < y2);
+    critical_if_fail (new_rects != 0);
+
+    /* Make sure we have enough space for all rectangles to be added */
+    RECTALLOC (region, new_rects);
+    next_rect = PIXREGION_TOP (region);
+    region->data->numRects += new_rects;
+
+    do
+    {
+	critical_if_fail (r->x1 < r->x2);
+	ADDRECT (next_rect, r->x1, y1, r->x2, y2);
+	r++;
+    }
+    while (r != r_end);
+
+    return TRUE;
+}
+
+#define FIND_BAND(r, r_band_end, r_end, ry1)			     \
+    do								     \
+    {								     \
+	ry1 = r->y1;						     \
+	r_band_end = r + 1;					     \
+	while ((r_band_end != r_end) && (r_band_end->y1 == ry1)) {   \
+	    r_band_end++;					     \
+	}							     \
+    } while (0)
+
+#define APPEND_REGIONS(new_reg, r, r_end)				\
+    do									\
+    {									\
+	int new_rects;							\
+	if ((new_rects = r_end - r)) {					\
+	    RECTALLOC_BAIL (new_reg, new_rects, bail);			\
+	    memmove ((char *)PIXREGION_TOP (new_reg), (char *)r,	\
+		     new_rects * sizeof(box_type_t));			\
+	    new_reg->data->numRects += new_rects;			\
+	}								\
+    } while (0)
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_op --
+ *	Apply an operation to two regions. Called by pixman_region_union, pixman_region_inverse,
+ *	pixman_region_subtract, pixman_region_intersect....  Both regions MUST have at least one
+ *      rectangle, and cannot be the same object.
+ *
+ * Results:
+ *	TRUE if successful.
+ *
+ * Side Effects:
+ *	The new region is overwritten.
+ *	overlap set to TRUE if overlap_func ever returns TRUE.
+ *
+ * Notes:
+ *	The idea behind this function is to view the two regions as sets.
+ *	Together they cover a rectangle of area that this function divides
+ *	into horizontal bands where points are covered only by one region
+ *	or by both. For the first case, the non_overlap_func is called with
+ *	each the band and the band's upper and lower extents. For the
+ *	second, the overlap_func is called to process the entire band. It
+ *	is responsible for clipping the rectangles in the band, though
+ *	this function provides the boundaries.
+ *	At the end of each band, the new region is coalesced, if possible,
+ *	to reduce the number of rectangles in the region.
+ *
+ *-----------------------------------------------------------------------
+ */
+
+typedef pixman_bool_t (*overlap_proc_ptr) (region_type_t *region,
+					   box_type_t *   r1,
+					   box_type_t *   r1_end,
+					   box_type_t *   r2,
+					   box_type_t *   r2_end,
+					   int            y1,
+					   int            y2);
+
+static pixman_bool_t
+pixman_op (region_type_t *  new_reg,               /* Place to store result	    */
+	   region_type_t *  reg1,                  /* First region in operation     */
+	   region_type_t *  reg2,                  /* 2d region in operation        */
+	   overlap_proc_ptr overlap_func,          /* Function to call for over-
+						    * lapping bands		    */
+	   int              append_non1,           /* Append non-overlapping bands  
+						    * in region 1 ?
+						    */
+	   int              append_non2            /* Append non-overlapping bands
+						    * in region 2 ?
+						    */
+    )
+{
+    box_type_t *r1;                 /* Pointer into first region     */
+    box_type_t *r2;                 /* Pointer into 2d region	     */
+    box_type_t *r1_end;             /* End of 1st region	     */
+    box_type_t *r2_end;             /* End of 2d region		     */
+    int ybot;                       /* Bottom of intersection	     */
+    int ytop;                       /* Top of intersection	     */
+    region_data_type_t *old_data;   /* Old data for new_reg	     */
+    int prev_band;                  /* Index of start of
+				     * previous band in new_reg       */
+    int cur_band;                   /* Index of start of current
+				     * band in new_reg		     */
+    box_type_t * r1_band_end;       /* End of current band in r1     */
+    box_type_t * r2_band_end;       /* End of current band in r2     */
+    int top;                        /* Top of non-overlapping band   */
+    int bot;                        /* Bottom of non-overlapping band*/
+    int r1y1;                       /* Temps for r1->y1 and r2->y1   */
+    int r2y1;
+    int new_size;
+    int numRects;
+
+    /*
+     * Break any region computed from a broken region
+     */
+    if (PIXREGION_NAR (reg1) || PIXREGION_NAR (reg2))
+	return pixman_break (new_reg);
+
+    /*
+     * Initialization:
+     *	set r1, r2, r1_end and r2_end appropriately, save the rectangles
+     * of the destination region until the end in case it's one of
+     * the two source regions, then mark the "new" region empty, allocating
+     * another array of rectangles for it to use.
+     */
+
+    r1 = PIXREGION_RECTS (reg1);
+    new_size = PIXREGION_NUMRECTS (reg1);
+    r1_end = r1 + new_size;
+
+    numRects = PIXREGION_NUMRECTS (reg2);
+    r2 = PIXREGION_RECTS (reg2);
+    r2_end = r2 + numRects;
+    
+    critical_if_fail (r1 != r1_end);
+    critical_if_fail (r2 != r2_end);
+
+    old_data = (region_data_type_t *)NULL;
+
+    if (((new_reg == reg1) && (new_size > 1)) ||
+        ((new_reg == reg2) && (numRects > 1)))
+    {
+        old_data = new_reg->data;
+        new_reg->data = pixman_region_empty_data;
+    }
+
+    /* guess at new size */
+    if (numRects > new_size)
+	new_size = numRects;
+
+    new_size <<= 1;
+
+    if (!new_reg->data)
+	new_reg->data = pixman_region_empty_data;
+    else if (new_reg->data->size)
+	new_reg->data->numRects = 0;
+
+    if (new_size > new_reg->data->size)
+    {
+        if (!pixman_rect_alloc (new_reg, new_size))
+        {
+            free (old_data);
+            return FALSE;
+	}
+    }
+
+    /*
+     * Initialize ybot.
+     * In the upcoming loop, ybot and ytop serve different functions depending
+     * on whether the band being handled is an overlapping or non-overlapping
+     * band.
+     *  In the case of a non-overlapping band (only one of the regions
+     * has points in the band), ybot is the bottom of the most recent
+     * intersection and thus clips the top of the rectangles in that band.
+     * ytop is the top of the next intersection between the two regions and
+     * serves to clip the bottom of the rectangles in the current band.
+     *	For an overlapping band (where the two regions intersect), ytop clips
+     * the top of the rectangles of both regions and ybot clips the bottoms.
+     */
+
+    ybot = MIN (r1->y1, r2->y1);
+
+    /*
+     * prev_band serves to mark the start of the previous band so rectangles
+     * can be coalesced into larger rectangles. qv. pixman_coalesce, above.
+     * In the beginning, there is no previous band, so prev_band == cur_band
+     * (cur_band is set later on, of course, but the first band will always
+     * start at index 0). prev_band and cur_band must be indices because of
+     * the possible expansion, and resultant moving, of the new region's
+     * array of rectangles.
+     */
+    prev_band = 0;
+
+    do
+    {
+        /*
+	 * This algorithm proceeds one source-band (as opposed to a
+	 * destination band, which is determined by where the two regions
+	 * intersect) at a time. r1_band_end and r2_band_end serve to mark the
+	 * rectangle after the last one in the current band for their
+	 * respective regions.
+	 */
+        critical_if_fail (r1 != r1_end);
+        critical_if_fail (r2 != r2_end);
+
+        FIND_BAND (r1, r1_band_end, r1_end, r1y1);
+        FIND_BAND (r2, r2_band_end, r2_end, r2y1);
+
+        /*
+	 * First handle the band that doesn't intersect, if any.
+	 *
+	 * Note that attention is restricted to one band in the
+	 * non-intersecting region at once, so if a region has n
+	 * bands between the current position and the next place it overlaps
+	 * the other, this entire loop will be passed through n times.
+	 */
+        if (r1y1 < r2y1)
+        {
+            if (append_non1)
+            {
+                top = MAX (r1y1, ybot);
+                bot = MIN (r1->y2, r2y1);
+                if (top != bot)
+                {
+                    cur_band = new_reg->data->numRects;
+                    if (!pixman_region_append_non_o (new_reg, r1, r1_band_end, top, bot))
+			goto bail;
+                    COALESCE (new_reg, prev_band, cur_band);
+		}
+	    }
+            ytop = r2y1;
+	}
+        else if (r2y1 < r1y1)
+        {
+            if (append_non2)
+            {
+                top = MAX (r2y1, ybot);
+                bot = MIN (r2->y2, r1y1);
+		
+                if (top != bot)
+                {
+                    cur_band = new_reg->data->numRects;
+
+                    if (!pixman_region_append_non_o (new_reg, r2, r2_band_end, top, bot))
+			goto bail;
+
+                    COALESCE (new_reg, prev_band, cur_band);
+		}
+	    }
+            ytop = r1y1;
+	}
+        else
+        {
+            ytop = r1y1;
+	}
+
+        /*
+	 * Now see if we've hit an intersecting band. The two bands only
+	 * intersect if ybot > ytop
+	 */
+        ybot = MIN (r1->y2, r2->y2);
+        if (ybot > ytop)
+        {
+            cur_band = new_reg->data->numRects;
+
+            if (!(*overlap_func)(new_reg,
+                                 r1, r1_band_end,
+                                 r2, r2_band_end,
+                                 ytop, ybot))
+	    {
+		goto bail;
+	    }
+	    
+            COALESCE (new_reg, prev_band, cur_band);
+	}
+
+        /*
+	 * If we've finished with a band (y2 == ybot) we skip forward
+	 * in the region to the next band.
+	 */
+        if (r1->y2 == ybot)
+	    r1 = r1_band_end;
+
+        if (r2->y2 == ybot)
+	    r2 = r2_band_end;
+
+    }
+    while (r1 != r1_end && r2 != r2_end);
+
+    /*
+     * Deal with whichever region (if any) still has rectangles left.
+     *
+     * We only need to worry about banding and coalescing for the very first
+     * band left.  After that, we can just group all remaining boxes,
+     * regardless of how many bands, into one final append to the list.
+     */
+
+    if ((r1 != r1_end) && append_non1)
+    {
+        /* Do first non_overlap1Func call, which may be able to coalesce */
+        FIND_BAND (r1, r1_band_end, r1_end, r1y1);
+	
+        cur_band = new_reg->data->numRects;
+	
+        if (!pixman_region_append_non_o (new_reg,
+                                         r1, r1_band_end,
+                                         MAX (r1y1, ybot), r1->y2))
+	{
+	    goto bail;
+	}
+	
+        COALESCE (new_reg, prev_band, cur_band);
+
+        /* Just append the rest of the boxes  */
+        APPEND_REGIONS (new_reg, r1_band_end, r1_end);
+    }
+    else if ((r2 != r2_end) && append_non2)
+    {
+        /* Do first non_overlap2Func call, which may be able to coalesce */
+        FIND_BAND (r2, r2_band_end, r2_end, r2y1);
+
+	cur_band = new_reg->data->numRects;
+
+        if (!pixman_region_append_non_o (new_reg,
+                                         r2, r2_band_end,
+                                         MAX (r2y1, ybot), r2->y2))
+	{
+	    goto bail;
+	}
+
+        COALESCE (new_reg, prev_band, cur_band);
+
+        /* Append rest of boxes */
+        APPEND_REGIONS (new_reg, r2_band_end, r2_end);
+    }
+
+    free (old_data);
+
+    if (!(numRects = new_reg->data->numRects))
+    {
+        FREE_DATA (new_reg);
+        new_reg->data = pixman_region_empty_data;
+    }
+    else if (numRects == 1)
+    {
+        new_reg->extents = *PIXREGION_BOXPTR (new_reg);
+        FREE_DATA (new_reg);
+        new_reg->data = (region_data_type_t *)NULL;
+    }
+    else
+    {
+        DOWNSIZE (new_reg, numRects);
+    }
+
+    return TRUE;
+
+bail:
+    free (old_data);
+
+    return pixman_break (new_reg);
+}
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_set_extents --
+ *	Reset the extents of a region to what they should be. Called by
+ *	pixman_region_subtract and pixman_region_intersect as they can't
+ *      figure it out along the way or do so easily, as pixman_region_union can.
+ *
+ * Results:
+ *	None.
+ *
+ * Side Effects:
+ *	The region's 'extents' structure is overwritten.
+ *
+ *-----------------------------------------------------------------------
+ */
+static void
+pixman_set_extents (region_type_t *region)
+{
+    box_type_t *box, *box_end;
+
+    if (!region->data)
+	return;
+
+    if (!region->data->size)
+    {
+        region->extents.x2 = region->extents.x1;
+        region->extents.y2 = region->extents.y1;
+        return;
+    }
+
+    box = PIXREGION_BOXPTR (region);
+    box_end = PIXREGION_END (region);
+
+    /*
+     * Since box is the first rectangle in the region, it must have the
+     * smallest y1 and since box_end is the last rectangle in the region,
+     * it must have the largest y2, because of banding. Initialize x1 and
+     * x2 from  box and box_end, resp., as good things to initialize them
+     * to...
+     */
+    region->extents.x1 = box->x1;
+    region->extents.y1 = box->y1;
+    region->extents.x2 = box_end->x2;
+    region->extents.y2 = box_end->y2;
+
+    critical_if_fail (region->extents.y1 < region->extents.y2);
+
+    while (box <= box_end)
+    {
+        if (box->x1 < region->extents.x1)
+	    region->extents.x1 = box->x1;
+        if (box->x2 > region->extents.x2)
+	    region->extents.x2 = box->x2;
+        box++;
+    }
+
+    critical_if_fail (region->extents.x1 < region->extents.x2);
+}
+
+/*======================================================================
+ *	    Region Intersection
+ *====================================================================*/
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_intersect_o --
+ *	Handle an overlapping band for pixman_region_intersect.
+ *
+ * Results:
+ *	TRUE if successful.
+ *
+ * Side Effects:
+ *	Rectangles may be added to the region.
+ *
+ *-----------------------------------------------------------------------
+ */
+/*ARGSUSED*/
+static pixman_bool_t
+pixman_region_intersect_o (region_type_t *region,
+                           box_type_t *   r1,
+                           box_type_t *   r1_end,
+                           box_type_t *   r2,
+                           box_type_t *   r2_end,
+                           int            y1,
+                           int            y2)
+{
+    int x1;
+    int x2;
+    box_type_t *        next_rect;
+
+    next_rect = PIXREGION_TOP (region);
+
+    critical_if_fail (y1 < y2);
+    critical_if_fail (r1 != r1_end && r2 != r2_end);
+
+    do
+    {
+        x1 = MAX (r1->x1, r2->x1);
+        x2 = MIN (r1->x2, r2->x2);
+
+        /*
+	 * If there's any overlap between the two rectangles, add that
+	 * overlap to the new region.
+	 */
+        if (x1 < x2)
+	    NEWRECT (region, next_rect, x1, y1, x2, y2);
+
+        /*
+	 * Advance the pointer(s) with the leftmost right side, since the next
+	 * rectangle on that list may still overlap the other region's
+	 * current rectangle.
+	 */
+        if (r1->x2 == x2)
+        {
+            r1++;
+	}
+        if (r2->x2 == x2)
+        {
+            r2++;
+	}
+    }
+    while ((r1 != r1_end) && (r2 != r2_end));
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_intersect) (region_type_t *     new_reg,
+                     region_type_t *        reg1,
+                     region_type_t *        reg2)
+{
+    GOOD (reg1);
+    GOOD (reg2);
+    GOOD (new_reg);
+
+    /* check for trivial reject */
+    if (PIXREGION_NIL (reg1) || PIXREGION_NIL (reg2) ||
+        !EXTENTCHECK (&reg1->extents, &reg2->extents))
+    {
+        /* Covers about 20% of all cases */
+        FREE_DATA (new_reg);
+        new_reg->extents.x2 = new_reg->extents.x1;
+        new_reg->extents.y2 = new_reg->extents.y1;
+        if (PIXREGION_NAR (reg1) || PIXREGION_NAR (reg2))
+        {
+            new_reg->data = pixman_broken_data;
+            return FALSE;
+	}
+        else
+	{
+	    new_reg->data = pixman_region_empty_data;
+	}
+    }
+    else if (!reg1->data && !reg2->data)
+    {
+        /* Covers about 80% of cases that aren't trivially rejected */
+        new_reg->extents.x1 = MAX (reg1->extents.x1, reg2->extents.x1);
+        new_reg->extents.y1 = MAX (reg1->extents.y1, reg2->extents.y1);
+        new_reg->extents.x2 = MIN (reg1->extents.x2, reg2->extents.x2);
+        new_reg->extents.y2 = MIN (reg1->extents.y2, reg2->extents.y2);
+
+        FREE_DATA (new_reg);
+
+	new_reg->data = (region_data_type_t *)NULL;
+    }
+    else if (!reg2->data && SUBSUMES (&reg2->extents, &reg1->extents))
+    {
+        return PREFIX (_copy) (new_reg, reg1);
+    }
+    else if (!reg1->data && SUBSUMES (&reg1->extents, &reg2->extents))
+    {
+        return PREFIX (_copy) (new_reg, reg2);
+    }
+    else if (reg1 == reg2)
+    {
+        return PREFIX (_copy) (new_reg, reg1);
+    }
+    else
+    {
+        /* General purpose intersection */
+
+        if (!pixman_op (new_reg, reg1, reg2, pixman_region_intersect_o, FALSE, FALSE))
+	    return FALSE;
+	
+        pixman_set_extents (new_reg);
+    }
+
+    GOOD (new_reg);
+    return(TRUE);
+}
+
+#define MERGERECT(r)							\
+    do									\
+    {									\
+        if (r->x1 <= x2)						\
+	{								\
+            /* Merge with current rectangle */				\
+            if (x2 < r->x2)						\
+		x2 = r->x2;						\
+	}								\
+	else								\
+	{								\
+            /* Add current rectangle, start new one */			\
+            NEWRECT (region, next_rect, x1, y1, x2, y2);		\
+            x1 = r->x1;							\
+            x2 = r->x2;							\
+	}								\
+        r++;								\
+    } while (0)
+
+/*======================================================================
+ *	    Region Union
+ *====================================================================*/
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_union_o --
+ *	Handle an overlapping band for the union operation. Picks the
+ *	left-most rectangle each time and merges it into the region.
+ *
+ * Results:
+ *	TRUE if successful.
+ *
+ * Side Effects:
+ *	region is overwritten.
+ *	overlap is set to TRUE if any boxes overlap.
+ *
+ *-----------------------------------------------------------------------
+ */
+static pixman_bool_t
+pixman_region_union_o (region_type_t *region,
+		       box_type_t *   r1,
+		       box_type_t *   r1_end,
+		       box_type_t *   r2,
+		       box_type_t *   r2_end,
+		       int            y1,
+		       int            y2)
+{
+    box_type_t *next_rect;
+    int x1;            /* left and right side of current union */
+    int x2;
+
+    critical_if_fail (y1 < y2);
+    critical_if_fail (r1 != r1_end && r2 != r2_end);
+
+    next_rect = PIXREGION_TOP (region);
+
+    /* Start off current rectangle */
+    if (r1->x1 < r2->x1)
+    {
+        x1 = r1->x1;
+        x2 = r1->x2;
+        r1++;
+    }
+    else
+    {
+        x1 = r2->x1;
+        x2 = r2->x2;
+        r2++;
+    }
+    while (r1 != r1_end && r2 != r2_end)
+    {
+        if (r1->x1 < r2->x1)
+	    MERGERECT (r1);
+	else
+	    MERGERECT (r2);
+    }
+
+    /* Finish off whoever (if any) is left */
+    if (r1 != r1_end)
+    {
+        do
+        {
+            MERGERECT (r1);
+	}
+        while (r1 != r1_end);
+    }
+    else if (r2 != r2_end)
+    {
+        do
+        {
+            MERGERECT (r2);
+	}
+        while (r2 != r2_end);
+    }
+
+    /* Add current rectangle */
+    NEWRECT (region, next_rect, x1, y1, x2, y2);
+
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX(_intersect_rect) (region_type_t *dest,
+			 region_type_t *source,
+			 int x, int y,
+			 unsigned int width,
+			 unsigned int height)
+{
+    region_type_t region;
+
+    region.data = NULL;
+    region.extents.x1 = x;
+    region.extents.y1 = y;
+    region.extents.x2 = x + width;
+    region.extents.y2 = y + height;
+
+    return PREFIX(_intersect) (dest, source, &region);
+}
+
+/* Convenience function for performing union of region with a
+ * single rectangle
+ */
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_union_rect) (region_type_t *dest,
+                      region_type_t *source,
+                      int            x,
+		      int            y,
+                      unsigned int   width,
+		      unsigned int   height)
+{
+    region_type_t region;
+
+    region.extents.x1 = x;
+    region.extents.y1 = y;
+    region.extents.x2 = x + width;
+    region.extents.y2 = y + height;
+
+    if (!GOOD_RECT (&region.extents))
+    {
+        if (BAD_RECT (&region.extents))
+            _pixman_log_error (FUNC, "Invalid rectangle passed");
+	return PREFIX (_copy) (dest, source);
+    }
+
+    region.data = NULL;
+
+    return PREFIX (_union) (dest, source, &region);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_union) (region_type_t *new_reg,
+                 region_type_t *reg1,
+                 region_type_t *reg2)
+{
+    /* Return TRUE if some overlap
+     * between reg1, reg2
+     */
+    GOOD (reg1);
+    GOOD (reg2);
+    GOOD (new_reg);
+
+    /*  checks all the simple cases */
+
+    /*
+     * Region 1 and 2 are the same
+     */
+    if (reg1 == reg2)
+        return PREFIX (_copy) (new_reg, reg1);
+
+    /*
+     * Region 1 is empty
+     */
+    if (PIXREGION_NIL (reg1))
+    {
+        if (PIXREGION_NAR (reg1))
+	    return pixman_break (new_reg);
+
+        if (new_reg != reg2)
+	    return PREFIX (_copy) (new_reg, reg2);
+
+	return TRUE;
+    }
+
+    /*
+     * Region 2 is empty
+     */
+    if (PIXREGION_NIL (reg2))
+    {
+        if (PIXREGION_NAR (reg2))
+	    return pixman_break (new_reg);
+
+	if (new_reg != reg1)
+	    return PREFIX (_copy) (new_reg, reg1);
+
+	return TRUE;
+    }
+
+    /*
+     * Region 1 completely subsumes region 2
+     */
+    if (!reg1->data && SUBSUMES (&reg1->extents, &reg2->extents))
+    {
+        if (new_reg != reg1)
+	    return PREFIX (_copy) (new_reg, reg1);
+
+	return TRUE;
+    }
+
+    /*
+     * Region 2 completely subsumes region 1
+     */
+    if (!reg2->data && SUBSUMES (&reg2->extents, &reg1->extents))
+    {
+        if (new_reg != reg2)
+	    return PREFIX (_copy) (new_reg, reg2);
+
+	return TRUE;
+    }
+
+    if (!pixman_op (new_reg, reg1, reg2, pixman_region_union_o, TRUE, TRUE))
+	return FALSE;
+
+    new_reg->extents.x1 = MIN (reg1->extents.x1, reg2->extents.x1);
+    new_reg->extents.y1 = MIN (reg1->extents.y1, reg2->extents.y1);
+    new_reg->extents.x2 = MAX (reg1->extents.x2, reg2->extents.x2);
+    new_reg->extents.y2 = MAX (reg1->extents.y2, reg2->extents.y2);
+    
+    GOOD (new_reg);
+
+    return TRUE;
+}
+
+/*======================================================================
+ *	    Batch Rectangle Union
+ *====================================================================*/
+
+#define EXCHANGE_RECTS(a, b)	\
+    {                           \
+        box_type_t t;		\
+        t = rects[a];           \
+        rects[a] = rects[b];    \
+        rects[b] = t;           \
+    }
+
+static void
+quick_sort_rects (
+    box_type_t rects[],
+    int        numRects)
+{
+    int y1;
+    int x1;
+    int i, j;
+    box_type_t *r;
+
+    /* Always called with numRects > 1 */
+
+    do
+    {
+        if (numRects == 2)
+        {
+            if (rects[0].y1 > rects[1].y1 ||
+                (rects[0].y1 == rects[1].y1 && rects[0].x1 > rects[1].x1))
+	    {
+		EXCHANGE_RECTS (0, 1);
+	    }
+
+            return;
+	}
+
+        /* Choose partition element, stick in location 0 */
+        EXCHANGE_RECTS (0, numRects >> 1);
+        y1 = rects[0].y1;
+        x1 = rects[0].x1;
+
+        /* Partition array */
+        i = 0;
+        j = numRects;
+
+        do
+        {
+            r = &(rects[i]);
+            do
+            {
+                r++;
+                i++;
+	    }
+	    while (i != numRects && (r->y1 < y1 || (r->y1 == y1 && r->x1 < x1)));
+
+	    r = &(rects[j]);
+            do
+            {
+                r--;
+                j--;
+	    }
+            while (y1 < r->y1 || (y1 == r->y1 && x1 < r->x1));
+	    
+            if (i < j)
+		EXCHANGE_RECTS (i, j);
+	}
+        while (i < j);
+
+        /* Move partition element back to middle */
+        EXCHANGE_RECTS (0, j);
+
+        /* Recurse */
+        if (numRects - j - 1 > 1)
+	    quick_sort_rects (&rects[j + 1], numRects - j - 1);
+
+        numRects = j;
+    }
+    while (numRects > 1);
+}
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_validate --
+ *
+ *      Take a ``region'' which is a non-y-x-banded random collection of
+ *      rectangles, and compute a nice region which is the union of all the
+ *      rectangles.
+ *
+ * Results:
+ *	TRUE if successful.
+ *
+ * Side Effects:
+ *      The passed-in ``region'' may be modified.
+ *	overlap set to TRUE if any retangles overlapped,
+ *      else FALSE;
+ *
+ * Strategy:
+ *      Step 1. Sort the rectangles into ascending order with primary key y1
+ *		and secondary key x1.
+ *
+ *      Step 2. Split the rectangles into the minimum number of proper y-x
+ *		banded regions.  This may require horizontally merging
+ *		rectangles, and vertically coalescing bands.  With any luck,
+ *		this step in an identity transformation (ala the Box widget),
+ *		or a coalescing into 1 box (ala Menus).
+ *
+ *	Step 3. Merge the separate regions down to a single region by calling
+ *		pixman_region_union.  Maximize the work each pixman_region_union call does by using
+ *		a binary merge.
+ *
+ *-----------------------------------------------------------------------
+ */
+
+static pixman_bool_t
+validate (region_type_t * badreg)
+{
+    /* Descriptor for regions under construction  in Step 2. */
+    typedef struct
+    {
+        region_type_t reg;
+        int prev_band;
+        int cur_band;
+    } region_info_t;
+
+    region_info_t stack_regions[64];
+
+    int numRects;                   /* Original numRects for badreg	    */
+    region_info_t *ri;              /* Array of current regions		    */
+    int num_ri;                     /* Number of entries used in ri	    */
+    int size_ri;                    /* Number of entries available in ri    */
+    int i;                          /* Index into rects			    */
+    int j;                          /* Index into ri			    */
+    region_info_t *rit;             /* &ri[j]				    */
+    region_type_t *reg;             /* ri[j].reg			    */
+    box_type_t *box;                /* Current box in rects		    */
+    box_type_t *ri_box;             /* Last box in ri[j].reg		    */
+    region_type_t *hreg;            /* ri[j_half].reg			    */
+    pixman_bool_t ret = TRUE;
+
+    if (!badreg->data)
+    {
+        GOOD (badreg);
+        return TRUE;
+    }
+    
+    numRects = badreg->data->numRects;
+    if (!numRects)
+    {
+        if (PIXREGION_NAR (badreg))
+	    return FALSE;
+        GOOD (badreg);
+        return TRUE;
+    }
+    
+    if (badreg->extents.x1 < badreg->extents.x2)
+    {
+        if ((numRects) == 1)
+        {
+            FREE_DATA (badreg);
+            badreg->data = (region_data_type_t *) NULL;
+	}
+        else
+        {
+            DOWNSIZE (badreg, numRects);
+	}
+
+        GOOD (badreg);
+
+	return TRUE;
+    }
+
+    /* Step 1: Sort the rects array into ascending (y1, x1) order */
+    quick_sort_rects (PIXREGION_BOXPTR (badreg), numRects);
+
+    /* Step 2: Scatter the sorted array into the minimum number of regions */
+
+    /* Set up the first region to be the first rectangle in badreg */
+    /* Note that step 2 code will never overflow the ri[0].reg rects array */
+    ri = stack_regions;
+    size_ri = sizeof (stack_regions) / sizeof (stack_regions[0]);
+    num_ri = 1;
+    ri[0].prev_band = 0;
+    ri[0].cur_band = 0;
+    ri[0].reg = *badreg;
+    box = PIXREGION_BOXPTR (&ri[0].reg);
+    ri[0].reg.extents = *box;
+    ri[0].reg.data->numRects = 1;
+    badreg->extents = *pixman_region_empty_box;
+    badreg->data = pixman_region_empty_data;
+
+    /* Now scatter rectangles into the minimum set of valid regions.  If the
+     * next rectangle to be added to a region would force an existing rectangle
+     * in the region to be split up in order to maintain y-x banding, just
+     * forget it.  Try the next region.  If it doesn't fit cleanly into any
+     * region, make a new one.
+     */
+
+    for (i = numRects; --i > 0;)
+    {
+        box++;
+        /* Look for a region to append box to */
+        for (j = num_ri, rit = ri; --j >= 0; rit++)
+        {
+            reg = &rit->reg;
+            ri_box = PIXREGION_END (reg);
+
+            if (box->y1 == ri_box->y1 && box->y2 == ri_box->y2)
+            {
+                /* box is in same band as ri_box.  Merge or append it */
+                if (box->x1 <= ri_box->x2)
+                {
+                    /* Merge it with ri_box */
+                    if (box->x2 > ri_box->x2)
+			ri_box->x2 = box->x2;
+		}
+                else
+                {
+                    RECTALLOC_BAIL (reg, 1, bail);
+                    *PIXREGION_TOP (reg) = *box;
+                    reg->data->numRects++;
+		}
+		
+                goto next_rect;   /* So sue me */
+	    }
+            else if (box->y1 >= ri_box->y2)
+            {
+                /* Put box into new band */
+                if (reg->extents.x2 < ri_box->x2)
+		    reg->extents.x2 = ri_box->x2;
+		
+                if (reg->extents.x1 > box->x1)
+		    reg->extents.x1 = box->x1;
+		
+                COALESCE (reg, rit->prev_band, rit->cur_band);
+                rit->cur_band = reg->data->numRects;
+                RECTALLOC_BAIL (reg, 1, bail);
+                *PIXREGION_TOP (reg) = *box;
+                reg->data->numRects++;
+
+                goto next_rect;
+	    }
+            /* Well, this region was inappropriate.  Try the next one. */
+	} /* for j */
+
+        /* Uh-oh.  No regions were appropriate.  Create a new one. */
+        if (size_ri == num_ri)
+        {
+            size_t data_size;
+
+            /* Oops, allocate space for new region information */
+            size_ri <<= 1;
+
+            data_size = size_ri * sizeof(region_info_t);
+            if (data_size / size_ri != sizeof(region_info_t))
+		goto bail;
+
+            if (ri == stack_regions)
+            {
+                rit = malloc (data_size);
+                if (!rit)
+		    goto bail;
+                memcpy (rit, ri, num_ri * sizeof (region_info_t));
+	    }
+            else
+            {
+                rit = (region_info_t *) realloc (ri, data_size);
+                if (!rit)
+		    goto bail;
+	    }
+            ri = rit;
+            rit = &ri[num_ri];
+	}
+        num_ri++;
+        rit->prev_band = 0;
+        rit->cur_band = 0;
+        rit->reg.extents = *box;
+        rit->reg.data = (region_data_type_t *)NULL;
+
+	/* MUST force allocation */
+        if (!pixman_rect_alloc (&rit->reg, (i + num_ri) / num_ri))
+	    goto bail;
+	
+    next_rect: ;
+    } /* for i */
+
+    /* Make a final pass over each region in order to COALESCE and set
+     * extents.x2 and extents.y2
+     */
+    for (j = num_ri, rit = ri; --j >= 0; rit++)
+    {
+        reg = &rit->reg;
+        ri_box = PIXREGION_END (reg);
+        reg->extents.y2 = ri_box->y2;
+
+        if (reg->extents.x2 < ri_box->x2)
+	    reg->extents.x2 = ri_box->x2;
+	
+        COALESCE (reg, rit->prev_band, rit->cur_band);
+
+	if (reg->data->numRects == 1) /* keep unions happy below */
+        {
+            FREE_DATA (reg);
+            reg->data = (region_data_type_t *)NULL;
+	}
+    }
+
+    /* Step 3: Union all regions into a single region */
+    while (num_ri > 1)
+    {
+        int half = num_ri / 2;
+        for (j = num_ri & 1; j < (half + (num_ri & 1)); j++)
+        {
+            reg = &ri[j].reg;
+            hreg = &ri[j + half].reg;
+
+            if (!pixman_op (reg, reg, hreg, pixman_region_union_o, TRUE, TRUE))
+		ret = FALSE;
+
+            if (hreg->extents.x1 < reg->extents.x1)
+		reg->extents.x1 = hreg->extents.x1;
+
+            if (hreg->extents.y1 < reg->extents.y1)
+		reg->extents.y1 = hreg->extents.y1;
+
+            if (hreg->extents.x2 > reg->extents.x2)
+		reg->extents.x2 = hreg->extents.x2;
+
+            if (hreg->extents.y2 > reg->extents.y2)
+		reg->extents.y2 = hreg->extents.y2;
+
+            FREE_DATA (hreg);
+	}
+
+        num_ri -= half;
+
+	if (!ret)
+	    goto bail;
+    }
+
+    *badreg = ri[0].reg;
+
+    if (ri != stack_regions)
+	free (ri);
+
+    GOOD (badreg);
+    return ret;
+
+bail:
+    for (i = 0; i < num_ri; i++)
+	FREE_DATA (&ri[i].reg);
+
+    if (ri != stack_regions)
+	free (ri);
+
+    return pixman_break (badreg);
+}
+
+/*======================================================================
+ *                Region Subtraction
+ *====================================================================*/
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_subtract_o --
+ *	Overlapping band subtraction. x1 is the left-most point not yet
+ *	checked.
+ *
+ * Results:
+ *	TRUE if successful.
+ *
+ * Side Effects:
+ *	region may have rectangles added to it.
+ *
+ *-----------------------------------------------------------------------
+ */
+/*ARGSUSED*/
+static pixman_bool_t
+pixman_region_subtract_o (region_type_t * region,
+			  box_type_t *    r1,
+			  box_type_t *    r1_end,
+			  box_type_t *    r2,
+			  box_type_t *    r2_end,
+			  int             y1,
+			  int             y2)
+{
+    box_type_t *        next_rect;
+    int x1;
+
+    x1 = r1->x1;
+
+    critical_if_fail (y1 < y2);
+    critical_if_fail (r1 != r1_end && r2 != r2_end);
+
+    next_rect = PIXREGION_TOP (region);
+
+    do
+    {
+        if (r2->x2 <= x1)
+        {
+            /*
+	     * Subtrahend entirely to left of minuend: go to next subtrahend.
+	     */
+            r2++;
+	}
+        else if (r2->x1 <= x1)
+        {
+            /*
+	     * Subtrahend precedes minuend: nuke left edge of minuend.
+	     */
+            x1 = r2->x2;
+            if (x1 >= r1->x2)
+            {
+                /*
+		 * Minuend completely covered: advance to next minuend and
+		 * reset left fence to edge of new minuend.
+		 */
+                r1++;
+                if (r1 != r1_end)
+		    x1 = r1->x1;
+	    }
+            else
+            {
+                /*
+		 * Subtrahend now used up since it doesn't extend beyond
+		 * minuend
+		 */
+                r2++;
+	    }
+	}
+        else if (r2->x1 < r1->x2)
+        {
+            /*
+	     * Left part of subtrahend covers part of minuend: add uncovered
+	     * part of minuend to region and skip to next subtrahend.
+	     */
+            critical_if_fail (x1 < r2->x1);
+            NEWRECT (region, next_rect, x1, y1, r2->x1, y2);
+
+            x1 = r2->x2;
+            if (x1 >= r1->x2)
+            {
+                /*
+		 * Minuend used up: advance to new...
+		 */
+                r1++;
+                if (r1 != r1_end)
+		    x1 = r1->x1;
+	    }
+            else
+            {
+                /*
+		 * Subtrahend used up
+		 */
+                r2++;
+	    }
+	}
+        else
+        {
+            /*
+	     * Minuend used up: add any remaining piece before advancing.
+	     */
+            if (r1->x2 > x1)
+		NEWRECT (region, next_rect, x1, y1, r1->x2, y2);
+
+            r1++;
+
+	    if (r1 != r1_end)
+		x1 = r1->x1;
+	}
+    }
+    while ((r1 != r1_end) && (r2 != r2_end));
+
+    /*
+     * Add remaining minuend rectangles to region.
+     */
+    while (r1 != r1_end)
+    {
+        critical_if_fail (x1 < r1->x2);
+
+        NEWRECT (region, next_rect, x1, y1, r1->x2, y2);
+
+        r1++;
+        if (r1 != r1_end)
+	    x1 = r1->x1;
+    }
+    return TRUE;
+}
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_subtract --
+ *	Subtract reg_s from reg_m and leave the result in reg_d.
+ *	S stands for subtrahend, M for minuend and D for difference.
+ *
+ * Results:
+ *	TRUE if successful.
+ *
+ * Side Effects:
+ *	reg_d is overwritten.
+ *
+ *-----------------------------------------------------------------------
+ */
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_subtract) (region_type_t *reg_d,
+                    region_type_t *reg_m,
+                    region_type_t *reg_s)
+{
+    GOOD (reg_m);
+    GOOD (reg_s);
+    GOOD (reg_d);
+    
+    /* check for trivial rejects */
+    if (PIXREGION_NIL (reg_m) || PIXREGION_NIL (reg_s) ||
+        !EXTENTCHECK (&reg_m->extents, &reg_s->extents))
+    {
+        if (PIXREGION_NAR (reg_s))
+	    return pixman_break (reg_d);
+	
+        return PREFIX (_copy) (reg_d, reg_m);
+    }
+    else if (reg_m == reg_s)
+    {
+        FREE_DATA (reg_d);
+        reg_d->extents.x2 = reg_d->extents.x1;
+        reg_d->extents.y2 = reg_d->extents.y1;
+        reg_d->data = pixman_region_empty_data;
+
+        return TRUE;
+    }
+
+    /* Add those rectangles in region 1 that aren't in region 2,
+       do yucky subtraction for overlaps, and
+       just throw away rectangles in region 2 that aren't in region 1 */
+    if (!pixman_op (reg_d, reg_m, reg_s, pixman_region_subtract_o, TRUE, FALSE))
+	return FALSE;
+
+    /*
+     * Can't alter reg_d's extents before we call pixman_op because
+     * it might be one of the source regions and pixman_op depends
+     * on the extents of those regions being unaltered. Besides, this
+     * way there's no checking against rectangles that will be nuked
+     * due to coalescing, so we have to examine fewer rectangles.
+     */
+    pixman_set_extents (reg_d);
+    GOOD (reg_d);
+    return TRUE;
+}
+
+/*======================================================================
+ *	    Region Inversion
+ *====================================================================*/
+
+/*-
+ *-----------------------------------------------------------------------
+ * pixman_region_inverse --
+ *	Take a region and a box and return a region that is everything
+ *	in the box but not in the region. The careful reader will note
+ *	that this is the same as subtracting the region from the box...
+ *
+ * Results:
+ *	TRUE.
+ *
+ * Side Effects:
+ *	new_reg is overwritten.
+ *
+ *-----------------------------------------------------------------------
+ */
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_inverse) (region_type_t *new_reg,  /* Destination region */
+		   region_type_t *reg1,     /* Region to invert */
+		   box_type_t *   inv_rect) /* Bounding box for inversion */
+{
+    region_type_t inv_reg; /* Quick and dirty region made from the
+			    * bounding box */
+    GOOD (reg1);
+    GOOD (new_reg);
+    
+    /* check for trivial rejects */
+    if (PIXREGION_NIL (reg1) || !EXTENTCHECK (inv_rect, &reg1->extents))
+    {
+        if (PIXREGION_NAR (reg1))
+	    return pixman_break (new_reg);
+	
+        new_reg->extents = *inv_rect;
+        FREE_DATA (new_reg);
+        new_reg->data = (region_data_type_t *)NULL;
+	
+        return TRUE;
+    }
+
+    /* Add those rectangles in region 1 that aren't in region 2,
+     * do yucky subtraction for overlaps, and
+     * just throw away rectangles in region 2 that aren't in region 1
+     */
+    inv_reg.extents = *inv_rect;
+    inv_reg.data = (region_data_type_t *)NULL;
+    if (!pixman_op (new_reg, &inv_reg, reg1, pixman_region_subtract_o, TRUE, FALSE))
+	return FALSE;
+
+    /*
+     * Can't alter new_reg's extents before we call pixman_op because
+     * it might be one of the source regions and pixman_op depends
+     * on the extents of those regions being unaltered. Besides, this
+     * way there's no checking against rectangles that will be nuked
+     * due to coalescing, so we have to examine fewer rectangles.
+     */
+    pixman_set_extents (new_reg);
+    GOOD (new_reg);
+    return TRUE;
+}
+
+/* In time O(log n), locate the first box whose y2 is greater than y.
+ * Return @end if no such box exists.
+ */
+static box_type_t *
+find_box_for_y (box_type_t *begin, box_type_t *end, int y)
+{
+    box_type_t *mid;
+
+    if (end == begin)
+	return end;
+
+    if (end - begin == 1)
+    {
+	if (begin->y2 > y)
+	    return begin;
+	else
+	    return end;
+    }
+
+    mid = begin + (end - begin) / 2;
+    if (mid->y2 > y)
+    {
+	/* If no box is found in [begin, mid], the function
+	 * will return @mid, which is then known to be the
+	 * correct answer.
+	 */
+	return find_box_for_y (begin, mid, y);
+    }
+    else
+    {
+	return find_box_for_y (mid, end, y);
+    }
+}
+
+/*
+ *   rect_in(region, rect)
+ *   This routine takes a pointer to a region and a pointer to a box
+ *   and determines if the box is outside/inside/partly inside the region.
+ *
+ *   The idea is to travel through the list of rectangles trying to cover the
+ *   passed box with them. Anytime a piece of the rectangle isn't covered
+ *   by a band of rectangles, part_out is set TRUE. Any time a rectangle in
+ *   the region covers part of the box, part_in is set TRUE. The process ends
+ *   when either the box has been completely covered (we reached a band that
+ *   doesn't overlap the box, part_in is TRUE and part_out is false), the
+ *   box has been partially covered (part_in == part_out == TRUE -- because of
+ *   the banding, the first time this is true we know the box is only
+ *   partially in the region) or is outside the region (we reached a band
+ *   that doesn't overlap the box at all and part_in is false)
+ */
+PIXMAN_EXPORT pixman_region_overlap_t
+PREFIX (_contains_rectangle) (region_type_t *  region,
+			      box_type_t *     prect)
+{
+    box_type_t *     pbox;
+    box_type_t *     pbox_end;
+    int part_in, part_out;
+    int numRects;
+    int x, y;
+
+    GOOD (region);
+
+    numRects = PIXREGION_NUMRECTS (region);
+
+    /* useful optimization */
+    if (!numRects || !EXTENTCHECK (&region->extents, prect))
+	return(PIXMAN_REGION_OUT);
+
+    if (numRects == 1)
+    {
+        /* We know that it must be PIXMAN_REGION_IN or PIXMAN_REGION_PART */
+        if (SUBSUMES (&region->extents, prect))
+	    return(PIXMAN_REGION_IN);
+        else
+	    return(PIXMAN_REGION_PART);
+    }
+
+    part_out = FALSE;
+    part_in = FALSE;
+
+    /* (x,y) starts at upper left of rect, moving to the right and down */
+    x = prect->x1;
+    y = prect->y1;
+
+    /* can stop when both part_out and part_in are TRUE, or we reach prect->y2 */
+    for (pbox = PIXREGION_BOXPTR (region), pbox_end = pbox + numRects;
+	 pbox != pbox_end;
+	 pbox++)
+    {
+	/* getting up to speed or skipping remainder of band */
+	if (pbox->y2 <= y)
+	{
+	    if ((pbox = find_box_for_y (pbox, pbox_end, y)) == pbox_end)
+		break;
+	}
+
+        if (pbox->y1 > y)
+        {
+            part_out = TRUE;     /* missed part of rectangle above */
+            if (part_in || (pbox->y1 >= prect->y2))
+		break;
+            y = pbox->y1;       /* x guaranteed to be == prect->x1 */
+	}
+
+        if (pbox->x2 <= x)
+	    continue;           /* not far enough over yet */
+
+        if (pbox->x1 > x)
+        {
+            part_out = TRUE;     /* missed part of rectangle to left */
+            if (part_in)
+		break;
+	}
+
+        if (pbox->x1 < prect->x2)
+        {
+            part_in = TRUE;      /* definitely overlap */
+            if (part_out)
+		break;
+	}
+
+        if (pbox->x2 >= prect->x2)
+        {
+            y = pbox->y2;       /* finished with this band */
+            if (y >= prect->y2)
+		break;
+            x = prect->x1;      /* reset x out to left again */
+	}
+        else
+        {
+            /*
+	     * Because boxes in a band are maximal width, if the first box
+	     * to overlap the rectangle doesn't completely cover it in that
+	     * band, the rectangle must be partially out, since some of it
+	     * will be uncovered in that band. part_in will have been set true
+	     * by now...
+	     */
+            part_out = TRUE;
+            break;
+	}
+    }
+
+    if (part_in)
+    {
+        if (y < prect->y2)
+	    return PIXMAN_REGION_PART;
+        else
+	    return PIXMAN_REGION_IN;
+    }
+    else
+    {
+        return PIXMAN_REGION_OUT;
+    }
+}
+
+/* PREFIX(_translate) (region, x, y)
+ * translates in place
+ */
+
+PIXMAN_EXPORT void
+PREFIX (_translate) (region_type_t *region, int x, int y)
+{
+    overflow_int_t x1, x2, y1, y2;
+    int nbox;
+    box_type_t * pbox;
+
+    GOOD (region);
+    region->extents.x1 = x1 = region->extents.x1 + x;
+    region->extents.y1 = y1 = region->extents.y1 + y;
+    region->extents.x2 = x2 = region->extents.x2 + x;
+    region->extents.y2 = y2 = region->extents.y2 + y;
+    
+    if (((x1 - PIXMAN_REGION_MIN) | (y1 - PIXMAN_REGION_MIN) | (PIXMAN_REGION_MAX - x2) | (PIXMAN_REGION_MAX - y2)) >= 0)
+    {
+        if (region->data && (nbox = region->data->numRects))
+        {
+            for (pbox = PIXREGION_BOXPTR (region); nbox--; pbox++)
+            {
+                pbox->x1 += x;
+                pbox->y1 += y;
+                pbox->x2 += x;
+                pbox->y2 += y;
+	    }
+	}
+        return;
+    }
+
+    if (((x2 - PIXMAN_REGION_MIN) | (y2 - PIXMAN_REGION_MIN) | (PIXMAN_REGION_MAX - x1) | (PIXMAN_REGION_MAX - y1)) <= 0)
+    {
+        region->extents.x2 = region->extents.x1;
+        region->extents.y2 = region->extents.y1;
+        FREE_DATA (region);
+        region->data = pixman_region_empty_data;
+        return;
+    }
+
+    if (x1 < PIXMAN_REGION_MIN)
+	region->extents.x1 = PIXMAN_REGION_MIN;
+    else if (x2 > PIXMAN_REGION_MAX)
+	region->extents.x2 = PIXMAN_REGION_MAX;
+
+    if (y1 < PIXMAN_REGION_MIN)
+	region->extents.y1 = PIXMAN_REGION_MIN;
+    else if (y2 > PIXMAN_REGION_MAX)
+	region->extents.y2 = PIXMAN_REGION_MAX;
+
+    if (region->data && (nbox = region->data->numRects))
+    {
+        box_type_t * pbox_out;
+
+        for (pbox_out = pbox = PIXREGION_BOXPTR (region); nbox--; pbox++)
+        {
+            pbox_out->x1 = x1 = pbox->x1 + x;
+            pbox_out->y1 = y1 = pbox->y1 + y;
+            pbox_out->x2 = x2 = pbox->x2 + x;
+            pbox_out->y2 = y2 = pbox->y2 + y;
+
+            if (((x2 - PIXMAN_REGION_MIN) | (y2 - PIXMAN_REGION_MIN) |
+                 (PIXMAN_REGION_MAX - x1) | (PIXMAN_REGION_MAX - y1)) <= 0)
+            {
+                region->data->numRects--;
+                continue;
+	    }
+
+            if (x1 < PIXMAN_REGION_MIN)
+		pbox_out->x1 = PIXMAN_REGION_MIN;
+            else if (x2 > PIXMAN_REGION_MAX)
+		pbox_out->x2 = PIXMAN_REGION_MAX;
+
+            if (y1 < PIXMAN_REGION_MIN)
+		pbox_out->y1 = PIXMAN_REGION_MIN;
+            else if (y2 > PIXMAN_REGION_MAX)
+		pbox_out->y2 = PIXMAN_REGION_MAX;
+
+            pbox_out++;
+	}
+
+        if (pbox_out != pbox)
+        {
+            if (region->data->numRects == 1)
+            {
+                region->extents = *PIXREGION_BOXPTR (region);
+                FREE_DATA (region);
+                region->data = (region_data_type_t *)NULL;
+	    }
+            else
+	    {
+		pixman_set_extents (region);
+	    }
+	}
+    }
+
+    GOOD (region);
+}
+
+PIXMAN_EXPORT void
+PREFIX (_reset) (region_type_t *region, box_type_t *box)
+{
+    GOOD (region);
+
+    critical_if_fail (GOOD_RECT (box));
+
+    region->extents = *box;
+
+    FREE_DATA (region);
+
+    region->data = NULL;
+}
+
+PIXMAN_EXPORT void
+PREFIX (_clear) (region_type_t *region)
+{
+    GOOD (region);
+    FREE_DATA (region);
+
+    region->extents = *pixman_region_empty_box;
+    region->data = pixman_region_empty_data;
+}
+
+/* box is "return" value */
+PIXMAN_EXPORT int
+PREFIX (_contains_point) (region_type_t * region,
+                          int x, int y,
+                          box_type_t * box)
+{
+    box_type_t *pbox, *pbox_end;
+    int numRects;
+
+    GOOD (region);
+    numRects = PIXREGION_NUMRECTS (region);
+
+    if (!numRects || !INBOX (&region->extents, x, y))
+	return(FALSE);
+
+    if (numRects == 1)
+    {
+        if (box)
+	    *box = region->extents;
+
+        return(TRUE);
+    }
+
+    pbox = PIXREGION_BOXPTR (region);
+    pbox_end = pbox + numRects;
+
+    pbox = find_box_for_y (pbox, pbox_end, y);
+
+    for (;pbox != pbox_end; pbox++)
+    {
+        if ((y < pbox->y1) || (x < pbox->x1))
+	    break;              /* missed it */
+
+        if (x >= pbox->x2)
+	    continue;           /* not there yet */
+
+        if (box)
+	    *box = *pbox;
+
+        return(TRUE);
+    }
+
+    return(FALSE);
+}
+
+PIXMAN_EXPORT int
+PREFIX (_not_empty) (region_type_t * region)
+{
+    GOOD (region);
+
+    return(!PIXREGION_NIL (region));
+}
+
+PIXMAN_EXPORT box_type_t *
+PREFIX (_extents) (region_type_t * region)
+{
+    GOOD (region);
+
+    return(&region->extents);
+}
+
+/*
+ * Clip a list of scanlines to a region.  The caller has allocated the
+ * space.  FSorted is non-zero if the scanline origins are in ascending order.
+ *
+ * returns the number of new, clipped scanlines.
+ */
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_selfcheck) (region_type_t *reg)
+{
+    int i, numRects;
+
+    if ((reg->extents.x1 > reg->extents.x2) ||
+        (reg->extents.y1 > reg->extents.y2))
+    {
+	return FALSE;
+    }
+
+    numRects = PIXREGION_NUMRECTS (reg);
+    if (!numRects)
+    {
+	return ((reg->extents.x1 == reg->extents.x2) &&
+	        (reg->extents.y1 == reg->extents.y2) &&
+	        (reg->data->size || (reg->data == pixman_region_empty_data)));
+    }
+    else if (numRects == 1)
+    {
+	return (!reg->data);
+    }
+    else
+    {
+        box_type_t * pbox_p, * pbox_n;
+        box_type_t box;
+
+        pbox_p = PIXREGION_RECTS (reg);
+        box = *pbox_p;
+        box.y2 = pbox_p[numRects - 1].y2;
+        pbox_n = pbox_p + 1;
+
+        for (i = numRects; --i > 0; pbox_p++, pbox_n++)
+        {
+            if ((pbox_n->x1 >= pbox_n->x2) ||
+                (pbox_n->y1 >= pbox_n->y2))
+	    {
+		return FALSE;
+	    }
+
+            if (pbox_n->x1 < box.x1)
+		box.x1 = pbox_n->x1;
+	    
+            if (pbox_n->x2 > box.x2)
+		box.x2 = pbox_n->x2;
+	    
+            if ((pbox_n->y1 < pbox_p->y1) ||
+                ((pbox_n->y1 == pbox_p->y1) &&
+                 ((pbox_n->x1 < pbox_p->x2) || (pbox_n->y2 != pbox_p->y2))))
+	    {
+		return FALSE;
+	    }
+	}
+
+        return ((box.x1 == reg->extents.x1) &&
+                (box.x2 == reg->extents.x2) &&
+                (box.y1 == reg->extents.y1) &&
+                (box.y2 == reg->extents.y2));
+    }
+}
+
+PIXMAN_EXPORT pixman_bool_t
+PREFIX (_init_rects) (region_type_t *region,
+                      const box_type_t *boxes, int count)
+{
+    box_type_t *rects;
+    int displacement;
+    int i;
+
+    /* if it's 1, then we just want to set the extents, so call
+     * the existing method. */
+    if (count == 1)
+    {
+        PREFIX (_init_rect) (region,
+                             boxes[0].x1,
+                             boxes[0].y1,
+                             boxes[0].x2 - boxes[0].x1,
+                             boxes[0].y2 - boxes[0].y1);
+        return TRUE;
+    }
+
+    PREFIX (_init) (region);
+
+    /* if it's 0, don't call pixman_rect_alloc -- 0 rectangles is
+     * a special case, and causing pixman_rect_alloc would cause
+     * us to leak memory (because the 0-rect case should be the
+     * static pixman_region_empty_data data).
+     */
+    if (count == 0)
+	return TRUE;
+
+    if (!pixman_rect_alloc (region, count))
+	return FALSE;
+
+    rects = PIXREGION_RECTS (region);
+
+    /* Copy in the rects */
+    memcpy (rects, boxes, sizeof(box_type_t) * count);
+    region->data->numRects = count;
+
+    /* Eliminate empty and malformed rectangles */
+    displacement = 0;
+
+    for (i = 0; i < count; ++i)
+    {
+        box_type_t *box = &rects[i];
+
+        if (box->x1 >= box->x2 || box->y1 >= box->y2)
+	    displacement++;
+        else if (displacement)
+	    rects[i - displacement] = rects[i];
+    }
+
+    region->data->numRects -= displacement;
+
+    /* If eliminating empty rectangles caused there
+     * to be only 0 or 1 rectangles, deal with that.
+     */
+    if (region->data->numRects == 0)
+    {
+        FREE_DATA (region);
+        PREFIX (_init) (region);
+
+        return TRUE;
+    }
+
+    if (region->data->numRects == 1)
+    {
+        region->extents = rects[0];
+
+        FREE_DATA (region);
+        region->data = NULL;
+
+        GOOD (region);
+
+        return TRUE;
+    }
+
+    /* Validate */
+    region->extents.x1 = region->extents.x2 = 0;
+
+    return validate (region);
+}
+
+#define READ(_ptr) (*(_ptr))
+
+static inline box_type_t *
+bitmap_addrect (region_type_t *reg,
+                box_type_t *r,
+                box_type_t **first_rect,
+                int rx1, int ry1,
+                int rx2, int ry2)
+{
+    if ((rx1 < rx2) && (ry1 < ry2) &&
+	(!(reg->data->numRects &&
+	   ((r-1)->y1 == ry1) && ((r-1)->y2 == ry2) &&
+	   ((r-1)->x1 <= rx1) && ((r-1)->x2 >= rx2))))
+    {
+	if (reg->data->numRects == reg->data->size)
+	{
+	    if (!pixman_rect_alloc (reg, 1))
+		return NULL;
+	    *first_rect = PIXREGION_BOXPTR(reg);
+	    r = *first_rect + reg->data->numRects;
+	}
+	r->x1 = rx1;
+	r->y1 = ry1;
+	r->x2 = rx2;
+	r->y2 = ry2;
+	reg->data->numRects++;
+	if (r->x1 < reg->extents.x1)
+	    reg->extents.x1 = r->x1;
+	if (r->x2 > reg->extents.x2)
+	    reg->extents.x2 = r->x2;
+	r++;
+    }
+    return r;
+}
+
+/* Convert bitmap clip mask into clipping region.
+ * First, goes through each line and makes boxes by noting the transitions
+ * from 0 to 1 and 1 to 0.
+ * Then it coalesces the current line with the previous if they have boxes
+ * at the same X coordinates.
+ * Stride is in number of uint32_t per line.
+ */
+PIXMAN_EXPORT void
+PREFIX (_init_from_image) (region_type_t *region,
+                           pixman_image_t *image)
+{
+    uint32_t mask0 = 0xffffffff & ~SCREEN_SHIFT_RIGHT(0xffffffff, 1);
+    box_type_t *first_rect, *rects, *prect_line_start;
+    box_type_t *old_rect, *new_rect;
+    uint32_t *pw, w, *pw_line, *pw_line_end;
+    int	irect_prev_start, irect_line_start;
+    int	h, base, rx1 = 0, crects;
+    int	ib;
+    pixman_bool_t in_box, same;
+    int width, height, stride;
+
+    PREFIX(_init) (region);
+
+    critical_if_fail (region->data);
+
+    return_if_fail (image->type == BITS);
+    return_if_fail (image->bits.format == PIXMAN_a1);
+
+    pw_line = pixman_image_get_data (image);
+    width = pixman_image_get_width (image);
+    height = pixman_image_get_height (image);
+    stride = pixman_image_get_stride (image) / 4;
+
+    first_rect = PIXREGION_BOXPTR(region);
+    rects = first_rect;
+
+    region->extents.x1 = width - 1;
+    region->extents.x2 = 0;
+    irect_prev_start = -1;
+    for (h = 0; h < height; h++)
+    {
+        pw = pw_line;
+        pw_line += stride;
+        irect_line_start = rects - first_rect;
+
+        /* If the Screen left most bit of the word is set, we're starting in
+         * a box */
+        if (READ(pw) & mask0)
+        {
+            in_box = TRUE;
+            rx1 = 0;
+        }
+        else
+        {
+            in_box = FALSE;
+        }
+
+        /* Process all words which are fully in the pixmap */
+        pw_line_end = pw + (width >> 5);
+        for (base = 0; pw < pw_line_end; base += 32)
+        {
+            w = READ(pw++);
+            if (in_box)
+            {
+                if (!~w)
+                    continue;
+            }
+            else
+            {
+                if (!w)
+                    continue;
+            }
+            for (ib = 0; ib < 32; ib++)
+            {
+                /* If the Screen left most bit of the word is set, we're
+                 * starting a box */
+                if (w & mask0)
+                {
+                    if (!in_box)
+                    {
+                        rx1 = base + ib;
+                        /* start new box */
+                        in_box = TRUE;
+                    }
+                }
+                else
+                {
+                    if (in_box)
+                    {
+                        /* end box */
+                        rects = bitmap_addrect (region, rects, &first_rect,
+                                                rx1, h, base + ib, h + 1);
+                        if (rects == NULL)
+                            goto error;
+                        in_box = FALSE;
+                    }
+                }
+                /* Shift the word VISUALLY left one. */
+                w = SCREEN_SHIFT_LEFT(w, 1);
+            }
+        }
+
+        if (width & 31)
+        {
+            /* Process final partial word on line */
+             w = READ(pw++);
+            for (ib = 0; ib < (width & 31); ib++)
+            {
+                /* If the Screen left most bit of the word is set, we're
+                 * starting a box */
+                if (w & mask0)
+                {
+                    if (!in_box)
+                    {
+                        rx1 = base + ib;
+                        /* start new box */
+                        in_box = TRUE;
+                    }
+                }
+                else
+                {
+                    if (in_box)
+                    {
+                        /* end box */
+                        rects = bitmap_addrect(region, rects, &first_rect,
+					       rx1, h, base + ib, h + 1);
+			if (rects == NULL)
+			    goto error;
+                        in_box = FALSE;
+                    }
+                }
+                /* Shift the word VISUALLY left one. */
+                w = SCREEN_SHIFT_LEFT(w, 1);
+            }
+        }
+        /* If scanline ended with last bit set, end the box */
+        if (in_box)
+        {
+            rects = bitmap_addrect(region, rects, &first_rect,
+				   rx1, h, base + (width & 31), h + 1);
+	    if (rects == NULL)
+		goto error;
+        }
+        /* if all rectangles on this line have the same x-coords as
+         * those on the previous line, then add 1 to all the previous  y2s and
+         * throw away all the rectangles from this line
+         */
+        same = FALSE;
+        if (irect_prev_start != -1)
+        {
+            crects = irect_line_start - irect_prev_start;
+            if (crects != 0 &&
+                crects == ((rects - first_rect) - irect_line_start))
+            {
+                old_rect = first_rect + irect_prev_start;
+                new_rect = prect_line_start = first_rect + irect_line_start;
+                same = TRUE;
+                while (old_rect < prect_line_start)
+                {
+                    if ((old_rect->x1 != new_rect->x1) ||
+                        (old_rect->x2 != new_rect->x2))
+                    {
+                          same = FALSE;
+                          break;
+                    }
+                    old_rect++;
+                    new_rect++;
+                }
+                if (same)
+                {
+                    old_rect = first_rect + irect_prev_start;
+                    while (old_rect < prect_line_start)
+                    {
+                        old_rect->y2 += 1;
+                        old_rect++;
+                    }
+                    rects -= crects;
+                    region->data->numRects -= crects;
+                }
+            }
+        }
+        if(!same)
+            irect_prev_start = irect_line_start;
+    }
+    if (!region->data->numRects)
+    {
+        region->extents.x1 = region->extents.x2 = 0;
+    }
+    else
+    {
+        region->extents.y1 = PIXREGION_BOXPTR(region)->y1;
+        region->extents.y2 = PIXREGION_END(region)->y2;
+        if (region->data->numRects == 1)
+        {
+            free (region->data);
+            region->data = NULL;
+        }
+    }
+
+ error:
+    return;
+}
diff --git a/src/pixman/pixman/pixman-region16.c b/src/pixman/pixman/pixman-region16.c
new file mode 100644
index 0000000..d88d338
--- /dev/null
+++ b/src/pixman/pixman/pixman-region16.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright © 2008 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without
+ * fee, provided that the above copyright notice appear in all copies
+ * and that both that copyright notice and this permission notice
+ * appear in supporting documentation, and that the name of
+ * Red Hat, Inc. not be used in advertising or publicity pertaining to
+ * distribution of the software without specific, written prior
+ * permission. Red Hat, Inc. makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * RED HAT, INC. DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL RED HAT, INC. BE LIABLE FOR ANY SPECIAL,
+ * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
+ * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+ * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author: Soren Sandmann <sandmann@redhat.com>
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#undef PIXMAN_DISABLE_DEPRECATED
+
+#include "pixman-private.h"
+
+#include <stdlib.h>
+
+typedef pixman_box16_t		box_type_t;
+typedef pixman_region16_data_t	region_data_type_t;
+typedef pixman_region16_t	region_type_t;
+typedef int32_t                 overflow_int_t;
+
+typedef struct {
+    int x, y;
+} point_type_t;
+
+#define PREFIX(x) pixman_region##x
+
+#define PIXMAN_REGION_MAX INT16_MAX
+#define PIXMAN_REGION_MIN INT16_MIN
+
+#include "pixman-region.c"
+
+/* This function exists only to make it possible to preserve the X ABI -
+ * it should go away at first opportunity.
+ *
+ * The problem is that the X ABI exports the three structs and has used
+ * them through macros. So the X server calls this function with
+ * the addresses of those structs which makes the existing code continue to
+ * work.
+ */
+PIXMAN_EXPORT void
+pixman_region_set_static_pointers (pixman_box16_t *empty_box,
+				   pixman_region16_data_t *empty_data,
+				   pixman_region16_data_t *broken_data)
+{
+    pixman_region_empty_box = empty_box;
+    pixman_region_empty_data = empty_data;
+    pixman_broken_data = broken_data;
+}
diff --git a/src/pixman/pixman/pixman-region32.c b/src/pixman/pixman/pixman-region32.c
new file mode 100644
index 0000000..abd6b1a
--- /dev/null
+++ b/src/pixman/pixman/pixman-region32.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2008 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software
+ * and its documentation for any purpose is hereby granted without
+ * fee, provided that the above copyright notice appear in all copies
+ * and that both that copyright notice and this permission notice
+ * appear in supporting documentation, and that the name of
+ * Red Hat, Inc. not be used in advertising or publicity pertaining to
+ * distribution of the software without specific, written prior
+ * permission. Red Hat, Inc. makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * RED HAT, INC. DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL RED HAT, INC. BE LIABLE FOR ANY SPECIAL,
+ * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
+ * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR
+ * IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author: Soren Sandmann <sandmann@redhat.com>
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+
+#include <stdlib.h>
+
+typedef pixman_box32_t		box_type_t;
+typedef pixman_region32_data_t	region_data_type_t;
+typedef pixman_region32_t	region_type_t;
+typedef int64_t                 overflow_int_t;
+
+typedef struct {
+    int x, y;
+} point_type_t;
+
+#define PREFIX(x) pixman_region32##x
+
+#define PIXMAN_REGION_MAX INT32_MAX
+#define PIXMAN_REGION_MIN INT32_MIN
+
+#include "pixman-region.c"
diff --git a/src/pixman/pixman/pixman-solid-fill.c b/src/pixman/pixman/pixman-solid-fill.c
new file mode 100644
index 0000000..5f9fef6
--- /dev/null
+++ b/src/pixman/pixman/pixman-solid-fill.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007, 2009 Red Hat, Inc.
+ * Copyright © 2009 Soren Sandmann
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+static uint32_t
+color_to_uint32 (const pixman_color_t *color)
+{
+    return
+        (color->alpha >> 8 << 24) |
+        (color->red >> 8 << 16) |
+        (color->green & 0xff00) |
+        (color->blue >> 8);
+}
+
+static argb_t
+color_to_float (const pixman_color_t *color)
+{
+    argb_t result;
+
+    result.a = pixman_unorm_to_float (color->alpha, 16);
+    result.r = pixman_unorm_to_float (color->red, 16);
+    result.g = pixman_unorm_to_float (color->green, 16);
+    result.b = pixman_unorm_to_float (color->blue, 16);
+
+    return result;
+}
+
+PIXMAN_EXPORT pixman_image_t *
+pixman_image_create_solid_fill (const pixman_color_t *color)
+{
+    pixman_image_t *img = _pixman_image_allocate ();
+
+    if (!img)
+	return NULL;
+
+    img->type = SOLID;
+    img->solid.color = *color;
+    img->solid.color_32 = color_to_uint32 (color);
+    img->solid.color_float = color_to_float (color);
+
+    return img;
+}
+
diff --git a/src/pixman/pixman/pixman-sse2.c b/src/pixman/pixman/pixman-sse2.c
new file mode 100644
index 0000000..a6e7808
--- /dev/null
+++ b/src/pixman/pixman/pixman-sse2.c
@@ -0,0 +1,6540 @@
+/*
+ * Copyright © 2008 Rodrigo Kumpera
+ * Copyright © 2008 André Tupinambá
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Rodrigo Kumpera (kumpera@gmail.com)
+ *          André Tupinambá (andrelrt@gmail.com)
+ *
+ * Based on work by Owen Taylor and Søren Sandmann
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+/* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
+#define PSHUFD_IS_FAST 0
+
+#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
+#include <emmintrin.h> /* for SSE2 intrinsics */
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "pixman-inlines.h"
+
+static __m128i mask_0080;
+static __m128i mask_00ff;
+static __m128i mask_0101;
+static __m128i mask_ffff;
+static __m128i mask_ff000000;
+static __m128i mask_alpha;
+
+static __m128i mask_565_r;
+static __m128i mask_565_g1, mask_565_g2;
+static __m128i mask_565_b;
+static __m128i mask_red;
+static __m128i mask_green;
+static __m128i mask_blue;
+
+static __m128i mask_565_fix_rb;
+static __m128i mask_565_fix_g;
+
+static __m128i mask_565_rb;
+static __m128i mask_565_pack_multiplier;
+
+static force_inline __m128i
+unpack_32_1x128 (uint32_t data)
+{
+    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
+}
+
+static force_inline void
+unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
+{
+    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
+    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
+}
+
+static force_inline __m128i
+unpack_565_to_8888 (__m128i lo)
+{
+    __m128i r, g, b, rb, t;
+
+    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
+    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
+    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
+
+    rb = _mm_or_si128 (r, b);
+    t  = _mm_and_si128 (rb, mask_565_fix_rb);
+    t  = _mm_srli_epi32 (t, 5);
+    rb = _mm_or_si128 (rb, t);
+
+    t  = _mm_and_si128 (g, mask_565_fix_g);
+    t  = _mm_srli_epi32 (t, 6);
+    g  = _mm_or_si128 (g, t);
+
+    return _mm_or_si128 (rb, g);
+}
+
+static force_inline void
+unpack_565_128_4x128 (__m128i  data,
+                      __m128i* data0,
+                      __m128i* data1,
+                      __m128i* data2,
+                      __m128i* data3)
+{
+    __m128i lo, hi;
+
+    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
+    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
+
+    lo = unpack_565_to_8888 (lo);
+    hi = unpack_565_to_8888 (hi);
+
+    unpack_128_2x128 (lo, data0, data1);
+    unpack_128_2x128 (hi, data2, data3);
+}
+
+static force_inline uint16_t
+pack_565_32_16 (uint32_t pixel)
+{
+    return (uint16_t) (((pixel >> 8) & 0xf800) |
+		       ((pixel >> 5) & 0x07e0) |
+		       ((pixel >> 3) & 0x001f));
+}
+
+static force_inline __m128i
+pack_2x128_128 (__m128i lo, __m128i hi)
+{
+    return _mm_packus_epi16 (lo, hi);
+}
+
+static force_inline __m128i
+pack_565_2packedx128_128 (__m128i lo, __m128i hi)
+{
+    __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
+    __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
+
+    __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
+    __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
+
+    __m128i g0 = _mm_and_si128 (lo, mask_green);
+    __m128i g1 = _mm_and_si128 (hi, mask_green);
+
+    t0 = _mm_or_si128 (t0, g0);
+    t1 = _mm_or_si128 (t1, g1);
+
+    /* Simulates _mm_packus_epi32 */
+    t0 = _mm_slli_epi32 (t0, 16 - 5);
+    t1 = _mm_slli_epi32 (t1, 16 - 5);
+    t0 = _mm_srai_epi32 (t0, 16);
+    t1 = _mm_srai_epi32 (t1, 16);
+    return _mm_packs_epi32 (t0, t1);
+}
+
+static force_inline __m128i
+pack_565_2x128_128 (__m128i lo, __m128i hi)
+{
+    __m128i data;
+    __m128i r, g1, g2, b;
+
+    data = pack_2x128_128 (lo, hi);
+
+    r  = _mm_and_si128 (data, mask_565_r);
+    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
+    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
+    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
+
+    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
+}
+
+static force_inline __m128i
+pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
+{
+    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
+			     pack_565_2x128_128 (*xmm2, *xmm3));
+}
+
+static force_inline int
+is_opaque (__m128i x)
+{
+    __m128i ffs = _mm_cmpeq_epi8 (x, x);
+
+    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
+}
+
+static force_inline int
+is_zero (__m128i x)
+{
+    return _mm_movemask_epi8 (
+	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
+}
+
+static force_inline int
+is_transparent (__m128i x)
+{
+    return (_mm_movemask_epi8 (
+		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
+}
+
+static force_inline __m128i
+expand_pixel_32_1x128 (uint32_t data)
+{
+    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
+}
+
+static force_inline __m128i
+expand_alpha_1x128 (__m128i data)
+{
+    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
+						     _MM_SHUFFLE (3, 3, 3, 3)),
+				_MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_2x128 (__m128i  data_lo,
+                    __m128i  data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
+
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
+}
+
+static force_inline void
+expand_alpha_rev_2x128 (__m128i  data_lo,
+                        __m128i  data_hi,
+                        __m128i* alpha_lo,
+                        __m128i* alpha_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
+    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline void
+pix_multiply_2x128 (__m128i* data_lo,
+                    __m128i* data_hi,
+                    __m128i* alpha_lo,
+                    __m128i* alpha_hi,
+                    __m128i* ret_lo,
+                    __m128i* ret_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
+    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
+    lo = _mm_adds_epu16 (lo, mask_0080);
+    hi = _mm_adds_epu16 (hi, mask_0080);
+    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
+    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
+}
+
+static force_inline void
+pix_add_multiply_2x128 (__m128i* src_lo,
+                        __m128i* src_hi,
+                        __m128i* alpha_dst_lo,
+                        __m128i* alpha_dst_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi,
+                        __m128i* alpha_src_lo,
+                        __m128i* alpha_src_hi,
+                        __m128i* ret_lo,
+                        __m128i* ret_hi)
+{
+    __m128i t1_lo, t1_hi;
+    __m128i t2_lo, t2_hi;
+
+    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
+    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
+
+    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
+    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
+}
+
+static force_inline void
+negate_2x128 (__m128i  data_lo,
+              __m128i  data_hi,
+              __m128i* neg_lo,
+              __m128i* neg_hi)
+{
+    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
+    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
+}
+
+static force_inline void
+invert_colors_2x128 (__m128i  data_lo,
+                     __m128i  data_hi,
+                     __m128i* inv_lo,
+                     __m128i* inv_hi)
+{
+    __m128i lo, hi;
+
+    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
+    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
+    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline void
+over_2x128 (__m128i* src_lo,
+            __m128i* src_hi,
+            __m128i* alpha_lo,
+            __m128i* alpha_hi,
+            __m128i* dst_lo,
+            __m128i* dst_hi)
+{
+    __m128i t1, t2;
+
+    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
+
+    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
+
+    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
+    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
+}
+
+static force_inline void
+over_rev_non_pre_2x128 (__m128i  src_lo,
+                        __m128i  src_hi,
+                        __m128i* dst_lo,
+                        __m128i* dst_hi)
+{
+    __m128i lo, hi;
+    __m128i alpha_lo, alpha_hi;
+
+    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
+
+    lo = _mm_or_si128 (alpha_lo, mask_alpha);
+    hi = _mm_or_si128 (alpha_hi, mask_alpha);
+
+    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
+
+    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
+
+    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
+}
+
+static force_inline void
+in_over_2x128 (__m128i* src_lo,
+               __m128i* src_hi,
+               __m128i* alpha_lo,
+               __m128i* alpha_hi,
+               __m128i* mask_lo,
+               __m128i* mask_hi,
+               __m128i* dst_lo,
+               __m128i* dst_hi)
+{
+    __m128i s_lo, s_hi;
+    __m128i a_lo, a_hi;
+
+    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
+    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
+
+    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
+}
+
+/* load 4 pixels from a 16-byte boundary aligned address */
+static force_inline __m128i
+load_128_aligned (__m128i* src)
+{
+    return _mm_load_si128 (src);
+}
+
+/* load 4 pixels from a unaligned address */
+static force_inline __m128i
+load_128_unaligned (const __m128i* src)
+{
+    return _mm_loadu_si128 (src);
+}
+
+/* save 4 pixels using Write Combining memory on a 16-byte
+ * boundary aligned address
+ */
+static force_inline void
+save_128_write_combining (__m128i* dst,
+                          __m128i  data)
+{
+    _mm_stream_si128 (dst, data);
+}
+
+/* save 4 pixels on a 16-byte boundary aligned address */
+static force_inline void
+save_128_aligned (__m128i* dst,
+                  __m128i  data)
+{
+    _mm_store_si128 (dst, data);
+}
+
+/* save 4 pixels on a unaligned address */
+static force_inline void
+save_128_unaligned (__m128i* dst,
+                    __m128i  data)
+{
+    _mm_storeu_si128 (dst, data);
+}
+
+static force_inline __m128i
+load_32_1x128 (uint32_t data)
+{
+    return _mm_cvtsi32_si128 (data);
+}
+
+static force_inline __m128i
+expand_alpha_rev_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m128i
+expand_pixel_8_1x128 (uint8_t data)
+{
+    return _mm_shufflelo_epi16 (
+	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
+}
+
+static force_inline __m128i
+pix_multiply_1x128 (__m128i data,
+		    __m128i alpha)
+{
+    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
+					    mask_0080),
+			    mask_0101);
+}
+
+static force_inline __m128i
+pix_add_multiply_1x128 (__m128i* src,
+			__m128i* alpha_dst,
+			__m128i* dst,
+			__m128i* alpha_src)
+{
+    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
+    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
+
+    return _mm_adds_epu8 (t1, t2);
+}
+
+static force_inline __m128i
+negate_1x128 (__m128i data)
+{
+    return _mm_xor_si128 (data, mask_00ff);
+}
+
+static force_inline __m128i
+invert_colors_1x128 (__m128i data)
+{
+    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
+}
+
+static force_inline __m128i
+over_1x128 (__m128i src, __m128i alpha, __m128i dst)
+{
+    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
+}
+
+static force_inline __m128i
+in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
+{
+    return over_1x128 (pix_multiply_1x128 (*src, *mask),
+		       pix_multiply_1x128 (*alpha, *mask),
+		       *dst);
+}
+
+static force_inline __m128i
+over_rev_non_pre_1x128 (__m128i src, __m128i dst)
+{
+    __m128i alpha = expand_alpha_1x128 (src);
+
+    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
+					   _mm_or_si128 (alpha, mask_alpha)),
+		       alpha,
+		       dst);
+}
+
+static force_inline uint32_t
+pack_1x128_32 (__m128i data)
+{
+    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
+}
+
+static force_inline __m128i
+expand565_16_1x128 (uint16_t pixel)
+{
+    __m128i m = _mm_cvtsi32_si128 (pixel);
+
+    m = unpack_565_to_8888 (m);
+
+    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
+}
+
+static force_inline uint32_t
+core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
+{
+    uint8_t a;
+    __m128i xmms;
+
+    a = src >> 24;
+
+    if (a == 0xff)
+    {
+	return src;
+    }
+    else if (src)
+    {
+	xmms = unpack_32_1x128 (src);
+	return pack_1x128_32 (
+	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
+			unpack_32_1x128 (dst)));
+    }
+
+    return dst;
+}
+
+static force_inline uint32_t
+combine1 (const uint32_t *ps, const uint32_t *pm)
+{
+    uint32_t s = *ps;
+
+    if (pm)
+    {
+	__m128i ms, mm;
+
+	mm = unpack_32_1x128 (*pm);
+	mm = expand_alpha_1x128 (mm);
+
+	ms = unpack_32_1x128 (s);
+	ms = pix_multiply_1x128 (ms, mm);
+
+	s = pack_1x128_32 (ms);
+    }
+
+    return s;
+}
+
+static force_inline __m128i
+combine4 (const __m128i *ps, const __m128i *pm)
+{
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_msk_lo, xmm_msk_hi;
+    __m128i s;
+
+    if (pm)
+    {
+	xmm_msk_lo = load_128_unaligned (pm);
+
+	if (is_transparent (xmm_msk_lo))
+	    return _mm_setzero_si128 ();
+    }
+
+    s = load_128_unaligned (ps);
+
+    if (pm)
+    {
+	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
+
+	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_msk_lo, &xmm_msk_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+
+	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
+    }
+
+    return s;
+}
+
+static force_inline void
+core_combine_over_u_sse2_mask (uint32_t *	  pd,
+			       const uint32_t*    ps,
+			       const uint32_t*    pm,
+			       int                w)
+{
+    uint32_t s, d;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((uintptr_t)pd & 15))
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	pm++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i mask = load_128_unaligned ((__m128i *)pm);
+
+	if (!is_zero (mask))
+	{
+	    __m128i src;
+	    __m128i src_hi, src_lo;
+	    __m128i mask_hi, mask_lo;
+	    __m128i alpha_hi, alpha_lo;
+
+	    src = load_128_unaligned ((__m128i *)ps);
+
+	    if (is_opaque (_mm_and_si128 (src, mask)))
+	    {
+		save_128_aligned ((__m128i *)pd, src);
+	    }
+	    else
+	    {
+		__m128i dst = load_128_aligned ((__m128i *)pd);
+		__m128i dst_hi, dst_lo;
+
+		unpack_128_2x128 (mask, &mask_lo, &mask_hi);
+		unpack_128_2x128 (src, &src_lo, &src_hi);
+
+		expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
+		pix_multiply_2x128 (&src_lo, &src_hi,
+				    &mask_lo, &mask_hi,
+				    &src_lo, &src_hi);
+
+		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+		expand_alpha_2x128 (src_lo, src_hi,
+				    &alpha_lo, &alpha_hi);
+
+		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+			    &dst_lo, &dst_hi);
+
+		save_128_aligned (
+		    (__m128i *)pd,
+		    pack_2x128_128 (dst_lo, dst_hi));
+	    }
+	}
+
+	pm += 4;
+	ps += 4;
+	pd += 4;
+	w -= 4;
+    }
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	pm++;
+
+	w--;
+    }
+}
+
+static force_inline void
+core_combine_over_u_sse2_no_mask (uint32_t *	  pd,
+				  const uint32_t*    ps,
+				  int                w)
+{
+    uint32_t s, d;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((uintptr_t)pd & 15))
+    {
+	d = *pd;
+	s = *ps;
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i src;
+	__m128i src_hi, src_lo, dst_hi, dst_lo;
+	__m128i alpha_hi, alpha_lo;
+
+	src = load_128_unaligned ((__m128i *)ps);
+
+	if (!is_zero (src))
+	{
+	    if (is_opaque (src))
+	    {
+		save_128_aligned ((__m128i *)pd, src);
+	    }
+	    else
+	    {
+		__m128i dst = load_128_aligned ((__m128i *)pd);
+
+		unpack_128_2x128 (src, &src_lo, &src_hi);
+		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
+
+		expand_alpha_2x128 (src_lo, src_hi,
+				    &alpha_lo, &alpha_hi);
+		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
+			    &dst_lo, &dst_hi);
+
+		save_128_aligned (
+		    (__m128i *)pd,
+		    pack_2x128_128 (dst_lo, dst_hi));
+	    }
+	}
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+    }
+    while (w)
+    {
+	d = *pd;
+	s = *ps;
+
+	if (s)
+	    *pd = core_combine_over_u_pixel_sse2 (s, d);
+	pd++;
+	ps++;
+
+	w--;
+    }
+}
+
+static force_inline void
+sse2_combine_over_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    if (pm)
+	core_combine_over_u_sse2_mask (pd, ps, pm, w);
+    else
+	core_combine_over_u_sse2_no_mask (pd, ps, w);
+}
+
+static void
+sse2_combine_over_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    /* Align dst on a 16-byte boundary */
+    while (w &&
+           ((uintptr_t)pd & 15))
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	/* I'm loading unaligned because I'm not sure
+	 * about the address alignment.
+	 */
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+		    &xmm_alpha_lo, &xmm_alpha_hi,
+		    &xmm_src_lo, &xmm_src_hi);
+
+	/* rebuid the 4 pixel data and save*/
+	save_128_aligned ((__m128i*)pd,
+			  pack_2x128_128 (xmm_src_lo, xmm_src_hi));
+
+	w -= 4;
+	ps += 4;
+	pd += 4;
+
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps, pm);
+
+	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
+{
+    uint32_t maska = src >> 24;
+
+    if (maska == 0)
+    {
+	return 0;
+    }
+    else if (maska != 0xff)
+    {
+	return pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (dst),
+				expand_alpha_1x128 (unpack_32_1x128 (src))));
+    }
+
+    return dst;
+}
+
+static void
+sse2_combine_in_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               pd,
+                   const uint32_t *         ps,
+                   const uint32_t *         pm,
+                   int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    while (w && ((uintptr_t)pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned ((__m128i*)pd,
+			  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static void
+sse2_combine_in_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               pd,
+                           const uint32_t *         ps,
+                           const uint32_t *         pm,
+                           int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    while (w && ((uintptr_t)pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static void
+sse2_combine_out_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
+{
+    while (w && ((uintptr_t)pd & 15))
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
+
+	if (pm)
+	    pm++;
+	ps++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i xmm_src_lo, xmm_src_hi;
+	__m128i xmm_dst_lo, xmm_dst_hi;
+
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	if (pm)
+	    pm += 4;
+
+	w -= 4;
+    }
+
+    while (w)
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
+	ps++;
+	if (pm)
+	    pm++;
+	w--;
+    }
+}
+
+static void
+sse2_combine_out_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
+{
+    while (w && ((uintptr_t)pd & 15))
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (s), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	__m128i xmm_src_lo, xmm_src_hi;
+	__m128i xmm_dst_lo, xmm_dst_hi;
+
+	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	uint32_t s = combine1 (ps, pm);
+	uint32_t d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (s), negate_1x128 (
+		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_atop_u_pixel_sse2 (uint32_t src,
+                                uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
+    __m128i da = expand_alpha_1x128 (d);
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
+}
+
+static void
+sse2_combine_atop_u (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+    while (w && ((uintptr_t)pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
+                                        uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
+}
+
+static void
+sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
+{
+    uint32_t s, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+    while (w && ((uintptr_t)pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
+	ps++;
+	w--;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_xor_u_pixel_sse2 (uint32_t src,
+                               uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
+}
+
+static void
+sse2_combine_xor_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int w = width;
+    uint32_t s, d;
+    uint32_t* pd = dst;
+    const uint32_t* ps = src;
+    const uint32_t* pm = mask;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+
+    while (w && ((uintptr_t)pd & 15))
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
+	xmm_dst = load_128_aligned ((__m128i*) pd);
+
+	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
+		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	w -= 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline void
+sse2_combine_add_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dst,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int w = width;
+    uint32_t s, d;
+    uint32_t* pd = dst;
+    const uint32_t* ps = src;
+    const uint32_t* pm = mask;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	ps++;
+	if (pm)
+	    pm++;
+	*pd++ = _mm_cvtsi128_si32 (
+	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i s;
+
+	s = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+	save_128_aligned (
+	    (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
+
+	pd += 4;
+	ps += 4;
+	if (pm)
+	    pm += 4;
+	w -= 4;
+    }
+
+    while (w--)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	ps++;
+	*pd++ = _mm_cvtsi128_si32 (
+	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
+	if (pm)
+	    pm++;
+    }
+}
+
+static force_inline uint32_t
+core_combine_saturate_u_pixel_sse2 (uint32_t src,
+                                    uint32_t dst)
+{
+    __m128i ms = unpack_32_1x128 (src);
+    __m128i md = unpack_32_1x128 (dst);
+    uint32_t sa = src >> 24;
+    uint32_t da = ~dst >> 24;
+
+    if (sa > da)
+    {
+	ms = pix_multiply_1x128 (
+	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
+    }
+
+    return pack_1x128_32 (_mm_adds_epu16 (md, ms));
+}
+
+static void
+sse2_combine_saturate_u (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         uint32_t *               pd,
+                         const uint32_t *         ps,
+                         const uint32_t *         pm,
+                         int                      w)
+{
+    uint32_t s, d;
+
+    uint32_t pack_cmp;
+    __m128i xmm_src, xmm_dst;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	w--;
+	ps++;
+	if (pm)
+	    pm++;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst = load_128_aligned  ((__m128i*)pd);
+	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
+
+	pack_cmp = _mm_movemask_epi8 (
+	    _mm_cmpgt_epi32 (
+		_mm_srli_epi32 (xmm_src, 24),
+		_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
+
+	/* if some alpha src is grater than respective ~alpha dst */
+	if (pack_cmp)
+	{
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+
+	    s = combine1 (ps++, pm);
+	    d = *pd;
+	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	    if (pm)
+		pm++;
+	}
+	else
+	{
+	    save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
+
+	    pd += 4;
+	    ps += 4;
+	    if (pm)
+		pm += 4;
+	}
+
+	w -= 4;
+    }
+
+    while (w--)
+    {
+	s = combine1 (ps, pm);
+	d = *pd;
+
+	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
+	ps++;
+	if (pm)
+	    pm++;
+    }
+}
+
+static void
+sse2_combine_src_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_over_ca_pixel_sse2 (uint32_t src,
+                                 uint32_t mask,
+                                 uint32_t dst)
+{
+    __m128i s = unpack_32_1x128 (src);
+    __m128i expAlpha = expand_alpha_1x128 (s);
+    __m128i unpk_mask = unpack_32_1x128 (mask);
+    __m128i unpk_dst  = unpack_32_1x128 (dst);
+
+    return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
+}
+
+static void
+sse2_combine_over_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+		       &xmm_alpha_lo, &xmm_alpha_hi,
+		       &xmm_mask_lo, &xmm_mask_hi,
+		       &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
+                                         uint32_t mask,
+                                         uint32_t dst)
+{
+    __m128i d = unpack_32_1x128 (dst);
+
+    return pack_1x128_32 (
+	over_1x128 (d, expand_alpha_1x128 (d),
+		    pix_multiply_1x128 (unpack_32_1x128 (src),
+					unpack_32_1x128 (mask))));
+}
+
+static void
+sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+		    &xmm_alpha_lo, &xmm_alpha_hi,
+		    &xmm_mask_lo, &xmm_mask_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static void
+sse2_combine_in_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               pd,
+                    const uint32_t *         ps,
+                    const uint32_t *         pm,
+                    int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		expand_alpha_1x128 (unpack_32_1x128 (d))));
+
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		expand_alpha_1x128 (unpack_32_1x128 (d))));
+
+	w--;
+    }
+}
+
+static void
+sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               pd,
+                            const uint32_t *         ps,
+                            const uint32_t *         pm,
+                            int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		pix_multiply_1x128 (unpack_32_1x128 (m),
+				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		pix_multiply_1x128 (unpack_32_1x128 (m),
+				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
+	w--;
+    }
+}
+
+static void
+sse2_combine_out_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
+		      &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
+		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
+
+	w--;
+    }
+}
+
+static void
+sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               pd,
+                             const uint32_t *         ps,
+                             const uint32_t *         pm,
+                             int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		negate_1x128 (pix_multiply_1x128 (
+				 unpack_32_1x128 (m),
+				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi);
+
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+		      &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    pix_multiply_1x128 (
+		unpack_32_1x128 (d),
+		negate_1x128 (pix_multiply_1x128 (
+				 unpack_32_1x128 (m),
+				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_atop_ca_pixel_sse2 (uint32_t src,
+                                 uint32_t mask,
+                                 uint32_t dst)
+{
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+    __m128i sa = expand_alpha_1x128 (s);
+    __m128i da = expand_alpha_1x128 (d);
+
+    s = pix_multiply_1x128 (s, m);
+    m = negate_1x128 (pix_multiply_1x128 (m, sa));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
+}
+
+static void
+sse2_combine_atop_ca (pixman_implementation_t *imp,
+                      pixman_op_t              op,
+                      uint32_t *               pd,
+                      const uint32_t *         ps,
+                      const uint32_t *         pm,
+                      int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
+                                         uint32_t mask,
+                                         uint32_t dst)
+{
+    __m128i m = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
+    __m128i sa = expand_alpha_1x128 (s);
+
+    s = pix_multiply_1x128 (s, m);
+    m = pix_multiply_1x128 (m, sa);
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
+}
+
+static void
+sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              uint32_t *               pd,
+                              const uint32_t *         ps,
+                              const uint32_t *         pm,
+                              int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static force_inline uint32_t
+core_combine_xor_ca_pixel_sse2 (uint32_t src,
+                                uint32_t mask,
+                                uint32_t dst)
+{
+    __m128i a = unpack_32_1x128 (mask);
+    __m128i s = unpack_32_1x128 (src);
+    __m128i d = unpack_32_1x128 (dst);
+
+    __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
+				       a, expand_alpha_1x128 (s)));
+    __m128i dest      = pix_multiply_1x128 (s, a);
+    __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
+
+    return pack_1x128_32 (pix_add_multiply_1x128 (&d,
+                                                &alpha_dst,
+                                                &dest,
+                                                &alpha_src));
+}
+
+static void
+sse2_combine_xor_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
+    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
+	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
+			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi);
+
+	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
+		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
+	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
+		      &xmm_mask_lo, &xmm_mask_hi);
+
+	pix_add_multiply_2x128 (
+	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
+	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
+	    &xmm_dst_lo, &xmm_dst_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
+	w--;
+    }
+}
+
+static void
+sse2_combine_add_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               pd,
+                     const uint32_t *         ps,
+                     const uint32_t *         pm,
+                     int                      w)
+{
+    uint32_t s, m, d;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask_lo, xmm_mask_hi;
+
+    while (w && (uintptr_t)pd & 15)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+					       unpack_32_1x128 (m)),
+			   unpack_32_1x128 (d)));
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
+	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
+	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
+
+	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+			    &xmm_mask_lo, &xmm_mask_hi,
+			    &xmm_src_lo, &xmm_src_hi);
+
+	save_128_aligned (
+	    (__m128i*)pd, pack_2x128_128 (
+		_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
+		_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
+
+	ps += 4;
+	pd += 4;
+	pm += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	s = *ps++;
+	m = *pm++;
+	d = *pd;
+
+	*pd++ = pack_1x128_32 (
+	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
+					       unpack_32_1x128 (m)),
+			   unpack_32_1x128 (d)));
+	w--;
+    }
+}
+
+static force_inline __m128i
+create_mask_16_128 (uint16_t mask)
+{
+    return _mm_set1_epi16 (mask);
+}
+
+/* Work around a code generation bug in Sun Studio 12. */
+#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
+# define create_mask_2x32_128(mask0, mask1)				\
+    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
+#else
+static force_inline __m128i
+create_mask_2x32_128 (uint32_t mask0,
+                      uint32_t mask1)
+{
+    return _mm_set_epi32 (mask0, mask1, mask0, mask1);
+}
+#endif
+
+static void
+sse2_composite_over_n_8888 (pixman_implementation_t *imp,
+                            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst, d;
+    int32_t w;
+    int dst_stride;
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+
+    while (height--)
+    {
+	dst = dst_line;
+
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    d = *dst;
+	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+						xmm_alpha,
+						unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    over_2x128 (&xmm_src, &xmm_src,
+			&xmm_alpha, &xmm_alpha,
+			&xmm_dst_lo, &xmm_dst_hi);
+
+	    /* rebuid the 4 pixel data and save*/
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    w -= 4;
+	    dst += 4;
+	}
+
+	while (w)
+	{
+	    d = *dst;
+	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
+						xmm_alpha,
+						unpack_32_1x128 (d)));
+	    w--;
+	}
+
+    }
+}
+
+static void
+sse2_composite_over_n_0565 (pixman_implementation_t *imp,
+                            pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    int32_t w;
+    int dst_stride;
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+
+    while (height--)
+    {
+	dst = dst_line;
+
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    d = *dst;
+
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (over_1x128 (xmm_src,
+					   xmm_alpha,
+					   expand565_16_1x128 (d))));
+	    w--;
+	}
+
+	while (w >= 8)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+	    over_2x128 (&xmm_src, &xmm_src,
+			&xmm_alpha, &xmm_alpha,
+			&xmm_dst0, &xmm_dst1);
+	    over_2x128 (&xmm_src, &xmm_src,
+			&xmm_alpha, &xmm_alpha,
+			&xmm_dst2, &xmm_dst3);
+
+	    xmm_dst = pack_565_4x128_128 (
+		&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+	    save_128_aligned ((__m128i*)dst, xmm_dst);
+
+	    dst += 8;
+	    w -= 8;
+	}
+
+	while (w--)
+	{
+	    d = *dst;
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
+					   expand565_16_1x128 (d))));
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
+				   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    __m128i xmm_src;
+    __m128i xmm_dst;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m128i mmx_src, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = _mm_unpacklo_epi8 (
+	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+    mmx_src   = xmm_src;
+
+    while (height--)
+    {
+	int w = width;
+	const uint32_t *pm = (uint32_t *)mask_line;
+	uint32_t *pd = (uint32_t *)dst_line;
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+
+	while (w && (uintptr_t)pd & 15)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (
+		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+				   mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+	    pack_cmp =
+		_mm_movemask_epi8 (
+		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+	    if (pack_cmp != 0xffff)
+	    {
+		xmm_dst = load_128_aligned ((__m128i*)pd);
+
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		pix_multiply_2x128 (&xmm_src, &xmm_src,
+				    &xmm_mask_lo, &xmm_mask_hi,
+				    &xmm_mask_lo, &xmm_mask_hi);
+		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
+
+		save_128_aligned (
+		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
+	    }
+
+	    pd += 4;
+	    pm += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (
+		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
+				   mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, d;
+    uint32_t    *mask_line, m;
+    uint32_t pack_cmp;
+    int dst_stride, mask_stride;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = _mm_unpacklo_epi8 (
+	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	int w = width;
+	const uint32_t *pm = (uint32_t *)mask_line;
+	uint32_t *pd = (uint32_t *)dst_line;
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+
+	while (w && (uintptr_t)pd & 15)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
+		                                  &mmx_alpha,
+		                                  &mmx_mask,
+		                                  &mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)pm);
+
+	    pack_cmp =
+		_mm_movemask_epi8 (
+		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
+	    if (pack_cmp != 0xffff)
+	    {
+		xmm_dst = load_128_aligned ((__m128i*)pd);
+
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    pd += 4;
+	    pm += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    m = *pm++;
+
+	    if (m)
+	    {
+		d = *pd;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*pd = pack_1x128_32 (
+		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
+	    }
+
+	    pd++;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    int32_t w;
+    int dst_stride, src_stride;
+
+    __m128i xmm_mask;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
+
+    xmm_mask = create_mask_16_128 (mask >> 24);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    uint32_t s = *src++;
+
+	    if (s)
+	    {
+		uint32_t d = *dst;
+		
+		__m128i ms = unpack_32_1x128 (s);
+		__m128i alpha    = expand_alpha_1x128 (ms);
+		__m128i dest     = xmm_mask;
+		__m128i alpha_dst = unpack_32_1x128 (d);
+		
+		*dst = pack_1x128_32 (
+		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	    }
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_src = load_128_unaligned ((__m128i*)src);
+
+	    if (!is_zero (xmm_src))
+	    {
+		xmm_dst = load_128_aligned ((__m128i*)dst);
+		
+		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				    &xmm_alpha_lo, &xmm_alpha_hi);
+		
+		in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			       &xmm_alpha_lo, &xmm_alpha_hi,
+			       &xmm_mask, &xmm_mask,
+			       &xmm_dst_lo, &xmm_dst_hi);
+		
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+		
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+	}
+
+	while (w)
+	{
+	    uint32_t s = *src++;
+
+	    if (s)
+	    {
+		uint32_t d = *dst;
+		
+		__m128i ms = unpack_32_1x128 (s);
+		__m128i alpha = expand_alpha_1x128 (ms);
+		__m128i mask  = xmm_mask;
+		__m128i dest  = unpack_32_1x128 (d);
+		
+		*dst = pack_1x128_32 (
+		    in_over_1x128 (&ms, &alpha, &mask, &dest));
+	    }
+
+	    dst++;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    s = *src++;
+	    *dst = convert_8888_to_0565 (s);
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 8)
+	{
+	    __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
+	    __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
+
+	    save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
+
+	    w -= 8;
+	    src += 8;
+	    dst += 8;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    *dst = convert_8888_to_0565 (s);
+	    dst++;
+	    w--;
+	}
+    }
+}
+
+static void
+sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
+			      pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int32_t w;
+    int dst_stride, src_stride;
+
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    *dst++ = *src++ | 0xff000000;
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
+	    
+	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
+	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
+	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
+	    xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
+	    
+	    save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
+	    save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
+	    save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
+	    save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
+	    
+	    dst += 16;
+	    src += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    *dst++ = *src++ | 0xff000000;
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    __m128i xmm_mask, xmm_alpha;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
+
+    xmm_mask = create_mask_16_128 (mask >> 24);
+    xmm_alpha = mask_00ff;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    uint32_t s = (*src++) | 0xff000000;
+	    uint32_t d = *dst;
+
+	    __m128i src   = unpack_32_1x128 (s);
+	    __m128i alpha = xmm_alpha;
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
+
+	    *dst++ = pack_1x128_32 (
+		in_over_1x128 (&src, &alpha, &mask, &dest));
+
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_src = _mm_or_si128 (
+		load_128_unaligned ((__m128i*)src), mask_ff000000);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha, &xmm_alpha,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    dst += 4;
+	    src += 4;
+	    w -= 4;
+
+	}
+
+	while (w)
+	{
+	    uint32_t s = (*src++) | 0xff000000;
+	    uint32_t d = *dst;
+
+	    __m128i src  = unpack_32_1x128 (s);
+	    __m128i alpha = xmm_alpha;
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
+
+	    *dst++ = pack_1x128_32 (
+		in_over_1x128 (&src, &alpha, &mask, &dest));
+
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    int dst_stride, src_stride;
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    dst = dst_line;
+    src = src_line;
+
+    while (height--)
+    {
+	sse2_combine_over_u (imp, op, dst, src, NULL, width);
+
+	dst += dst_stride;
+	src += src_stride;
+    }
+}
+
+static force_inline uint16_t
+composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
+{
+    __m128i ms;
+
+    ms = unpack_32_1x128 (src);
+    return pack_565_32_16 (
+	pack_1x128_32 (
+	    over_1x128 (
+		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
+}
+
+static void
+sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
+                               pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	src = src_line;
+
+	dst_line += dst_stride;
+	src_line += src_stride;
+	w = width;
+
+	/* Align dst on a 16-byte boundary */
+	while (w &&
+	       ((uintptr_t)dst & 15))
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = composite_over_8888_0565pixel (s, d);
+	    w--;
+	}
+
+	/* It's a 8 pixel loop */
+	while (w >= 8)
+	{
+	    /* I'm loading unaligned because I'm not sure
+	     * about the address alignment.
+	     */
+	    xmm_src = load_128_unaligned ((__m128i*) src);
+	    xmm_dst = load_128_aligned ((__m128i*) dst);
+
+	    /* Unpacking */
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				&xmm_alpha_lo, &xmm_alpha_hi);
+
+	    /* I'm loading next 4 pixels from memory
+	     * before to optimze the memory read.
+	     */
+	    xmm_src = load_128_unaligned ((__m128i*) (src + 4));
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst0, &xmm_dst1);
+
+	    /* Unpacking */
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				&xmm_alpha_lo, &xmm_alpha_hi);
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst2, &xmm_dst3);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    dst += 8;
+	    src += 8;
+	}
+
+	while (w--)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = composite_over_8888_0565pixel (s, d);
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m, d;
+
+    __m128i xmm_src, xmm_alpha, xmm_def;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    xmm_def = create_mask_2x32_128 (src, src);
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src   = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_pixel_8_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
+		                                   &mmx_alpha,
+		                                   &mmx_mask,
+		                                   &mmx_dest));
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 4)
+	{
+	    m = *((uint32_t*)mask);
+
+	    if (srca == 0xff && m == 0xffffffff)
+	    {
+		save_128_aligned ((__m128i*)dst, xmm_def);
+	    }
+	    else if (m)
+	    {
+		xmm_dst = load_128_aligned ((__m128i*) dst);
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    mask += 4;
+	}
+
+	while (w)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_pixel_8_1x128 (m);
+		mmx_dest = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
+		                                   &mmx_alpha,
+		                                   &mmx_mask,
+		                                   &mmx_dest));
+	    }
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+static pixman_bool_t
+sse2_fill (pixman_implementation_t *imp,
+           uint32_t *               bits,
+           int                      stride,
+           int                      bpp,
+           int                      x,
+           int                      y,
+           int                      width,
+           int                      height,
+           uint32_t		    filler)
+{
+    uint32_t byte_width;
+    uint8_t *byte_line;
+
+    __m128i xmm_def;
+
+    if (bpp == 8)
+    {
+	uint8_t b;
+	uint16_t w;
+
+	stride = stride * (int) sizeof (uint32_t) / 1;
+	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+	byte_width = width;
+	stride *= 1;
+
+	b = filler & 0xff;
+	w = (b << 8) | b;
+	filler = (w << 16) | w;
+    }
+    else if (bpp == 16)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 2;
+	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+	byte_width = 2 * width;
+	stride *= 2;
+
+        filler = (filler & 0xffff) * 0x00010001;
+    }
+    else if (bpp == 32)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 4;
+	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+	byte_width = 4 * width;
+	stride *= 4;
+    }
+    else
+    {
+	return FALSE;
+    }
+
+    xmm_def = create_mask_2x32_128 (filler, filler);
+
+    while (height--)
+    {
+	int w;
+	uint8_t *d = byte_line;
+	byte_line += stride;
+	w = byte_width;
+
+	if (w >= 1 && ((uintptr_t)d & 1))
+	{
+	    *(uint8_t *)d = filler;
+	    w -= 1;
+	    d += 1;
+	}
+
+	while (w >= 2 && ((uintptr_t)d & 3))
+	{
+	    *(uint16_t *)d = filler;
+	    w -= 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((uintptr_t)d & 15))
+	{
+	    *(uint32_t *)d = filler;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	while (w >= 128)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 64),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 80),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 96),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 112), xmm_def);
+
+	    d += 128;
+	    w -= 128;
+	}
+
+	if (w >= 64)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
+	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
+
+	    d += 64;
+	    w -= 64;
+	}
+
+	if (w >= 32)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
+
+	    d += 32;
+	    w -= 32;
+	}
+
+	if (w >= 16)
+	{
+	    save_128_aligned ((__m128i*)(d),     xmm_def);
+
+	    d += 16;
+	    w -= 16;
+	}
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = filler;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = filler;
+	    w -= 2;
+	    d += 2;
+	}
+
+	if (w >= 1)
+	{
+	    *(uint8_t *)d = filler;
+	    w -= 1;
+	    d += 1;
+	}
+    }
+
+    return TRUE;
+}
+
+static void
+sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
+                             pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m;
+
+    __m128i xmm_src, xmm_def;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+    {
+	sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
+		   PIXMAN_FORMAT_BPP (dest_image->bits.format),
+		   dest_x, dest_y, width, height, 0);
+	return;
+    }
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    xmm_def = create_mask_2x32_128 (src, src);
+    xmm_src = expand_pixel_32_1x128 (src);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		*dst = pack_1x128_32 (
+		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 4)
+	{
+	    m = *((uint32_t*)mask);
+
+	    if (srca == 0xff && m == 0xffffffff)
+	    {
+		save_128_aligned ((__m128i*)dst, xmm_def);
+	    }
+	    else if (m)
+	    {
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		pix_multiply_2x128 (&xmm_src, &xmm_src,
+				    &xmm_mask_lo, &xmm_mask_hi,
+				    &xmm_mask_lo, &xmm_mask_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
+	    }
+	    else
+	    {
+		save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    mask += 4;
+	}
+
+	while (w)
+	{
+	    uint8_t m = *mask++;
+
+	    if (m)
+	    {
+		*dst = pack_1x128_32 (
+		    pix_multiply_1x128 (
+			xmm_src, expand_pixel_8_1x128 (m)));
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t m;
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 8)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*) dst);
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+
+	    m = *((uint32_t*)mask);
+	    mask += 4;
+
+	    if (m)
+	    {
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst0, &xmm_dst1);
+	    }
+
+	    m = *((uint32_t*)mask);
+	    mask += 4;
+
+	    if (m)
+	    {
+		xmm_mask = unpack_32_1x128 (m);
+		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
+
+		/* Unpacking */
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst2, &xmm_dst3);
+	    }
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    dst += 8;
+	}
+
+	while (w)
+	{
+	    m = *mask++;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t opaque, zero;
+
+    __m128i ms;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    ms = unpack_32_1x128 (s);
+
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (
+		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
+	    w--;
+	}
+
+	while (w >= 8)
+	{
+	    /* First round */
+	    xmm_src = load_128_unaligned ((__m128i*)src);
+	    xmm_dst = load_128_aligned  ((__m128i*)dst);
+
+	    opaque = is_opaque (xmm_src);
+	    zero = is_zero (xmm_src);
+
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+	    /* preload next round*/
+	    xmm_src = load_128_unaligned ((__m128i*)(src + 4));
+
+	    if (opaque)
+	    {
+		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+				     &xmm_dst0, &xmm_dst1);
+	    }
+	    else if (!zero)
+	    {
+		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+					&xmm_dst0, &xmm_dst1);
+	    }
+
+	    /* Second round */
+	    opaque = is_opaque (xmm_src);
+	    zero = is_zero (xmm_src);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+
+	    if (opaque)
+	    {
+		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+				     &xmm_dst2, &xmm_dst3);
+	    }
+	    else if (!zero)
+	    {
+		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+					&xmm_dst2, &xmm_dst3);
+	    }
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    src += 8;
+	    dst += 8;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    ms = unpack_32_1x128 (s);
+
+	    *dst++ = pack_565_32_16 (
+		pack_1x128_32 (
+		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst, d;
+    uint32_t    *src_line, *src, s;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint32_t opaque, zero;
+
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = pack_1x128_32 (
+		over_rev_non_pre_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
+
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
+
+	    opaque = is_opaque (xmm_src_hi);
+	    zero = is_zero (xmm_src_hi);
+
+	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+
+	    if (opaque)
+	    {
+		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
+				     &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+	    else if (!zero)
+	    {
+		xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
+
+		unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
+					&xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    src += 4;
+	}
+
+	while (w)
+	{
+	    s = *src++;
+	    d = *dst;
+
+	    *dst++ = pack_1x128_32 (
+		over_rev_non_pre_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
+
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint16_t    *dst_line, *dst, d;
+    uint32_t    *mask_line, *mask, m;
+    int dst_stride, mask_stride;
+    int w;
+    uint32_t pack_cmp;
+
+    __m128i xmm_src, xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
+
+    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+    xmm_alpha = expand_alpha_1x128 (xmm_src);
+    mmx_src = xmm_src;
+    mmx_alpha = xmm_alpha;
+
+    while (height--)
+    {
+	w = width;
+	mask = mask_line;
+	dst = dst_line;
+	mask_line += mask_stride;
+	dst_line += dst_stride;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    m = *(uint32_t *) mask;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	    mask++;
+	}
+
+	while (w >= 8)
+	{
+	    /* First round */
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    pack_cmp = _mm_movemask_epi8 (
+		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    unpack_565_128_4x128 (xmm_dst,
+				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+	    /* preload next round */
+	    xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
+
+	    /* preload next round */
+	    if (pack_cmp != 0xffff)
+	    {
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst0, &xmm_dst1);
+	    }
+
+	    /* Second round */
+	    pack_cmp = _mm_movemask_epi8 (
+		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
+
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+
+	    if (pack_cmp != 0xffff)
+	    {
+		in_over_2x128 (&xmm_src, &xmm_src,
+			       &xmm_alpha, &xmm_alpha,
+			       &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst2, &xmm_dst3);
+	    }
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_565_4x128_128 (
+		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
+
+	    w -= 8;
+	    dst += 8;
+	    mask += 8;
+	}
+
+	while (w)
+	{
+	    m = *(uint32_t *) mask;
+
+	    if (m)
+	    {
+		d = *dst;
+		mmx_mask = unpack_32_1x128 (m);
+		mmx_dest = expand565_16_1x128 (d);
+
+		*dst = pack_565_32_16 (
+		    pack_1x128_32 (
+			in_over_1x128 (
+			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
+	    }
+
+	    w--;
+	    dst++;
+	    mask++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
+                         pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    uint32_t d, m;
+    uint32_t src;
+    int32_t w;
+
+    __m128i xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    pix_multiply_1x128 (xmm_alpha,
+				       unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+				&xmm_mask_lo, &xmm_mask_hi,
+				&xmm_mask_lo, &xmm_mask_hi);
+
+	    pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
+				&xmm_dst_lo, &xmm_dst_hi,
+				&xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    mask += 16;
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_in_n_8 (pixman_implementation_t *imp,
+		       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    int dst_stride;
+    uint32_t d;
+    uint32_t src;
+    int32_t w;
+
+    __m128i xmm_alpha;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    src = src >> 24;
+
+    if (src == 0xff)
+	return;
+
+    if (src == 0x00)
+    {
+	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+		     8, dest_x, dest_y, width, height, src);
+
+	return;
+    }
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    xmm_alpha,
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    
+	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+				&xmm_dst_lo, &xmm_dst_hi,
+				&xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    xmm_alpha,
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_in_8_8 (pixman_implementation_t *imp,
+                       pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int src_stride, dst_stride;
+    int32_t w;
+    uint32_t s, d;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    s = (uint32_t) *src++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (
+		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_src = load_128_unaligned ((__m128i*)src);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
+				&xmm_dst_lo, &xmm_dst_hi,
+				&xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    src += 16;
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    s = (uint32_t) *src++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
+			  pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+    uint32_t m, d;
+
+    __m128i xmm_alpha;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		_mm_adds_epu16 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+	    w--;
+	}
+
+	while (w >= 16)
+	{
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
+				&xmm_mask_lo, &xmm_mask_hi,
+				&xmm_mask_lo, &xmm_mask_hi);
+
+	    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+	    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+
+	    mask += 16;
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    m = (uint32_t) *mask++;
+	    d = (uint32_t) *dst;
+
+	    *dst++ = (uint8_t) pack_1x128_32 (
+		_mm_adds_epu16 (
+		    pix_multiply_1x128 (
+			xmm_alpha, unpack_32_1x128 (m)),
+		    unpack_32_1x128 (d)));
+
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_n_8 (pixman_implementation_t *imp,
+			pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    int dst_stride;
+    int32_t w;
+    uint32_t src;
+
+    __m128i xmm_src;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    src >>= 24;
+
+    if (src == 0x00)
+	return;
+
+    if (src == 0xff)
+    {
+	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
+		     8, dest_x, dest_y, width, height, 0xff);
+
+	return;
+    }
+
+    src = (src << 24) | (src << 16) | (src << 8) | src;
+    xmm_src = _mm_set_epi32 (src, src, src, src);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    *dst = (uint8_t)_mm_cvtsi128_si32 (
+		_mm_adds_epu8 (
+		    xmm_src,
+		    _mm_cvtsi32_si128 (*dst)));
+
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 16)
+	{
+	    save_128_aligned (
+		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
+
+	    dst += 16;
+	    w -= 16;
+	}
+
+	while (w)
+	{
+	    *dst = (uint8_t)_mm_cvtsi128_si32 (
+		_mm_adds_epu8 (
+		    xmm_src,
+		    _mm_cvtsi32_si128 (*dst)));
+
+	    w--;
+	    dst++;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_8_8 (pixman_implementation_t *imp,
+			pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint8_t     *dst_line, *dst;
+    uint8_t     *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+    uint16_t t;
+
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	src = src_line;
+
+	dst_line += dst_stride;
+	src_line += src_stride;
+	w = width;
+
+	/* Small head */
+	while (w && (uintptr_t)dst & 3)
+	{
+	    t = (*dst) + (*src++);
+	    *dst++ = t | (0 - (t >> 8));
+	    w--;
+	}
+
+	sse2_combine_add_u (imp, op,
+			    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
+
+	/* Small tail */
+	dst += w & 0xfffc;
+	src += w & 0xfffc;
+
+	w &= 3;
+
+	while (w)
+	{
+	    t = (*dst) + (*src++);
+	    *dst++ = t | (0 - (t >> 8));
+	    w--;
+	}
+    }
+
+}
+
+static void
+sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
+                              pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+
+	sse2_combine_add_u (imp, op, dst, src, NULL, width);
+    }
+}
+
+static void
+sse2_composite_add_n_8888 (pixman_implementation_t *imp,
+			   pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst, src;
+    int dst_stride;
+
+    __m128i xmm_src;
+
+    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    if (src == 0)
+	return;
+
+    if (src == ~0)
+    {
+	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
+		     dest_x, dest_y, width, height, ~0);
+
+	return;
+    }
+
+    xmm_src = _mm_set_epi32 (src, src, src, src);
+    while (height--)
+    {
+	int w = width;
+	uint32_t d;
+
+	dst = dst_line;
+	dst_line += dst_stride;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    d = *dst;
+	    *dst++ =
+		_mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    save_128_aligned
+		((__m128i*)dst,
+		 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
+
+	    dst += 4;
+	    w -= 4;
+	}
+
+	while (w--)
+	{
+	    d = *dst;
+	    *dst++ =
+		_mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
+						  _mm_cvtsi32_si128 (d)));
+	}
+    }
+}
+
+static void
+sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
+			     pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t     *dst_line, *dst;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    uint32_t src;
+
+    __m128i xmm_src;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+    if (src == 0)
+	return;
+    xmm_src = expand_pixel_32_1x128 (src);
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    uint8_t m = *mask++;
+	    if (m)
+	    {
+		*dst = pack_1x128_32
+		    (_mm_adds_epu16
+		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
+		      unpack_32_1x128 (*dst)));
+	    }
+	    dst++;
+	    w--;
+	}
+
+	while (w >= 4)
+	{
+	    uint32_t m = *(uint32_t*)mask;
+	    if (m)
+	    {
+		__m128i xmm_mask_lo, xmm_mask_hi;
+		__m128i xmm_dst_lo, xmm_dst_hi;
+
+		__m128i xmm_dst = load_128_aligned ((__m128i*)dst);
+		__m128i xmm_mask =
+		    _mm_unpacklo_epi8 (unpack_32_1x128(m),
+				       _mm_setzero_si128 ());
+
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
+					&xmm_mask_lo, &xmm_mask_hi);
+
+		pix_multiply_2x128 (&xmm_src, &xmm_src,
+				    &xmm_mask_lo, &xmm_mask_hi,
+				    &xmm_mask_lo, &xmm_mask_hi);
+
+		xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
+		xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
+
+		save_128_aligned (
+		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+
+	    w -= 4;
+	    dst += 4;
+	    mask += 4;
+	}
+
+	while (w)
+	{
+	    uint8_t m = *mask++;
+	    if (m)
+	    {
+		*dst = pack_1x128_32
+		    (_mm_adds_epu16
+		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
+		      unpack_32_1x128 (*dst)));
+	    }
+	    dst++;
+	    w--;
+	}
+    }
+}
+
+static pixman_bool_t
+sse2_blt (pixman_implementation_t *imp,
+          uint32_t *               src_bits,
+          uint32_t *               dst_bits,
+          int                      src_stride,
+          int                      dst_stride,
+          int                      src_bpp,
+          int                      dst_bpp,
+          int                      src_x,
+          int                      src_y,
+          int                      dest_x,
+          int                      dest_y,
+          int                      width,
+          int                      height)
+{
+    uint8_t *   src_bytes;
+    uint8_t *   dst_bytes;
+    int byte_width;
+
+    if (src_bpp != dst_bpp)
+	return FALSE;
+
+    if (src_bpp == 16)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+	byte_width = 2 * width;
+	src_stride *= 2;
+	dst_stride *= 2;
+    }
+    else if (src_bpp == 32)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
+	byte_width = 4 * width;
+	src_stride *= 4;
+	dst_stride *= 4;
+    }
+    else
+    {
+	return FALSE;
+    }
+
+    while (height--)
+    {
+	int w;
+	uint8_t *s = src_bytes;
+	uint8_t *d = dst_bytes;
+	src_bytes += src_stride;
+	dst_bytes += dst_stride;
+	w = byte_width;
+
+	while (w >= 2 && ((uintptr_t)d & 3))
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((uintptr_t)d & 15))
+	{
+	    *(uint32_t *)d = *(uint32_t *)s;
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+
+	while (w >= 64)
+	{
+	    __m128i xmm0, xmm1, xmm2, xmm3;
+
+	    xmm0 = load_128_unaligned ((__m128i*)(s));
+	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
+	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
+	    xmm3 = load_128_unaligned ((__m128i*)(s + 48));
+
+	    save_128_aligned ((__m128i*)(d),    xmm0);
+	    save_128_aligned ((__m128i*)(d + 16), xmm1);
+	    save_128_aligned ((__m128i*)(d + 32), xmm2);
+	    save_128_aligned ((__m128i*)(d + 48), xmm3);
+
+	    s += 64;
+	    d += 64;
+	    w -= 64;
+	}
+
+	while (w >= 16)
+	{
+	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
+
+	    w -= 16;
+	    d += 16;
+	    s += 16;
+	}
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = *(uint32_t *)s;
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+    }
+
+    return TRUE;
+}
+
+static void
+sse2_composite_copy_area (pixman_implementation_t *imp,
+                          pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    sse2_blt (imp, src_image->bits.bits,
+	      dest_image->bits.bits,
+	      src_image->bits.rowstride,
+	      dest_image->bits.rowstride,
+	      PIXMAN_FORMAT_BPP (src_image->bits.format),
+	      PIXMAN_FORMAT_BPP (dest_image->bits.format),
+	      src_x, src_y, dest_x, dest_y, width, height);
+}
+
+static void
+sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint8_t         *mask, *mask_line;
+    uint32_t m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+    __m128i ms;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (uintptr_t)dst & 15)
+        {
+            s = 0xff000000 | *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+            ms = unpack_32_1x128 (s);
+
+            if (m != 0xff)
+            {
+		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		__m128i md = unpack_32_1x128 (d);
+
+                ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
+            }
+
+            *dst++ = pack_1x128_32 (ms);
+            w--;
+        }
+
+        while (w >= 4)
+        {
+            m = *(uint32_t*) mask;
+            xmm_src = _mm_or_si128 (
+		load_128_unaligned ((__m128i*)src), mask_ff000000);
+
+            if (m == 0xffffffff)
+            {
+                save_128_aligned ((__m128i*)dst, xmm_src);
+            }
+            else
+            {
+                xmm_dst = load_128_aligned ((__m128i*)dst);
+
+                xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+                unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+                expand_alpha_rev_2x128 (
+		    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+                in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
+			       &xmm_dst_lo, &xmm_dst_hi);
+
+                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+            }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+            m = (uint32_t) *mask++;
+
+            if (m)
+            {
+                s = 0xff000000 | *src;
+
+                if (m == 0xff)
+                {
+                    *dst = s;
+                }
+                else
+                {
+		    __m128i ma, md, ms;
+
+                    d = *dst;
+
+		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
+		    md = unpack_32_1x128 (d);
+		    ms = unpack_32_1x128 (s);
+
+                    *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
+                }
+
+            }
+
+            src++;
+            dst++;
+            w--;
+        }
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint8_t         *mask, *mask_line;
+    uint32_t m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (uintptr_t)dst & 15)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+
+        while (w >= 4)
+        {
+            m = *(uint32_t *) mask;
+
+	    if (m)
+	    {
+		xmm_src = load_128_unaligned ((__m128i*)src);
+
+		if (m == 0xffffffff && is_opaque (xmm_src))
+		{
+		    save_128_aligned ((__m128i *)dst, xmm_src);
+		}
+		else
+		{
+		    xmm_dst = load_128_aligned ((__m128i *)dst);
+
+		    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+		    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+		}
+	    }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (uint32_t) *mask++;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+    }
+
+}
+
+static void
+sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
+				    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t src;
+    uint32_t    *dst_line, *dst;
+    __m128i xmm_src;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_dsta_hi, xmm_dsta_lo;
+    int dst_stride;
+    int32_t w;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    xmm_src = expand_pixel_32_1x128 (src);
+
+    while (height--)
+    {
+	dst = dst_line;
+
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && (uintptr_t)dst & 15)
+	{
+	    __m128i vd;
+
+	    vd = unpack_32_1x128 (*dst);
+
+	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+					      xmm_src));
+	    w--;
+	    dst++;
+	}
+
+	while (w >= 4)
+	{
+	    __m128i tmp_lo, tmp_hi;
+
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
+
+	    tmp_lo = xmm_src;
+	    tmp_hi = xmm_src;
+
+	    over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
+			&xmm_dsta_lo, &xmm_dsta_hi,
+			&tmp_lo, &tmp_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
+
+	    w -= 4;
+	    dst += 4;
+	}
+
+	while (w)
+	{
+	    __m128i vd;
+
+	    vd = unpack_32_1x128 (*dst);
+
+	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
+					      xmm_src));
+	    w--;
+	    dst++;
+	}
+
+    }
+
+}
+
+static void
+sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
+				    pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t    *src, *src_line, s;
+    uint32_t    *dst, *dst_line, d;
+    uint32_t    *mask, *mask_line;
+    uint32_t    m;
+    int src_stride, mask_stride, dst_stride;
+    int32_t w;
+
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (
+	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+        src = src_line;
+        src_line += src_stride;
+        dst = dst_line;
+        dst_line += dst_stride;
+        mask = mask_line;
+        mask_line += mask_stride;
+
+        w = width;
+
+        while (w && (uintptr_t)dst & 15)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (*mask++) >> 24;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+
+        while (w >= 4)
+        {
+	    xmm_mask = load_128_unaligned ((__m128i*)mask);
+
+	    if (!is_transparent (xmm_mask))
+	    {
+		xmm_src = load_128_unaligned ((__m128i*)src);
+
+		if (is_opaque (xmm_mask) && is_opaque (xmm_src))
+		{
+		    save_128_aligned ((__m128i *)dst, xmm_src);
+		}
+		else
+		{
+		    xmm_dst = load_128_aligned ((__m128i *)dst);
+
+		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+		    expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+		}
+	    }
+
+            src += 4;
+            dst += 4;
+            mask += 4;
+            w -= 4;
+        }
+
+        while (w)
+        {
+	    uint32_t sa;
+
+            s = *src++;
+            m = (*mask++) >> 24;
+            d = *dst;
+
+	    sa = s >> 24;
+
+	    if (m)
+	    {
+		if (sa == 0xff && m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+		    __m128i ms, md, ma, msa;
+
+		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		    ms = unpack_32_1x128 (s);
+		    md = unpack_32_1x128 (d);
+
+		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+		}
+	    }
+
+	    dst++;
+            w--;
+        }
+    }
+
+}
+
+/* A variant of 'sse2_combine_over_u' with minor tweaks */
+static force_inline void
+scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
+                                             const uint32_t* ps,
+                                             int32_t         w,
+                                             pixman_fixed_t  vx,
+                                             pixman_fixed_t  unit_x,
+                                             pixman_fixed_t  src_width_fixed,
+                                             pixman_bool_t   fully_transparent_src)
+{
+    uint32_t s, d;
+    const uint32_t* pm = NULL;
+
+    __m128i xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_src_lo, xmm_src_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    if (fully_transparent_src)
+	return;
+
+    /* Align dst on a 16-byte boundary */
+    while (w && ((uintptr_t)pd & 15))
+    {
+	d = *pd;
+	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
+	if (pm)
+	    pm++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i tmp;
+	uint32_t tmp1, tmp2, tmp3, tmp4;
+
+	tmp1 = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp2 = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp3 = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp4 = *(ps + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
+
+	if (is_opaque (xmm_src_hi))
+	{
+	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
+	}
+	else if (!is_zero (xmm_src_hi))
+	{
+	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
+
+	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+	    expand_alpha_2x128 (
+		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+
+	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			&xmm_alpha_lo, &xmm_alpha_hi,
+			&xmm_dst_lo, &xmm_dst_hi);
+
+	    /* rebuid the 4 pixel data and save*/
+	    save_128_aligned ((__m128i*)pd,
+			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	w -= 4;
+	pd += 4;
+	if (pm)
+	    pm += 4;
+    }
+
+    while (w)
+    {
+	d = *pd;
+	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
+	if (pm)
+	    pm++;
+
+	w--;
+    }
+}
+
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, COVER)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, NONE)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, PAD)
+FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
+		       scaled_nearest_scanline_sse2_8888_8888_OVER,
+		       uint32_t, uint32_t, NORMAL)
+
+static force_inline void
+scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
+					       uint32_t *       dst,
+					       const uint32_t * src,
+					       int32_t          w,
+					       pixman_fixed_t   vx,
+					       pixman_fixed_t   unit_x,
+					       pixman_fixed_t   src_width_fixed,
+					       pixman_bool_t    zero_src)
+{
+    __m128i xmm_mask;
+    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+    if (zero_src || (*mask >> 24) == 0)
+	return;
+
+    xmm_mask = create_mask_16_128 (*mask >> 24);
+
+    while (w && (uintptr_t)dst & 15)
+    {
+	uint32_t s = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	if (s)
+	{
+	    uint32_t d = *dst;
+
+	    __m128i ms = unpack_32_1x128 (s);
+	    __m128i alpha     = expand_alpha_1x128 (ms);
+	    __m128i dest      = xmm_mask;
+	    __m128i alpha_dst = unpack_32_1x128 (d);
+
+	    *dst = pack_1x128_32 (
+		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	}
+	dst++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	uint32_t tmp1, tmp2, tmp3, tmp4;
+
+	tmp1 = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp2 = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp3 = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+	tmp4 = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
+
+	if (!is_zero (xmm_src))
+	{
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+			        &xmm_alpha_lo, &xmm_alpha_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha_lo, &xmm_alpha_hi,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned (
+		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	dst += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	uint32_t s = *(src + pixman_fixed_to_int (vx));
+	vx += unit_x;
+	while (vx >= 0)
+	    vx -= src_width_fixed;
+
+	if (s)
+	{
+	    uint32_t d = *dst;
+
+	    __m128i ms = unpack_32_1x128 (s);
+	    __m128i alpha = expand_alpha_1x128 (ms);
+	    __m128i mask  = xmm_mask;
+	    __m128i dest  = unpack_32_1x128 (d);
+
+	    *dst = pack_1x128_32 (
+		in_over_1x128 (&ms, &alpha, &mask, &dest));
+	}
+
+	dst++;
+	w--;
+    }
+
+}
+
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
+			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
+
+#if PSHUFD_IS_FAST
+
+/***********************************************************************************/
+
+# define BILINEAR_DECLARE_VARIABLES						\
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
+    const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
+					   unit_x, -unit_x, unit_x, -unit_x);	\
+    const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4);		\
+    const __m128i xmm_zero = _mm_setzero_si128 ();				\
+    __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3,	\
+				   vx + unit_x * 2, -(vx + 1) - unit_x * 2,	\
+				   vx + unit_x * 1, -(vx + 1) - unit_x * 1,	\
+				   vx + unit_x * 0, -(vx + 1) - unit_x * 0);	\
+    __m128i xmm_wh_state;
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_)			\
+do {										\
+    int phase = phase_;								\
+    __m128i xmm_wh, xmm_a, xmm_b;						\
+    /* fetch 2x2 pixel block into sse2 registers */				\
+    __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\
+    __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\
+    vx += unit_x;								\
+    /* vertical interpolation */						\
+    xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\
+    xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\
+    xmm_a = _mm_add_epi16 (xmm_a, xmm_b);						\
+    /* calculate horizontal weights */						\
+    if (phase <= 0)								\
+    {										\
+	xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
+	xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4);		\
+	phase = 0;								\
+    }										\
+    xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase,	\
+							   phase, phase));	\
+    /* horizontal interpolation */						\
+    xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
+		xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh);		\
+    /* shift the result */							\
+    pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\
+} while (0)
+
+#else /************************************************************************/
+
+# define BILINEAR_DECLARE_VARIABLES						\
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
+    const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
+					  unit_x, -unit_x, unit_x, -unit_x);	\
+    const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4,		\
+					   unit_x * 4, -unit_x * 4);		\
+    const __m128i xmm_zero = _mm_setzero_si128 ();				\
+    __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),		\
+				   vx, -(vx + 1), vx, -(vx + 1))
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase)			\
+do {										\
+    __m128i xmm_wh, xmm_a, xmm_b;						\
+    /* fetch 2x2 pixel block into sse2 registers */				\
+    __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);		\
+    __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);		\
+    (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */		\
+    vx += unit_x;								\
+    /* vertical interpolation */						\
+    xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);	\
+    xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);	\
+    xmm_a = _mm_add_epi16 (xmm_a, xmm_b);					\
+    /* calculate horizontal weights */						\
+    xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,			\
+					16 - BILINEAR_INTERPOLATION_BITS));	\
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\
+    /* horizontal interpolation */						\
+    xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a);	\
+    xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh);		\
+    /* shift the result */							\
+    pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);		\
+} while (0)
+
+/***********************************************************************************/
+
+#endif
+
+#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix);					\
+do {										\
+	__m128i xmm_pix;							\
+	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1);			\
+	xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix);				\
+	xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix);				\
+	pix = _mm_cvtsi128_si32 (xmm_pix);					\
+} while(0)
+
+#define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix);					\
+do {										\
+	__m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4;				\
+	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0);			\
+	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1);			\
+	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2);			\
+	BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3);			\
+	xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2);			\
+	xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4);			\
+	pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3);				\
+} while(0)
+
+#define BILINEAR_SKIP_ONE_PIXEL()						\
+do {										\
+    vx += unit_x;								\
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);					\
+} while(0)
+
+#define BILINEAR_SKIP_FOUR_PIXELS()						\
+do {										\
+    vx += unit_x * 4;								\
+    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4);					\
+} while(0)
+
+/***********************************************************************************/
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
+					     const uint32_t * mask,
+					     const uint32_t * src_top,
+					     const uint32_t * src_bottom,
+					     int32_t          w,
+					     int              wt,
+					     int              wb,
+					     pixman_fixed_t   vx_,
+					     pixman_fixed_t   unit_x_,
+					     pixman_fixed_t   max_vx,
+					     pixman_bool_t    zero_src)
+{
+    intptr_t vx = vx_;
+    intptr_t unit_x = unit_x_;
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2;
+
+    while (w && ((uintptr_t)dst & 15))
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	*dst++ = pix1;
+	w--;
+    }
+
+    while ((w -= 4) >= 0) {
+	__m128i xmm_src;
+	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
+	_mm_store_si128 ((__m128i *)dst, xmm_src);
+	dst += 4;
+    }
+
+    if (w & 2)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+	*dst++ = pix1;
+	*dst++ = pix2;
+    }
+
+    if (w & 1)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	*dst = pix1;
+    }
+
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t *       dst,
+					     const uint32_t * mask,
+					     const uint32_t * src_top,
+					     const uint32_t * src_bottom,
+					     int32_t          w,
+					     int              wt,
+					     int              wb,
+					     pixman_fixed_t   vx_,
+					     pixman_fixed_t   unit_x_,
+					     pixman_fixed_t   max_vx,
+					     pixman_bool_t    zero_src)
+{
+    intptr_t vx = vx_;
+    intptr_t unit_x = unit_x_;
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2;
+
+    while (w && ((uintptr_t)dst & 15))
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	*dst++ = pix1 | 0xFF000000;
+	w--;
+    }
+
+    while ((w -= 4) >= 0) {
+	__m128i xmm_src;
+	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
+	_mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000));
+	dst += 4;
+    }
+
+    if (w & 2)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
+	*dst++ = pix1 | 0xFF000000;
+	*dst++ = pix2 | 0xFF000000;
+    }
+
+    if (w & 1)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	*dst = pix1 | 0xFF000000;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC,
+			       scaled_bilinear_scanline_sse2_x888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC,
+			       scaled_bilinear_scanline_sse2_x888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC,
+			       scaled_bilinear_scanline_sse2_x888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
+					      const uint32_t * mask,
+					      const uint32_t * src_top,
+					      const uint32_t * src_bottom,
+					      int32_t          w,
+					      int              wt,
+					      int              wb,
+					      pixman_fixed_t   vx_,
+					      pixman_fixed_t   unit_x_,
+					      pixman_fixed_t   max_vx,
+					      pixman_bool_t    zero_src)
+{
+    intptr_t vx = vx_;
+    intptr_t unit_x = unit_x_;
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2;
+
+    while (w && ((uintptr_t)dst & 15))
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+	if (pix1)
+	{
+	    pix2 = *dst;
+	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+	}
+
+	w--;
+	dst++;
+    }
+
+    while (w  >= 4)
+    {
+	__m128i xmm_src;
+	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
+	__m128i xmm_alpha_hi, xmm_alpha_lo;
+
+	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
+
+	if (!is_zero (xmm_src))
+	{
+	    if (is_opaque (xmm_src))
+	    {
+		save_128_aligned ((__m128i *)dst, xmm_src);
+	    }
+	    else
+	    {
+		__m128i xmm_dst = load_128_aligned ((__m128i *)dst);
+
+		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
+		over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
+			    &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+	}
+
+	w -= 4;
+	dst += 4;
+    }
+
+    while (w)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+
+	if (pix1)
+	{
+	    pix2 = *dst;
+	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
+	}
+
+	w--;
+	dst++;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_NONE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_NONE)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
+						const uint8_t  * mask,
+						const uint32_t * src_top,
+						const uint32_t * src_bottom,
+						int32_t          w,
+						int              wt,
+						int              wb,
+						pixman_fixed_t   vx_,
+						pixman_fixed_t   unit_x_,
+						pixman_fixed_t   max_vx,
+						pixman_bool_t    zero_src)
+{
+    intptr_t vx = vx_;
+    intptr_t unit_x = unit_x_;
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1, pix2;
+    uint32_t m;
+
+    while (w && ((uintptr_t)dst & 15))
+    {
+	uint32_t sa;
+
+	m = (uint32_t) *mask++;
+
+	if (m)
+	{
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	    sa = pix1 >> 24;
+
+	    if (sa == 0xff && m == 0xff)
+	    {
+		*dst = pix1;
+	    }
+	    else
+	    {
+		__m128i ms, md, ma, msa;
+
+		pix2 = *dst;
+		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		ms = unpack_32_1x128 (pix1);
+		md = unpack_32_1x128 (pix2);
+
+		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+	    }
+	}
+	else
+	{
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	}
+
+	w--;
+	dst++;
+    }
+
+    while (w >= 4)
+    {
+	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
+	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+	m = *(uint32_t*)mask;
+
+	if (m)
+	{
+	    BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
+
+	    if (m == 0xffffffff && is_opaque (xmm_src))
+	    {
+		save_128_aligned ((__m128i *)dst, xmm_src);
+	    }
+	    else
+	    {
+		xmm_dst = load_128_aligned ((__m128i *)dst);
+
+		xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
+		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+		in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
+			       &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+		save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	    }
+	}
+	else
+	{
+	    BILINEAR_SKIP_FOUR_PIXELS ();
+	}
+
+	w -= 4;
+	dst += 4;
+	mask += 4;
+    }
+
+    while (w)
+    {
+	uint32_t sa;
+
+	m = (uint32_t) *mask++;
+
+	if (m)
+	{
+	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	    sa = pix1 >> 24;
+
+	    if (sa == 0xff && m == 0xff)
+	    {
+		*dst = pix1;
+	    }
+	    else
+	    {
+		__m128i ms, md, ma, msa;
+
+		pix2 = *dst;
+		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
+		ms = unpack_32_1x128 (pix1);
+		md = unpack_32_1x128 (pix2);
+
+		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
+
+		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
+	    }
+	}
+	else
+	{
+	    BILINEAR_SKIP_ONE_PIXEL ();
+	}
+
+	w--;
+	dst++;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       COVER, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       PAD, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       NONE, FLAG_HAVE_NON_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
+			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
+			       uint32_t, uint8_t, uint32_t,
+			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
+						const uint32_t * mask,
+						const uint32_t * src_top,
+						const uint32_t * src_bottom,
+						int32_t          w,
+						int              wt,
+						int              wb,
+						pixman_fixed_t   vx_,
+						pixman_fixed_t   unit_x_,
+						pixman_fixed_t   max_vx,
+						pixman_bool_t    zero_src)
+{
+    intptr_t vx = vx_;
+    intptr_t unit_x = unit_x_;
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1;
+    __m128i xmm_mask;
+
+    if (zero_src || (*mask >> 24) == 0)
+	return;
+
+    xmm_mask = create_mask_16_128 (*mask >> 24);
+
+    while (w && ((uintptr_t)dst & 15))
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	if (pix1)
+	{
+		uint32_t d = *dst;
+
+		__m128i ms = unpack_32_1x128 (pix1);
+		__m128i alpha     = expand_alpha_1x128 (ms);
+		__m128i dest      = xmm_mask;
+		__m128i alpha_dst = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32
+			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	}
+
+	dst++;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	__m128i xmm_src;
+	BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
+
+	if (!is_zero (xmm_src))
+	{
+	    __m128i xmm_src_lo, xmm_src_hi;
+	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+	    __m128i xmm_alpha_lo, xmm_alpha_hi;
+
+	    xmm_dst = load_128_aligned ((__m128i*)dst);
+
+	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
+				&xmm_alpha_lo, &xmm_alpha_hi);
+
+	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
+			   &xmm_alpha_lo, &xmm_alpha_hi,
+			   &xmm_mask, &xmm_mask,
+			   &xmm_dst_lo, &xmm_dst_hi);
+
+	    save_128_aligned
+		((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+	}
+
+	dst += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+	if (pix1)
+	{
+		uint32_t d = *dst;
+
+		__m128i ms = unpack_32_1x128 (pix1);
+		__m128i alpha     = expand_alpha_1x128 (ms);
+		__m128i dest      = xmm_mask;
+		__m128i alpha_dst = unpack_32_1x128 (d);
+
+		*dst = pack_1x128_32
+			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
+	}
+
+	dst++;
+	w--;
+    }
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FLAG_HAVE_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FLAG_HAVE_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FLAG_HAVE_SOLID_MASK)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
+			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
+			       uint32_t, uint32_t, uint32_t,
+			       NORMAL, FLAG_HAVE_SOLID_MASK)
+
+static const pixman_fast_path_t sse2_fast_paths[] =
+{
+    /* PIXMAN_OP_OVER */
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
+    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
+    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    
+    /* PIXMAN_OP_OVER_REVERSE */
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
+
+    /* PIXMAN_OP_ADD */
+    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
+    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
+    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
+
+    /* PIXMAN_OP_SRC */
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
+    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
+
+    /* PIXMAN_OP_IN */
+    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
+    PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
+
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
+    SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
+
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
+
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
+    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
+
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
+    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
+
+    { PIXMAN_OP_NONE },
+};
+
+static uint32_t *
+sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    __m128i ff000000 = mask_ff000000;
+    uint32_t *dst = iter->buffer;
+    uint32_t *src = (uint32_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w && ((uintptr_t)dst) & 0x0f)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    while (w >= 4)
+    {
+	save_128_aligned (
+	    (__m128i *)dst, _mm_or_si128 (
+		load_128_unaligned ((__m128i *)src), ff000000));
+
+	dst += 4;
+	src += 4;
+	w -= 4;
+    }
+
+    while (w)
+    {
+	*dst++ = (*src++) | 0xff000000;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint16_t *src = (uint16_t *)iter->bits;
+    __m128i ff000000 = mask_ff000000;
+
+    iter->bits += iter->stride;
+
+    while (w && ((uintptr_t)dst) & 0x0f)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = convert_0565_to_8888 (s);
+	w--;
+    }
+
+    while (w >= 8)
+    {
+	__m128i lo, hi, s;
+
+	s = _mm_loadu_si128 ((__m128i *)src);
+
+	lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
+	hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
+
+	save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
+	save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
+
+	dst += 8;
+	src += 8;
+	w -= 8;
+    }
+
+    while (w)
+    {
+	uint16_t s = *src++;
+
+	*dst++ = convert_0565_to_8888 (s);
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int w = iter->width;
+    uint32_t *dst = iter->buffer;
+    uint8_t *src = iter->bits;
+    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
+
+    iter->bits += iter->stride;
+
+    while (w && (((uintptr_t)dst) & 15))
+    {
+        *dst++ = *(src++) << 24;
+        w--;
+    }
+
+    while (w >= 16)
+    {
+	xmm0 = _mm_loadu_si128((__m128i *)src);
+
+	xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
+	xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
+	xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
+	xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
+	xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
+	xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
+
+	_mm_store_si128(((__m128i *)(dst +  0)), xmm3);
+	_mm_store_si128(((__m128i *)(dst +  4)), xmm4);
+	_mm_store_si128(((__m128i *)(dst +  8)), xmm5);
+	_mm_store_si128(((__m128i *)(dst + 12)), xmm6);
+
+	dst += 16;
+	src += 16;
+	w -= 16;
+    }
+
+    while (w)
+    {
+	*dst++ = *(src++) << 24;
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+#define IMAGE_FLAGS							\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
+     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+static const pixman_iter_info_t sse2_iters[] = 
+{
+    { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL
+    },
+    { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL
+    },
+    { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
+      _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL
+    },
+    { PIXMAN_null },
+};
+
+#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+pixman_implementation_t *
+_pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
+
+    /* SSE2 constants */
+    mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+    mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
+    mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
+    mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
+    mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
+    mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
+    mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
+    mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
+    mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
+    mask_0080 = create_mask_16_128 (0x0080);
+    mask_00ff = create_mask_16_128 (0x00ff);
+    mask_0101 = create_mask_16_128 (0x0101);
+    mask_ffff = create_mask_16_128 (0xffff);
+    mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
+    mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
+    mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
+    mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
+
+    /* Set up function pointers */
+    imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
+
+    imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
+
+    imp->blt = sse2_blt;
+    imp->fill = sse2_fill;
+
+    imp->iter_info = sse2_iters;
+
+    return imp;
+}
diff --git a/src/pixman/pixman/pixman-ssse3.c b/src/pixman/pixman/pixman-ssse3.c
new file mode 100644
index 0000000..680d6b9
--- /dev/null
+++ b/src/pixman/pixman/pixman-ssse3.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright © 2013 Soren Sandmann Pedersen
+ * Copyright © 2013 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Soren Sandmann (soren.sandmann@gmail.com)
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <mmintrin.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include "pixman-private.h"
+#include "pixman-inlines.h"
+
+typedef struct
+{
+    int		y;
+    uint64_t *	buffer;
+} line_t;
+
+typedef struct
+{
+    line_t		lines[2];
+    pixman_fixed_t	y;
+    pixman_fixed_t	x;
+    uint64_t		data[1];
+} bilinear_info_t;
+
+static void
+ssse3_fetch_horizontal (bits_image_t *image, line_t *line,
+			int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
+{
+    uint32_t *bits = image->bits + y * image->rowstride;
+    __m128i vx = _mm_set_epi16 (
+	- (x + 1), x, - (x + 1), x,
+	- (x + ux + 1), x + ux,  - (x + ux + 1), x + ux);
+    __m128i vux = _mm_set_epi16 (
+	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
+	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
+    __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
+    __m128i *b = (__m128i *)line->buffer;
+    __m128i vrl0, vrl1;
+
+    while ((n -= 2) >= 0)
+    {
+	__m128i vw, vr, s;
+
+	vrl1 = _mm_loadl_epi64 (
+	    (__m128i *)(bits + pixman_fixed_to_int (x + ux)));
+	/* vrl1: R1, L1 */
+
+    final_pixel:
+	vrl0 = _mm_loadl_epi64 (
+	    (__m128i *)(bits + pixman_fixed_to_int (x)));
+	/* vrl0: R0, L0 */
+
+	/* The weights are based on vx which is a vector of 
+	 *
+	 *    - (x + 1), x, - (x + 1), x,
+	 *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
+	 *
+	 * so the 16 bit weights end up like this:
+	 *
+	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1
+	 *
+	 * and after shifting and packing, we get these bytes:
+	 *
+	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+	 *        iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+	 *
+	 * which means the first and the second input pixel 
+	 * have to be interleaved like this:
+	 *
+	 *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
+	 *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
+	 *
+	 * before maddubsw can be used.
+	 */
+
+	vw = _mm_add_epi16 (
+	    vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
+	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
+	 */
+
+	vw = _mm_packus_epi16 (vw, vw);
+	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
+	 *         iw0, w0, iw0, w0, iw1, w1, iw1, w1
+	 */
+	vx = _mm_add_epi16 (vx, vux);
+
+	x += 2 * ux;
+
+	vr = _mm_unpacklo_epi16 (vrl1, vrl0);
+	/* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
+
+	s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
+	/* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
+
+	vr = _mm_unpackhi_epi8 (vr, s);
+	/* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
+	 *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
+	 */
+
+	vr = _mm_maddubs_epi16 (vr, vw);
+
+	/* When the weight is 0, the inverse weight is
+	 * 128 which can't be represented in a signed byte.
+	 * As a result maddubsw computes the following:
+	 *
+	 *     r = l * -128 + r * 0
+	 *
+	 * rather than the desired
+	 *
+	 *     r = l * 128 + r * 0
+	 *
+	 * We fix this by taking the absolute value of the
+	 * result.
+	 */
+	vr = _mm_abs_epi16 (vr);
+
+	/* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
+	_mm_store_si128 (b++, vr);
+    }
+
+    if (n == -1)
+    {
+	vrl1 = _mm_setzero_si128();
+	goto final_pixel;
+    }
+
+    line->y = y;
+}
+
+static uint32_t *
+ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
+{
+    pixman_fixed_t fx, ux;
+    bilinear_info_t *info = iter->data;
+    line_t *line0, *line1;
+    int y0, y1;
+    int32_t dist_y;
+    __m128i vw;
+    int i;
+
+    fx = info->x;
+    ux = iter->image->common.transform->matrix[0][0];
+
+    y0 = pixman_fixed_to_int (info->y);
+    y1 = y0 + 1;
+
+    line0 = &info->lines[y0 & 0x01];
+    line1 = &info->lines[y1 & 0x01];
+
+    if (line0->y != y0)
+    {
+	ssse3_fetch_horizontal (
+	    &iter->image->bits, line0, y0, fx, ux, iter->width);
+    }
+
+    if (line1->y != y1)
+    {
+	ssse3_fetch_horizontal (
+	    &iter->image->bits, line1, y1, fx, ux, iter->width);
+    }
+
+    dist_y = pixman_fixed_to_bilinear_weight (info->y);
+    dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);
+
+    vw = _mm_set_epi16 (
+	dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
+
+    for (i = 0; i + 3 < iter->width; i += 4)
+    {
+	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
+	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
+	__m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
+	__m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
+	__m128i r0, r1, tmp, p;
+
+	r0 = _mm_mulhi_epu16 (
+	    _mm_sub_epi16 (bot0, top0), vw);
+	tmp = _mm_cmplt_epi16 (bot0, top0);
+	tmp = _mm_and_si128 (tmp, vw);
+	r0 = _mm_sub_epi16 (r0, tmp);
+	r0 = _mm_add_epi16 (r0, top0);
+	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
+	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
+	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
+	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
+
+	r1 = _mm_mulhi_epu16 (
+	    _mm_sub_epi16 (bot1, top1), vw);
+	tmp = _mm_cmplt_epi16 (bot1, top1);
+	tmp = _mm_and_si128 (tmp, vw);
+	r1 = _mm_sub_epi16 (r1, tmp);
+	r1 = _mm_add_epi16 (r1, top1);
+	r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
+	r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
+	/* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
+
+	p = _mm_packus_epi16 (r0, r1);
+
+	_mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
+    }
+
+    while (i < iter->width)
+    {
+	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
+	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
+	__m128i r0, tmp, p;
+
+	r0 = _mm_mulhi_epu16 (
+	    _mm_sub_epi16 (bot0, top0), vw);
+	tmp = _mm_cmplt_epi16 (bot0, top0);
+	tmp = _mm_and_si128 (tmp, vw);
+	r0 = _mm_sub_epi16 (r0, tmp);
+	r0 = _mm_add_epi16 (r0, top0);
+	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
+	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
+	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
+	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
+
+	p = _mm_packus_epi16 (r0, r0);
+
+	if (iter->width - i == 1)
+	{
+	    *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p);
+	    i++;
+	}
+	else
+	{
+	    _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
+	    i += 2;
+	}
+    }
+    
+    info->y += iter->image->common.transform->matrix[1][1];
+
+    return iter->buffer;
+}
+
+static void
+ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter)
+{
+    free (iter->data);
+}
+
+static void
+ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info)
+{
+    int width = iter->width;
+    bilinear_info_t *info;
+    pixman_vector_t v;
+
+    /* Reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (iter->image->common.transform, &v))
+	goto fail;
+
+    info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64);
+    if (!info)
+	goto fail;
+
+    info->x = v.vector[0] - pixman_fixed_1 / 2;
+    info->y = v.vector[1] - pixman_fixed_1 / 2;
+
+#define ALIGN(addr)							\
+    ((void *)((((uintptr_t)(addr)) + 15) & (~15)))
+
+    /* It is safe to set the y coordinates to -1 initially
+     * because COVER_CLIP_BILINEAR ensures that we will only
+     * be asked to fetch lines in the [0, height) interval
+     */
+    info->lines[0].y = -1;
+    info->lines[0].buffer = ALIGN (&(info->data[0]));
+    info->lines[1].y = -1;
+    info->lines[1].buffer = ALIGN (info->lines[0].buffer + width);
+
+    iter->get_scanline = ssse3_fetch_bilinear_cover;
+    iter->fini = ssse3_bilinear_cover_iter_fini;
+
+    iter->data = info;
+    return;
+
+fail:
+    /* Something went wrong, either a bad matrix or OOM; in such cases,
+     * we don't guarantee any particular rendering.
+     */
+    _pixman_log_error (
+	FUNC, "Allocation failure or bad matrix, skipping rendering\n");
+    
+    iter->get_scanline = _pixman_iter_get_scanline_noop;
+    iter->fini = NULL;
+}
+
+static const pixman_iter_info_t ssse3_iters[] = 
+{
+    { PIXMAN_a8r8g8b8,
+      (FAST_PATH_STANDARD_FLAGS			|
+       FAST_PATH_SCALE_TRANSFORM		|
+       FAST_PATH_BILINEAR_FILTER		|
+       FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR),
+      ITER_NARROW | ITER_SRC,
+      ssse3_bilinear_cover_iter_init,
+      NULL, NULL
+    },
+
+    { PIXMAN_null },
+};
+
+static const pixman_fast_path_t ssse3_fast_paths[] =
+{
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_ssse3 (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp =
+	_pixman_implementation_create (fallback, ssse3_fast_paths);
+
+    imp->iter_info = ssse3_iters;
+
+    return imp;
+}
diff --git a/src/pixman/pixman/pixman-timer.c b/src/pixman/pixman/pixman-timer.c
new file mode 100644
index 0000000..f5ae18e
--- /dev/null
+++ b/src/pixman/pixman/pixman-timer.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Red Hat not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Red Hat makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * RED HAT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL RED HAT
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "pixman-private.h"
+
+#ifdef PIXMAN_TIMERS
+
+static pixman_timer_t *timers;
+
+static void
+dump_timers (void)
+{
+    pixman_timer_t *timer;
+
+    for (timer = timers; timer != NULL; timer = timer->next)
+    {
+	printf ("%s:   total: %llu     n: %llu      avg: %f\n",
+	        timer->name,
+	        timer->total,
+	        timer->n_times,
+	        timer->total / (double)timer->n_times);
+    }
+}
+
+void
+pixman_timer_register (pixman_timer_t *timer)
+{
+    static int initialized;
+
+    int atexit (void (*function)(void));
+
+    if (!initialized)
+    {
+	atexit (dump_timers);
+	initialized = 1;
+    }
+
+    timer->next = timers;
+    timers = timer;
+}
+
+#endif
diff --git a/src/pixman/pixman/pixman-trap.c b/src/pixman/pixman/pixman-trap.c
new file mode 100644
index 0000000..91766fd
--- /dev/null
+++ b/src/pixman/pixman/pixman-trap.c
@@ -0,0 +1,711 @@
+/*
+ * Copyright © 2002 Keith Packard, member of The XFree86 Project, Inc.
+ * Copyright © 2004 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+
+/*
+ * Compute the smallest value greater than or equal to y which is on a
+ * grid row.
+ */
+
+PIXMAN_EXPORT pixman_fixed_t
+pixman_sample_ceil_y (pixman_fixed_t y, int n)
+{
+    pixman_fixed_t f = pixman_fixed_frac (y);
+    pixman_fixed_t i = pixman_fixed_floor (y);
+
+    f = DIV (f - Y_FRAC_FIRST (n) + (STEP_Y_SMALL (n) - pixman_fixed_e), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
+	Y_FRAC_FIRST (n);
+    
+    if (f > Y_FRAC_LAST (n))
+    {
+	if (pixman_fixed_to_int (i) == 0x7fff)
+	{
+	    f = 0xffff; /* saturate */
+	}
+	else
+	{
+	    f = Y_FRAC_FIRST (n);
+	    i += pixman_fixed_1;
+	}
+    }
+    return (i | f);
+}
+
+/*
+ * Compute the largest value strictly less than y which is on a
+ * grid row.
+ */
+PIXMAN_EXPORT pixman_fixed_t
+pixman_sample_floor_y (pixman_fixed_t y,
+                       int            n)
+{
+    pixman_fixed_t f = pixman_fixed_frac (y);
+    pixman_fixed_t i = pixman_fixed_floor (y);
+
+    f = DIV (f - pixman_fixed_e - Y_FRAC_FIRST (n), STEP_Y_SMALL (n)) * STEP_Y_SMALL (n) +
+	Y_FRAC_FIRST (n);
+
+    if (f < Y_FRAC_FIRST (n))
+    {
+	if (pixman_fixed_to_int (i) == 0x8000)
+	{
+	    f = 0; /* saturate */
+	}
+	else
+	{
+	    f = Y_FRAC_LAST (n);
+	    i -= pixman_fixed_1;
+	}
+    }
+    return (i | f);
+}
+
+/*
+ * Step an edge by any amount (including negative values)
+ */
+PIXMAN_EXPORT void
+pixman_edge_step (pixman_edge_t *e,
+                  int            n)
+{
+    pixman_fixed_48_16_t ne;
+
+    e->x += n * e->stepx;
+
+    ne = e->e + n * (pixman_fixed_48_16_t) e->dx;
+
+    if (n >= 0)
+    {
+	if (ne > 0)
+	{
+	    int nx = (ne + e->dy - 1) / e->dy;
+	    e->e = ne - nx * (pixman_fixed_48_16_t) e->dy;
+	    e->x += nx * e->signdx;
+	}
+    }
+    else
+    {
+	if (ne <= -e->dy)
+	{
+	    int nx = (-ne) / e->dy;
+	    e->e = ne + nx * (pixman_fixed_48_16_t) e->dy;
+	    e->x -= nx * e->signdx;
+	}
+    }
+}
+
+/*
+ * A private routine to initialize the multi-step
+ * elements of an edge structure
+ */
+static void
+_pixman_edge_multi_init (pixman_edge_t * e,
+                         int             n,
+                         pixman_fixed_t *stepx_p,
+                         pixman_fixed_t *dx_p)
+{
+    pixman_fixed_t stepx;
+    pixman_fixed_48_16_t ne;
+
+    ne = n * (pixman_fixed_48_16_t) e->dx;
+    stepx = n * e->stepx;
+
+    if (ne > 0)
+    {
+	int nx = ne / e->dy;
+	ne -= nx * (pixman_fixed_48_16_t)e->dy;
+	stepx += nx * e->signdx;
+    }
+
+    *dx_p = ne;
+    *stepx_p = stepx;
+}
+
+/*
+ * Initialize one edge structure given the line endpoints and a
+ * starting y value
+ */
+PIXMAN_EXPORT void
+pixman_edge_init (pixman_edge_t *e,
+                  int            n,
+                  pixman_fixed_t y_start,
+                  pixman_fixed_t x_top,
+                  pixman_fixed_t y_top,
+                  pixman_fixed_t x_bot,
+                  pixman_fixed_t y_bot)
+{
+    pixman_fixed_t dx, dy;
+
+    e->x = x_top;
+    e->e = 0;
+    dx = x_bot - x_top;
+    dy = y_bot - y_top;
+    e->dy = dy;
+    e->dx = 0;
+
+    if (dy)
+    {
+	if (dx >= 0)
+	{
+	    e->signdx = 1;
+	    e->stepx = dx / dy;
+	    e->dx = dx % dy;
+	    e->e = -dy;
+	}
+	else
+	{
+	    e->signdx = -1;
+	    e->stepx = -(-dx / dy);
+	    e->dx = -dx % dy;
+	    e->e = 0;
+	}
+
+	_pixman_edge_multi_init (e, STEP_Y_SMALL (n),
+				 &e->stepx_small, &e->dx_small);
+
+	_pixman_edge_multi_init (e, STEP_Y_BIG (n),
+				 &e->stepx_big, &e->dx_big);
+    }
+    pixman_edge_step (e, y_start - y_top);
+}
+
+/*
+ * Initialize one edge structure given a line, starting y value
+ * and a pixel offset for the line
+ */
+PIXMAN_EXPORT void
+pixman_line_fixed_edge_init (pixman_edge_t *            e,
+                             int                        n,
+                             pixman_fixed_t             y,
+                             const pixman_line_fixed_t *line,
+                             int                        x_off,
+                             int                        y_off)
+{
+    pixman_fixed_t x_off_fixed = pixman_int_to_fixed (x_off);
+    pixman_fixed_t y_off_fixed = pixman_int_to_fixed (y_off);
+    const pixman_point_fixed_t *top, *bot;
+
+    if (line->p1.y <= line->p2.y)
+    {
+	top = &line->p1;
+	bot = &line->p2;
+    }
+    else
+    {
+	top = &line->p2;
+	bot = &line->p1;
+    }
+    
+    pixman_edge_init (e, n, y,
+                      top->x + x_off_fixed,
+                      top->y + y_off_fixed,
+                      bot->x + x_off_fixed,
+                      bot->y + y_off_fixed);
+}
+
+PIXMAN_EXPORT void
+pixman_add_traps (pixman_image_t *     image,
+                  int16_t              x_off,
+                  int16_t              y_off,
+                  int                  ntrap,
+                  const pixman_trap_t *traps)
+{
+    int bpp;
+    int height;
+
+    pixman_fixed_t x_off_fixed;
+    pixman_fixed_t y_off_fixed;
+    pixman_edge_t l, r;
+    pixman_fixed_t t, b;
+
+    _pixman_image_validate (image);
+    
+    height = image->bits.height;
+    bpp = PIXMAN_FORMAT_BPP (image->bits.format);
+
+    x_off_fixed = pixman_int_to_fixed (x_off);
+    y_off_fixed = pixman_int_to_fixed (y_off);
+
+    while (ntrap--)
+    {
+	t = traps->top.y + y_off_fixed;
+	if (t < 0)
+	    t = 0;
+	t = pixman_sample_ceil_y (t, bpp);
+
+	b = traps->bot.y + y_off_fixed;
+	if (pixman_fixed_to_int (b) >= height)
+	    b = pixman_int_to_fixed (height) - 1;
+	b = pixman_sample_floor_y (b, bpp);
+
+	if (b >= t)
+	{
+	    /* initialize edge walkers */
+	    pixman_edge_init (&l, bpp, t,
+	                      traps->top.l + x_off_fixed,
+	                      traps->top.y + y_off_fixed,
+	                      traps->bot.l + x_off_fixed,
+	                      traps->bot.y + y_off_fixed);
+
+	    pixman_edge_init (&r, bpp, t,
+	                      traps->top.r + x_off_fixed,
+	                      traps->top.y + y_off_fixed,
+	                      traps->bot.r + x_off_fixed,
+	                      traps->bot.y + y_off_fixed);
+
+	    pixman_rasterize_edges (image, &l, &r, t, b);
+	}
+
+	traps++;
+    }
+}
+
+#if 0
+static void
+dump_image (pixman_image_t *image,
+            const char *    title)
+{
+    int i, j;
+
+    if (!image->type == BITS)
+	printf ("%s is not a regular image\n", title);
+
+    if (!image->bits.format == PIXMAN_a8)
+	printf ("%s is not an alpha mask\n", title);
+
+    printf ("\n\n\n%s: \n", title);
+
+    for (i = 0; i < image->bits.height; ++i)
+    {
+	uint8_t *line =
+	    (uint8_t *)&(image->bits.bits[i * image->bits.rowstride]);
+
+	for (j = 0; j < image->bits.width; ++j)
+	    printf ("%c", line[j] ? '#' : ' ');
+
+	printf ("\n");
+    }
+}
+#endif
+
+PIXMAN_EXPORT void
+pixman_add_trapezoids (pixman_image_t *          image,
+                       int16_t                   x_off,
+                       int                       y_off,
+                       int                       ntraps,
+                       const pixman_trapezoid_t *traps)
+{
+    int i;
+
+#if 0
+    dump_image (image, "before");
+#endif
+
+    for (i = 0; i < ntraps; ++i)
+    {
+	const pixman_trapezoid_t *trap = &(traps[i]);
+
+	if (!pixman_trapezoid_valid (trap))
+	    continue;
+
+	pixman_rasterize_trapezoid (image, trap, x_off, y_off);
+    }
+
+#if 0
+    dump_image (image, "after");
+#endif
+}
+
+PIXMAN_EXPORT void
+pixman_rasterize_trapezoid (pixman_image_t *          image,
+                            const pixman_trapezoid_t *trap,
+                            int                       x_off,
+                            int                       y_off)
+{
+    int bpp;
+    int height;
+
+    pixman_fixed_t y_off_fixed;
+    pixman_edge_t l, r;
+    pixman_fixed_t t, b;
+
+    return_if_fail (image->type == BITS);
+
+    _pixman_image_validate (image);
+    
+    if (!pixman_trapezoid_valid (trap))
+	return;
+
+    height = image->bits.height;
+    bpp = PIXMAN_FORMAT_BPP (image->bits.format);
+
+    y_off_fixed = pixman_int_to_fixed (y_off);
+
+    t = trap->top + y_off_fixed;
+    if (t < 0)
+	t = 0;
+    t = pixman_sample_ceil_y (t, bpp);
+
+    b = trap->bottom + y_off_fixed;
+    if (pixman_fixed_to_int (b) >= height)
+	b = pixman_int_to_fixed (height) - 1;
+    b = pixman_sample_floor_y (b, bpp);
+    
+    if (b >= t)
+    {
+	/* initialize edge walkers */
+	pixman_line_fixed_edge_init (&l, bpp, t, &trap->left, x_off, y_off);
+	pixman_line_fixed_edge_init (&r, bpp, t, &trap->right, x_off, y_off);
+
+	pixman_rasterize_edges (image, &l, &r, t, b);
+    }
+}
+
+static const pixman_bool_t zero_src_has_no_effect[PIXMAN_N_OPERATORS] =
+{
+    FALSE,	/* Clear		0			0    */
+    FALSE,	/* Src			1			0    */
+    TRUE,	/* Dst			0			1    */
+    TRUE,	/* Over			1			1-Aa */
+    TRUE,	/* OverReverse		1-Ab			1    */
+    FALSE,	/* In			Ab			0    */
+    FALSE,	/* InReverse		0			Aa   */
+    FALSE,	/* Out			1-Ab			0    */
+    TRUE,	/* OutReverse		0			1-Aa */
+    TRUE,	/* Atop			Ab			1-Aa */
+    FALSE,	/* AtopReverse		1-Ab			Aa   */
+    TRUE,	/* Xor			1-Ab			1-Aa */
+    TRUE,	/* Add			1			1    */
+};
+
+static pixman_bool_t
+get_trap_extents (pixman_op_t op, pixman_image_t *dest,
+		  const pixman_trapezoid_t *traps, int n_traps,
+		  pixman_box32_t *box)
+{
+    int i;
+
+    /* When the operator is such that a zero source has an
+     * effect on the underlying image, we have to
+     * composite across the entire destination
+     */
+    if (!zero_src_has_no_effect [op])
+    {
+	box->x1 = 0;
+	box->y1 = 0;
+	box->x2 = dest->bits.width;
+	box->y2 = dest->bits.height;
+	return TRUE;
+    }
+    
+    box->x1 = INT32_MAX;
+    box->y1 = INT32_MAX;
+    box->x2 = INT32_MIN;
+    box->y2 = INT32_MIN;
+	
+    for (i = 0; i < n_traps; ++i)
+    {
+	const pixman_trapezoid_t *trap = &(traps[i]);
+	int y1, y2;
+	    
+	if (!pixman_trapezoid_valid (trap))
+	    continue;
+	    
+	y1 = pixman_fixed_to_int (trap->top);
+	if (y1 < box->y1)
+	    box->y1 = y1;
+	    
+	y2 = pixman_fixed_to_int (pixman_fixed_ceil (trap->bottom));
+	if (y2 > box->y2)
+	    box->y2 = y2;
+	    
+#define EXTEND_MIN(x)							\
+	if (pixman_fixed_to_int ((x)) < box->x1)			\
+	    box->x1 = pixman_fixed_to_int ((x));
+#define EXTEND_MAX(x)							\
+	if (pixman_fixed_to_int (pixman_fixed_ceil ((x))) > box->x2)	\
+	    box->x2 = pixman_fixed_to_int (pixman_fixed_ceil ((x)));
+	    
+#define EXTEND(x)							\
+	EXTEND_MIN(x);							\
+	EXTEND_MAX(x);
+	    
+	EXTEND(trap->left.p1.x);
+	EXTEND(trap->left.p2.x);
+	EXTEND(trap->right.p1.x);
+	EXTEND(trap->right.p2.x);
+    }
+	
+    if (box->x1 >= box->x2 || box->y1 >= box->y2)
+	return FALSE;
+
+    return TRUE;
+}
+
+/*
+ * pixman_composite_trapezoids()
+ *
+ * All the trapezoids are conceptually rendered to an infinitely big image.
+ * The (0, 0) coordinates of this image are then aligned with the (x, y)
+ * coordinates of the source image, and then both images are aligned with
+ * the (x, y) coordinates of the destination. Then these three images are
+ * composited across the entire destination.
+ */
+PIXMAN_EXPORT void
+pixman_composite_trapezoids (pixman_op_t		op,
+			     pixman_image_t *		src,
+			     pixman_image_t *		dst,
+			     pixman_format_code_t	mask_format,
+			     int			x_src,
+			     int			y_src,
+			     int			x_dst,
+			     int			y_dst,
+			     int			n_traps,
+			     const pixman_trapezoid_t *	traps)
+{
+    int i;
+
+    return_if_fail (PIXMAN_FORMAT_TYPE (mask_format) == PIXMAN_TYPE_A);
+    
+    if (n_traps <= 0)
+	return;
+
+    _pixman_image_validate (src);
+    _pixman_image_validate (dst);
+
+    if (op == PIXMAN_OP_ADD &&
+	(src->common.flags & FAST_PATH_IS_OPAQUE)		&&
+	(mask_format == dst->common.extended_format_code)	&&
+	!(dst->common.have_clip_region))
+    {
+	for (i = 0; i < n_traps; ++i)
+	{
+	    const pixman_trapezoid_t *trap = &(traps[i]);
+	    
+	    if (!pixman_trapezoid_valid (trap))
+		continue;
+	    
+	    pixman_rasterize_trapezoid (dst, trap, x_dst, y_dst);
+	}
+    }
+    else
+    {
+	pixman_image_t *tmp;
+	pixman_box32_t box;
+	int i;
+
+	if (!get_trap_extents (op, dst, traps, n_traps, &box))
+	    return;
+	
+	if (!(tmp = pixman_image_create_bits (
+		  mask_format, box.x2 - box.x1, box.y2 - box.y1, NULL, -1)))
+	    return;
+	
+	for (i = 0; i < n_traps; ++i)
+	{
+	    const pixman_trapezoid_t *trap = &(traps[i]);
+	    
+	    if (!pixman_trapezoid_valid (trap))
+		continue;
+	    
+	    pixman_rasterize_trapezoid (tmp, trap, - box.x1, - box.y1);
+	}
+	
+	pixman_image_composite (op, src, tmp, dst,
+				x_src + box.x1, y_src + box.y1,
+				0, 0,
+				x_dst + box.x1, y_dst + box.y1,
+				box.x2 - box.x1, box.y2 - box.y1);
+	
+	pixman_image_unref (tmp);
+    }
+}
+
+static int
+greater_y (const pixman_point_fixed_t *a, const pixman_point_fixed_t *b)
+{
+    if (a->y == b->y)
+	return a->x > b->x;
+    return a->y > b->y;
+}
+
+/*
+ * Note that the definition of this function is a bit odd because
+ * of the X coordinate space (y increasing downwards).
+ */
+static int
+clockwise (const pixman_point_fixed_t *ref,
+	   const pixman_point_fixed_t *a,
+	   const pixman_point_fixed_t *b)
+{
+    pixman_point_fixed_t	ad, bd;
+
+    ad.x = a->x - ref->x;
+    ad.y = a->y - ref->y;
+    bd.x = b->x - ref->x;
+    bd.y = b->y - ref->y;
+
+    return ((pixman_fixed_32_32_t) bd.y * ad.x -
+	    (pixman_fixed_32_32_t) ad.y * bd.x) < 0;
+}
+
+static void
+triangle_to_trapezoids (const pixman_triangle_t *tri, pixman_trapezoid_t *traps)
+{
+    const pixman_point_fixed_t *top, *left, *right, *tmp;
+
+    top = &tri->p1;
+    left = &tri->p2;
+    right = &tri->p3;
+
+    if (greater_y (top, left))
+    {
+	tmp = left;
+	left = top;
+	top = tmp;
+    }
+
+    if (greater_y (top, right))
+    {
+	tmp = right;
+	right = top;
+	top = tmp;
+    }
+
+    if (clockwise (top, right, left))
+    {
+	tmp = right;
+	right = left;
+	left = tmp;
+    }
+    
+    /*
+     * Two cases:
+     *
+     *		+		+
+     *	       / \             / \
+     *	      /   \           /	  \
+     *	     /     +         +	   \
+     *      /    --           --    \
+     *     /   --               --   \
+     *    / ---                   --- \
+     *	 +--                         --+
+     */
+
+    traps->top = top->y;
+    traps->left.p1 = *top;
+    traps->left.p2 = *left;
+    traps->right.p1 = *top;
+    traps->right.p2 = *right;
+
+    if (right->y < left->y)
+	traps->bottom = right->y;
+    else
+	traps->bottom = left->y;
+
+    traps++;
+
+    *traps = *(traps - 1);
+    
+    if (right->y < left->y)
+    {
+	traps->top = right->y;
+	traps->bottom = left->y;
+	traps->right.p1 = *right;
+	traps->right.p2 = *left;
+    }
+    else
+    {
+	traps->top = left->y;
+	traps->bottom = right->y;
+	traps->left.p1 = *left;
+	traps->left.p2 = *right;
+    }
+}
+
+static pixman_trapezoid_t *
+convert_triangles (int n_tris, const pixman_triangle_t *tris)
+{
+    pixman_trapezoid_t *traps;
+    int i;
+
+    if (n_tris <= 0)
+	return NULL;
+    
+    traps = pixman_malloc_ab (n_tris, 2 * sizeof (pixman_trapezoid_t));
+    if (!traps)
+	return NULL;
+
+    for (i = 0; i < n_tris; ++i)
+	triangle_to_trapezoids (&(tris[i]), traps + 2 * i);
+
+    return traps;
+}
+
+PIXMAN_EXPORT void
+pixman_composite_triangles (pixman_op_t			op,
+			    pixman_image_t *		src,
+			    pixman_image_t *		dst,
+			    pixman_format_code_t	mask_format,
+			    int				x_src,
+			    int				y_src,
+			    int				x_dst,
+			    int				y_dst,
+			    int				n_tris,
+			    const pixman_triangle_t *	tris)
+{
+    pixman_trapezoid_t *traps;
+
+    if ((traps = convert_triangles (n_tris, tris)))
+    {
+	pixman_composite_trapezoids (op, src, dst, mask_format,
+				     x_src, y_src, x_dst, y_dst,
+				     n_tris * 2, traps);
+	
+	free (traps);
+    }
+}
+
+PIXMAN_EXPORT void
+pixman_add_triangles (pixman_image_t          *image,
+		      int32_t	               x_off,
+		      int32_t	               y_off,
+		      int	               n_tris,
+		      const pixman_triangle_t *tris)
+{
+    pixman_trapezoid_t *traps;
+
+    if ((traps = convert_triangles (n_tris, tris)))
+    {
+	pixman_add_trapezoids (image, x_off, y_off,
+			       n_tris * 2, traps);
+
+	free (traps);
+    }
+}
diff --git a/src/pixman/pixman/pixman-utils.c b/src/pixman/pixman/pixman-utils.c
new file mode 100644
index 0000000..4a3a835
--- /dev/null
+++ b/src/pixman/pixman/pixman-utils.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 1999 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pixman-private.h"
+
+pixman_bool_t
+_pixman_multiply_overflows_size (size_t a, size_t b)
+{
+    return a >= SIZE_MAX / b;
+}
+
+pixman_bool_t
+_pixman_multiply_overflows_int (unsigned int a, unsigned int b)
+{
+    return a >= INT32_MAX / b;
+}
+
+pixman_bool_t
+_pixman_addition_overflows_int (unsigned int a, unsigned int b)
+{
+    return a > INT32_MAX - b;
+}
+
+void *
+pixman_malloc_ab_plus_c (unsigned int a, unsigned int b, unsigned int c)
+{
+    if (!b || a >= INT32_MAX / b || (a * b) > INT32_MAX - c)
+	return NULL;
+
+    return malloc (a * b + c);
+}
+
+void *
+pixman_malloc_ab (unsigned int a,
+                  unsigned int b)
+{
+    if (a >= INT32_MAX / b)
+	return NULL;
+
+    return malloc (a * b);
+}
+
+void *
+pixman_malloc_abc (unsigned int a,
+                   unsigned int b,
+                   unsigned int c)
+{
+    if (a >= INT32_MAX / b)
+	return NULL;
+    else if (a * b >= INT32_MAX / c)
+	return NULL;
+    else
+	return malloc (a * b * c);
+}
+
+static force_inline uint16_t
+float_to_unorm (float f, int n_bits)
+{
+    uint32_t u;
+
+    if (f > 1.0)
+	f = 1.0;
+    if (f < 0.0)
+	f = 0.0;
+
+    u = f * (1 << n_bits);
+    u -= (u >> n_bits);
+
+    return u;
+}
+
+static force_inline float
+unorm_to_float (uint16_t u, int n_bits)
+{
+    uint32_t m = ((1 << n_bits) - 1);
+
+    return (u & m) * (1.f / (float)m);
+}
+
+/*
+ * This function expands images from a8r8g8b8 to argb_t.  To preserve
+ * precision, it needs to know from which source format the a8r8g8b8 pixels
+ * originally came.
+ *
+ * For example, if the source was PIXMAN_x1r5g5b5 and the red component
+ * contained bits 12345, then the 8-bit value is 12345123.  To correctly
+ * expand this to floating point, it should be 12345 / 31.0 and not
+ * 12345123 / 255.0.
+ */
+void
+pixman_expand_to_float (argb_t               *dst,
+			const uint32_t       *src,
+			pixman_format_code_t  format,
+			int                   width)
+{
+    static const float multipliers[16] = {
+	0.0f,
+	1.0f / ((1 <<  1) - 1),
+	1.0f / ((1 <<  2) - 1),
+	1.0f / ((1 <<  3) - 1),
+	1.0f / ((1 <<  4) - 1),
+	1.0f / ((1 <<  5) - 1),
+	1.0f / ((1 <<  6) - 1),
+	1.0f / ((1 <<  7) - 1),
+	1.0f / ((1 <<  8) - 1),
+	1.0f / ((1 <<  9) - 1),
+	1.0f / ((1 << 10) - 1),
+	1.0f / ((1 << 11) - 1),
+	1.0f / ((1 << 12) - 1),
+	1.0f / ((1 << 13) - 1),
+	1.0f / ((1 << 14) - 1),
+	1.0f / ((1 << 15) - 1),
+    };
+    int a_size, r_size, g_size, b_size;
+    int a_shift, r_shift, g_shift, b_shift;
+    float a_mul, r_mul, g_mul, b_mul;
+    uint32_t a_mask, r_mask, g_mask, b_mask;
+    int i;
+
+    if (!PIXMAN_FORMAT_VIS (format))
+	format = PIXMAN_a8r8g8b8;
+
+    /*
+     * Determine the sizes of each component and the masks and shifts
+     * required to extract them from the source pixel.
+     */
+    a_size = PIXMAN_FORMAT_A (format);
+    r_size = PIXMAN_FORMAT_R (format);
+    g_size = PIXMAN_FORMAT_G (format);
+    b_size = PIXMAN_FORMAT_B (format);
+
+    a_shift = 32 - a_size;
+    r_shift = 24 - r_size;
+    g_shift = 16 - g_size;
+    b_shift =  8 - b_size;
+
+    a_mask = ((1 << a_size) - 1);
+    r_mask = ((1 << r_size) - 1);
+    g_mask = ((1 << g_size) - 1);
+    b_mask = ((1 << b_size) - 1);
+
+    a_mul = multipliers[a_size];
+    r_mul = multipliers[r_size];
+    g_mul = multipliers[g_size];
+    b_mul = multipliers[b_size];
+
+    /* Start at the end so that we can do the expansion in place
+     * when src == dst
+     */
+    for (i = width - 1; i >= 0; i--)
+    {
+	const uint32_t pixel = src[i];
+
+	dst[i].a = a_mask? ((pixel >> a_shift) & a_mask) * a_mul : 1.0f;
+	dst[i].r = ((pixel >> r_shift) & r_mask) * r_mul;
+	dst[i].g = ((pixel >> g_shift) & g_mask) * g_mul;
+	dst[i].b = ((pixel >> b_shift) & b_mask) * b_mul;
+    }
+}
+
+uint16_t
+pixman_float_to_unorm (float f, int n_bits)
+{
+    return float_to_unorm (f, n_bits);
+}
+
+float
+pixman_unorm_to_float (uint16_t u, int n_bits)
+{
+    return unorm_to_float (u, n_bits);
+}
+
+void
+pixman_contract_from_float (uint32_t     *dst,
+			    const argb_t *src,
+			    int           width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	uint8_t a, r, g, b;
+
+	a = float_to_unorm (src[i].a, 8);
+	r = float_to_unorm (src[i].r, 8);
+	g = float_to_unorm (src[i].g, 8);
+	b = float_to_unorm (src[i].b, 8);
+
+	dst[i] = (a << 24) | (r << 16) | (g << 8) | (b << 0);
+    }
+}
+
+uint32_t *
+_pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask)
+{
+    return iter->buffer;
+}
+
+void
+_pixman_iter_init_bits_stride (pixman_iter_t *iter, const pixman_iter_info_t *info)
+{
+    pixman_image_t *image = iter->image;
+    uint8_t *b = (uint8_t *)image->bits.bits;
+    int s = image->bits.rowstride * 4;
+
+    iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (info->format) / 8;
+    iter->stride = s;
+}
+
+#define N_TMP_BOXES (16)
+
+pixman_bool_t
+pixman_region16_copy_from_region32 (pixman_region16_t *dst,
+                                    pixman_region32_t *src)
+{
+    int n_boxes, i;
+    pixman_box32_t *boxes32;
+    pixman_box16_t *boxes16;
+    pixman_bool_t retval;
+
+    boxes32 = pixman_region32_rectangles (src, &n_boxes);
+
+    boxes16 = pixman_malloc_ab (n_boxes, sizeof (pixman_box16_t));
+
+    if (!boxes16)
+	return FALSE;
+
+    for (i = 0; i < n_boxes; ++i)
+    {
+	boxes16[i].x1 = boxes32[i].x1;
+	boxes16[i].y1 = boxes32[i].y1;
+	boxes16[i].x2 = boxes32[i].x2;
+	boxes16[i].y2 = boxes32[i].y2;
+    }
+
+    pixman_region_fini (dst);
+    retval = pixman_region_init_rects (dst, boxes16, n_boxes);
+    free (boxes16);
+    return retval;
+}
+
+pixman_bool_t
+pixman_region32_copy_from_region16 (pixman_region32_t *dst,
+                                    pixman_region16_t *src)
+{
+    int n_boxes, i;
+    pixman_box16_t *boxes16;
+    pixman_box32_t *boxes32;
+    pixman_box32_t tmp_boxes[N_TMP_BOXES];
+    pixman_bool_t retval;
+
+    boxes16 = pixman_region_rectangles (src, &n_boxes);
+
+    if (n_boxes > N_TMP_BOXES)
+	boxes32 = pixman_malloc_ab (n_boxes, sizeof (pixman_box32_t));
+    else
+	boxes32 = tmp_boxes;
+
+    if (!boxes32)
+	return FALSE;
+
+    for (i = 0; i < n_boxes; ++i)
+    {
+	boxes32[i].x1 = boxes16[i].x1;
+	boxes32[i].y1 = boxes16[i].y1;
+	boxes32[i].x2 = boxes16[i].x2;
+	boxes32[i].y2 = boxes16[i].y2;
+    }
+
+    pixman_region32_fini (dst);
+    retval = pixman_region32_init_rects (dst, boxes32, n_boxes);
+
+    if (boxes32 != tmp_boxes)
+	free (boxes32);
+
+    return retval;
+}
+
+/* This function is exported for the sake of the test suite and not part
+ * of the ABI.
+ */
+PIXMAN_EXPORT pixman_implementation_t *
+_pixman_internal_only_get_implementation (void)
+{
+    return get_implementation ();
+}
+
+void
+_pixman_log_error (const char *function, const char *message)
+{
+    static int n_messages = 0;
+
+    if (n_messages < 10)
+    {
+	fprintf (stderr,
+		 "*** BUG ***\n"
+		 "In %s: %s\n"
+		 "Set a breakpoint on '_pixman_log_error' to debug\n\n",
+                 function, message);
+
+	n_messages++;
+    }
+}
diff --git a/src/pixman/pixman/pixman-version.h.in b/src/pixman/pixman/pixman-version.h.in
new file mode 100644
index 0000000..256b2e6
--- /dev/null
+++ b/src/pixman/pixman/pixman-version.h.in
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Author: Carl D. Worth <cworth@cworth.org>
+ */
+
+#ifndef PIXMAN_VERSION_H__
+#define PIXMAN_VERSION_H__
+
+#ifndef PIXMAN_H__
+#  error pixman-version.h should only be included by pixman.h
+#endif
+
+#define PIXMAN_VERSION_MAJOR @PIXMAN_VERSION_MAJOR@
+#define PIXMAN_VERSION_MINOR @PIXMAN_VERSION_MINOR@
+#define PIXMAN_VERSION_MICRO @PIXMAN_VERSION_MICRO@
+
+#define PIXMAN_VERSION_STRING "@PIXMAN_VERSION_MAJOR@.@PIXMAN_VERSION_MINOR@.@PIXMAN_VERSION_MICRO@"
+
+#define PIXMAN_VERSION_ENCODE(major, minor, micro) (	\
+	  ((major) * 10000)				\
+	+ ((minor) *   100)				\
+	+ ((micro) *     1))
+
+#define PIXMAN_VERSION PIXMAN_VERSION_ENCODE(	\
+	PIXMAN_VERSION_MAJOR,			\
+	PIXMAN_VERSION_MINOR,			\
+	PIXMAN_VERSION_MICRO)
+
+#endif /* PIXMAN_VERSION_H__ */
diff --git a/src/pixman/pixman/pixman-vmx.c b/src/pixman/pixman/pixman-vmx.c
new file mode 100644
index 0000000..c33631c
--- /dev/null
+++ b/src/pixman/pixman/pixman-vmx.c
@@ -0,0 +1,2026 @@
+/*
+ * Copyright © 2007 Luca Barbato
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Luca Barbato not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  Luca Barbato makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * Author:  Luca Barbato (lu_zero@gentoo.org)
+ *
+ * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include <altivec.h>
+
+#define AVV(x...) {x}
+
+static force_inline vector unsigned int
+splat_alpha (vector unsigned int pix)
+{
+    return vec_perm (pix, pix,
+		     (vector unsigned char)AVV (
+			 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
+			 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
+}
+
+static force_inline vector unsigned int
+pix_multiply (vector unsigned int p, vector unsigned int a)
+{
+    vector unsigned short hi, lo, mod;
+
+    /* unpack to short */
+    hi = (vector unsigned short)
+	vec_mergeh ((vector unsigned char)AVV (0),
+		    (vector unsigned char)p);
+
+    mod = (vector unsigned short)
+	vec_mergeh ((vector unsigned char)AVV (0),
+		    (vector unsigned char)a);
+
+    hi = vec_mladd (hi, mod, (vector unsigned short)
+                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
+                         0x0080, 0x0080, 0x0080, 0x0080));
+
+    hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
+
+    hi = vec_sr (hi, vec_splat_u16 (8));
+
+    /* unpack to short */
+    lo = (vector unsigned short)
+	vec_mergel ((vector unsigned char)AVV (0),
+		    (vector unsigned char)p);
+    mod = (vector unsigned short)
+	vec_mergel ((vector unsigned char)AVV (0),
+		    (vector unsigned char)a);
+
+    lo = vec_mladd (lo, mod, (vector unsigned short)
+                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
+                         0x0080, 0x0080, 0x0080, 0x0080));
+
+    lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
+
+    lo = vec_sr (lo, vec_splat_u16 (8));
+
+    return (vector unsigned int)vec_packsu (hi, lo);
+}
+
+static force_inline vector unsigned int
+pix_add (vector unsigned int a, vector unsigned int b)
+{
+    return (vector unsigned int)vec_adds ((vector unsigned char)a,
+                                          (vector unsigned char)b);
+}
+
+static force_inline vector unsigned int
+pix_add_mul (vector unsigned int x,
+             vector unsigned int a,
+             vector unsigned int y,
+             vector unsigned int b)
+{
+    vector unsigned int t1, t2;
+
+    t1 = pix_multiply (x, a);
+    t2 = pix_multiply (y, b);
+
+    return pix_add (t1, t2);
+}
+
+static force_inline vector unsigned int
+negate (vector unsigned int src)
+{
+    return vec_nor (src, src);
+}
+
+/* dest*~srca + src */
+static force_inline vector unsigned int
+over (vector unsigned int src,
+      vector unsigned int srca,
+      vector unsigned int dest)
+{
+    vector unsigned char tmp = (vector unsigned char)
+	pix_multiply (dest, negate (srca));
+
+    tmp = vec_adds ((vector unsigned char)src, tmp);
+    return (vector unsigned int)tmp;
+}
+
+/* in == pix_multiply */
+#define in_over(src, srca, mask, dest)					\
+    over (pix_multiply (src, mask),					\
+          pix_multiply (srca, mask), dest)
+
+
+#define COMPUTE_SHIFT_MASK(source)					\
+    source ## _mask = vec_lvsl (0, source);
+
+#define COMPUTE_SHIFT_MASKS(dest, source)				\
+    source ## _mask = vec_lvsl (0, source);
+
+#define COMPUTE_SHIFT_MASKC(dest, source, mask)				\
+    mask ## _mask = vec_lvsl (0, mask);					\
+    source ## _mask = vec_lvsl (0, source);
+
+/* notice you have to declare temp vars...
+ * Note: tmp3 and tmp4 must remain untouched!
+ */
+
+#define LOAD_VECTORS(dest, source)			  \
+    tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
+    tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
+    v ## source = (typeof(v ## source))			  \
+	vec_perm (tmp1, tmp2, source ## _mask);		  \
+    v ## dest = (typeof(v ## dest))vec_ld (0, dest);
+
+#define LOAD_VECTORSC(dest, source, mask)		  \
+    tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
+    tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
+    v ## source = (typeof(v ## source))			  \
+	vec_perm (tmp1, tmp2, source ## _mask);		  \
+    tmp1 = (typeof(tmp1))vec_ld (0, mask);		  \
+    v ## dest = (typeof(v ## dest))vec_ld (0, dest);	  \
+    tmp2 = (typeof(tmp2))vec_ld (15, mask);		  \
+    v ## mask = (typeof(v ## mask))			  \
+	vec_perm (tmp1, tmp2, mask ## _mask);
+
+#define LOAD_VECTORSM(dest, source, mask)				\
+    LOAD_VECTORSC (dest, source, mask)					\
+    v ## source = pix_multiply (v ## source,				\
+                                splat_alpha (v ## mask));
+
+#define STORE_VECTOR(dest)						\
+    vec_st ((vector unsigned int) v ## dest, 0, dest);
+
+static void
+vmx_combine_over_u_no_mask (uint32_t *      dest,
+                            const uint32_t *src,
+                            int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+
+	*dest++ = d;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+
+	LOAD_VECTORS (dest, src);
+
+	vdest = over (vsrc, splat_alpha (vsrc), vdest);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_over_u_mask (uint32_t *      dest,
+                         const uint32_t *src,
+                         const uint32_t *mask,
+                         int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia;
+
+	UN8x4_MUL_UN8 (s, m);
+
+	ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+	*dest++ = d;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = over (vsrc, splat_alpha (vsrc), vdest);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t ia;
+
+	UN8x4_MUL_UN8 (s, m);
+
+	ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_over_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    if (mask)
+	vmx_combine_over_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_over_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
+                                    const uint32_t *src,
+                                    int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+
+	LOAD_VECTORS (dest, src);
+
+	vdest = over (vdest, splat_alpha (vdest), vsrc);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t ia = ALPHA_8 (~dest[i]);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_over_reverse_u_mask (uint32_t *      dest,
+                                 const uint32_t *src,
+                                 const uint32_t *mask,
+                                 int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = over (vdest, splat_alpha (vdest), vsrc);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t ia = ALPHA_8 (~dest[i]);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_over_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    if (mask)
+	vmx_combine_over_reverse_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_over_reverse_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_in_u_no_mask (uint32_t *      dest,
+                          const uint32_t *src,
+                          int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (*dest);
+
+	UN8x4_MUL_UN8 (s, a);
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_multiply (vsrc, splat_alpha (vdest));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t a = ALPHA_8 (dest[i]);
+
+	UN8x4_MUL_UN8 (s, a);
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_in_u_mask (uint32_t *      dest,
+                       const uint32_t *src,
+                       const uint32_t *mask,
+                       int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (*dest);
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_MUL_UN8 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_multiply (vsrc, splat_alpha (vdest));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t a = ALPHA_8 (dest[i]);
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_MUL_UN8 (s, a);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_in_u (pixman_implementation_t *imp,
+                  pixman_op_t              op,
+                  uint32_t *               dest,
+                  const uint32_t *         src,
+                  const uint32_t *         mask,
+                  int                      width)
+{
+    if (mask)
+	vmx_combine_in_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_in_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
+                                  const uint32_t *src,
+                                  int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t d = *dest;
+	uint32_t a = ALPHA_8 (*src++);
+
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_multiply (vdest, splat_alpha (vsrc));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t d = dest[i];
+	uint32_t a = ALPHA_8 (src[i]);
+
+	UN8x4_MUL_UN8 (d, a);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_in_reverse_u_mask (uint32_t *      dest,
+                               const uint32_t *src,
+                               const uint32_t *mask,
+                               int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t d = *dest;
+	uint32_t a = *src++;
+
+	UN8x4_MUL_UN8 (a, m);
+	a = ALPHA_8 (a);
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_multiply (vdest, splat_alpha (vsrc));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t d = dest[i];
+	uint32_t a = src[i];
+
+	UN8x4_MUL_UN8 (a, m);
+	a = ALPHA_8 (a);
+	UN8x4_MUL_UN8 (d, a);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_in_reverse_u (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *               dest,
+                          const uint32_t *         src,
+                          const uint32_t *         mask,
+                          int                      width)
+{
+    if (mask)
+	vmx_combine_in_reverse_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_in_reverse_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_out_u_no_mask (uint32_t *      dest,
+                           const uint32_t *src,
+                           int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (~(*dest));
+
+	UN8x4_MUL_UN8 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t a = ALPHA_8 (~dest[i]);
+
+	UN8x4_MUL_UN8 (s, a);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_out_u_mask (uint32_t *      dest,
+                        const uint32_t *src,
+                        const uint32_t *mask,
+                        int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t a = ALPHA_8 (~(*dest));
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_MUL_UN8 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t a = ALPHA_8 (~dest[i]);
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_MUL_UN8 (s, a);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_out_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    if (mask)
+	vmx_combine_out_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_out_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
+                                   const uint32_t *src,
+                                   int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t d = *dest;
+	uint32_t a = ALPHA_8 (~(*src++));
+
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t d = dest[i];
+	uint32_t a = ALPHA_8 (~src[i]);
+
+	UN8x4_MUL_UN8 (d, a);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_out_reverse_u_mask (uint32_t *      dest,
+                                const uint32_t *src,
+                                const uint32_t *mask,
+                                int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t d = *dest;
+	uint32_t a = *src++;
+
+	UN8x4_MUL_UN8 (a, m);
+	a = ALPHA_8 (~a);
+	UN8x4_MUL_UN8 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t d = dest[i];
+	uint32_t a = src[i];
+
+	UN8x4_MUL_UN8 (a, m);
+	a = ALPHA_8 (~a);
+	UN8x4_MUL_UN8 (d, a);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_out_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    if (mask)
+	vmx_combine_out_reverse_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_out_reverse_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_atop_u_no_mask (uint32_t *      dest,
+                            const uint32_t *src,
+                            int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t dest_a = ALPHA_8 (d);
+	uint32_t src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
+			     vdest, splat_alpha (negate (vsrc)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t dest_a = ALPHA_8 (d);
+	uint32_t src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_atop_u_mask (uint32_t *      dest,
+                         const uint32_t *src,
+                         const uint32_t *mask,
+                         int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t dest_a = ALPHA_8 (d);
+	uint32_t src_ia;
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
+			     vdest, splat_alpha (negate (vsrc)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t dest_a = ALPHA_8 (d);
+	uint32_t src_ia;
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_atop_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    if (mask)
+	vmx_combine_atop_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_atop_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
+                                    const uint32_t *src,
+                                    int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_a = ALPHA_8 (s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
+			     vsrc, splat_alpha (negate (vdest)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t src_a = ALPHA_8 (s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
+                                 const uint32_t *src,
+                                 const uint32_t *mask,
+                                 int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_a;
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_a = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
+			     vsrc, splat_alpha (negate (vdest)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t src_a;
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_a = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_atop_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    if (mask)
+	vmx_combine_atop_reverse_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_atop_reverse_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_xor_u_no_mask (uint32_t *      dest,
+                           const uint32_t *src,
+                           int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_ia = ALPHA_8 (~s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
+			     vdest, splat_alpha (negate (vsrc)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t src_ia = ALPHA_8 (~s);
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_xor_u_mask (uint32_t *      dest,
+                        const uint32_t *src,
+                        const uint32_t *mask,
+                        int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t src_ia;
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
+			     vdest, splat_alpha (negate (vsrc)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t src_ia;
+	uint32_t dest_ia = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8 (s, m);
+
+	src_ia = ALPHA_8 (~s);
+
+	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_xor_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    if (mask)
+	vmx_combine_xor_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_xor_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_add_u_no_mask (uint32_t *      dest,
+                           const uint32_t *src,
+                           int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc;
+    vector unsigned char tmp1, tmp2, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+
+	UN8x4_ADD_UN8x4 (d, s);
+
+	*dest++ = d;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKS (dest, src);
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORS (dest, src);
+
+	vdest = pix_add (vsrc, vdest);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+
+	UN8x4_ADD_UN8x4 (d, s);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_add_u_mask (uint32_t *      dest,
+                        const uint32_t *src,
+                        const uint32_t *mask,
+                        int             width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, src_mask, mask_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t m = ALPHA_8 (*mask++);
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_ADD_UN8x4 (d, s);
+
+	*dest++ = d;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSM (dest, src, mask);
+
+	vdest = pix_add (vsrc, vdest);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t m = ALPHA_8 (mask[i]);
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+
+	UN8x4_MUL_UN8 (s, m);
+	UN8x4_ADD_UN8x4 (d, s);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_add_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    if (mask)
+	vmx_combine_add_u_mask (dest, src, mask, width);
+    else
+	vmx_combine_add_u_no_mask (dest, src, width);
+}
+
+static void
+vmx_combine_src_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+
+	UN8x4_MUL_UN8x4 (s, a);
+
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_multiply (vsrc, vmask);
+
+	STORE_VECTOR (dest);
+
+	mask += 4;
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+
+	UN8x4_MUL_UN8x4 (s, a);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_over_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
+
+	*dest++ = d;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
+
+	STORE_VECTOR (dest);
+
+	mask += 4;
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t ida = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
+
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
+
+	STORE_VECTOR (dest);
+
+	mask += 4;
+	src += 4;
+	dest += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t ida = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_in_ca (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t da = ALPHA_8 (*dest);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (s, da);
+
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t da = ALPHA_8 (dest[i]);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (s, da);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (*src++);
+
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4 (d, a);
+
+	*dest++ = d;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (src[i]);
+
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4 (d, a);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_out_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (s, da);
+
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_multiply (
+	    pix_multiply (vsrc, vmask), splat_alpha (negate (vdest)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (s, da);
+
+	dest[i] = s;
+    }
+}
+
+static void
+vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4 (d, ~a);
+
+	*dest++ = d;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_multiply (
+	    vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4 (d, ~a);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_atop_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask, vsrca;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+	*dest++ = d;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vsrca = splat_alpha (vsrc);
+
+	vsrc = pix_multiply (vsrc, vmask);
+	vmask = pix_multiply (vmask, vsrca);
+
+	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
+			     negate (vmask), vdest);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
+
+	*dest++ = d;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_add_mul (vdest,
+			     pix_multiply (vmask, splat_alpha (vsrc)),
+			     pix_multiply (vsrc, vmask),
+			     negate (splat_alpha (vdest)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_xor_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+	*dest++ = d;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_add_mul (vdest,
+			     negate (pix_multiply (vmask, splat_alpha (vsrc))),
+			     pix_multiply (vsrc, vmask),
+			     negate (splat_alpha (vdest)));
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+	uint32_t sa = ALPHA_8 (s);
+	uint32_t da = ALPHA_8 (~d);
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_MUL_UN8 (a, sa);
+	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
+
+	dest[i] = d;
+    }
+}
+
+static void
+vmx_combine_add_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    int i;
+    vector unsigned int vdest, vsrc, vmask;
+    vector unsigned char tmp1, tmp2, mask_mask, src_mask;
+
+    while (width && ((uintptr_t)dest & 15))
+    {
+	uint32_t a = *mask++;
+	uint32_t s = *src++;
+	uint32_t d = *dest;
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_ADD_UN8x4 (s, d);
+
+	*dest++ = s;
+	width--;
+    }
+
+    COMPUTE_SHIFT_MASKC (dest, src, mask);
+
+    /* printf ("%s\n",__PRETTY_FUNCTION__); */
+    for (i = width / 4; i > 0; i--)
+    {
+	LOAD_VECTORSC (dest, src, mask);
+
+	vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
+
+	STORE_VECTOR (dest);
+
+	src += 4;
+	dest += 4;
+	mask += 4;
+    }
+
+    for (i = width % 4; --i >= 0;)
+    {
+	uint32_t a = mask[i];
+	uint32_t s = src[i];
+	uint32_t d = dest[i];
+
+	UN8x4_MUL_UN8x4 (s, a);
+	UN8x4_ADD_UN8x4 (s, d);
+
+	dest[i] = s;
+    }
+}
+
+static const pixman_fast_path_t vmx_fast_paths[] =
+{
+    {   PIXMAN_OP_NONE	},
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_vmx (pixman_implementation_t *fallback)
+{
+    pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
+
+    /* Set up function pointers */
+
+    imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = vmx_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u;
+
+    imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = vmx_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = vmx_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = vmx_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
+
+    return imp;
+}
diff --git a/src/pixman/pixman/pixman-x86.c b/src/pixman/pixman/pixman-x86.c
new file mode 100644
index 0000000..05297c4
--- /dev/null
+++ b/src/pixman/pixman/pixman-x86.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+
+#if defined(USE_X86_MMX) || defined (USE_SSE2) || defined (USE_SSSE3)
+
+/* The CPU detection code needs to be in a file not compiled with
+ * "-mmmx -msse", as gcc would generate CMOV instructions otherwise
+ * that would lead to SIGILL instructions on old CPUs that don't have
+ * it.
+ */
+
+typedef enum
+{
+    X86_MMX			= (1 << 0),
+    X86_MMX_EXTENSIONS		= (1 << 1),
+    X86_SSE			= (1 << 2) | X86_MMX_EXTENSIONS,
+    X86_SSE2			= (1 << 3),
+    X86_CMOV			= (1 << 4),
+    X86_SSSE3			= (1 << 5)
+} cpu_features_t;
+
+#ifdef HAVE_GETISAX
+
+#include <sys/auxv.h>
+
+static cpu_features_t
+detect_cpu_features (void)
+{
+    cpu_features_t features = 0;
+    unsigned int result = 0;
+
+    if (getisax (&result, 1))
+    {
+	if (result & AV_386_CMOV)
+	    features |= X86_CMOV;
+	if (result & AV_386_MMX)
+	    features |= X86_MMX;
+	if (result & AV_386_AMD_MMX)
+	    features |= X86_MMX_EXTENSIONS;
+	if (result & AV_386_SSE)
+	    features |= X86_SSE;
+	if (result & AV_386_SSE2)
+	    features |= X86_SSE2;
+	if (result & AV_386_SSSE3)
+	    features |= X86_SSSE3;
+    }
+
+    return features;
+}
+
+#else
+
+#define _PIXMAN_X86_64							\
+    (defined(__amd64__) || defined(__x86_64__) || defined(_M_AMD64))
+
+static pixman_bool_t
+have_cpuid (void)
+{
+#if _PIXMAN_X86_64 || defined (_MSC_VER)
+
+    return TRUE;
+
+#elif defined (__GNUC__)
+    uint32_t result;
+
+    __asm__ volatile (
+        "pushf"				"\n\t"
+        "pop %%eax"			"\n\t"
+        "mov %%eax, %%ecx"		"\n\t"
+        "xor $0x00200000, %%eax"	"\n\t"
+        "push %%eax"			"\n\t"
+        "popf"				"\n\t"
+        "pushf"				"\n\t"
+        "pop %%eax"			"\n\t"
+        "xor %%ecx, %%eax"		"\n\t"
+	"mov %%eax, %0"			"\n\t"
+	: "=r" (result)
+	:
+	: "%eax", "%ecx");
+
+    return !!result;
+
+#else
+#error "Unknown compiler"
+#endif
+}
+
+static void
+pixman_cpuid (uint32_t feature,
+	      uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d)
+{
+#if defined (__GNUC__)
+
+#if _PIXMAN_X86_64
+    __asm__ volatile (
+        "cpuid"				"\n\t"
+	: "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)
+	: "a" (feature));
+#else
+    /* On x86-32 we need to be careful about the handling of %ebx
+     * and %esp. We can't declare either one as clobbered
+     * since they are special registers (%ebx is the "PIC
+     * register" holding an offset to global data, %esp the
+     * stack pointer), so we need to make sure that %ebx is
+     * preserved, and that %esp has its original value when
+     * accessing the output operands.
+     */
+    __asm__ volatile (
+	"xchg %%ebx, %1"		"\n\t"
+	"cpuid"				"\n\t"
+	"xchg %%ebx, %1"		"\n\t"
+	: "=a" (*a), "=r" (*b), "=c" (*c), "=d" (*d)
+	: "a" (feature));
+#endif
+
+#elif defined (_MSC_VER)
+    int info[4];
+
+    __cpuid (info, feature);
+
+    *a = info[0];
+    *b = info[1];
+    *c = info[2];
+    *d = info[3];
+#else
+#error Unknown compiler
+#endif
+}
+
+static cpu_features_t
+detect_cpu_features (void)
+{
+    uint32_t a, b, c, d;
+    cpu_features_t features = 0;
+
+    if (!have_cpuid())
+	return features;
+
+    /* Get feature bits */
+    pixman_cpuid (0x01, &a, &b, &c, &d);
+    if (d & (1 << 15))
+	features |= X86_CMOV;
+    if (d & (1 << 23))
+	features |= X86_MMX;
+    if (d & (1 << 25))
+	features |= X86_SSE;
+    if (d & (1 << 26))
+	features |= X86_SSE2;
+    if (c & (1 << 9))
+	features |= X86_SSSE3;
+
+    /* Check for AMD specific features */
+    if ((features & X86_MMX) && !(features & X86_SSE))
+    {
+	char vendor[13];
+
+	/* Get vendor string */
+	memset (vendor, 0, sizeof vendor);
+
+	pixman_cpuid (0x00, &a, &b, &c, &d);
+	memcpy (vendor + 0, &b, 4);
+	memcpy (vendor + 4, &d, 4);
+	memcpy (vendor + 8, &c, 4);
+
+	if (strcmp (vendor, "AuthenticAMD") == 0 ||
+	    strcmp (vendor, "Geode by NSC") == 0)
+	{
+	    pixman_cpuid (0x80000000, &a, &b, &c, &d);
+	    if (a >= 0x80000001)
+	    {
+		pixman_cpuid (0x80000001, &a, &b, &c, &d);
+
+		if (d & (1 << 22))
+		    features |= X86_MMX_EXTENSIONS;
+	    }
+	}
+    }
+
+    return features;
+}
+
+#endif
+
+static pixman_bool_t
+have_feature (cpu_features_t feature)
+{
+    static pixman_bool_t initialized;
+    static cpu_features_t features;
+
+    if (!initialized)
+    {
+	features = detect_cpu_features();
+	initialized = TRUE;
+    }
+
+    return (features & feature) == feature;
+}
+
+#endif
+
+pixman_implementation_t *
+_pixman_x86_get_implementations (pixman_implementation_t *imp)
+{
+#define MMX_BITS  (X86_MMX | X86_MMX_EXTENSIONS)
+#define SSE2_BITS (X86_MMX | X86_MMX_EXTENSIONS | X86_SSE | X86_SSE2)
+#define SSSE3_BITS (X86_SSE | X86_SSE2 | X86_SSSE3)
+
+#ifdef USE_X86_MMX
+    if (!_pixman_disabled ("mmx") && have_feature (MMX_BITS))
+	imp = _pixman_implementation_create_mmx (imp);
+#endif
+
+#ifdef USE_SSE2
+    if (!_pixman_disabled ("sse2") && have_feature (SSE2_BITS))
+	imp = _pixman_implementation_create_sse2 (imp);
+#endif
+
+#ifdef USE_SSSE3
+    if (!_pixman_disabled ("ssse3") && have_feature (SSSE3_BITS))
+	imp = _pixman_implementation_create_ssse3 (imp);
+#endif
+
+    return imp;
+}
diff --git a/src/pixman/pixman/pixman.c b/src/pixman/pixman/pixman.c
new file mode 100644
index 0000000..9555cea
--- /dev/null
+++ b/src/pixman/pixman/pixman.c
@@ -0,0 +1,1135 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Author:  Keith Packard, SuSE, Inc.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include "pixman-private.h"
+
+#include <stdlib.h>
+
+pixman_implementation_t *global_implementation;
+
+#ifdef TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR
+static void __attribute__((constructor))
+pixman_constructor (void)
+{
+    global_implementation = _pixman_choose_implementation ();
+}
+#endif
+
+typedef struct operator_info_t operator_info_t;
+
+struct operator_info_t
+{
+    uint8_t	opaque_info[4];
+};
+
+#define PACK(neither, src, dest, both)			\
+    {{	    (uint8_t)PIXMAN_OP_ ## neither,		\
+	    (uint8_t)PIXMAN_OP_ ## src,			\
+	    (uint8_t)PIXMAN_OP_ ## dest,		\
+	    (uint8_t)PIXMAN_OP_ ## both		}}
+
+static const operator_info_t operator_table[] =
+{
+    /*    Neither Opaque         Src Opaque             Dst Opaque             Both Opaque */
+    PACK (CLEAR,                 CLEAR,                 CLEAR,                 CLEAR),
+    PACK (SRC,                   SRC,                   SRC,                   SRC),
+    PACK (DST,                   DST,                   DST,                   DST),
+    PACK (OVER,                  SRC,                   OVER,                  SRC),
+    PACK (OVER_REVERSE,          OVER_REVERSE,          DST,                   DST),
+    PACK (IN,                    IN,                    SRC,                   SRC),
+    PACK (IN_REVERSE,            DST,                   IN_REVERSE,            DST),
+    PACK (OUT,                   OUT,                   CLEAR,                 CLEAR),
+    PACK (OUT_REVERSE,           CLEAR,                 OUT_REVERSE,           CLEAR),
+    PACK (ATOP,                  IN,                    OVER,                  SRC),
+    PACK (ATOP_REVERSE,          OVER_REVERSE,          IN_REVERSE,            DST),
+    PACK (XOR,                   OUT,                   OUT_REVERSE,           CLEAR),
+    PACK (ADD,                   ADD,                   ADD,                   ADD),
+    PACK (SATURATE,              OVER_REVERSE,          DST,                   DST),
+
+    {{ 0 /* 0x0e */ }},
+    {{ 0 /* 0x0f */ }},
+
+    PACK (CLEAR,                 CLEAR,                 CLEAR,                 CLEAR),
+    PACK (SRC,                   SRC,                   SRC,                   SRC),
+    PACK (DST,                   DST,                   DST,                   DST),
+    PACK (DISJOINT_OVER,         DISJOINT_OVER,         DISJOINT_OVER,         DISJOINT_OVER),
+    PACK (DISJOINT_OVER_REVERSE, DISJOINT_OVER_REVERSE, DISJOINT_OVER_REVERSE, DISJOINT_OVER_REVERSE),
+    PACK (DISJOINT_IN,           DISJOINT_IN,           DISJOINT_IN,           DISJOINT_IN),
+    PACK (DISJOINT_IN_REVERSE,   DISJOINT_IN_REVERSE,   DISJOINT_IN_REVERSE,   DISJOINT_IN_REVERSE),
+    PACK (DISJOINT_OUT,          DISJOINT_OUT,          DISJOINT_OUT,          DISJOINT_OUT),
+    PACK (DISJOINT_OUT_REVERSE,  DISJOINT_OUT_REVERSE,  DISJOINT_OUT_REVERSE,  DISJOINT_OUT_REVERSE),
+    PACK (DISJOINT_ATOP,         DISJOINT_ATOP,         DISJOINT_ATOP,         DISJOINT_ATOP),
+    PACK (DISJOINT_ATOP_REVERSE, DISJOINT_ATOP_REVERSE, DISJOINT_ATOP_REVERSE, DISJOINT_ATOP_REVERSE),
+    PACK (DISJOINT_XOR,          DISJOINT_XOR,          DISJOINT_XOR,          DISJOINT_XOR),
+
+    {{ 0 /* 0x1c */ }},
+    {{ 0 /* 0x1d */ }},
+    {{ 0 /* 0x1e */ }},
+    {{ 0 /* 0x1f */ }},
+
+    PACK (CLEAR,                 CLEAR,                 CLEAR,                 CLEAR),
+    PACK (SRC,                   SRC,                   SRC,                   SRC),
+    PACK (DST,                   DST,                   DST,                   DST),
+    PACK (CONJOINT_OVER,         CONJOINT_OVER,         CONJOINT_OVER,         CONJOINT_OVER),
+    PACK (CONJOINT_OVER_REVERSE, CONJOINT_OVER_REVERSE, CONJOINT_OVER_REVERSE, CONJOINT_OVER_REVERSE),
+    PACK (CONJOINT_IN,           CONJOINT_IN,           CONJOINT_IN,           CONJOINT_IN),
+    PACK (CONJOINT_IN_REVERSE,   CONJOINT_IN_REVERSE,   CONJOINT_IN_REVERSE,   CONJOINT_IN_REVERSE),
+    PACK (CONJOINT_OUT,          CONJOINT_OUT,          CONJOINT_OUT,          CONJOINT_OUT),
+    PACK (CONJOINT_OUT_REVERSE,  CONJOINT_OUT_REVERSE,  CONJOINT_OUT_REVERSE,  CONJOINT_OUT_REVERSE),
+    PACK (CONJOINT_ATOP,         CONJOINT_ATOP,         CONJOINT_ATOP,         CONJOINT_ATOP),
+    PACK (CONJOINT_ATOP_REVERSE, CONJOINT_ATOP_REVERSE, CONJOINT_ATOP_REVERSE, CONJOINT_ATOP_REVERSE),
+    PACK (CONJOINT_XOR,          CONJOINT_XOR,          CONJOINT_XOR,          CONJOINT_XOR),
+
+    {{ 0 /* 0x2c */ }},
+    {{ 0 /* 0x2d */ }},
+    {{ 0 /* 0x2e */ }},
+    {{ 0 /* 0x2f */ }},
+
+    PACK (MULTIPLY,              MULTIPLY,              MULTIPLY,              MULTIPLY),
+    PACK (SCREEN,                SCREEN,                SCREEN,                SCREEN),
+    PACK (OVERLAY,               OVERLAY,               OVERLAY,               OVERLAY),
+    PACK (DARKEN,                DARKEN,                DARKEN,                DARKEN),
+    PACK (LIGHTEN,               LIGHTEN,               LIGHTEN,               LIGHTEN),
+    PACK (COLOR_DODGE,           COLOR_DODGE,           COLOR_DODGE,           COLOR_DODGE),
+    PACK (COLOR_BURN,            COLOR_BURN,            COLOR_BURN,            COLOR_BURN),
+    PACK (HARD_LIGHT,            HARD_LIGHT,            HARD_LIGHT,            HARD_LIGHT),
+    PACK (SOFT_LIGHT,            SOFT_LIGHT,            SOFT_LIGHT,            SOFT_LIGHT),
+    PACK (DIFFERENCE,            DIFFERENCE,            DIFFERENCE,            DIFFERENCE),
+    PACK (EXCLUSION,             EXCLUSION,             EXCLUSION,             EXCLUSION),
+    PACK (HSL_HUE,               HSL_HUE,               HSL_HUE,               HSL_HUE),
+    PACK (HSL_SATURATION,        HSL_SATURATION,        HSL_SATURATION,        HSL_SATURATION),
+    PACK (HSL_COLOR,             HSL_COLOR,             HSL_COLOR,             HSL_COLOR),
+    PACK (HSL_LUMINOSITY,        HSL_LUMINOSITY,        HSL_LUMINOSITY,        HSL_LUMINOSITY),
+};
+
+/*
+ * Optimize the current operator based on opacity of source or destination
+ * The output operator should be mathematically equivalent to the source.
+ */
+static pixman_op_t
+optimize_operator (pixman_op_t     op,
+		   uint32_t        src_flags,
+		   uint32_t        mask_flags,
+		   uint32_t        dst_flags)
+{
+    pixman_bool_t is_source_opaque, is_dest_opaque;
+
+#define OPAQUE_SHIFT 13
+    
+    COMPILE_TIME_ASSERT (FAST_PATH_IS_OPAQUE == (1 << OPAQUE_SHIFT));
+    
+    is_dest_opaque = (dst_flags & FAST_PATH_IS_OPAQUE);
+    is_source_opaque = ((src_flags & mask_flags) & FAST_PATH_IS_OPAQUE);
+
+    is_dest_opaque >>= OPAQUE_SHIFT - 1;
+    is_source_opaque >>= OPAQUE_SHIFT;
+
+    return operator_table[op].opaque_info[is_dest_opaque | is_source_opaque];
+}
+
+/*
+ * Computing composite region
+ */
+static inline pixman_bool_t
+clip_general_image (pixman_region32_t * region,
+                    pixman_region32_t * clip,
+                    int                 dx,
+                    int                 dy)
+{
+    if (pixman_region32_n_rects (region) == 1 &&
+        pixman_region32_n_rects (clip) == 1)
+    {
+	pixman_box32_t *  rbox = pixman_region32_rectangles (region, NULL);
+	pixman_box32_t *  cbox = pixman_region32_rectangles (clip, NULL);
+	int v;
+
+	if (rbox->x1 < (v = cbox->x1 + dx))
+	    rbox->x1 = v;
+	if (rbox->x2 > (v = cbox->x2 + dx))
+	    rbox->x2 = v;
+	if (rbox->y1 < (v = cbox->y1 + dy))
+	    rbox->y1 = v;
+	if (rbox->y2 > (v = cbox->y2 + dy))
+	    rbox->y2 = v;
+	if (rbox->x1 >= rbox->x2 || rbox->y1 >= rbox->y2)
+	{
+	    pixman_region32_init (region);
+	    return FALSE;
+	}
+    }
+    else if (!pixman_region32_not_empty (clip))
+    {
+	return FALSE;
+    }
+    else
+    {
+	if (dx || dy)
+	    pixman_region32_translate (region, -dx, -dy);
+
+	if (!pixman_region32_intersect (region, region, clip))
+	    return FALSE;
+
+	if (dx || dy)
+	    pixman_region32_translate (region, dx, dy);
+    }
+
+    return pixman_region32_not_empty (region);
+}
+
+static inline pixman_bool_t
+clip_source_image (pixman_region32_t * region,
+                   pixman_image_t *    image,
+                   int                 dx,
+                   int                 dy)
+{
+    /* Source clips are ignored, unless they are explicitly turned on
+     * and the clip in question was set by an X client. (Because if
+     * the clip was not set by a client, then it is a hierarchy
+     * clip and those should always be ignored for sources).
+     */
+    if (!image->common.clip_sources || !image->common.client_clip)
+	return TRUE;
+
+    return clip_general_image (region,
+                               &image->common.clip_region,
+                               dx, dy);
+}
+
+/*
+ * returns FALSE if the final region is empty.  Indistinguishable from
+ * an allocation failure, but rendering ignores those anyways.
+ */
+pixman_bool_t
+_pixman_compute_composite_region32 (pixman_region32_t * region,
+				    pixman_image_t *    src_image,
+				    pixman_image_t *    mask_image,
+				    pixman_image_t *    dest_image,
+				    int32_t             src_x,
+				    int32_t             src_y,
+				    int32_t             mask_x,
+				    int32_t             mask_y,
+				    int32_t             dest_x,
+				    int32_t             dest_y,
+				    int32_t             width,
+				    int32_t             height)
+{
+    region->extents.x1 = dest_x;
+    region->extents.x2 = dest_x + width;
+    region->extents.y1 = dest_y;
+    region->extents.y2 = dest_y + height;
+
+    region->extents.x1 = MAX (region->extents.x1, 0);
+    region->extents.y1 = MAX (region->extents.y1, 0);
+    region->extents.x2 = MIN (region->extents.x2, dest_image->bits.width);
+    region->extents.y2 = MIN (region->extents.y2, dest_image->bits.height);
+
+    region->data = 0;
+
+    /* Check for empty operation */
+    if (region->extents.x1 >= region->extents.x2 ||
+        region->extents.y1 >= region->extents.y2)
+    {
+	region->extents.x1 = 0;
+	region->extents.x2 = 0;
+	region->extents.y1 = 0;
+	region->extents.y2 = 0;
+	return FALSE;
+    }
+
+    if (dest_image->common.have_clip_region)
+    {
+	if (!clip_general_image (region, &dest_image->common.clip_region, 0, 0))
+	    return FALSE;
+    }
+
+    if (dest_image->common.alpha_map)
+    {
+	if (!pixman_region32_intersect_rect (region, region,
+					     dest_image->common.alpha_origin_x,
+					     dest_image->common.alpha_origin_y,
+					     dest_image->common.alpha_map->width,
+					     dest_image->common.alpha_map->height))
+	{
+	    return FALSE;
+	}
+	if (!pixman_region32_not_empty (region))
+	    return FALSE;
+	if (dest_image->common.alpha_map->common.have_clip_region)
+	{
+	    if (!clip_general_image (region, &dest_image->common.alpha_map->common.clip_region,
+				     -dest_image->common.alpha_origin_x,
+				     -dest_image->common.alpha_origin_y))
+	    {
+		return FALSE;
+	    }
+	}
+    }
+
+    /* clip against src */
+    if (src_image->common.have_clip_region)
+    {
+	if (!clip_source_image (region, src_image, dest_x - src_x, dest_y - src_y))
+	    return FALSE;
+    }
+    if (src_image->common.alpha_map && src_image->common.alpha_map->common.have_clip_region)
+    {
+	if (!clip_source_image (region, (pixman_image_t *)src_image->common.alpha_map,
+	                        dest_x - (src_x - src_image->common.alpha_origin_x),
+	                        dest_y - (src_y - src_image->common.alpha_origin_y)))
+	{
+	    return FALSE;
+	}
+    }
+    /* clip against mask */
+    if (mask_image && mask_image->common.have_clip_region)
+    {
+	if (!clip_source_image (region, mask_image, dest_x - mask_x, dest_y - mask_y))
+	    return FALSE;
+
+	if (mask_image->common.alpha_map && mask_image->common.alpha_map->common.have_clip_region)
+	{
+	    if (!clip_source_image (region, (pixman_image_t *)mask_image->common.alpha_map,
+	                            dest_x - (mask_x - mask_image->common.alpha_origin_x),
+	                            dest_y - (mask_y - mask_image->common.alpha_origin_y)))
+	    {
+		return FALSE;
+	    }
+	}
+    }
+
+    return TRUE;
+}
+
+typedef struct
+{
+    pixman_fixed_48_16_t	x1;
+    pixman_fixed_48_16_t	y1;
+    pixman_fixed_48_16_t	x2;
+    pixman_fixed_48_16_t	y2;
+} box_48_16_t;
+
+static pixman_bool_t
+compute_transformed_extents (pixman_transform_t *transform,
+			     const pixman_box32_t *extents,
+			     box_48_16_t *transformed)
+{
+    pixman_fixed_48_16_t tx1, ty1, tx2, ty2;
+    pixman_fixed_t x1, y1, x2, y2;
+    int i;
+
+    x1 = pixman_int_to_fixed (extents->x1) + pixman_fixed_1 / 2;
+    y1 = pixman_int_to_fixed (extents->y1) + pixman_fixed_1 / 2;
+    x2 = pixman_int_to_fixed (extents->x2) - pixman_fixed_1 / 2;
+    y2 = pixman_int_to_fixed (extents->y2) - pixman_fixed_1 / 2;
+
+    if (!transform)
+    {
+	transformed->x1 = x1;
+	transformed->y1 = y1;
+	transformed->x2 = x2;
+	transformed->y2 = y2;
+
+	return TRUE;
+    }
+
+    tx1 = ty1 = INT64_MAX;
+    tx2 = ty2 = INT64_MIN;
+
+    for (i = 0; i < 4; ++i)
+    {
+	pixman_fixed_48_16_t tx, ty;
+	pixman_vector_t v;
+
+	v.vector[0] = (i & 0x01)? x1 : x2;
+	v.vector[1] = (i & 0x02)? y1 : y2;
+	v.vector[2] = pixman_fixed_1;
+
+	if (!pixman_transform_point (transform, &v))
+	    return FALSE;
+
+	tx = (pixman_fixed_48_16_t)v.vector[0];
+	ty = (pixman_fixed_48_16_t)v.vector[1];
+
+	if (tx < tx1)
+	    tx1 = tx;
+	if (ty < ty1)
+	    ty1 = ty;
+	if (tx > tx2)
+	    tx2 = tx;
+	if (ty > ty2)
+	    ty2 = ty;
+    }
+
+    transformed->x1 = tx1;
+    transformed->y1 = ty1;
+    transformed->x2 = tx2;
+    transformed->y2 = ty2;
+
+    return TRUE;
+}
+
+#define IS_16BIT(x) (((x) >= INT16_MIN) && ((x) <= INT16_MAX))
+#define ABS(f)      (((f) < 0)?  (-(f)) : (f))
+#define IS_16_16(f) (((f) >= pixman_min_fixed_48_16 && ((f) <= pixman_max_fixed_48_16)))
+
+static pixman_bool_t
+analyze_extent (pixman_image_t       *image,
+		const pixman_box32_t *extents,
+		uint32_t             *flags)
+{
+    pixman_transform_t *transform;
+    pixman_fixed_t x_off, y_off;
+    pixman_fixed_t width, height;
+    pixman_fixed_t *params;
+    box_48_16_t transformed;
+    pixman_box32_t exp_extents;
+
+    if (!image)
+	return TRUE;
+
+    /* Some compositing functions walk one step
+     * outside the destination rectangle, so we
+     * check here that the expanded-by-one source
+     * extents in destination space fits in 16 bits
+     */
+    if (!IS_16BIT (extents->x1 - 1)		||
+	!IS_16BIT (extents->y1 - 1)		||
+	!IS_16BIT (extents->x2 + 1)		||
+	!IS_16BIT (extents->y2 + 1))
+    {
+	return FALSE;
+    }
+
+    transform = image->common.transform;
+    if (image->common.type == BITS)
+    {
+	/* During repeat mode calculations we might convert the
+	 * width/height of an image to fixed 16.16, so we need
+	 * them to be smaller than 16 bits.
+	 */
+	if (image->bits.width >= 0x7fff	|| image->bits.height >= 0x7fff)
+	    return FALSE;
+
+	if ((image->common.flags & FAST_PATH_ID_TRANSFORM) == FAST_PATH_ID_TRANSFORM &&
+	    extents->x1 >= 0 &&
+	    extents->y1 >= 0 &&
+	    extents->x2 <= image->bits.width &&
+	    extents->y2 <= image->bits.height)
+	{
+	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+	    return TRUE;
+	}
+
+	switch (image->common.filter)
+	{
+	case PIXMAN_FILTER_CONVOLUTION:
+	    params = image->common.filter_params;
+	    x_off = - pixman_fixed_e - ((params[0] - pixman_fixed_1) >> 1);
+	    y_off = - pixman_fixed_e - ((params[1] - pixman_fixed_1) >> 1);
+	    width = params[0];
+	    height = params[1];
+	    break;
+
+	case PIXMAN_FILTER_SEPARABLE_CONVOLUTION:
+	    params = image->common.filter_params;
+	    x_off = - pixman_fixed_e - ((params[0] - pixman_fixed_1) >> 1);
+	    y_off = - pixman_fixed_e - ((params[1] - pixman_fixed_1) >> 1);
+	    width = params[0];
+	    height = params[1];
+	    break;
+	    
+	case PIXMAN_FILTER_GOOD:
+	case PIXMAN_FILTER_BEST:
+	case PIXMAN_FILTER_BILINEAR:
+	    x_off = - pixman_fixed_1 / 2;
+	    y_off = - pixman_fixed_1 / 2;
+	    width = pixman_fixed_1;
+	    height = pixman_fixed_1;
+	    break;
+
+	case PIXMAN_FILTER_FAST:
+	case PIXMAN_FILTER_NEAREST:
+	    x_off = - pixman_fixed_e;
+	    y_off = - pixman_fixed_e;
+	    width = 0;
+	    height = 0;
+	    break;
+
+	default:
+	    return FALSE;
+	}
+    }
+    else
+    {
+	x_off = 0;
+	y_off = 0;
+	width = 0;
+	height = 0;
+    }
+
+    if (!compute_transformed_extents (transform, extents, &transformed))
+	return FALSE;
+
+    /* Expand the source area by a tiny bit so account of different rounding that
+     * may happen during sampling. Note that (8 * pixman_fixed_e) is very far from
+     * 0.5 so this won't cause the area computed to be overly pessimistic.
+     */
+    transformed.x1 -= 8 * pixman_fixed_e;
+    transformed.y1 -= 8 * pixman_fixed_e;
+    transformed.x2 += 8 * pixman_fixed_e;
+    transformed.y2 += 8 * pixman_fixed_e;
+
+    if (image->common.type == BITS)
+    {
+	if (pixman_fixed_to_int (transformed.x1) >= 0			&&
+	    pixman_fixed_to_int (transformed.y1) >= 0			&&
+	    pixman_fixed_to_int (transformed.x2) < image->bits.width	&&
+	    pixman_fixed_to_int (transformed.y2) < image->bits.height)
+	{
+	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
+	}
+
+	if (pixman_fixed_to_int (transformed.x1 - pixman_fixed_1 / 2) >= 0		  &&
+	    pixman_fixed_to_int (transformed.y1 - pixman_fixed_1 / 2) >= 0		  &&
+	    pixman_fixed_to_int (transformed.x2 + pixman_fixed_1 / 2) < image->bits.width &&
+	    pixman_fixed_to_int (transformed.y2 + pixman_fixed_1 / 2) < image->bits.height)
+	{
+	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR;
+	}
+    }
+
+    /* Check we don't overflow when the destination extents are expanded by one.
+     * This ensures that compositing functions can simply walk the source space
+     * using 16.16 variables without worrying about overflow.
+     */
+    exp_extents = *extents;
+    exp_extents.x1 -= 1;
+    exp_extents.y1 -= 1;
+    exp_extents.x2 += 1;
+    exp_extents.y2 += 1;
+
+    if (!compute_transformed_extents (transform, &exp_extents, &transformed))
+	return FALSE;
+    
+    if (!IS_16_16 (transformed.x1 + x_off - 8 * pixman_fixed_e)	||
+	!IS_16_16 (transformed.y1 + y_off - 8 * pixman_fixed_e)	||
+	!IS_16_16 (transformed.x2 + x_off + 8 * pixman_fixed_e + width)	||
+	!IS_16_16 (transformed.y2 + y_off + 8 * pixman_fixed_e + height))
+    {
+	return FALSE;
+    }
+
+    return TRUE;
+}
+
+/*
+ * Work around GCC bug causing crashes in Mozilla with SSE2
+ *
+ * When using -msse, gcc generates movdqa instructions assuming that
+ * the stack is 16 byte aligned. Unfortunately some applications, such
+ * as Mozilla and Mono, end up aligning the stack to 4 bytes, which
+ * causes the movdqa instructions to fail.
+ *
+ * The __force_align_arg_pointer__ makes gcc generate a prologue that
+ * realigns the stack pointer to 16 bytes.
+ *
+ * On x86-64 this is not necessary because the standard ABI already
+ * calls for a 16 byte aligned stack.
+ *
+ * See https://bugs.freedesktop.org/show_bug.cgi?id=15693
+ */
+#if defined (USE_SSE2) && defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
+__attribute__((__force_align_arg_pointer__))
+#endif
+PIXMAN_EXPORT void
+pixman_image_composite32 (pixman_op_t      op,
+                          pixman_image_t * src,
+                          pixman_image_t * mask,
+                          pixman_image_t * dest,
+                          int32_t          src_x,
+                          int32_t          src_y,
+                          int32_t          mask_x,
+                          int32_t          mask_y,
+                          int32_t          dest_x,
+                          int32_t          dest_y,
+                          int32_t          width,
+                          int32_t          height)
+{
+    pixman_format_code_t src_format, mask_format, dest_format;
+    pixman_region32_t region;
+    pixman_box32_t extents;
+    pixman_implementation_t *imp;
+    pixman_composite_func_t func;
+    pixman_composite_info_t info;
+    const pixman_box32_t *pbox;
+    int n;
+
+    _pixman_image_validate (src);
+    if (mask)
+	_pixman_image_validate (mask);
+    _pixman_image_validate (dest);
+
+    src_format = src->common.extended_format_code;
+    info.src_flags = src->common.flags;
+
+    if (mask && !(mask->common.flags & FAST_PATH_IS_OPAQUE))
+    {
+	mask_format = mask->common.extended_format_code;
+	info.mask_flags = mask->common.flags;
+    }
+    else
+    {
+	mask_format = PIXMAN_null;
+	info.mask_flags = FAST_PATH_IS_OPAQUE | FAST_PATH_NO_ALPHA_MAP;
+    }
+
+    dest_format = dest->common.extended_format_code;
+    info.dest_flags = dest->common.flags;
+
+    /* Check for pixbufs */
+    if ((mask_format == PIXMAN_a8r8g8b8 || mask_format == PIXMAN_a8b8g8r8) &&
+	(src->type == BITS && src->bits.bits == mask->bits.bits)	   &&
+	(src->common.repeat == mask->common.repeat)			   &&
+	(info.src_flags & info.mask_flags & FAST_PATH_ID_TRANSFORM)	   &&
+	(src_x == mask_x && src_y == mask_y))
+    {
+	if (src_format == PIXMAN_x8b8g8r8)
+	    src_format = mask_format = PIXMAN_pixbuf;
+	else if (src_format == PIXMAN_x8r8g8b8)
+	    src_format = mask_format = PIXMAN_rpixbuf;
+    }
+
+    pixman_region32_init (&region);
+
+    if (!_pixman_compute_composite_region32 (
+	    &region, src, mask, dest,
+	    src_x, src_y, mask_x, mask_y, dest_x, dest_y, width, height))
+    {
+	goto out;
+    }
+
+    extents = *pixman_region32_extents (&region);
+
+    extents.x1 -= dest_x - src_x;
+    extents.y1 -= dest_y - src_y;
+    extents.x2 -= dest_x - src_x;
+    extents.y2 -= dest_y - src_y;
+
+    if (!analyze_extent (src, &extents, &info.src_flags))
+	goto out;
+
+    extents.x1 -= src_x - mask_x;
+    extents.y1 -= src_y - mask_y;
+    extents.x2 -= src_x - mask_x;
+    extents.y2 -= src_y - mask_y;
+
+    if (!analyze_extent (mask, &extents, &info.mask_flags))
+	goto out;
+
+    /* If the clip is within the source samples, and the samples are
+     * opaque, then the source is effectively opaque.
+     */
+#define NEAREST_OPAQUE	(FAST_PATH_SAMPLES_OPAQUE |			\
+			 FAST_PATH_NEAREST_FILTER |			\
+			 FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+#define BILINEAR_OPAQUE	(FAST_PATH_SAMPLES_OPAQUE |			\
+			 FAST_PATH_BILINEAR_FILTER |			\
+			 FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR)
+
+    if ((info.src_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
+	(info.src_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
+    {
+	info.src_flags |= FAST_PATH_IS_OPAQUE;
+    }
+
+    if ((info.mask_flags & NEAREST_OPAQUE) == NEAREST_OPAQUE ||
+	(info.mask_flags & BILINEAR_OPAQUE) == BILINEAR_OPAQUE)
+    {
+	info.mask_flags |= FAST_PATH_IS_OPAQUE;
+    }
+
+    /*
+     * Check if we can replace our operator by a simpler one
+     * if the src or dest are opaque. The output operator should be
+     * mathematically equivalent to the source.
+     */
+    info.op = optimize_operator (op, info.src_flags, info.mask_flags, info.dest_flags);
+
+    _pixman_implementation_lookup_composite (
+	get_implementation (), info.op,
+	src_format, info.src_flags,
+	mask_format, info.mask_flags,
+	dest_format, info.dest_flags,
+	&imp, &func);
+
+    info.src_image = src;
+    info.mask_image = mask;
+    info.dest_image = dest;
+
+    pbox = pixman_region32_rectangles (&region, &n);
+
+    while (n--)
+    {
+	info.src_x = pbox->x1 + src_x - dest_x;
+	info.src_y = pbox->y1 + src_y - dest_y;
+	info.mask_x = pbox->x1 + mask_x - dest_x;
+	info.mask_y = pbox->y1 + mask_y - dest_y;
+	info.dest_x = pbox->x1;
+	info.dest_y = pbox->y1;
+	info.width = pbox->x2 - pbox->x1;
+	info.height = pbox->y2 - pbox->y1;
+
+	func (imp, &info);
+
+	pbox++;
+    }
+
+out:
+    pixman_region32_fini (&region);
+}
+
+PIXMAN_EXPORT void
+pixman_image_composite (pixman_op_t      op,
+                        pixman_image_t * src,
+                        pixman_image_t * mask,
+                        pixman_image_t * dest,
+                        int16_t          src_x,
+                        int16_t          src_y,
+                        int16_t          mask_x,
+                        int16_t          mask_y,
+                        int16_t          dest_x,
+                        int16_t          dest_y,
+                        uint16_t         width,
+                        uint16_t         height)
+{
+    pixman_image_composite32 (op, src, mask, dest, src_x, src_y, 
+                              mask_x, mask_y, dest_x, dest_y, width, height);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_blt (uint32_t *src_bits,
+            uint32_t *dst_bits,
+            int       src_stride,
+            int       dst_stride,
+            int       src_bpp,
+            int       dst_bpp,
+            int       src_x,
+            int       src_y,
+            int       dest_x,
+            int       dest_y,
+            int       width,
+            int       height)
+{
+    return _pixman_implementation_blt (get_implementation(),
+				       src_bits, dst_bits, src_stride, dst_stride,
+                                       src_bpp, dst_bpp,
+                                       src_x, src_y,
+                                       dest_x, dest_y,
+                                       width, height);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_fill (uint32_t *bits,
+             int       stride,
+             int       bpp,
+             int       x,
+             int       y,
+             int       width,
+             int       height,
+             uint32_t  filler)
+{
+    return _pixman_implementation_fill (
+	get_implementation(), bits, stride, bpp, x, y, width, height, filler);
+}
+
+static uint32_t
+color_to_uint32 (const pixman_color_t *color)
+{
+    return
+        (color->alpha >> 8 << 24) |
+        (color->red >> 8 << 16) |
+        (color->green & 0xff00) |
+        (color->blue >> 8);
+}
+
+static pixman_bool_t
+color_to_pixel (const pixman_color_t *color,
+                uint32_t *            pixel,
+                pixman_format_code_t  format)
+{
+    uint32_t c = color_to_uint32 (color);
+
+    if (!(format == PIXMAN_a8r8g8b8     ||
+          format == PIXMAN_x8r8g8b8     ||
+          format == PIXMAN_a8b8g8r8     ||
+          format == PIXMAN_x8b8g8r8     ||
+          format == PIXMAN_b8g8r8a8     ||
+          format == PIXMAN_b8g8r8x8     ||
+          format == PIXMAN_r8g8b8a8     ||
+          format == PIXMAN_r8g8b8x8     ||
+          format == PIXMAN_r5g6b5       ||
+          format == PIXMAN_b5g6r5       ||
+          format == PIXMAN_a8           ||
+          format == PIXMAN_a1))
+    {
+	return FALSE;
+    }
+
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_ABGR)
+    {
+	c = ((c & 0xff000000) >>  0) |
+	    ((c & 0x00ff0000) >> 16) |
+	    ((c & 0x0000ff00) >>  0) |
+	    ((c & 0x000000ff) << 16);
+    }
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_BGRA)
+    {
+	c = ((c & 0xff000000) >> 24) |
+	    ((c & 0x00ff0000) >>  8) |
+	    ((c & 0x0000ff00) <<  8) |
+	    ((c & 0x000000ff) << 24);
+    }
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_RGBA)
+	c = ((c & 0xff000000) >> 24) | (c << 8);
+
+    if (format == PIXMAN_a1)
+	c = c >> 31;
+    else if (format == PIXMAN_a8)
+	c = c >> 24;
+    else if (format == PIXMAN_r5g6b5 ||
+             format == PIXMAN_b5g6r5)
+	c = convert_8888_to_0565 (c);
+
+#if 0
+    printf ("color: %x %x %x %x\n", color->alpha, color->red, color->green, color->blue);
+    printf ("pixel: %x\n", c);
+#endif
+
+    *pixel = c;
+    return TRUE;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_fill_rectangles (pixman_op_t                 op,
+                              pixman_image_t *            dest,
+			      const pixman_color_t *      color,
+                              int                         n_rects,
+                              const pixman_rectangle16_t *rects)
+{
+    pixman_box32_t stack_boxes[6];
+    pixman_box32_t *boxes;
+    pixman_bool_t result;
+    int i;
+
+    if (n_rects > 6)
+    {
+        boxes = pixman_malloc_ab (sizeof (pixman_box32_t), n_rects);
+        if (boxes == NULL)
+            return FALSE;
+    }
+    else
+    {
+        boxes = stack_boxes;
+    }
+
+    for (i = 0; i < n_rects; ++i)
+    {
+        boxes[i].x1 = rects[i].x;
+        boxes[i].y1 = rects[i].y;
+        boxes[i].x2 = boxes[i].x1 + rects[i].width;
+        boxes[i].y2 = boxes[i].y1 + rects[i].height;
+    }
+
+    result = pixman_image_fill_boxes (op, dest, color, n_rects, boxes);
+
+    if (boxes != stack_boxes)
+        free (boxes);
+    
+    return result;
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_image_fill_boxes (pixman_op_t           op,
+                         pixman_image_t *      dest,
+                         const pixman_color_t *color,
+                         int                   n_boxes,
+                         const pixman_box32_t *boxes)
+{
+    pixman_image_t *solid;
+    pixman_color_t c;
+    int i;
+
+    _pixman_image_validate (dest);
+    
+    if (color->alpha == 0xffff)
+    {
+        if (op == PIXMAN_OP_OVER)
+            op = PIXMAN_OP_SRC;
+    }
+
+    if (op == PIXMAN_OP_CLEAR)
+    {
+        c.red = 0;
+        c.green = 0;
+        c.blue = 0;
+        c.alpha = 0;
+
+        color = &c;
+
+        op = PIXMAN_OP_SRC;
+    }
+
+    if (op == PIXMAN_OP_SRC)
+    {
+        uint32_t pixel;
+
+        if (color_to_pixel (color, &pixel, dest->bits.format))
+        {
+            pixman_region32_t fill_region;
+            int n_rects, j;
+            pixman_box32_t *rects;
+
+            if (!pixman_region32_init_rects (&fill_region, boxes, n_boxes))
+                return FALSE;
+
+            if (dest->common.have_clip_region)
+            {
+                if (!pixman_region32_intersect (&fill_region,
+                                                &fill_region,
+                                                &dest->common.clip_region))
+                    return FALSE;
+            }
+
+            rects = pixman_region32_rectangles (&fill_region, &n_rects);
+            for (j = 0; j < n_rects; ++j)
+            {
+                const pixman_box32_t *rect = &(rects[j]);
+                pixman_fill (dest->bits.bits, dest->bits.rowstride, PIXMAN_FORMAT_BPP (dest->bits.format),
+                             rect->x1, rect->y1, rect->x2 - rect->x1, rect->y2 - rect->y1,
+                             pixel);
+            }
+
+            pixman_region32_fini (&fill_region);
+            return TRUE;
+        }
+    }
+
+    solid = pixman_image_create_solid_fill (color);
+    if (!solid)
+        return FALSE;
+
+    for (i = 0; i < n_boxes; ++i)
+    {
+        const pixman_box32_t *box = &(boxes[i]);
+
+        pixman_image_composite32 (op, solid, NULL, dest,
+                                  0, 0, 0, 0,
+                                  box->x1, box->y1,
+                                  box->x2 - box->x1, box->y2 - box->y1);
+    }
+
+    pixman_image_unref (solid);
+
+    return TRUE;
+}
+
+/**
+ * pixman_version:
+ *
+ * Returns the version of the pixman library encoded in a single
+ * integer as per %PIXMAN_VERSION_ENCODE. The encoding ensures that
+ * later versions compare greater than earlier versions.
+ *
+ * A run-time comparison to check that pixman's version is greater than
+ * or equal to version X.Y.Z could be performed as follows:
+ *
+ * <informalexample><programlisting>
+ * if (pixman_version() >= PIXMAN_VERSION_ENCODE(X,Y,Z)) {...}
+ * </programlisting></informalexample>
+ *
+ * See also pixman_version_string() as well as the compile-time
+ * equivalents %PIXMAN_VERSION and %PIXMAN_VERSION_STRING.
+ *
+ * Return value: the encoded version.
+ **/
+PIXMAN_EXPORT int
+pixman_version (void)
+{
+    return PIXMAN_VERSION;
+}
+
+/**
+ * pixman_version_string:
+ *
+ * Returns the version of the pixman library as a human-readable string
+ * of the form "X.Y.Z".
+ *
+ * See also pixman_version() as well as the compile-time equivalents
+ * %PIXMAN_VERSION_STRING and %PIXMAN_VERSION.
+ *
+ * Return value: a string containing the version.
+ **/
+PIXMAN_EXPORT const char*
+pixman_version_string (void)
+{
+    return PIXMAN_VERSION_STRING;
+}
+
+/**
+ * pixman_format_supported_source:
+ * @format: A pixman_format_code_t format
+ *
+ * Return value: whether the provided format code is a supported
+ * format for a pixman surface used as a source in
+ * rendering.
+ *
+ * Currently, all pixman_format_code_t values are supported.
+ **/
+PIXMAN_EXPORT pixman_bool_t
+pixman_format_supported_source (pixman_format_code_t format)
+{
+    switch (format)
+    {
+    /* 32 bpp formats */
+    case PIXMAN_a2b10g10r10:
+    case PIXMAN_x2b10g10r10:
+    case PIXMAN_a2r10g10b10:
+    case PIXMAN_x2r10g10b10:
+    case PIXMAN_a8r8g8b8:
+    case PIXMAN_a8r8g8b8_sRGB:
+    case PIXMAN_x8r8g8b8:
+    case PIXMAN_a8b8g8r8:
+    case PIXMAN_x8b8g8r8:
+    case PIXMAN_b8g8r8a8:
+    case PIXMAN_b8g8r8x8:
+    case PIXMAN_r8g8b8a8:
+    case PIXMAN_r8g8b8x8:
+    case PIXMAN_r8g8b8:
+    case PIXMAN_b8g8r8:
+    case PIXMAN_r5g6b5:
+    case PIXMAN_b5g6r5:
+    case PIXMAN_x14r6g6b6:
+    /* 16 bpp formats */
+    case PIXMAN_a1r5g5b5:
+    case PIXMAN_x1r5g5b5:
+    case PIXMAN_a1b5g5r5:
+    case PIXMAN_x1b5g5r5:
+    case PIXMAN_a4r4g4b4:
+    case PIXMAN_x4r4g4b4:
+    case PIXMAN_a4b4g4r4:
+    case PIXMAN_x4b4g4r4:
+    /* 8bpp formats */
+    case PIXMAN_a8:
+    case PIXMAN_r3g3b2:
+    case PIXMAN_b2g3r3:
+    case PIXMAN_a2r2g2b2:
+    case PIXMAN_a2b2g2r2:
+    case PIXMAN_c8:
+    case PIXMAN_g8:
+    case PIXMAN_x4a4:
+    /* Collides with PIXMAN_c8
+       case PIXMAN_x4c4:
+     */
+    /* Collides with PIXMAN_g8
+       case PIXMAN_x4g4:
+     */
+    /* 4bpp formats */
+    case PIXMAN_a4:
+    case PIXMAN_r1g2b1:
+    case PIXMAN_b1g2r1:
+    case PIXMAN_a1r1g1b1:
+    case PIXMAN_a1b1g1r1:
+    case PIXMAN_c4:
+    case PIXMAN_g4:
+    /* 1bpp formats */
+    case PIXMAN_a1:
+    case PIXMAN_g1:
+    /* YUV formats */
+    case PIXMAN_yuy2:
+    case PIXMAN_yv12:
+	return TRUE;
+
+    default:
+	return FALSE;
+    }
+}
+
+/**
+ * pixman_format_supported_destination:
+ * @format: A pixman_format_code_t format
+ *
+ * Return value: whether the provided format code is a supported
+ * format for a pixman surface used as a destination in
+ * rendering.
+ *
+ * Currently, all pixman_format_code_t values are supported
+ * except for the YUV formats.
+ **/
+PIXMAN_EXPORT pixman_bool_t
+pixman_format_supported_destination (pixman_format_code_t format)
+{
+    /* YUV formats cannot be written to at the moment */
+    if (format == PIXMAN_yuy2 || format == PIXMAN_yv12)
+	return FALSE;
+
+    return pixman_format_supported_source (format);
+}
+
+PIXMAN_EXPORT pixman_bool_t
+pixman_compute_composite_region (pixman_region16_t * region,
+                                 pixman_image_t *    src_image,
+                                 pixman_image_t *    mask_image,
+                                 pixman_image_t *    dest_image,
+                                 int16_t             src_x,
+                                 int16_t             src_y,
+                                 int16_t             mask_x,
+                                 int16_t             mask_y,
+                                 int16_t             dest_x,
+                                 int16_t             dest_y,
+                                 uint16_t            width,
+                                 uint16_t            height)
+{
+    pixman_region32_t r32;
+    pixman_bool_t retval;
+
+    pixman_region32_init (&r32);
+
+    retval = _pixman_compute_composite_region32 (
+	&r32, src_image, mask_image, dest_image,
+	src_x, src_y, mask_x, mask_y, dest_x, dest_y,
+	width, height);
+
+    if (retval)
+    {
+	if (!pixman_region16_copy_from_region32 (region, &r32))
+	    retval = FALSE;
+    }
+
+    pixman_region32_fini (&r32);
+    return retval;
+}
diff --git a/src/pixman/pixman/pixman.h b/src/pixman/pixman/pixman.h
new file mode 100644
index 0000000..509ba5e
--- /dev/null
+++ b/src/pixman/pixman/pixman.h
@@ -0,0 +1,1111 @@
+/***********************************************************
+
+Copyright 1987, 1998  The Open Group
+
+Permission to use, copy, modify, distribute, and sell this software and its
+documentation for any purpose is hereby granted without fee, provided that
+the above copyright notice appear in all copies and that both that
+copyright notice and this permission notice appear in supporting
+documentation.
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+Except as contained in this notice, the name of The Open Group shall not be
+used in advertising or otherwise to promote the sale, use or other dealings
+in this Software without prior written authorization from The Open Group.
+
+Copyright 1987 by Digital Equipment Corporation, Maynard, Massachusetts.
+
+                        All Rights Reserved
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose and without fee is hereby granted,
+provided that the above copyright notice appear in all copies and that
+both that copyright notice and this permission notice appear in
+supporting documentation, and that the name of Digital not be
+used in advertising or publicity pertaining to distribution of the
+software without specific, written prior permission.
+
+DIGITAL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
+ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL
+DIGITAL BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR
+ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+SOFTWARE.
+
+******************************************************************/
+/*
+ * Copyright © 1998, 2004 Keith Packard
+ * Copyright   2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Keith Packard not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Keith Packard makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * KEITH PACKARD DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL KEITH PACKARD BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef PIXMAN_H__
+#define PIXMAN_H__
+
+#include <pixman-version.h>
+
+#ifdef  __cplusplus
+#define PIXMAN_BEGIN_DECLS extern "C" {
+#define PIXMAN_END_DECLS }
+#else
+#define PIXMAN_BEGIN_DECLS
+#define PIXMAN_END_DECLS
+#endif
+
+PIXMAN_BEGIN_DECLS
+
+/*
+ * Standard integers
+ */
+
+#if !defined (PIXMAN_DONT_DEFINE_STDINT)
+
+#if defined (_SVR4) || defined (SVR4) || defined (__OpenBSD__) || defined (_sgi) || defined (__sun) || defined (sun) || defined (__digital__) || defined (__HP_cc)
+#  include <inttypes.h>
+/* VS 2010 (_MSC_VER 1600) has stdint.h */
+#elif defined (_MSC_VER) && _MSC_VER < 1600
+typedef __int8 int8_t;
+typedef unsigned __int8 uint8_t;
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#elif defined (_AIX)
+#  include <sys/inttypes.h>
+#else
+#  include <stdint.h>
+#endif
+
+#endif
+
+/*
+ * Boolean
+ */
+typedef int pixman_bool_t;
+
+/*
+ * Fixpoint numbers
+ */
+typedef int64_t			pixman_fixed_32_32_t;
+typedef pixman_fixed_32_32_t	pixman_fixed_48_16_t;
+typedef uint32_t		pixman_fixed_1_31_t;
+typedef uint32_t		pixman_fixed_1_16_t;
+typedef int32_t			pixman_fixed_16_16_t;
+typedef pixman_fixed_16_16_t	pixman_fixed_t;
+
+#define pixman_fixed_e			((pixman_fixed_t) 1)
+#define pixman_fixed_1			(pixman_int_to_fixed(1))
+#define pixman_fixed_1_minus_e		(pixman_fixed_1 - pixman_fixed_e)
+#define pixman_fixed_minus_1		(pixman_int_to_fixed(-1))
+#define pixman_fixed_to_int(f)		((int) ((f) >> 16))
+#define pixman_int_to_fixed(i)		((pixman_fixed_t) ((i) << 16))
+#define pixman_fixed_to_double(f)	(double) ((f) / (double) pixman_fixed_1)
+#define pixman_double_to_fixed(d)	((pixman_fixed_t) ((d) * 65536.0))
+#define pixman_fixed_frac(f)		((f) & pixman_fixed_1_minus_e)
+#define pixman_fixed_floor(f)		((f) & ~pixman_fixed_1_minus_e)
+#define pixman_fixed_ceil(f)		pixman_fixed_floor ((f) + pixman_fixed_1_minus_e)
+#define pixman_fixed_fraction(f)	((f) & pixman_fixed_1_minus_e)
+#define pixman_fixed_mod_2(f)		((f) & (pixman_fixed1 | pixman_fixed_1_minus_e))
+#define pixman_max_fixed_48_16		((pixman_fixed_48_16_t) 0x7fffffff)
+#define pixman_min_fixed_48_16		(-((pixman_fixed_48_16_t) 1 << 31))
+
+/*
+ * Misc structs
+ */
+typedef struct pixman_color pixman_color_t;
+typedef struct pixman_point_fixed pixman_point_fixed_t;
+typedef struct pixman_line_fixed pixman_line_fixed_t;
+typedef struct pixman_vector pixman_vector_t;
+typedef struct pixman_transform pixman_transform_t;
+
+struct pixman_color
+{
+    uint16_t	red;
+    uint16_t    green;
+    uint16_t    blue;
+    uint16_t    alpha;
+};
+
+struct pixman_point_fixed
+{
+    pixman_fixed_t	x;
+    pixman_fixed_t	y;
+};
+
+struct pixman_line_fixed
+{
+    pixman_point_fixed_t	p1, p2;
+};
+
+/*
+ * Fixed point matrices
+ */
+
+struct pixman_vector
+{
+    pixman_fixed_t	vector[3];
+};
+
+struct pixman_transform
+{
+    pixman_fixed_t	matrix[3][3];
+};
+
+/* forward declaration (sorry) */
+struct pixman_box16;
+typedef  union pixman_image		pixman_image_t;
+
+void          pixman_transform_init_identity    (struct pixman_transform       *matrix);
+pixman_bool_t pixman_transform_point_3d         (const struct pixman_transform *transform,
+						 struct pixman_vector          *vector);
+pixman_bool_t pixman_transform_point            (const struct pixman_transform *transform,
+						 struct pixman_vector          *vector);
+pixman_bool_t pixman_transform_multiply         (struct pixman_transform       *dst,
+						 const struct pixman_transform *l,
+						 const struct pixman_transform *r);
+void          pixman_transform_init_scale       (struct pixman_transform       *t,
+						 pixman_fixed_t                 sx,
+						 pixman_fixed_t                 sy);
+pixman_bool_t pixman_transform_scale            (struct pixman_transform       *forward,
+						 struct pixman_transform       *reverse,
+						 pixman_fixed_t                 sx,
+						 pixman_fixed_t                 sy);
+void          pixman_transform_init_rotate      (struct pixman_transform       *t,
+						 pixman_fixed_t                 cos,
+						 pixman_fixed_t                 sin);
+pixman_bool_t pixman_transform_rotate           (struct pixman_transform       *forward,
+						 struct pixman_transform       *reverse,
+						 pixman_fixed_t                 c,
+						 pixman_fixed_t                 s);
+void          pixman_transform_init_translate   (struct pixman_transform       *t,
+						 pixman_fixed_t                 tx,
+						 pixman_fixed_t                 ty);
+pixman_bool_t pixman_transform_translate        (struct pixman_transform       *forward,
+						 struct pixman_transform       *reverse,
+						 pixman_fixed_t                 tx,
+						 pixman_fixed_t                 ty);
+pixman_bool_t pixman_transform_bounds           (const struct pixman_transform *matrix,
+						 struct pixman_box16           *b);
+pixman_bool_t pixman_transform_invert           (struct pixman_transform       *dst,
+						 const struct pixman_transform *src);
+pixman_bool_t pixman_transform_is_identity      (const struct pixman_transform *t);
+pixman_bool_t pixman_transform_is_scale         (const struct pixman_transform *t);
+pixman_bool_t pixman_transform_is_int_translate (const struct pixman_transform *t);
+pixman_bool_t pixman_transform_is_inverse       (const struct pixman_transform *a,
+						 const struct pixman_transform *b);
+
+/*
+ * Floating point matrices
+ */
+typedef struct pixman_f_transform pixman_f_transform_t;
+typedef struct pixman_f_vector pixman_f_vector_t;
+
+struct pixman_f_vector
+{
+    double  v[3];
+};
+
+struct pixman_f_transform
+{
+    double  m[3][3];
+};
+
+pixman_bool_t pixman_transform_from_pixman_f_transform (struct pixman_transform         *t,
+							const struct pixman_f_transform *ft);
+void          pixman_f_transform_from_pixman_transform (struct pixman_f_transform       *ft,
+							const struct pixman_transform   *t);
+pixman_bool_t pixman_f_transform_invert                (struct pixman_f_transform       *dst,
+							const struct pixman_f_transform *src);
+pixman_bool_t pixman_f_transform_point                 (const struct pixman_f_transform *t,
+							struct pixman_f_vector          *v);
+void          pixman_f_transform_point_3d              (const struct pixman_f_transform *t,
+							struct pixman_f_vector          *v);
+void          pixman_f_transform_multiply              (struct pixman_f_transform       *dst,
+							const struct pixman_f_transform *l,
+							const struct pixman_f_transform *r);
+void          pixman_f_transform_init_scale            (struct pixman_f_transform       *t,
+							double                           sx,
+							double                           sy);
+pixman_bool_t pixman_f_transform_scale                 (struct pixman_f_transform       *forward,
+							struct pixman_f_transform       *reverse,
+							double                           sx,
+							double                           sy);
+void          pixman_f_transform_init_rotate           (struct pixman_f_transform       *t,
+							double                           cos,
+							double                           sin);
+pixman_bool_t pixman_f_transform_rotate                (struct pixman_f_transform       *forward,
+							struct pixman_f_transform       *reverse,
+							double                           c,
+							double                           s);
+void          pixman_f_transform_init_translate        (struct pixman_f_transform       *t,
+							double                           tx,
+							double                           ty);
+pixman_bool_t pixman_f_transform_translate             (struct pixman_f_transform       *forward,
+							struct pixman_f_transform       *reverse,
+							double                           tx,
+							double                           ty);
+pixman_bool_t pixman_f_transform_bounds                (const struct pixman_f_transform *t,
+							struct pixman_box16             *b);
+void          pixman_f_transform_init_identity         (struct pixman_f_transform       *t);
+
+typedef enum
+{
+    PIXMAN_REPEAT_NONE,
+    PIXMAN_REPEAT_NORMAL,
+    PIXMAN_REPEAT_PAD,
+    PIXMAN_REPEAT_REFLECT
+} pixman_repeat_t;
+
+typedef enum
+{
+    PIXMAN_FILTER_FAST,
+    PIXMAN_FILTER_GOOD,
+    PIXMAN_FILTER_BEST,
+    PIXMAN_FILTER_NEAREST,
+    PIXMAN_FILTER_BILINEAR,
+    PIXMAN_FILTER_CONVOLUTION,
+
+    /* The SEPARABLE_CONVOLUTION filter takes the following parameters:
+     *
+     *         width:           integer given as 16.16 fixpoint number
+     *         height:          integer given as 16.16 fixpoint number
+     *         x_phase_bits:	integer given as 16.16 fixpoint
+     *         y_phase_bits:	integer given as 16.16 fixpoint
+     *         xtables:         (1 << x_phase_bits) tables of size width
+     *         ytables:         (1 << y_phase_bits) tables of size height
+     *
+     * When sampling at (x, y), the location is first rounded to one of
+     * n_x_phases * n_y_phases subpixel positions. These subpixel positions
+     * determine an xtable and a ytable to use.
+     *
+     * Conceptually a width x height matrix is then formed in which each entry
+     * is the product of the corresponding entries in the x and y tables.
+     * This matrix is then aligned with the image pixels such that its center
+     * is as close as possible to the subpixel location chosen earlier. Then
+     * the image is convolved with the matrix and the resulting pixel returned.
+     */
+    PIXMAN_FILTER_SEPARABLE_CONVOLUTION
+} pixman_filter_t;
+
+typedef enum
+{
+    PIXMAN_OP_CLEAR			= 0x00,
+    PIXMAN_OP_SRC			= 0x01,
+    PIXMAN_OP_DST			= 0x02,
+    PIXMAN_OP_OVER			= 0x03,
+    PIXMAN_OP_OVER_REVERSE		= 0x04,
+    PIXMAN_OP_IN			= 0x05,
+    PIXMAN_OP_IN_REVERSE		= 0x06,
+    PIXMAN_OP_OUT			= 0x07,
+    PIXMAN_OP_OUT_REVERSE		= 0x08,
+    PIXMAN_OP_ATOP			= 0x09,
+    PIXMAN_OP_ATOP_REVERSE		= 0x0a,
+    PIXMAN_OP_XOR			= 0x0b,
+    PIXMAN_OP_ADD			= 0x0c,
+    PIXMAN_OP_SATURATE			= 0x0d,
+
+    PIXMAN_OP_DISJOINT_CLEAR		= 0x10,
+    PIXMAN_OP_DISJOINT_SRC		= 0x11,
+    PIXMAN_OP_DISJOINT_DST		= 0x12,
+    PIXMAN_OP_DISJOINT_OVER		= 0x13,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE	= 0x14,
+    PIXMAN_OP_DISJOINT_IN		= 0x15,
+    PIXMAN_OP_DISJOINT_IN_REVERSE	= 0x16,
+    PIXMAN_OP_DISJOINT_OUT		= 0x17,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE	= 0x18,
+    PIXMAN_OP_DISJOINT_ATOP		= 0x19,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE	= 0x1a,
+    PIXMAN_OP_DISJOINT_XOR		= 0x1b,
+
+    PIXMAN_OP_CONJOINT_CLEAR		= 0x20,
+    PIXMAN_OP_CONJOINT_SRC		= 0x21,
+    PIXMAN_OP_CONJOINT_DST		= 0x22,
+    PIXMAN_OP_CONJOINT_OVER		= 0x23,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE	= 0x24,
+    PIXMAN_OP_CONJOINT_IN		= 0x25,
+    PIXMAN_OP_CONJOINT_IN_REVERSE	= 0x26,
+    PIXMAN_OP_CONJOINT_OUT		= 0x27,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE	= 0x28,
+    PIXMAN_OP_CONJOINT_ATOP		= 0x29,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE	= 0x2a,
+    PIXMAN_OP_CONJOINT_XOR		= 0x2b,
+
+    PIXMAN_OP_MULTIPLY                  = 0x30,
+    PIXMAN_OP_SCREEN                    = 0x31,
+    PIXMAN_OP_OVERLAY                   = 0x32,
+    PIXMAN_OP_DARKEN                    = 0x33,
+    PIXMAN_OP_LIGHTEN                   = 0x34,
+    PIXMAN_OP_COLOR_DODGE               = 0x35,
+    PIXMAN_OP_COLOR_BURN                = 0x36,
+    PIXMAN_OP_HARD_LIGHT                = 0x37,
+    PIXMAN_OP_SOFT_LIGHT                = 0x38,
+    PIXMAN_OP_DIFFERENCE                = 0x39,
+    PIXMAN_OP_EXCLUSION                 = 0x3a,
+    PIXMAN_OP_HSL_HUE			= 0x3b,
+    PIXMAN_OP_HSL_SATURATION		= 0x3c,
+    PIXMAN_OP_HSL_COLOR			= 0x3d,
+    PIXMAN_OP_HSL_LUMINOSITY		= 0x3e
+
+#ifdef PIXMAN_USE_INTERNAL_API
+    ,
+    PIXMAN_N_OPERATORS,
+    PIXMAN_OP_NONE = PIXMAN_N_OPERATORS
+#endif
+} pixman_op_t;
+
+/*
+ * Regions
+ */
+typedef struct pixman_region16_data	pixman_region16_data_t;
+typedef struct pixman_box16		pixman_box16_t;
+typedef struct pixman_rectangle16	pixman_rectangle16_t;
+typedef struct pixman_region16		pixman_region16_t;
+
+struct pixman_region16_data {
+    long		size;
+    long		numRects;
+/*  pixman_box16_t	rects[size];   in memory but not explicitly declared */
+};
+
+struct pixman_rectangle16
+{
+    int16_t	x, y;
+    uint16_t	width, height;
+};
+
+struct pixman_box16
+{
+    int16_t x1, y1, x2, y2;
+};
+
+struct pixman_region16
+{
+    pixman_box16_t          extents;
+    pixman_region16_data_t *data;
+};
+
+typedef enum
+{
+    PIXMAN_REGION_OUT,
+    PIXMAN_REGION_IN,
+    PIXMAN_REGION_PART
+} pixman_region_overlap_t;
+
+/* This function exists only to make it possible to preserve
+ * the X ABI - it should go away at first opportunity.
+ */
+void pixman_region_set_static_pointers (pixman_box16_t         *empty_box,
+					pixman_region16_data_t *empty_data,
+					pixman_region16_data_t *broken_data);
+
+/* creation/destruction */
+void                    pixman_region_init               (pixman_region16_t *region);
+void                    pixman_region_init_rect          (pixman_region16_t *region,
+							  int                x,
+							  int                y,
+							  unsigned int       width,
+							  unsigned int       height);
+pixman_bool_t           pixman_region_init_rects         (pixman_region16_t *region,
+							  const pixman_box16_t *boxes,
+							  int                count);
+void                    pixman_region_init_with_extents  (pixman_region16_t *region,
+							  pixman_box16_t    *extents);
+void                    pixman_region_init_from_image    (pixman_region16_t *region,
+							  pixman_image_t    *image);
+void                    pixman_region_fini               (pixman_region16_t *region);
+
+
+/* manipulation */
+void                    pixman_region_translate          (pixman_region16_t *region,
+							  int                x,
+							  int                y);
+pixman_bool_t           pixman_region_copy               (pixman_region16_t *dest,
+							  pixman_region16_t *source);
+pixman_bool_t           pixman_region_intersect          (pixman_region16_t *new_reg,
+							  pixman_region16_t *reg1,
+							  pixman_region16_t *reg2);
+pixman_bool_t           pixman_region_union              (pixman_region16_t *new_reg,
+							  pixman_region16_t *reg1,
+							  pixman_region16_t *reg2);
+pixman_bool_t           pixman_region_union_rect         (pixman_region16_t *dest,
+							  pixman_region16_t *source,
+							  int                x,
+							  int                y,
+							  unsigned int       width,
+							  unsigned int       height);
+pixman_bool_t		pixman_region_intersect_rect     (pixman_region16_t *dest,
+							  pixman_region16_t *source,
+							  int                x,
+							  int                y,
+							  unsigned int       width,
+							  unsigned int       height);
+pixman_bool_t           pixman_region_subtract           (pixman_region16_t *reg_d,
+							  pixman_region16_t *reg_m,
+							  pixman_region16_t *reg_s);
+pixman_bool_t           pixman_region_inverse            (pixman_region16_t *new_reg,
+							  pixman_region16_t *reg1,
+							  pixman_box16_t    *inv_rect);
+pixman_bool_t           pixman_region_contains_point     (pixman_region16_t *region,
+							  int                x,
+							  int                y,
+							  pixman_box16_t    *box);
+pixman_region_overlap_t pixman_region_contains_rectangle (pixman_region16_t *region,
+							  pixman_box16_t    *prect);
+pixman_bool_t           pixman_region_not_empty          (pixman_region16_t *region);
+pixman_box16_t *        pixman_region_extents            (pixman_region16_t *region);
+int                     pixman_region_n_rects            (pixman_region16_t *region);
+pixman_box16_t *        pixman_region_rectangles         (pixman_region16_t *region,
+							  int               *n_rects);
+pixman_bool_t           pixman_region_equal              (pixman_region16_t *region1,
+							  pixman_region16_t *region2);
+pixman_bool_t           pixman_region_selfcheck          (pixman_region16_t *region);
+void                    pixman_region_reset              (pixman_region16_t *region,
+							  pixman_box16_t    *box);
+void			pixman_region_clear		 (pixman_region16_t *region);
+/*
+ * 32 bit regions
+ */
+typedef struct pixman_region32_data	pixman_region32_data_t;
+typedef struct pixman_box32		pixman_box32_t;
+typedef struct pixman_rectangle32	pixman_rectangle32_t;
+typedef struct pixman_region32		pixman_region32_t;
+
+struct pixman_region32_data {
+    long		size;
+    long		numRects;
+/*  pixman_box32_t	rects[size];   in memory but not explicitly declared */
+};
+
+struct pixman_rectangle32
+{
+    int32_t x, y;
+    uint32_t width, height;
+};
+
+struct pixman_box32
+{
+    int32_t x1, y1, x2, y2;
+};
+
+struct pixman_region32
+{
+    pixman_box32_t          extents;
+    pixman_region32_data_t  *data;
+};
+
+/* creation/destruction */
+void                    pixman_region32_init               (pixman_region32_t *region);
+void                    pixman_region32_init_rect          (pixman_region32_t *region,
+							    int                x,
+							    int                y,
+							    unsigned int       width,
+							    unsigned int       height);
+pixman_bool_t           pixman_region32_init_rects         (pixman_region32_t *region,
+							    const pixman_box32_t *boxes,
+							    int                count);
+void                    pixman_region32_init_with_extents  (pixman_region32_t *region,
+							    pixman_box32_t    *extents);
+void                    pixman_region32_init_from_image    (pixman_region32_t *region,
+							    pixman_image_t    *image);
+void                    pixman_region32_fini               (pixman_region32_t *region);
+
+
+/* manipulation */
+void                    pixman_region32_translate          (pixman_region32_t *region,
+							    int                x,
+							    int                y);
+pixman_bool_t           pixman_region32_copy               (pixman_region32_t *dest,
+							    pixman_region32_t *source);
+pixman_bool_t           pixman_region32_intersect          (pixman_region32_t *new_reg,
+							    pixman_region32_t *reg1,
+							    pixman_region32_t *reg2);
+pixman_bool_t           pixman_region32_union              (pixman_region32_t *new_reg,
+							    pixman_region32_t *reg1,
+							    pixman_region32_t *reg2);
+pixman_bool_t		pixman_region32_intersect_rect     (pixman_region32_t *dest,
+							    pixman_region32_t *source,
+							    int                x,
+							    int                y,
+							    unsigned int       width,
+							    unsigned int       height);
+pixman_bool_t           pixman_region32_union_rect         (pixman_region32_t *dest,
+							    pixman_region32_t *source,
+							    int                x,
+							    int                y,
+							    unsigned int       width,
+							    unsigned int       height);
+pixman_bool_t           pixman_region32_subtract           (pixman_region32_t *reg_d,
+							    pixman_region32_t *reg_m,
+							    pixman_region32_t *reg_s);
+pixman_bool_t           pixman_region32_inverse            (pixman_region32_t *new_reg,
+							    pixman_region32_t *reg1,
+							    pixman_box32_t    *inv_rect);
+pixman_bool_t           pixman_region32_contains_point     (pixman_region32_t *region,
+							    int                x,
+							    int                y,
+							    pixman_box32_t    *box);
+pixman_region_overlap_t pixman_region32_contains_rectangle (pixman_region32_t *region,
+							    pixman_box32_t    *prect);
+pixman_bool_t           pixman_region32_not_empty          (pixman_region32_t *region);
+pixman_box32_t *        pixman_region32_extents            (pixman_region32_t *region);
+int                     pixman_region32_n_rects            (pixman_region32_t *region);
+pixman_box32_t *        pixman_region32_rectangles         (pixman_region32_t *region,
+							    int               *n_rects);
+pixman_bool_t           pixman_region32_equal              (pixman_region32_t *region1,
+							    pixman_region32_t *region2);
+pixman_bool_t           pixman_region32_selfcheck          (pixman_region32_t *region);
+void                    pixman_region32_reset              (pixman_region32_t *region,
+							    pixman_box32_t    *box);
+void			pixman_region32_clear		   (pixman_region32_t *region);
+
+
+/* Copy / Fill / Misc */
+pixman_bool_t pixman_blt                (uint32_t           *src_bits,
+					 uint32_t           *dst_bits,
+					 int                 src_stride,
+					 int                 dst_stride,
+					 int                 src_bpp,
+					 int                 dst_bpp,
+					 int                 src_x,
+					 int                 src_y,
+					 int                 dest_x,
+					 int                 dest_y,
+					 int                 width,
+					 int                 height);
+pixman_bool_t pixman_fill               (uint32_t           *bits,
+					 int                 stride,
+					 int                 bpp,
+					 int                 x,
+					 int                 y,
+					 int                 width,
+					 int                 height,
+					 uint32_t            _xor);
+
+int           pixman_version            (void);
+const char*   pixman_version_string     (void);
+
+/*
+ * Images
+ */
+typedef struct pixman_indexed		pixman_indexed_t;
+typedef struct pixman_gradient_stop	pixman_gradient_stop_t;
+
+typedef uint32_t (* pixman_read_memory_func_t) (const void *src, int size);
+typedef void     (* pixman_write_memory_func_t) (void *dst, uint32_t value, int size);
+
+typedef void     (* pixman_image_destroy_func_t) (pixman_image_t *image, void *data);
+
+struct pixman_gradient_stop {
+    pixman_fixed_t x;
+    pixman_color_t color;
+};
+
+#define PIXMAN_MAX_INDEXED  256 /* XXX depth must be <= 8 */
+
+#if PIXMAN_MAX_INDEXED <= 256
+typedef uint8_t pixman_index_type;
+#endif
+
+struct pixman_indexed
+{
+    pixman_bool_t       color;
+    uint32_t		rgba[PIXMAN_MAX_INDEXED];
+    pixman_index_type	ent[32768];
+};
+
+/*
+ * While the protocol is generous in format support, the
+ * sample implementation allows only packed RGB and GBR
+ * representations for data to simplify software rendering,
+ */
+#define PIXMAN_FORMAT(bpp,type,a,r,g,b)	(((bpp) << 24) |  \
+					 ((type) << 16) | \
+					 ((a) << 12) |	  \
+					 ((r) << 8) |	  \
+					 ((g) << 4) |	  \
+					 ((b)))
+
+#define PIXMAN_FORMAT_BPP(f)	(((f) >> 24)       )
+#define PIXMAN_FORMAT_TYPE(f)	(((f) >> 16) & 0xff)
+#define PIXMAN_FORMAT_A(f)	(((f) >> 12) & 0x0f)
+#define PIXMAN_FORMAT_R(f)	(((f) >>  8) & 0x0f)
+#define PIXMAN_FORMAT_G(f)	(((f) >>  4) & 0x0f)
+#define PIXMAN_FORMAT_B(f)	(((f)      ) & 0x0f)
+#define PIXMAN_FORMAT_RGB(f)	(((f)      ) & 0xfff)
+#define PIXMAN_FORMAT_VIS(f)	(((f)      ) & 0xffff)
+#define PIXMAN_FORMAT_DEPTH(f)	(PIXMAN_FORMAT_A(f) +	\
+				 PIXMAN_FORMAT_R(f) +	\
+				 PIXMAN_FORMAT_G(f) +	\
+				 PIXMAN_FORMAT_B(f))
+
+#define PIXMAN_TYPE_OTHER	0
+#define PIXMAN_TYPE_A		1
+#define PIXMAN_TYPE_ARGB	2
+#define PIXMAN_TYPE_ABGR	3
+#define PIXMAN_TYPE_COLOR	4
+#define PIXMAN_TYPE_GRAY	5
+#define PIXMAN_TYPE_YUY2	6
+#define PIXMAN_TYPE_YV12	7
+#define PIXMAN_TYPE_BGRA	8
+#define PIXMAN_TYPE_RGBA	9
+#define PIXMAN_TYPE_ARGB_SRGB	10
+
+#define PIXMAN_FORMAT_COLOR(f)				\
+	(PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ARGB ||	\
+	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_ABGR ||	\
+	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_BGRA ||	\
+	 PIXMAN_FORMAT_TYPE(f) == PIXMAN_TYPE_RGBA)
+
+/* 32bpp formats */
+typedef enum {
+    PIXMAN_a8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,8,8,8,8),
+    PIXMAN_x8r8g8b8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,8,8,8),
+    PIXMAN_a8b8g8r8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,8,8,8,8),
+    PIXMAN_x8b8g8r8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,8,8,8),
+    PIXMAN_b8g8r8a8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,8,8,8,8),
+    PIXMAN_b8g8r8x8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_BGRA,0,8,8,8),
+    PIXMAN_r8g8b8a8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,8,8,8,8),
+    PIXMAN_r8g8b8x8 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_RGBA,0,8,8,8),
+    PIXMAN_x14r6g6b6 =	 PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,6,6,6),
+    PIXMAN_x2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,0,10,10,10),
+    PIXMAN_a2r10g10b10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB,2,10,10,10),
+    PIXMAN_x2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,0,10,10,10),
+    PIXMAN_a2b10g10r10 = PIXMAN_FORMAT(32,PIXMAN_TYPE_ABGR,2,10,10,10),
+
+/* sRGB formats */
+    PIXMAN_a8r8g8b8_sRGB = PIXMAN_FORMAT(32,PIXMAN_TYPE_ARGB_SRGB,8,8,8,8),
+
+/* 24bpp formats */
+    PIXMAN_r8g8b8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ARGB,0,8,8,8),
+    PIXMAN_b8g8r8 =	 PIXMAN_FORMAT(24,PIXMAN_TYPE_ABGR,0,8,8,8),
+
+/* 16bpp formats */
+    PIXMAN_r5g6b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,6,5),
+    PIXMAN_b5g6r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,6,5),
+
+    PIXMAN_a1r5g5b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,1,5,5,5),
+    PIXMAN_x1r5g5b5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,5,5,5),
+    PIXMAN_a1b5g5r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,1,5,5,5),
+    PIXMAN_x1b5g5r5 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,5,5,5),
+    PIXMAN_a4r4g4b4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,4,4,4,4),
+    PIXMAN_x4r4g4b4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ARGB,0,4,4,4),
+    PIXMAN_a4b4g4r4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,4,4,4,4),
+    PIXMAN_x4b4g4r4 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_ABGR,0,4,4,4),
+
+/* 8bpp formats */
+    PIXMAN_a8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_A,8,0,0,0),
+    PIXMAN_r3g3b2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,0,3,3,2),
+    PIXMAN_b2g3r3 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,0,3,3,2),
+    PIXMAN_a2r2g2b2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ARGB,2,2,2,2),
+    PIXMAN_a2b2g2r2 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_ABGR,2,2,2,2),
+
+    PIXMAN_c8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_g8 =		 PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+    PIXMAN_x4a4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_A,4,0,0,0),
+
+    PIXMAN_x4c4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_x4g4 =	 PIXMAN_FORMAT(8,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+/* 4bpp formats */
+    PIXMAN_a4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_A,4,0,0,0),
+    PIXMAN_r1g2b1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,0,1,2,1),
+    PIXMAN_b1g2r1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,0,1,2,1),
+    PIXMAN_a1r1g1b1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ARGB,1,1,1,1),
+    PIXMAN_a1b1g1r1 =	 PIXMAN_FORMAT(4,PIXMAN_TYPE_ABGR,1,1,1,1),
+
+    PIXMAN_c4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_COLOR,0,0,0,0),
+    PIXMAN_g4 =		 PIXMAN_FORMAT(4,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+/* 1bpp formats */
+    PIXMAN_a1 =		 PIXMAN_FORMAT(1,PIXMAN_TYPE_A,1,0,0,0),
+
+    PIXMAN_g1 =		 PIXMAN_FORMAT(1,PIXMAN_TYPE_GRAY,0,0,0,0),
+
+/* YUV formats */
+    PIXMAN_yuy2 =	 PIXMAN_FORMAT(16,PIXMAN_TYPE_YUY2,0,0,0,0),
+    PIXMAN_yv12 =	 PIXMAN_FORMAT(12,PIXMAN_TYPE_YV12,0,0,0,0)
+} pixman_format_code_t;
+
+/* Querying supported format values. */
+pixman_bool_t pixman_format_supported_destination (pixman_format_code_t format);
+pixman_bool_t pixman_format_supported_source      (pixman_format_code_t format);
+
+/* Constructors */
+pixman_image_t *pixman_image_create_solid_fill       (const pixman_color_t         *color);
+pixman_image_t *pixman_image_create_linear_gradient  (const pixman_point_fixed_t   *p1,
+						      const pixman_point_fixed_t   *p2,
+						      const pixman_gradient_stop_t *stops,
+						      int                           n_stops);
+pixman_image_t *pixman_image_create_radial_gradient  (const pixman_point_fixed_t   *inner,
+						      const pixman_point_fixed_t   *outer,
+						      pixman_fixed_t                inner_radius,
+						      pixman_fixed_t                outer_radius,
+						      const pixman_gradient_stop_t *stops,
+						      int                           n_stops);
+pixman_image_t *pixman_image_create_conical_gradient (const pixman_point_fixed_t   *center,
+						      pixman_fixed_t                angle,
+						      const pixman_gradient_stop_t *stops,
+						      int                           n_stops);
+pixman_image_t *pixman_image_create_bits             (pixman_format_code_t          format,
+						      int                           width,
+						      int                           height,
+						      uint32_t                     *bits,
+						      int                           rowstride_bytes);
+pixman_image_t *pixman_image_create_bits_no_clear    (pixman_format_code_t format,
+						      int                  width,
+						      int                  height,
+						      uint32_t *           bits,
+						      int                  rowstride_bytes);
+
+/* Destructor */
+pixman_image_t *pixman_image_ref                     (pixman_image_t               *image);
+pixman_bool_t   pixman_image_unref                   (pixman_image_t               *image);
+
+void		pixman_image_set_destroy_function    (pixman_image_t		   *image,
+						      pixman_image_destroy_func_t   function,
+						      void			   *data);
+void *		pixman_image_get_destroy_data        (pixman_image_t		   *image);
+
+/* Set properties */
+pixman_bool_t   pixman_image_set_clip_region         (pixman_image_t               *image,
+						      pixman_region16_t            *region);
+pixman_bool_t   pixman_image_set_clip_region32       (pixman_image_t               *image,
+						      pixman_region32_t            *region);
+void		pixman_image_set_has_client_clip     (pixman_image_t               *image,
+						      pixman_bool_t		    clien_clip);
+pixman_bool_t   pixman_image_set_transform           (pixman_image_t               *image,
+						      const pixman_transform_t     *transform);
+void            pixman_image_set_repeat              (pixman_image_t               *image,
+						      pixman_repeat_t               repeat);
+pixman_bool_t   pixman_image_set_filter              (pixman_image_t               *image,
+						      pixman_filter_t               filter,
+						      const pixman_fixed_t         *filter_params,
+						      int                           n_filter_params);
+void		pixman_image_set_source_clipping     (pixman_image_t		   *image,
+						      pixman_bool_t                 source_clipping);
+void            pixman_image_set_alpha_map           (pixman_image_t               *image,
+						      pixman_image_t               *alpha_map,
+						      int16_t                       x,
+						      int16_t                       y);
+void            pixman_image_set_component_alpha     (pixman_image_t               *image,
+						      pixman_bool_t                 component_alpha);
+pixman_bool_t   pixman_image_get_component_alpha     (pixman_image_t               *image);
+void		pixman_image_set_accessors	     (pixman_image_t		   *image,
+						      pixman_read_memory_func_t	    read_func,
+						      pixman_write_memory_func_t    write_func);
+void		pixman_image_set_indexed	     (pixman_image_t		   *image,
+						      const pixman_indexed_t	   *indexed);
+uint32_t       *pixman_image_get_data                (pixman_image_t               *image);
+int		pixman_image_get_width               (pixman_image_t               *image);
+int             pixman_image_get_height              (pixman_image_t               *image);
+int		pixman_image_get_stride              (pixman_image_t               *image); /* in bytes */
+int		pixman_image_get_depth               (pixman_image_t		   *image);
+pixman_format_code_t pixman_image_get_format	     (pixman_image_t		   *image);
+
+typedef enum
+{
+    PIXMAN_KERNEL_IMPULSE,
+    PIXMAN_KERNEL_BOX,
+    PIXMAN_KERNEL_LINEAR,
+    PIXMAN_KERNEL_CUBIC,
+    PIXMAN_KERNEL_GAUSSIAN,
+    PIXMAN_KERNEL_LANCZOS2,
+    PIXMAN_KERNEL_LANCZOS3,
+    PIXMAN_KERNEL_LANCZOS3_STRETCHED       /* Jim Blinn's 'nice' filter */
+} pixman_kernel_t;
+
+/* Create the parameter list for a SEPARABLE_CONVOLUTION filter
+ * with the given kernels and scale parameters.
+ */
+pixman_fixed_t *
+pixman_filter_create_separable_convolution (int             *n_values,
+					    pixman_fixed_t   scale_x,
+					    pixman_fixed_t   scale_y,
+					    pixman_kernel_t  reconstruct_x,
+					    pixman_kernel_t  reconstruct_y,
+					    pixman_kernel_t  sample_x,
+					    pixman_kernel_t  sample_y,
+					    int              subsample_bits_x,
+					    int              subsample_bits_y);
+
+pixman_bool_t	pixman_image_fill_rectangles	     (pixman_op_t		    op,
+						      pixman_image_t		   *image,
+						      const pixman_color_t	   *color,
+						      int			    n_rects,
+						      const pixman_rectangle16_t   *rects);
+pixman_bool_t   pixman_image_fill_boxes              (pixman_op_t                   op,
+                                                      pixman_image_t               *dest,
+                                                      const pixman_color_t         *color,
+                                                      int                           n_boxes,
+                                                      const pixman_box32_t         *boxes);
+
+/* Composite */
+pixman_bool_t pixman_compute_composite_region (pixman_region16_t *region,
+					       pixman_image_t    *src_image,
+					       pixman_image_t    *mask_image,
+					       pixman_image_t    *dest_image,
+					       int16_t            src_x,
+					       int16_t            src_y,
+					       int16_t            mask_x,
+					       int16_t            mask_y,
+					       int16_t            dest_x,
+					       int16_t            dest_y,
+					       uint16_t           width,
+					       uint16_t           height);
+void          pixman_image_composite          (pixman_op_t        op,
+					       pixman_image_t    *src,
+					       pixman_image_t    *mask,
+					       pixman_image_t    *dest,
+					       int16_t            src_x,
+					       int16_t            src_y,
+					       int16_t            mask_x,
+					       int16_t            mask_y,
+					       int16_t            dest_x,
+					       int16_t            dest_y,
+					       uint16_t           width,
+					       uint16_t           height);
+void          pixman_image_composite32        (pixman_op_t        op,
+					       pixman_image_t    *src,
+					       pixman_image_t    *mask,
+					       pixman_image_t    *dest,
+					       int32_t            src_x,
+					       int32_t            src_y,
+					       int32_t            mask_x,
+					       int32_t            mask_y,
+					       int32_t            dest_x,
+					       int32_t            dest_y,
+					       int32_t            width,
+					       int32_t            height);
+
+/* Executive Summary: This function is a no-op that only exists
+ * for historical reasons.
+ *
+ * There used to be a bug in the X server where it would rely on
+ * out-of-bounds accesses when it was asked to composite with a
+ * window as the source. It would create a pixman image pointing
+ * to some bogus position in memory, but then set a clip region
+ * to the position where the actual bits were.
+ *
+ * Due to a bug in old versions of pixman, where it would not clip
+ * against the image bounds when a clip region was set, this would
+ * actually work. So when the pixman bug was fixed, a workaround was
+ * added to allow certain out-of-bound accesses. This function disabled
+ * those workarounds.
+ *
+ * Since 0.21.2, pixman doesn't do these workarounds anymore, so now this
+ * function is a no-op.
+ */
+void pixman_disable_out_of_bounds_workaround (void);
+
+/*
+ * Glyphs
+ */
+typedef struct pixman_glyph_cache_t pixman_glyph_cache_t;
+typedef struct
+{
+    int		x, y;
+    const void *glyph;
+} pixman_glyph_t;
+
+pixman_glyph_cache_t *pixman_glyph_cache_create       (void);
+void                  pixman_glyph_cache_destroy      (pixman_glyph_cache_t *cache);
+void                  pixman_glyph_cache_freeze       (pixman_glyph_cache_t *cache);
+void                  pixman_glyph_cache_thaw         (pixman_glyph_cache_t *cache);
+const void *          pixman_glyph_cache_lookup       (pixman_glyph_cache_t *cache,
+						       void                 *font_key,
+						       void                 *glyph_key);
+const void *          pixman_glyph_cache_insert       (pixman_glyph_cache_t *cache,
+						       void                 *font_key,
+						       void                 *glyph_key,
+						       int		     origin_x,
+						       int                   origin_y,
+						       pixman_image_t       *glyph_image);
+void                  pixman_glyph_cache_remove       (pixman_glyph_cache_t *cache,
+						       void                 *font_key,
+						       void                 *glyph_key);
+void                  pixman_glyph_get_extents        (pixman_glyph_cache_t *cache,
+						       int                   n_glyphs,
+						       pixman_glyph_t       *glyphs,
+						       pixman_box32_t       *extents);
+pixman_format_code_t  pixman_glyph_get_mask_format    (pixman_glyph_cache_t *cache,
+						       int		     n_glyphs,
+						       const pixman_glyph_t *glyphs);
+void                  pixman_composite_glyphs         (pixman_op_t           op,
+						       pixman_image_t       *src,
+						       pixman_image_t       *dest,
+						       pixman_format_code_t  mask_format,
+						       int32_t               src_x,
+						       int32_t               src_y,
+						       int32_t		     mask_x,
+						       int32_t		     mask_y,
+						       int32_t               dest_x,
+						       int32_t               dest_y,
+						       int32_t		     width,
+						       int32_t		     height,
+						       pixman_glyph_cache_t *cache,
+						       int		     n_glyphs,
+						       const pixman_glyph_t *glyphs);
+void                  pixman_composite_glyphs_no_mask (pixman_op_t           op,
+						       pixman_image_t       *src,
+						       pixman_image_t       *dest,
+						       int32_t               src_x,
+						       int32_t               src_y,
+						       int32_t               dest_x,
+						       int32_t               dest_y,
+						       pixman_glyph_cache_t *cache,
+						       int		     n_glyphs,
+						       const pixman_glyph_t *glyphs);
+
+/*
+ * Trapezoids
+ */
+typedef struct pixman_edge pixman_edge_t;
+typedef struct pixman_trapezoid pixman_trapezoid_t;
+typedef struct pixman_trap pixman_trap_t;
+typedef struct pixman_span_fix pixman_span_fix_t;
+typedef struct pixman_triangle pixman_triangle_t;
+
+/*
+ * An edge structure.  This represents a single polygon edge
+ * and can be quickly stepped across small or large gaps in the
+ * sample grid
+ */
+struct pixman_edge
+{
+    pixman_fixed_t	x;
+    pixman_fixed_t	e;
+    pixman_fixed_t	stepx;
+    pixman_fixed_t	signdx;
+    pixman_fixed_t	dy;
+    pixman_fixed_t	dx;
+
+    pixman_fixed_t	stepx_small;
+    pixman_fixed_t	stepx_big;
+    pixman_fixed_t	dx_small;
+    pixman_fixed_t	dx_big;
+};
+
+struct pixman_trapezoid
+{
+    pixman_fixed_t	top, bottom;
+    pixman_line_fixed_t	left, right;
+};
+
+struct pixman_triangle
+{
+    pixman_point_fixed_t p1, p2, p3;
+};
+
+/* whether 't' is a well defined not obviously empty trapezoid */
+#define pixman_trapezoid_valid(t)				   \
+    ((t)->left.p1.y != (t)->left.p2.y &&			   \
+     (t)->right.p1.y != (t)->right.p2.y &&			   \
+     ((t)->bottom > (t)->top))
+
+struct pixman_span_fix
+{
+    pixman_fixed_t	l, r, y;
+};
+
+struct pixman_trap
+{
+    pixman_span_fix_t	top, bot;
+};
+
+pixman_fixed_t pixman_sample_ceil_y        (pixman_fixed_t             y,
+					    int                        bpp);
+pixman_fixed_t pixman_sample_floor_y       (pixman_fixed_t             y,
+					    int                        bpp);
+void           pixman_edge_step            (pixman_edge_t             *e,
+					    int                        n);
+void           pixman_edge_init            (pixman_edge_t             *e,
+					    int                        bpp,
+					    pixman_fixed_t             y_start,
+					    pixman_fixed_t             x_top,
+					    pixman_fixed_t             y_top,
+					    pixman_fixed_t             x_bot,
+					    pixman_fixed_t             y_bot);
+void           pixman_line_fixed_edge_init (pixman_edge_t             *e,
+					    int                        bpp,
+					    pixman_fixed_t             y,
+					    const pixman_line_fixed_t *line,
+					    int                        x_off,
+					    int                        y_off);
+void           pixman_rasterize_edges      (pixman_image_t            *image,
+					    pixman_edge_t             *l,
+					    pixman_edge_t             *r,
+					    pixman_fixed_t             t,
+					    pixman_fixed_t             b);
+void           pixman_add_traps            (pixman_image_t            *image,
+					    int16_t                    x_off,
+					    int16_t                    y_off,
+					    int                        ntrap,
+					    const pixman_trap_t       *traps);
+void           pixman_add_trapezoids       (pixman_image_t            *image,
+					    int16_t                    x_off,
+					    int                        y_off,
+					    int                        ntraps,
+					    const pixman_trapezoid_t  *traps);
+void           pixman_rasterize_trapezoid  (pixman_image_t            *image,
+					    const pixman_trapezoid_t  *trap,
+					    int                        x_off,
+					    int                        y_off);
+void          pixman_composite_trapezoids (pixman_op_t		       op,
+					   pixman_image_t *	       src,
+					   pixman_image_t *	       dst,
+					   pixman_format_code_t	       mask_format,
+					   int			       x_src,
+					   int			       y_src,
+					   int			       x_dst,
+					   int			       y_dst,
+					   int			       n_traps,
+					   const pixman_trapezoid_t *  traps);
+void          pixman_composite_triangles (pixman_op_t		       op,
+					  pixman_image_t *	       src,
+					  pixman_image_t *	       dst,
+					  pixman_format_code_t	       mask_format,
+					  int			       x_src,
+					  int			       y_src,
+					  int			       x_dst,
+					  int			       y_dst,
+					  int			       n_tris,
+					  const pixman_triangle_t *    tris);
+void	      pixman_add_triangles       (pixman_image_t              *image,
+					  int32_t	               x_off,
+					  int32_t	               y_off,
+					  int	                       n_tris,
+					  const pixman_triangle_t     *tris);
+
+PIXMAN_END_DECLS
+
+#endif /* PIXMAN_H__ */
diff --git a/src/pixman/pixman/rounding.txt b/src/pixman/pixman/rounding.txt
new file mode 100644
index 0000000..b52b084
--- /dev/null
+++ b/src/pixman/pixman/rounding.txt
@@ -0,0 +1,167 @@
+*** General notes about rounding
+
+Suppose a function is sampled at positions [k + o] where k is an
+integer and o is a fractional offset 0 <= o < 1.
+
+To round a value to the nearest sample, breaking ties by rounding up,
+we can do this:
+
+   round(x) = floor(x - o + 0.5) + o
+
+That is, first subtract o to let us pretend that the samples are at
+integer coordinates, then add 0.5 and floor to round to nearest
+integer, then add the offset back in.
+
+To break ties by rounding down:
+
+    round(x) = ceil(x - o - 0.5) + o
+
+or if we have an epsilon value:
+
+    round(x) = floor(x - o + 0.5 - e) + o
+
+To always round *up* to the next sample:
+
+    round_up(x) = ceil(x - o) + o
+
+To always round *down* to the previous sample:
+
+    round_down(x) = floor(x - o) + o
+
+If a set of samples is stored in an array, you get from the sample
+position to an index by subtracting the position of the first sample
+in the array:
+
+    index(s) = s - first_sample
+
+
+*** Application to pixman
+
+In pixman, images are sampled with o = 0.5, that is, pixels are
+located midways between integers. We usually break ties by rounding
+down (i.e., "round towards north-west").
+
+
+-- NEAREST filtering:
+
+The NEAREST filter simply picks the closest pixel to the given
+position:
+
+    round(x) = floor(x - 0.5 + 0.5 - e) + 0.5 = floor (x - e) + 0.5
+
+The first sample of a pixman image has position 0.5, so to find the
+index in the pixel array, we have to subtract 0.5:
+
+    floor (x - e) + 0.5 - 0.5 = floor (x - e).
+
+Therefore a 16.16 fixed-point image location is turned into a pixel
+value with NEAREST filtering by doing this:
+
+    pixels[((y - e) >> 16) * stride + ((x - e) >> 16)]
+
+where stride is the number of pixels allocated per scanline and e =
+0x0001.
+
+
+-- CONVOLUTION filtering:
+
+A convolution matrix is considered a sampling of a function f at
+values surrounding 0. For example, this convolution matrix:
+
+	[a, b, c, d]
+
+is interpreted as the values of a function f:
+
+   	a = f(-1.5)
+        b = f(-0.5)
+        c = f(0.5)
+        d = f(1.5)
+
+The sample offset in this case is o = 0.5 and the first sample has
+position s0 = -1.5. If the matrix is:
+
+        [a, b, c, d, e]
+
+the sample offset is o = 0 and the first sample has position s0 =
+-2.0. In general we have 
+
+      s0 = (- width / 2.0 + 0.5).
+
+and
+
+      o = frac (s0)
+
+To evaluate f at a position between the samples, we round to the
+closest sample, and then we subtract the position of the first sample
+to get the index in the matrix:
+
+	f(t) = matrix[floor(t - o + 0.5) + o - s0]
+
+Note that in this case we break ties by rounding up.
+
+If we write s0 = m + o, where m is an integer, this is equivalent to
+
+        f(t) = matrix[floor(t - o + 0.5) + o - (m + o)]
+	     = matrix[floor(t - o + 0.5 - m) + o - o]
+	     = matrix[floor(t - s0 + 0.5)]
+
+The convolution filter in pixman positions f such that 0 aligns with
+the given position x. For a given pixel x0 in the image, the closest
+sample of f is then computed by taking (x - x0) and rounding that to
+the closest index:
+
+	i = floor ((x0 - x) - s0 + 0.5)
+
+To perform the convolution, we have to find the first pixel x0 whose
+corresponding sample has index 0. We can write x0 = k + 0.5, where k
+is an integer:
+
+         0 = floor(k + 0.5 - x - s0 + 0.5)
+
+	   = k + floor(1 - x - s0)
+
+	   = k - ceil(x + s0 - 1)
+
+	   = k - floor(x + s0 - e)
+
+	   = k - floor(x - (width - 1) / 2.0 - e)
+
+And so the final formula for the index k of x0 in the image is:
+
+    	    k = floor(x - (width - 1) / 2.0 - e)
+
+Computing the result is then simply a matter of convolving all the
+pixels starting at k with all the samples in the matrix.
+
+
+--- SEPARABLE_CONVOLUTION
+
+For this filter, x is first rounded to one of n regularly spaced
+subpixel positions. This subpixel position determines which of n
+convolution matrices is being used.
+
+Then, as in a regular convolution filter, the first pixel to be used
+is determined:
+
+    	k = floor (x - (width - 1) / 2.0 - e)
+
+and then the image pixels starting there are convolved with the chosen
+matrix. If we write x = xi + frac, where xi is an integer, we get
+
+	k = xi + floor (frac - (width - 1) / 2.0 - e)
+
+so the location of k relative to x is given by:
+
+    (k + 0.5 - x) = xi + floor (frac - (width - 1) / 2.0 - e) + 0.5 - x
+
+                  = floor (frac - (width - 1) / 2.0 - e) + 0.5 - frac
+
+which means the contents of the matrix corresponding to (frac) should
+contain width samplings of the function, with the first sample at:
+
+       floor (frac - (width - 1) / 2.0 - e) + 0.5 - frac
+
+This filter is called separable because each of the k x k convolution
+matrices is specified with two k-wide vectors, one for each dimension,
+where each entry in the matrix is computed as the product of the
+corresponding entries in the vectors.
diff --git a/src/pixman/pixman/solaris-hwcap.mapfile b/src/pixman/pixman/solaris-hwcap.mapfile
new file mode 100644
index 0000000..87efce1
--- /dev/null
+++ b/src/pixman/pixman/solaris-hwcap.mapfile
@@ -0,0 +1,30 @@
+###############################################################################
+#
+# Copyright 2009, Oracle and/or its affiliates. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+#
+# Override the linker's detection of CMOV/MMX/SSE instructions so this
+# library isn't flagged as only usable on CPU's with those ISA's, since it
+# checks at runtime for availability before calling them
+
+hwcap_1 = V0x0 FPU OVERRIDE;
diff --git a/src/pixman/test/Makefile.am b/src/pixman/test/Makefile.am
new file mode 100644
index 0000000..88dc36d
--- /dev/null
+++ b/src/pixman/test/Makefile.am
@@ -0,0 +1,13 @@
+include $(top_srcdir)/test/Makefile.sources
+
+AM_CFLAGS = $(OPENMP_CFLAGS) $(PTHREAD_CFLAGS)
+AM_LDFLAGS = $(OPENMP_CFLAGS) $(TESTPROGS_EXTRA_LDFLAGS) $(PTHREAD_LDFLAGS)
+LDADD = libutils.la $(top_builddir)/pixman/libpixman-1.la -lm  $(PNG_LIBS) $(PTHREAD_LIBS)
+AM_CPPFLAGS = -I$(top_srcdir)/pixman -I$(top_builddir)/pixman $(PNG_CFLAGS)
+
+libutils_la_SOURCES = $(libutils_sources) $(libutils_headers)
+
+noinst_LTLIBRARIES = libutils.la
+noinst_PROGRAMS = $(TESTPROGRAMS) $(OTHERPROGRAMS)
+
+TESTS = $(TESTPROGRAMS)
diff --git a/src/pixman/test/Makefile.sources b/src/pixman/test/Makefile.sources
new file mode 100644
index 0000000..2ae5d9f
--- /dev/null
+++ b/src/pixman/test/Makefile.sources
@@ -0,0 +1,49 @@
+# Tests (sorted by expected completion time)
+TESTPROGRAMS =			\
+	prng-test		\
+	a1-trap-test		\
+	pdf-op-test		\
+	region-test		\
+	region-translate-test	\
+	combiner-test		\
+	pixel-test		\
+	fetch-test		\
+	rotate-test		\
+	oob-test		\
+	infinite-loop		\
+	trap-crasher		\
+	alpha-loop		\
+	thread-test		\
+	scaling-crash-test	\
+	scaling-helpers-test	\
+	gradient-crash-test	\
+	region-contains-test	\
+	alphamap		\
+	matrix-test		\
+	stress-test		\
+	composite-traps-test	\
+	blitters-test		\
+	glyph-test		\
+	scaling-test		\
+	affine-test		\
+	composite		\
+	$(NULL)
+
+# Other programs
+OTHERPROGRAMS =                 \
+	lowlevel-blt-bench	\
+	radial-perf-test	\
+        check-formats           \
+	scaling-bench		\
+	$(NULL)
+
+# Utility functions
+libutils_sources =		\
+	utils.c			\
+	utils-prng.c		\
+	$(NULL)
+
+libutils_headers =		\
+	utils.h			\
+	utils-prng.h		\
+	$(NULL)
diff --git a/src/pixman/test/Makefile.win32 b/src/pixman/test/Makefile.win32
new file mode 100644
index 0000000..6cfb4a7
--- /dev/null
+++ b/src/pixman/test/Makefile.win32
@@ -0,0 +1,54 @@
+default: all
+
+top_srcdir = ..
+include $(top_srcdir)/test/Makefile.sources
+include $(top_srcdir)/Makefile.win32.common
+
+TEST_LDADD = \
+	$(top_builddir)/pixman/$(CFG_VAR)/$(LIBRARY).lib \
+	$(CFG_VAR)/libutils.lib \
+	$(NULL)
+
+libutils_OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(libutils_sources))
+
+SOURCES = $(patsubst %,   %.c,              $(TESTPROGRAMS) $(OTHERPROGRAMS))
+OBJECTS = $(patsubst %.c, $(CFG_VAR)/%.obj, $(SOURCES))
+TESTS   = $(patsubst %,   $(CFG_VAR)/%.exe, $(TESTPROGRAMS))
+OTHERS  = $(patsubst %,   $(CFG_VAR)/%.exe, $(OTHERPROGRAMS))
+
+all: pixman inform $(TESTS) $(OTHERS)
+
+check: pixman inform $(TESTS)
+	@failures=0 ; \
+	total=0 ; \
+	for test in $(TESTS) ; \
+	do \
+		total=`expr $$total + 1` ; \
+		if ./$$test ; \
+		then echo "PASS: $$test" ; \
+		else echo "FAIL: $$test" ; \
+		     failures=`expr $$failures + 1` ; \
+		fi ; \
+	done ; \
+	if test $$failures -eq 0 ; \
+	then banner="All $$total tests passed" ; \
+	else banner="$$failures of $$total tests failed" ; \
+	fi ; \
+	dashes=`echo "$$banner" | sed s/./=/g`; \
+	echo "$$dashes" ; \
+	echo "$$banner" ; \
+	echo "$$dashes" ; \
+	test $$failures -eq 0
+
+$(CFG_VAR)/libutils.lib: $(libutils_OBJECTS)
+	@$(AR) $(PIXMAN_ARFLAGS) -OUT:$@ $^
+
+$(CFG_VAR)/%.exe: $(CFG_VAR)/%.obj $(TEST_LDADD)
+	@$(LD) $(PIXMAN_LDFLAGS) -OUT:$@ $^
+
+$(top_builddir)/pixman/$(CFG_VAR)/$(LIBRARY).lib: pixman
+
+pixman:
+	@$(MAKE) -C $(top_builddir)/pixman -f Makefile.win32
+
+.PHONY: all check pixman
diff --git a/src/pixman/test/a1-trap-test.c b/src/pixman/test/a1-trap-test.c
new file mode 100644
index 0000000..c2b4883
--- /dev/null
+++ b/src/pixman/test/a1-trap-test.c
@@ -0,0 +1,58 @@
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 20
+#define HEIGHT 20
+
+    pixman_image_t *src_img;
+    pixman_image_t *mask_img;
+    pixman_image_t *dest_img;
+    pixman_trap_t trap;
+    pixman_color_t red = { 0xffff, 0x0000, 0x0000, 0xffff };
+    uint32_t *bits = malloc (WIDTH * HEIGHT * 4);
+    uint32_t *mbits = malloc (WIDTH * HEIGHT);
+
+    memset (mbits, 0, WIDTH * HEIGHT);
+    memset (bits, 0xff, WIDTH * HEIGHT * 4);
+    
+    trap.top.l = pixman_double_to_fixed (0.5);
+    trap.top.r = pixman_double_to_fixed (1.5);
+    trap.top.y = pixman_double_to_fixed (0.5);
+
+    trap.bot.l = pixman_double_to_fixed (0.5);
+    trap.bot.r = pixman_double_to_fixed (1.5);
+    trap.bot.y = pixman_double_to_fixed (1.5);
+
+    mask_img = pixman_image_create_bits (
+	PIXMAN_a1, WIDTH, HEIGHT, mbits, WIDTH);
+    src_img = pixman_image_create_solid_fill (&red);
+    dest_img = pixman_image_create_bits (
+	PIXMAN_a8r8g8b8, WIDTH, HEIGHT, bits, WIDTH * 4);
+    
+    pixman_add_traps (mask_img, 0, 0, 1, &trap);
+
+    pixman_image_composite (PIXMAN_OP_OVER,
+			    src_img, mask_img, dest_img,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    assert (bits[0] == 0xffff0000);
+    assert (bits[1] == 0xffffffff);
+    assert (bits[1 * WIDTH + 0] == 0xffffffff);
+    assert (bits[1 * WIDTH + 1] == 0xffffffff);
+
+    /* The check-formats test depends on operator_name() and format_name() returning
+     * these precise formats, so if those change, check-formats.c must be updated too.
+     */
+    assert (
+        strcmp (operator_name (PIXMAN_OP_DISJOINT_OVER), "PIXMAN_OP_DISJOINT_OVER") == 0);
+    assert (
+        strcmp (format_name (PIXMAN_r5g6b5), "r5g6b5") == 0);
+    
+    return 0;
+}
diff --git a/src/pixman/test/affine-test.c b/src/pixman/test/affine-test.c
new file mode 100644
index 0000000..8e19023
--- /dev/null
+++ b/src/pixman/test/affine-test.c
@@ -0,0 +1,324 @@
+/*
+ * Test program, which can detect some problems with affine transformations
+ * in pixman. Testing is done by running lots of random SRC and OVER
+ * compositing operations a8r8g8b8, x8a8r8g8b8, r5g6b5 and a8 color formats
+ * with random scaled, rotated and translated transforms.
+ *
+ * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
+ * the case of test failure.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+#define MAX_SRC_WIDTH  16
+#define MAX_SRC_HEIGHT 16
+#define MAX_DST_WIDTH  16
+#define MAX_DST_HEIGHT 16
+#define MAX_STRIDE     4
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (int      testnum,
+		int      verbose)
+{
+    int                i;
+    pixman_image_t *   src_img;
+    pixman_image_t *   dst_img;
+    pixman_transform_t transform;
+    pixman_region16_t  clip;
+    int                src_width, src_height;
+    int                dst_width, dst_height;
+    int                src_stride, dst_stride;
+    int                src_x, src_y;
+    int                dst_x, dst_y;
+    int                src_bpp;
+    int                dst_bpp;
+    int                w, h;
+    pixman_fixed_t     scale_x = 65536, scale_y = 65536;
+    pixman_fixed_t     translate_x = 0, translate_y = 0;
+    pixman_op_t        op;
+    pixman_repeat_t    repeat = PIXMAN_REPEAT_NONE;
+    pixman_format_code_t src_fmt, dst_fmt;
+    uint32_t *         srcbuf;
+    uint32_t *         dstbuf;
+    uint32_t           crc32;
+    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+    prng_srand (testnum);
+
+    src_bpp = (prng_rand_n (2) == 0) ? 2 : 4;
+    dst_bpp = (prng_rand_n (2) == 0) ? 2 : 4;
+    op = (prng_rand_n (2) == 0) ? PIXMAN_OP_SRC : PIXMAN_OP_OVER;
+
+    src_width = prng_rand_n (MAX_SRC_WIDTH) + 1;
+    src_height = prng_rand_n (MAX_SRC_HEIGHT) + 1;
+    dst_width = prng_rand_n (MAX_DST_WIDTH) + 1;
+    dst_height = prng_rand_n (MAX_DST_HEIGHT) + 1;
+    src_stride = src_width * src_bpp + prng_rand_n (MAX_STRIDE) * src_bpp;
+    dst_stride = dst_width * dst_bpp + prng_rand_n (MAX_STRIDE) * dst_bpp;
+
+    if (src_stride & 3)
+	src_stride += 2;
+
+    if (dst_stride & 3)
+	dst_stride += 2;
+
+    src_x = -(src_width / 4) + prng_rand_n (src_width * 3 / 2);
+    src_y = -(src_height / 4) + prng_rand_n (src_height * 3 / 2);
+    dst_x = -(dst_width / 4) + prng_rand_n (dst_width * 3 / 2);
+    dst_y = -(dst_height / 4) + prng_rand_n (dst_height * 3 / 2);
+    w = prng_rand_n (dst_width * 3 / 2 - dst_x);
+    h = prng_rand_n (dst_height * 3 / 2 - dst_y);
+
+    srcbuf = (uint32_t *)malloc (src_stride * src_height);
+    dstbuf = (uint32_t *)malloc (dst_stride * dst_height);
+
+    prng_randmemset (srcbuf, src_stride * src_height, 0);
+    prng_randmemset (dstbuf, dst_stride * dst_height, 0);
+
+    if (prng_rand_n (2) == 0)
+    {
+	srcbuf += (src_stride / 4) * (src_height - 1);
+	src_stride = - src_stride;
+    }
+
+    if (prng_rand_n (2) == 0)
+    {
+	dstbuf += (dst_stride / 4) * (dst_height - 1);
+	dst_stride = - dst_stride;
+    }
+    
+    src_fmt = src_bpp == 4 ? (prng_rand_n (2) == 0 ?
+                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+
+    dst_fmt = dst_bpp == 4 ? (prng_rand_n (2) == 0 ?
+                              PIXMAN_a8r8g8b8 : PIXMAN_x8r8g8b8) : PIXMAN_r5g6b5;
+
+    src_img = pixman_image_create_bits (
+        src_fmt, src_width, src_height, srcbuf, src_stride);
+
+    dst_img = pixman_image_create_bits (
+        dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
+
+    image_endian_swap (src_img);
+    image_endian_swap (dst_img);
+
+    pixman_transform_init_identity (&transform);
+
+    if (prng_rand_n (3) > 0)
+    {
+	scale_x = -65536 * 3 + prng_rand_n (65536 * 6);
+	if (prng_rand_n (2))
+	    scale_y = -65536 * 3 + prng_rand_n (65536 * 6);
+	else
+	    scale_y = scale_x;
+	pixman_transform_init_scale (&transform, scale_x, scale_y);
+    }
+    if (prng_rand_n (3) > 0)
+    {
+	translate_x = -65536 * 3 + prng_rand_n (6 * 65536);
+	if (prng_rand_n (2))
+	    translate_y = -65536 * 3 + prng_rand_n (6 * 65536);
+	else
+	    translate_y = translate_x;
+	pixman_transform_translate (&transform, NULL, translate_x, translate_y);
+    }
+
+    if (prng_rand_n (4) > 0)
+    {
+	int c, s, tx = 0, ty = 0;
+	switch (prng_rand_n (4))
+	{
+	case 0:
+	    /* 90 degrees */
+	    c = 0;
+	    s = pixman_fixed_1;
+	    tx = pixman_int_to_fixed (MAX_SRC_HEIGHT);
+	    break;
+	case 1:
+	    /* 180 degrees */
+	    c = -pixman_fixed_1;
+	    s = 0;
+	    tx = pixman_int_to_fixed (MAX_SRC_WIDTH);
+	    ty = pixman_int_to_fixed (MAX_SRC_HEIGHT);
+	    break;
+	case 2:
+	    /* 270 degrees */
+	    c = 0;
+	    s = -pixman_fixed_1;
+	    ty = pixman_int_to_fixed (MAX_SRC_WIDTH);
+	    break;
+	default:
+	    /* arbitrary rotation */
+	    c = prng_rand_n (2 * 65536) - 65536;
+	    s = prng_rand_n (2 * 65536) - 65536;
+	    break;
+	}
+	pixman_transform_rotate (&transform, NULL, c, s);
+	pixman_transform_translate (&transform, NULL, tx, ty);
+    }
+
+    if (prng_rand_n (8) == 0)
+    {
+	/* Flip random bits */
+	int maxflipcount = 8;
+	while (maxflipcount--)
+	{
+	    int i = prng_rand_n (2);
+	    int j = prng_rand_n (3);
+	    int bitnum = prng_rand_n (32);
+	    transform.matrix[i][j] ^= 1 << bitnum;
+	    if (prng_rand_n (2))
+		break;
+	}
+    }
+
+    pixman_image_set_transform (src_img, &transform);
+
+    switch (prng_rand_n (4))
+    {
+    case 0:
+	repeat = PIXMAN_REPEAT_NONE;
+	break;
+
+    case 1:
+	repeat = PIXMAN_REPEAT_NORMAL;
+	break;
+
+    case 2:
+	repeat = PIXMAN_REPEAT_PAD;
+	break;
+
+    case 3:
+	repeat = PIXMAN_REPEAT_REFLECT;
+	break;
+
+    default:
+        break;
+    }
+    pixman_image_set_repeat (src_img, repeat);
+
+    if (prng_rand_n (2))
+	pixman_image_set_filter (src_img, PIXMAN_FILTER_NEAREST, NULL, 0);
+    else
+	pixman_image_set_filter (src_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    if (verbose)
+    {
+#define M(r,c)								\
+	transform.matrix[r][c]
+
+	printf ("src_fmt=%s, dst_fmt=%s\n", format_name (src_fmt), format_name (dst_fmt));
+	printf ("op=%s, repeat=%d, transform=\n",
+	        operator_name (op), repeat);
+	printf (" { { { 0x%08x, 0x%08x, 0x%08x },\n"
+		"     { 0x%08x, 0x%08x, 0x%08x },\n"
+		"     { 0x%08x, 0x%08x, 0x%08x },\n"
+		" } };\n",
+		M(0,0), M(0,1), M(0,2),
+		M(1,0), M(1,1), M(1,2),
+		M(2,0), M(2,1), M(2,2));
+	printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
+	        src_width, src_height, dst_width, dst_height);
+	printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
+	        src_x, src_y, dst_x, dst_y);
+	printf ("w=%d, h=%d\n", w, h);
+    }
+
+    if (prng_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = prng_rand_n (2) + 1;
+
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = prng_rand_n (src_width);
+	    clip_boxes[i].y1 = prng_rand_n (src_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + prng_rand_n (src_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + prng_rand_n (src_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("source clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (src_img, &clip);
+	pixman_image_set_source_clipping (src_img, 1);
+	pixman_region_fini (&clip);
+    }
+
+    if (prng_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = prng_rand_n (2) + 1;
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = prng_rand_n (dst_width);
+	    clip_boxes[i].y1 = prng_rand_n (dst_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + prng_rand_n (dst_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + prng_rand_n (dst_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("destination clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (dst_img, &clip);
+	pixman_region_fini (&clip);
+    }
+
+    pixman_image_composite (op, src_img, NULL, dst_img,
+                            src_x, src_y, 0, 0, dst_x, dst_y, w, h);
+
+    crc32 = compute_crc32_for_image (0, dst_img);
+    
+    if (verbose)
+	print_image (dst_img);
+
+    pixman_image_unref (src_img);
+    pixman_image_unref (dst_img);
+
+    if (src_stride < 0)
+	srcbuf += (src_stride / 4) * (src_height - 1);
+
+    if (dst_stride < 0)
+	dstbuf += (dst_stride / 4) * (dst_height - 1);
+    
+    free (srcbuf);
+    free (dstbuf);
+
+    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+    return crc32;
+}
+
+#if BILINEAR_INTERPOLATION_BITS == 7
+#define CHECKSUM 0xBE724CFE
+#elif BILINEAR_INTERPOLATION_BITS == 4
+#define CHECKSUM 0x79BBE501
+#else
+#define CHECKSUM 0x00000000
+#endif
+
+int
+main (int argc, const char *argv[])
+{
+    pixman_disable_out_of_bounds_workaround ();
+
+    return fuzzer_test_main ("affine", 8000000, CHECKSUM,
+			     test_composite, argc, argv);
+}
diff --git a/src/pixman/test/alpha-loop.c b/src/pixman/test/alpha-loop.c
new file mode 100644
index 0000000..4d4384d
--- /dev/null
+++ b/src/pixman/test/alpha-loop.c
@@ -0,0 +1,35 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils.h"
+
+#define WIDTH 400
+#define HEIGHT 200
+
+int
+main (int argc, char **argv)
+{
+    pixman_image_t *a, *d, *s;
+    uint8_t *alpha;
+    uint32_t *src, *dest;
+
+    prng_srand (0);
+
+    alpha = make_random_bytes (WIDTH * HEIGHT);
+    src = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * 4);
+    dest = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * 4);
+
+    a = pixman_image_create_bits (PIXMAN_a8, WIDTH, HEIGHT, (uint32_t *)alpha, WIDTH);
+    d = pixman_image_create_bits (PIXMAN_a8r8g8b8, WIDTH, HEIGHT, dest, WIDTH * 4);
+    s = pixman_image_create_bits (PIXMAN_a2r10g10b10, WIDTH, HEIGHT, src, WIDTH * 4);
+
+    fail_after (5, "Infinite loop detected: 5 seconds without progress\n");
+
+    pixman_image_set_alpha_map (s, a, 0, 0);
+    pixman_image_set_alpha_map (a, s, 0, 0);
+
+    pixman_image_composite (PIXMAN_OP_SRC, s, NULL, d, 0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    pixman_image_unref (s);
+
+    return 0;
+}
diff --git a/src/pixman/test/alphamap.c b/src/pixman/test/alphamap.c
new file mode 100644
index 0000000..4d09076
--- /dev/null
+++ b/src/pixman/test/alphamap.c
@@ -0,0 +1,315 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils.h"
+
+#define WIDTH 48
+#define HEIGHT 48
+
+static const pixman_format_code_t formats[] =
+{
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_a8
+};
+
+static const pixman_format_code_t alpha_formats[] =
+{
+    PIXMAN_null,
+    PIXMAN_a8,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_a4r4g4b4
+};
+
+static const int origins[] =
+{
+    0, 10, -100
+};
+
+static void
+on_destroy (pixman_image_t *image, void *data)
+{
+    uint32_t *bits = pixman_image_get_data (image);
+
+    fence_free (bits);
+}
+
+static pixman_image_t *
+make_image (pixman_format_code_t format)
+{
+    uint32_t *bits;
+    uint8_t bpp = PIXMAN_FORMAT_BPP (format) / 8;
+    pixman_image_t *image;
+
+    bits = (uint32_t *)make_random_bytes (WIDTH * HEIGHT * bpp);
+
+    image = pixman_image_create_bits (format, WIDTH, HEIGHT, bits, WIDTH * bpp);
+
+    if (image && bits)
+	pixman_image_set_destroy_function (image, on_destroy, NULL);
+
+    return image;
+}
+
+static uint8_t
+get_alpha (pixman_image_t *image, int x, int y, int orig_x, int orig_y)
+{
+    uint8_t *bits;
+    uint8_t r;
+
+    if (image->common.alpha_map)
+    {
+	if (x - orig_x >= 0 && x - orig_x < WIDTH &&
+	    y - orig_y >= 0 && y - orig_y < HEIGHT)
+	{
+	    image = (pixman_image_t *)image->common.alpha_map;
+
+	    x -= orig_x;
+	    y -= orig_y;
+	}
+	else
+	{
+	    return 0;
+	}
+    }
+
+    bits = (uint8_t *)image->bits.bits;
+
+    if (image->bits.format == PIXMAN_a8)
+    {
+	r = bits[y * WIDTH + x];
+    }
+    else if (image->bits.format == PIXMAN_a2r10g10b10)
+    {
+	r = ((uint32_t *)bits)[y * WIDTH + x] >> 30;
+	r |= r << 2;
+	r |= r << 4;
+    }
+    else if (image->bits.format == PIXMAN_a8r8g8b8)
+    {
+	r = ((uint32_t *)bits)[y * WIDTH + x] >> 24;
+    }
+    else if (image->bits.format == PIXMAN_a4r4g4b4)
+    {
+	r = ((uint16_t *)bits)[y * WIDTH + x] >> 12;
+	r |= r << 4;
+    }
+    else
+    {
+	assert (0);
+    }
+
+    return r;
+}
+
+static uint16_t
+get_red (pixman_image_t *image, int x, int y, int orig_x, int orig_y)
+{
+    uint8_t *bits;
+    uint16_t r;
+
+    bits = (uint8_t *)image->bits.bits;
+
+    if (image->bits.format == PIXMAN_a8)
+    {
+	r = 0x00;
+    }
+    else if (image->bits.format == PIXMAN_a2r10g10b10)
+    {
+	r = ((uint32_t *)bits)[y * WIDTH + x] >> 14;
+	r &= 0xffc0;
+	r |= (r >> 10);
+    }
+    else if (image->bits.format == PIXMAN_a8r8g8b8)
+    {
+	r = ((uint32_t *)bits)[y * WIDTH + x] >> 16;
+	r &= 0xff;
+	r |= r << 8;
+    }
+    else if (image->bits.format == PIXMAN_a4r4g4b4)
+    {
+	r = ((uint16_t *)bits)[y * WIDTH + x] >> 8;
+	r &= 0xf;
+	r |= r << 4;
+	r |= r << 8;
+    }
+    else
+    {
+	assert (0);
+    }
+
+    return r;
+}
+
+static int
+run_test (int s, int d, int sa, int da, int soff, int doff)
+{
+    pixman_format_code_t sf = formats[s];
+    pixman_format_code_t df = formats[d];
+    pixman_format_code_t saf = alpha_formats[sa];
+    pixman_format_code_t daf = alpha_formats[da];
+    pixman_image_t *src, *dst, *orig_dst, *alpha, *orig_alpha;
+    pixman_transform_t t1;
+    int j, k;
+    int n_alpha_bits, n_red_bits;
+
+    soff = origins[soff];
+    doff = origins[doff];
+
+    n_alpha_bits = PIXMAN_FORMAT_A (df);
+    if (daf != PIXMAN_null)
+	n_alpha_bits = PIXMAN_FORMAT_A (daf);
+
+    n_red_bits = PIXMAN_FORMAT_R (df);
+
+    /* Source */
+    src = make_image (sf);
+    if (saf != PIXMAN_null)
+    {
+	alpha = make_image (saf);
+	pixman_image_set_alpha_map (src, alpha, soff, soff);
+	pixman_image_unref (alpha);
+    }
+
+    /* Destination */
+    orig_dst = make_image (df);
+    dst = make_image (df);
+    pixman_image_composite (PIXMAN_OP_SRC, orig_dst, NULL, dst,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    if (daf != PIXMAN_null)
+    {
+	orig_alpha = make_image (daf);
+	alpha = make_image (daf);
+
+	pixman_image_composite (PIXMAN_OP_SRC, orig_alpha, NULL, alpha,
+				0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+	pixman_image_set_alpha_map (orig_dst, orig_alpha, doff, doff);
+	pixman_image_set_alpha_map (dst, alpha, doff, doff);
+
+	pixman_image_unref (orig_alpha);
+	pixman_image_unref (alpha);
+    }
+
+    /* Transformations, repeats and filters on destinations should be ignored,
+     * so just set some random ones.
+     */
+    pixman_transform_init_identity (&t1);
+    pixman_transform_scale (&t1, NULL, pixman_int_to_fixed (100), pixman_int_to_fixed (11));
+    pixman_transform_rotate (&t1, NULL, pixman_double_to_fixed (0.5), pixman_double_to_fixed (0.11));
+    pixman_transform_translate (&t1, NULL, pixman_int_to_fixed (11), pixman_int_to_fixed (17));
+
+    pixman_image_set_transform (dst, &t1);
+    pixman_image_set_filter (dst, PIXMAN_FILTER_BILINEAR, NULL, 0);
+    pixman_image_set_repeat (dst, PIXMAN_REPEAT_REFLECT);
+
+    pixman_image_composite (PIXMAN_OP_ADD, src, NULL, dst,
+			    0, 0, 0, 0, 0, 0, WIDTH, HEIGHT);
+
+    for (j = MAX (doff, 0); j < MIN (HEIGHT, HEIGHT + doff); ++j)
+    {
+	for (k = MAX (doff, 0); k < MIN (WIDTH, WIDTH + doff); ++k)
+	{
+	    uint8_t sa, da, oda, refa;
+	    uint16_t sr, dr, odr, refr;
+
+	    sa = get_alpha (src, k, j, soff, soff);
+	    da = get_alpha (dst, k, j, doff, doff);
+	    oda = get_alpha (orig_dst, k, j, doff, doff);
+
+	    if (sa + oda > 255)
+		refa = 255;
+	    else
+		refa = sa + oda;
+
+	    if (da >> (8 - n_alpha_bits) != refa >> (8 - n_alpha_bits))
+	    {
+		printf ("\nWrong alpha value at (%d, %d). Should be 0x%x; got 0x%x. Source was 0x%x, original dest was 0x%x\n",
+			k, j, refa, da, sa, oda);
+
+		printf ("src: %s, alpha: %s, origin %d %d\ndst: %s, alpha: %s, origin: %d %d\n\n",
+			format_name (sf),
+			format_name (saf),
+			soff, soff,
+			format_name (df),
+			format_name (daf),
+			doff, doff);
+		return 1;
+	    }
+
+	    /* There are cases where we go through the 8 bit compositing
+	     * path even with 10bpc formats. This results in incorrect
+	     * results here, so only do the red check for narrow formats
+	     */
+	    if (n_red_bits <= 8)
+	    {
+		sr = get_red (src, k, j, soff, soff);
+		dr = get_red (dst, k, j, doff, doff);
+		odr = get_red (orig_dst, k, j, doff, doff);
+
+		if (sr + odr > 0xffff)
+		    refr = 0xffff;
+		else
+		    refr = sr + odr;
+
+		if (abs ((dr >> (16 - n_red_bits)) - (refr >> (16 - n_red_bits))) > 1)
+		{
+		    printf ("%d red bits\n", n_red_bits);
+		    printf ("\nWrong red value at (%d, %d). Should be 0x%x; got 0x%x. Source was 0x%x, original dest was 0x%x\n",
+			    k, j, refr, dr, sr, odr);
+
+		    printf ("src: %s, alpha: %s, origin %d %d\ndst: %s, alpha: %s, origin: %d %d\n\n",
+			    format_name (sf),
+			    format_name (saf),
+			    soff, soff,
+			    format_name (df),
+			    format_name (daf),
+			    doff, doff);
+		    return 1;
+		}
+	    }
+	}
+    }
+
+    pixman_image_set_alpha_map (src, NULL, 0, 0);
+    pixman_image_set_alpha_map (dst, NULL, 0, 0);
+    pixman_image_set_alpha_map (orig_dst, NULL, 0, 0);
+
+    pixman_image_unref (src);
+    pixman_image_unref (dst);
+    pixman_image_unref (orig_dst);
+
+    return 0;
+}
+
+int
+main (int argc, char **argv)
+{
+    int i, j, a, b, x, y;
+
+    prng_srand (0);
+
+    for (i = 0; i < ARRAY_LENGTH (formats); ++i)
+    {
+	for (j = 0; j < ARRAY_LENGTH (formats); ++j)
+	{
+	    for (a = 0; a < ARRAY_LENGTH (alpha_formats); ++a)
+	    {
+		for (b = 0; b < ARRAY_LENGTH (alpha_formats); ++b)
+		{
+		    for (x = 0; x < ARRAY_LENGTH (origins); ++x)
+		    {
+			for (y = 0; y < ARRAY_LENGTH (origins); ++y)
+			{
+			    if (run_test (i, j, a, b, x, y) != 0)
+				return 1;
+			}
+		    }
+		}
+	    }
+	}
+    }
+
+    return 0;
+}
diff --git a/src/pixman/test/blitters-test.c b/src/pixman/test/blitters-test.c
new file mode 100644
index 0000000..ea03f47
--- /dev/null
+++ b/src/pixman/test/blitters-test.c
@@ -0,0 +1,399 @@
+/*
+ * Test program, which stresses the use of different color formats and
+ * compositing operations.
+ *
+ * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
+ * the case of test failure.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+static pixman_indexed_t rgb_palette[9];
+static pixman_indexed_t y_palette[9];
+
+/* The first eight format in the list are by far the most widely
+ * used formats, so we test those more than the others
+ */
+#define N_MOST_LIKELY_FORMATS 8
+
+/* Create random image for testing purposes */
+static pixman_image_t *
+create_random_image (pixman_format_code_t *allowed_formats,
+		     int                   max_width,
+		     int                   max_height,
+		     int                   max_extra_stride,
+		     pixman_format_code_t *used_fmt)
+{
+    int n = 0, width, height, stride;
+    pixman_format_code_t fmt;
+    uint32_t *buf;
+    pixman_image_t *img;
+
+    while (allowed_formats[n] != PIXMAN_null)
+	n++;
+
+    if (n > N_MOST_LIKELY_FORMATS && prng_rand_n (4) != 0)
+	n = N_MOST_LIKELY_FORMATS;
+    fmt = allowed_formats[prng_rand_n (n)];
+
+    width = prng_rand_n (max_width) + 1;
+    height = prng_rand_n (max_height) + 1;
+    stride = (width * PIXMAN_FORMAT_BPP (fmt) + 7) / 8 +
+	prng_rand_n (max_extra_stride + 1);
+    stride = (stride + 3) & ~3;
+
+    /* do the allocation */
+    buf = aligned_malloc (64, stride * height);
+
+    if (prng_rand_n (4) == 0)
+    {
+	/* uniform distribution */
+	prng_randmemset (buf, stride * height, 0);
+    }
+    else
+    {
+	/* significantly increased probability for 0x00 and 0xFF */
+	prng_randmemset (buf, stride * height, RANDMEMSET_MORE_00_AND_FF);
+    }
+
+    /* test negative stride */
+    if (prng_rand_n (4) == 0)
+    {
+	buf += (stride / 4) * (height - 1);
+	stride = - stride;
+    }
+    
+    img = pixman_image_create_bits (fmt, width, height, buf, stride);
+
+    if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_COLOR)
+    {
+	pixman_image_set_indexed (img, &(rgb_palette[PIXMAN_FORMAT_BPP (fmt)]));
+    }
+    else if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_GRAY)
+    {
+	pixman_image_set_indexed (img, &(y_palette[PIXMAN_FORMAT_BPP (fmt)]));
+    }
+
+    if (prng_rand_n (16) == 0)
+	pixman_image_set_filter (img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    image_endian_swap (img);
+
+    if (used_fmt) *used_fmt = fmt;
+    return img;
+}
+
+/* Free random image, and optionally update crc32 based on its data */
+static uint32_t
+free_random_image (uint32_t initcrc,
+		   pixman_image_t *img,
+		   pixman_format_code_t fmt)
+{
+    uint32_t crc32 = 0;
+    uint32_t *data = pixman_image_get_data (img);
+
+    if (fmt != PIXMAN_null)
+	crc32 = compute_crc32_for_image (initcrc, img);
+
+    if (img->bits.rowstride < 0)
+	data += img->bits.rowstride * (img->bits.height - 1);
+
+    pixman_image_unref (img);
+    free (data);
+
+    return crc32;
+}
+
+static pixman_op_t op_list[] = {
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+#if 0 /* these use floating point math and are not always bitexact on different platforms */
+    PIXMAN_OP_SOFT_LIGHT,
+    PIXMAN_OP_HSL_HUE,
+    PIXMAN_OP_HSL_SATURATION,
+    PIXMAN_OP_HSL_COLOR,
+    PIXMAN_OP_HSL_LUMINOSITY,
+#endif
+};
+
+static pixman_format_code_t img_fmt_list[] = {
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a8b8g8r8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_x8b8g8r8,
+    PIXMAN_r5g6b5,
+    PIXMAN_b5g6r5,
+    PIXMAN_a8,
+    PIXMAN_a1,
+    PIXMAN_r3g3b2,
+    PIXMAN_b8g8r8a8,
+    PIXMAN_b8g8r8x8,
+    PIXMAN_r8g8b8a8,
+    PIXMAN_r8g8b8x8,
+    PIXMAN_x14r6g6b6,
+    PIXMAN_r8g8b8,
+    PIXMAN_b8g8r8,
+#if 0 /* These are going to use floating point in the near future */
+    PIXMAN_x2r10g10b10,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_x2b10g10r10,
+    PIXMAN_a2b10g10r10,
+#endif
+    PIXMAN_a1r5g5b5,
+    PIXMAN_x1r5g5b5,
+    PIXMAN_a1b5g5r5,
+    PIXMAN_x1b5g5r5,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_x4r4g4b4,
+    PIXMAN_a4b4g4r4,
+    PIXMAN_x4b4g4r4,
+    PIXMAN_r3g3b2,
+    PIXMAN_b2g3r3,
+    PIXMAN_a2r2g2b2,
+    PIXMAN_a2b2g2r2,
+    PIXMAN_c8,
+    PIXMAN_g8,
+    PIXMAN_x4c4,
+    PIXMAN_x4g4,
+    PIXMAN_c4,
+    PIXMAN_g4,
+    PIXMAN_g1,
+    PIXMAN_x4a4,
+    PIXMAN_a4,
+    PIXMAN_r1g2b1,
+    PIXMAN_b1g2r1,
+    PIXMAN_a1r1g1b1,
+    PIXMAN_a1b1g1r1,
+    PIXMAN_null
+};
+
+static pixman_format_code_t mask_fmt_list[] = {
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a8,
+    PIXMAN_a4,
+    PIXMAN_a1,
+    PIXMAN_null
+};
+
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (int testnum, int verbose)
+{
+    pixman_image_t *src_img = NULL;
+    pixman_image_t *dst_img = NULL;
+    pixman_image_t *mask_img = NULL;
+    int src_width, src_height;
+    int dst_width, dst_height;
+    int src_stride, dst_stride;
+    int src_x, src_y;
+    int dst_x, dst_y;
+    int mask_x, mask_y;
+    int w, h;
+    pixman_op_t op;
+    pixman_format_code_t src_fmt, dst_fmt, mask_fmt;
+    uint32_t *srcbuf, *maskbuf;
+    uint32_t crc32;
+    int max_width, max_height, max_extra_stride;
+    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+    max_width = max_height = 24 + testnum / 10000;
+    max_extra_stride = 4 + testnum / 1000000;
+
+    if (max_width > 256)
+	max_width = 256;
+
+    if (max_height > 16)
+	max_height = 16;
+
+    if (max_extra_stride > 8)
+	max_extra_stride = 8;
+
+    prng_srand (testnum);
+
+    op = op_list[prng_rand_n (ARRAY_LENGTH (op_list))];
+
+    if (prng_rand_n (8))
+    {
+	/* normal image */
+	src_img = create_random_image (img_fmt_list, max_width, max_height,
+				       max_extra_stride, &src_fmt);
+    }
+    else
+    {
+	/* solid case */
+	src_img = create_random_image (img_fmt_list, 1, 1,
+				       max_extra_stride, &src_fmt);
+
+	pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
+    }
+
+    dst_img = create_random_image (img_fmt_list, max_width, max_height,
+				   max_extra_stride, &dst_fmt);
+
+    src_width = pixman_image_get_width (src_img);
+    src_height = pixman_image_get_height (src_img);
+    src_stride = pixman_image_get_stride (src_img);
+
+    dst_width = pixman_image_get_width (dst_img);
+    dst_height = pixman_image_get_height (dst_img);
+    dst_stride = pixman_image_get_stride (dst_img);
+
+    srcbuf = pixman_image_get_data (src_img);
+
+    src_x = prng_rand_n (src_width);
+    src_y = prng_rand_n (src_height);
+    dst_x = prng_rand_n (dst_width);
+    dst_y = prng_rand_n (dst_height);
+
+    mask_img = NULL;
+    mask_fmt = PIXMAN_null;
+    mask_x = 0;
+    mask_y = 0;
+    maskbuf = NULL;
+
+    if ((src_fmt == PIXMAN_x8r8g8b8 || src_fmt == PIXMAN_x8b8g8r8) &&
+	(prng_rand_n (4) == 0))
+    {
+	/* PIXBUF */
+	mask_fmt = prng_rand_n (2) ? PIXMAN_a8r8g8b8 : PIXMAN_a8b8g8r8;
+	mask_img = pixman_image_create_bits (mask_fmt,
+	                                     src_width,
+	                                     src_height,
+	                                     srcbuf,
+	                                     src_stride);
+	mask_x = src_x;
+	mask_y = src_y;
+	maskbuf = srcbuf;
+    }
+    else if (prng_rand_n (2))
+    {
+	if (prng_rand_n (2))
+	{
+	    mask_img = create_random_image (mask_fmt_list, max_width, max_height,
+					   max_extra_stride, &mask_fmt);
+	}
+	else
+	{
+	    /* solid case */
+	    mask_img = create_random_image (mask_fmt_list, 1, 1,
+					   max_extra_stride, &mask_fmt);
+	    pixman_image_set_repeat (mask_img, PIXMAN_REPEAT_NORMAL);
+	}
+
+	if (prng_rand_n (2))
+	    pixman_image_set_component_alpha (mask_img, 1);
+
+	mask_x = prng_rand_n (pixman_image_get_width (mask_img));
+	mask_y = prng_rand_n (pixman_image_get_height (mask_img));
+    }
+
+
+    w = prng_rand_n (dst_width - dst_x + 1);
+    h = prng_rand_n (dst_height - dst_y + 1);
+
+    if (verbose)
+    {
+        printf ("op=%s\n", operator_name (op));
+	printf ("src_fmt=%s, dst_fmt=%s, mask_fmt=%s\n",
+	    format_name (src_fmt), format_name (dst_fmt),
+	    format_name (mask_fmt));
+	printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
+	    src_width, src_height, dst_width, dst_height);
+	printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
+	    src_x, src_y, dst_x, dst_y);
+	printf ("src_stride=%d, dst_stride=%d\n",
+	    src_stride, dst_stride);
+	printf ("w=%d, h=%d\n", w, h);
+    }
+
+    pixman_image_composite (op, src_img, mask_img, dst_img,
+			    src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
+
+    if (verbose)
+	print_image (dst_img);
+
+    free_random_image (0, src_img, PIXMAN_null);
+    crc32 = free_random_image (0, dst_img, dst_fmt);
+
+    if (mask_img)
+    {
+	if (srcbuf == maskbuf)
+	    pixman_image_unref(mask_img);
+	else
+	    free_random_image (0, mask_img, PIXMAN_null);
+    }
+
+    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    int i;
+
+    prng_srand (0);
+
+    for (i = 1; i <= 8; i++)
+    {
+	initialize_palette (&(rgb_palette[i]), i, TRUE);
+	initialize_palette (&(y_palette[i]), i, FALSE);
+    }
+
+    return fuzzer_test_main("blitters", 2000000,
+			    0xE0A07495,
+			    test_composite, argc, argv);
+}
diff --git a/src/pixman/test/check-formats.c b/src/pixman/test/check-formats.c
new file mode 100644
index 0000000..7edc198
--- /dev/null
+++ b/src/pixman/test/check-formats.c
@@ -0,0 +1,352 @@
+#include <ctype.h>
+#include "utils.h"
+
+static int
+check_op (pixman_op_t          op,
+          pixman_format_code_t src_format,
+          pixman_format_code_t dest_format)
+{
+    uint32_t src_alpha_mask, src_green_mask;
+    uint32_t dest_alpha_mask, dest_green_mask;
+    pixel_checker_t src_checker, dest_checker;
+    pixman_image_t *si, *di;
+    uint32_t sa, sg, da, dg;
+    uint32_t s, d;
+    int retval = 0;
+
+    pixel_checker_init (&src_checker, src_format);
+    pixel_checker_init (&dest_checker, dest_format);
+
+    pixel_checker_get_masks (
+        &src_checker, &src_alpha_mask, NULL, &src_green_mask, NULL);
+    pixel_checker_get_masks (
+        &dest_checker, &dest_alpha_mask, NULL, &dest_green_mask, NULL);
+
+    /* printf ("masks: %x %x %x %x\n", */
+    /* 	    src_alpha_mask, src_green_mask, */
+    /* 	    dest_alpha_mask, dest_green_mask); */
+
+    si = pixman_image_create_bits (src_format, 1, 1, &s, 4);
+    di = pixman_image_create_bits (dest_format, 1, 1, &d, 4);
+
+    sa = 0;
+    do
+    {
+        sg = 0;
+        do
+        {
+            da = 0;
+            do
+            {
+                dg = 0;
+                do
+                {
+                    color_t src_color, dest_color, result_color;
+                    uint32_t orig_d;
+
+                    s = sa | sg;
+                    d = da | dg;
+
+                    orig_d = d;
+
+		    pixel_checker_convert_pixel_to_color (&src_checker, s, &src_color);
+		    pixel_checker_convert_pixel_to_color (&dest_checker, d, &dest_color);
+
+		    do_composite (op, &src_color, NULL, &dest_color, &result_color, FALSE);
+
+
+		    if (!is_little_endian())
+                    {
+			s <<= 32 - PIXMAN_FORMAT_BPP (src_format);
+			d <<= 32 - PIXMAN_FORMAT_BPP (dest_format);
+                    }
+
+		    pixman_image_composite32 (op, si, NULL, di,
+					      0, 0, 0, 0, 0, 0, 1, 1);
+
+		    if (!is_little_endian())
+                        d >>= (32 - PIXMAN_FORMAT_BPP (dest_format));
+
+                    if (!pixel_checker_check (&dest_checker, d, &result_color))
+                    {
+                        printf ("---- test failed ----\n");
+                        printf ("operator: %-32s\n", operator_name (op));
+                        printf ("source:   %-12s pixel: %08x\n", format_name (src_format), s);
+                        printf ("dest:     %-12s pixel: %08x\n", format_name (dest_format), orig_d);
+                        printf ("got:      %-12s pixel: %08x\n", format_name (dest_format), d);
+
+                        retval = 1;
+                    }
+
+                    dg -= dest_green_mask;
+                    dg &= dest_green_mask;
+                }
+                while (dg != 0);
+
+                da -= dest_alpha_mask;
+                da &= dest_alpha_mask;
+            }
+            while (da != 0);
+
+            sg -= src_green_mask;
+            sg &= src_green_mask;
+        }
+        while (sg != 0);
+
+        sa -= src_alpha_mask;
+        sa &= src_alpha_mask;
+    }
+    while (sa != 0);
+
+    pixman_image_unref (si);
+    pixman_image_unref (di);
+
+    return retval;
+}
+
+static const pixman_op_t op_list[] =
+{
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+};
+
+static const pixman_format_code_t format_list[] =
+{
+    PIXMAN_a8r8g8b8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_a8b8g8r8,
+    PIXMAN_x8b8g8r8,
+    PIXMAN_b8g8r8a8,
+    PIXMAN_b8g8r8x8,
+    PIXMAN_r8g8b8a8,
+    PIXMAN_r8g8b8x8,
+    PIXMAN_x14r6g6b6,
+    PIXMAN_x2r10g10b10,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_x2b10g10r10,
+    PIXMAN_a2b10g10r10,
+    PIXMAN_a8r8g8b8_sRGB,
+    PIXMAN_r8g8b8,
+    PIXMAN_b8g8r8,
+    PIXMAN_r5g6b5,
+    PIXMAN_b5g6r5,
+    PIXMAN_a1r5g5b5,
+    PIXMAN_x1r5g5b5,
+    PIXMAN_a1b5g5r5,
+    PIXMAN_x1b5g5r5,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_x4r4g4b4,
+    PIXMAN_a4b4g4r4,
+    PIXMAN_x4b4g4r4,
+    PIXMAN_a8,
+    PIXMAN_r3g3b2,
+    PIXMAN_b2g3r3,
+    PIXMAN_a2r2g2b2,
+    PIXMAN_a2b2g2r2,
+    PIXMAN_x4a4,
+    PIXMAN_a4,
+    PIXMAN_r1g2b1,
+    PIXMAN_b1g2r1,
+    PIXMAN_a1r1g1b1,
+    PIXMAN_a1b1g1r1,
+    PIXMAN_a1,
+};
+
+static pixman_format_code_t
+format_from_string (const char *s)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_LENGTH (format_list); ++i)
+    {
+        if (strcasecmp (format_name (format_list[i]), s) == 0)
+            return format_list[i];
+    }
+
+    return PIXMAN_null;
+}
+
+static void
+emit (const char *s, int *n_chars)
+{
+    *n_chars += printf ("%s,", s);
+    if (*n_chars > 60)
+    {
+        printf ("\n    ");
+        *n_chars = 0;
+    }
+    else
+    {
+        printf (" ");
+        (*n_chars)++;
+    }
+}
+
+static void
+list_formats (void)
+{
+    int n_chars;
+    int i;
+
+    printf ("Formats:\n    ");
+
+    n_chars = 0;
+    for (i = 0; i < ARRAY_LENGTH (format_list); ++i)
+        emit (format_name (format_list[i]), &n_chars);
+
+    printf ("\n\n");
+}
+
+static void
+list_operators (void)
+{
+    char short_name [128] = { 0 };
+    int i, n_chars;
+
+    printf ("Operators:\n    ");
+
+    n_chars = 0;
+    for (i = 0; i < ARRAY_LENGTH (op_list); ++i)
+    {
+        pixman_op_t op = op_list[i];
+        int j;
+
+        snprintf (short_name, sizeof (short_name) - 1, "%s",
+                  operator_name (op) + strlen ("PIXMAN_OP_"));
+
+        for (j = 0; short_name[j] != '\0'; ++j)
+            short_name[j] = tolower (short_name[j]);
+
+        emit (short_name, &n_chars);
+    }
+
+    printf ("\n\n");
+}
+
+static pixman_op_t
+operator_from_string (const char *s)
+{
+    char full_name[128] = { 0 };
+    int i;
+
+    snprintf (full_name, (sizeof full_name) - 1, "PIXMAN_OP_%s", s);
+
+    for (i = 0; i < ARRAY_LENGTH (op_list); ++i)
+    {
+        pixman_op_t op = op_list[i];
+
+        if (strcasecmp (operator_name (op), full_name) == 0)
+            return op;
+    }
+
+    return PIXMAN_OP_NONE;
+}
+
+int
+main (int argc, char **argv)
+{
+    enum { OPTION_OP, OPTION_SRC, OPTION_DEST, LAST_OPTION } option;
+    pixman_format_code_t src_fmt, dest_fmt;
+    pixman_op_t op;
+
+    op = PIXMAN_OP_NONE;
+    src_fmt = PIXMAN_null;
+    dest_fmt = PIXMAN_null;
+
+    argc--;
+    argv++;
+
+    for (option = OPTION_OP; option < LAST_OPTION; ++option)
+    {
+        char *arg = NULL;
+
+        if (argc)
+        {
+            argc--;
+            arg = *argv++;
+        }
+
+        switch (option)
+        {
+        case OPTION_OP:
+            if (!arg)
+                printf ("  - missing operator\n");
+            else if ((op = operator_from_string (arg)) == PIXMAN_OP_NONE)
+                printf ("  - unknown operator %s\n", arg);
+            break;
+
+        case OPTION_SRC:
+            if (!arg)
+                printf ("  - missing source format\n");
+            else if ((src_fmt = format_from_string (arg)) == PIXMAN_null)
+                printf ("  - unknown source format %s\n", arg);
+            break;
+
+        case OPTION_DEST:
+            if (!arg)
+                printf ("  - missing destination format\n");
+            else if ((dest_fmt = format_from_string (arg)) == PIXMAN_null)
+                printf ("  - unknown destination format %s\n", arg);
+            break;
+
+        default:
+            assert (0);
+            break;
+        }
+    }
+
+    while (argc--)
+    {
+        op = PIXMAN_OP_NONE;
+        printf ("  - unexpected argument: %s\n", *argv++);
+    }
+
+    if (op == PIXMAN_OP_NONE || src_fmt == PIXMAN_null || dest_fmt == PIXMAN_null)
+    {
+        printf ("\nUsage:\n    check-formats <operator> <src-format> <dest-format>\n\n");
+        list_operators();
+        list_formats();
+
+        return -1;
+    }
+
+    return check_op (op, src_fmt, dest_fmt);
+}
diff --git a/src/pixman/test/combiner-test.c b/src/pixman/test/combiner-test.c
new file mode 100644
index 0000000..01f63a5
--- /dev/null
+++ b/src/pixman/test/combiner-test.c
@@ -0,0 +1,151 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils.h"
+#include <sys/types.h>
+#include "pixman-private.h"
+
+static const pixman_op_t op_list[] =
+{
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+    PIXMAN_OP_SOFT_LIGHT,
+    PIXMAN_OP_HSL_HUE,
+    PIXMAN_OP_HSL_SATURATION,
+    PIXMAN_OP_HSL_COLOR,
+    PIXMAN_OP_HSL_LUMINOSITY,
+};
+
+static float
+rand_float (void)
+{
+    uint32_t u = prng_rand();
+
+    return *(float *)&u;
+}
+
+static void
+random_floats (argb_t *argb, int width)
+{
+    int i;
+
+    for (i = 0; i < width; ++i)
+    {
+	argb_t *p = argb + i;
+
+	p->a = rand_float();
+	p->r = rand_float();
+	p->g = rand_float();
+	p->b = rand_float();
+    }
+}
+
+#define WIDTH	512
+
+static pixman_combine_float_func_t
+lookup_combiner (pixman_implementation_t *imp, pixman_op_t op,
+		 pixman_bool_t component_alpha)
+{
+    pixman_combine_float_func_t f;
+
+    do
+    {
+	if (component_alpha)
+	    f = imp->combine_float_ca[op];
+	else
+	    f = imp->combine_float[op];
+	
+	imp = imp->fallback;
+    }
+    while (!f);
+
+    return f;
+}
+
+int
+main ()
+{
+    pixman_implementation_t *impl;
+    argb_t *src_bytes = malloc (WIDTH * sizeof (argb_t));
+    argb_t *mask_bytes = malloc (WIDTH * sizeof (argb_t));
+    argb_t *dest_bytes = malloc (WIDTH * sizeof (argb_t));
+    int i;
+
+    enable_divbyzero_exceptions();
+    
+    impl = _pixman_internal_only_get_implementation();
+    
+    prng_srand (0);
+
+    for (i = 0; i < ARRAY_LENGTH (op_list); ++i)
+    {
+	pixman_op_t op = op_list[i];
+	pixman_combine_float_func_t combiner;
+	int ca;
+
+	for (ca = 0; ca < 2; ++ca)
+	{
+	    combiner = lookup_combiner (impl, op, ca);
+
+	    random_floats (src_bytes, WIDTH);
+	    random_floats (mask_bytes, WIDTH);
+	    random_floats (dest_bytes, WIDTH);
+
+	    combiner (impl, op,
+		      (float *)dest_bytes,
+		      (float *)mask_bytes,
+		      (float *)src_bytes,
+		      WIDTH);
+	}
+    }	
+
+    return 0;
+}
diff --git a/src/pixman/test/composite-traps-test.c b/src/pixman/test/composite-traps-test.c
new file mode 100644
index 0000000..86a0355
--- /dev/null
+++ b/src/pixman/test/composite-traps-test.c
@@ -0,0 +1,252 @@
+/* Based loosely on scaling-test */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+#define MAX_SRC_WIDTH  48
+#define MAX_SRC_HEIGHT 48
+#define MAX_DST_WIDTH  48
+#define MAX_DST_HEIGHT 48
+#define MAX_STRIDE     4
+
+static pixman_format_code_t formats[] =
+{
+    PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_r5g6b5, PIXMAN_a1, PIXMAN_a4
+};
+
+static pixman_format_code_t mask_formats[] =
+{
+    PIXMAN_a1, PIXMAN_a4, PIXMAN_a8,
+};
+
+static pixman_op_t operators[] =
+{
+    PIXMAN_OP_OVER, PIXMAN_OP_ADD, PIXMAN_OP_SRC, PIXMAN_OP_IN
+};
+
+#define RANDOM_ELT(array)						\
+    ((array)[prng_rand_n(ARRAY_LENGTH((array)))])
+
+static void
+destroy_bits (pixman_image_t *image, void *data)
+{
+    fence_free (data);
+}
+
+static pixman_fixed_t
+random_fixed (int n)
+{
+    return prng_rand_n (n << 16);
+}
+
+/*
+ * Composite operation with pseudorandom images
+ */
+uint32_t
+test_composite (int      testnum,
+		int      verbose)
+{
+    int                i;
+    pixman_image_t *   src_img;
+    pixman_image_t *   dst_img;
+    pixman_region16_t  clip;
+    int                dst_width, dst_height;
+    int                dst_stride;
+    int                dst_x, dst_y;
+    int                dst_bpp;
+    pixman_op_t        op;
+    uint32_t *         dst_bits;
+    uint32_t           crc32;
+    pixman_format_code_t mask_format, dst_format;
+    pixman_trapezoid_t *traps;
+    int src_x, src_y;
+    int n_traps;
+
+    static pixman_color_t colors[] =
+    {
+	{ 0xffff, 0xffff, 0xffff, 0xffff },
+	{ 0x0000, 0x0000, 0x0000, 0x0000 },
+	{ 0xabcd, 0xabcd, 0x0000, 0xabcd },
+	{ 0x0000, 0x0000, 0x0000, 0xffff },
+	{ 0x0101, 0x0101, 0x0101, 0x0101 },
+	{ 0x7777, 0x6666, 0x5555, 0x9999 },
+    };
+    
+    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+    prng_srand (testnum);
+
+    op = RANDOM_ELT (operators);
+    mask_format = RANDOM_ELT (mask_formats);
+
+    /* Create source image */
+    
+    if (prng_rand_n (4) == 0)
+    {
+	src_img = pixman_image_create_solid_fill (
+	    &(colors[prng_rand_n (ARRAY_LENGTH (colors))]));
+
+	src_x = 10;
+	src_y = 234;
+    }
+    else
+    {
+	pixman_format_code_t src_format = RANDOM_ELT(formats);
+	int src_bpp = (PIXMAN_FORMAT_BPP (src_format) + 7) / 8;
+	int src_width = prng_rand_n (MAX_SRC_WIDTH) + 1;
+	int src_height = prng_rand_n (MAX_SRC_HEIGHT) + 1;
+	int src_stride = src_width * src_bpp + prng_rand_n (MAX_STRIDE) * src_bpp;
+	uint32_t *bits, *orig;
+
+	src_x = -(src_width / 4) + prng_rand_n (src_width * 3 / 2);
+	src_y = -(src_height / 4) + prng_rand_n (src_height * 3 / 2);
+
+	src_stride = (src_stride + 3) & ~3;
+	
+	orig = bits = (uint32_t *)make_random_bytes (src_stride * src_height);
+
+	if (prng_rand_n (2) == 0)
+	{
+	    bits += (src_stride / 4) * (src_height - 1);
+	    src_stride = - src_stride;
+	}
+	
+	src_img = pixman_image_create_bits (
+	    src_format, src_width, src_height, bits, src_stride);
+
+	pixman_image_set_destroy_function (src_img, destroy_bits, orig);
+
+	if (prng_rand_n (8) == 0)
+	{
+	    pixman_box16_t clip_boxes[2];
+	    int            n = prng_rand_n (2) + 1;
+	    
+	    for (i = 0; i < n; i++)
+	    {
+		clip_boxes[i].x1 = prng_rand_n (src_width);
+		clip_boxes[i].y1 = prng_rand_n (src_height);
+		clip_boxes[i].x2 =
+		    clip_boxes[i].x1 + prng_rand_n (src_width - clip_boxes[i].x1);
+		clip_boxes[i].y2 =
+		    clip_boxes[i].y1 + prng_rand_n (src_height - clip_boxes[i].y1);
+		
+		if (verbose)
+		{
+		    printf ("source clip box: [%d,%d-%d,%d]\n",
+			    clip_boxes[i].x1, clip_boxes[i].y1,
+			    clip_boxes[i].x2, clip_boxes[i].y2);
+		}
+	    }
+	    
+	    pixman_region_init_rects (&clip, clip_boxes, n);
+	    pixman_image_set_clip_region (src_img, &clip);
+	    pixman_image_set_source_clipping (src_img, 1);
+	    pixman_region_fini (&clip);
+	}
+
+	image_endian_swap (src_img);
+    }
+
+    /* Create destination image */
+    {
+	dst_format = RANDOM_ELT(formats);
+	dst_bpp = (PIXMAN_FORMAT_BPP (dst_format) + 7) / 8;
+	dst_width = prng_rand_n (MAX_DST_WIDTH) + 1;
+	dst_height = prng_rand_n (MAX_DST_HEIGHT) + 1;
+	dst_stride = dst_width * dst_bpp + prng_rand_n (MAX_STRIDE) * dst_bpp;
+	dst_stride = (dst_stride + 3) & ~3;
+	
+	dst_bits = (uint32_t *)make_random_bytes (dst_stride * dst_height);
+
+	if (prng_rand_n (2) == 0)
+	{
+	    dst_bits += (dst_stride / 4) * (dst_height - 1);
+	    dst_stride = - dst_stride;
+	}
+	
+	dst_x = -(dst_width / 4) + prng_rand_n (dst_width * 3 / 2);
+	dst_y = -(dst_height / 4) + prng_rand_n (dst_height * 3 / 2);
+	
+	dst_img = pixman_image_create_bits (
+	    dst_format, dst_width, dst_height, dst_bits, dst_stride);
+
+	image_endian_swap (dst_img);
+    }
+
+    /* Create traps */
+    {
+	int i;
+
+	n_traps = prng_rand_n (25);
+	traps = fence_malloc (n_traps * sizeof (pixman_trapezoid_t));
+
+	for (i = 0; i < n_traps; ++i)
+	{
+	    pixman_trapezoid_t *t = &(traps[i]);
+	    
+	    t->top = random_fixed (MAX_DST_HEIGHT) - MAX_DST_HEIGHT / 2;
+	    t->bottom = t->top + random_fixed (MAX_DST_HEIGHT);
+	    t->left.p1.x = random_fixed (MAX_DST_WIDTH) - MAX_DST_WIDTH / 2;
+	    t->left.p1.y = t->top - random_fixed (50);
+	    t->left.p2.x = random_fixed (MAX_DST_WIDTH) - MAX_DST_WIDTH / 2;
+	    t->left.p2.y = t->bottom + random_fixed (50);
+	    t->right.p1.x = t->left.p1.x + random_fixed (MAX_DST_WIDTH);
+	    t->right.p1.y = t->top - random_fixed (50);
+	    t->right.p2.x = t->left.p2.x + random_fixed (MAX_DST_WIDTH);
+	    t->right.p2.y = t->bottom - random_fixed (50);
+	}
+    }
+    
+    if (prng_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = prng_rand_n (2) + 1;
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = prng_rand_n (dst_width);
+	    clip_boxes[i].y1 = prng_rand_n (dst_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + prng_rand_n (dst_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + prng_rand_n (dst_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("destination clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (dst_img, &clip);
+	pixman_region_fini (&clip);
+    }
+
+    pixman_composite_trapezoids (op, src_img, dst_img, mask_format,
+				 src_x, src_y, dst_x, dst_y, n_traps, traps);
+
+    crc32 = compute_crc32_for_image (0, dst_img);
+
+    if (verbose)
+	print_image (dst_img);
+
+    if (dst_stride < 0)
+	dst_bits += (dst_stride / 4) * (dst_height - 1);
+    
+    fence_free (dst_bits);
+    
+    pixman_image_unref (src_img);
+    pixman_image_unref (dst_img);
+    fence_free (traps);
+
+    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    return fuzzer_test_main("composite traps", 40000, 0xAF41D210,
+			    test_composite, argc, argv);
+}
diff --git a/src/pixman/test/composite.c b/src/pixman/test/composite.c
new file mode 100644
index 0000000..9e51a8f
--- /dev/null
+++ b/src/pixman/test/composite.c
@@ -0,0 +1,536 @@
+/*
+ * Copyright © 2005 Eric Anholt
+ * Copyright © 2009 Chris Wilson
+ * Copyright © 2010 Soeren Sandmann
+ * Copyright © 2010 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Eric Anholt not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Eric Anholt makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * ERIC ANHOLT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL ERIC ANHOLT BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <stdio.h>
+#include <stdlib.h> /* abort() */
+#include <math.h>
+#include <time.h>
+#include "utils.h"
+
+typedef struct image_t image_t;
+
+static const color_t colors[] =
+{
+    { 1.0, 1.0, 1.0, 1.0 },
+    { 1.0, 1.0, 1.0, 0.0 },
+    { 0.0, 0.0, 0.0, 1.0 },
+    { 0.0, 0.0, 0.0, 0.0 },
+    { 1.0, 0.0, 0.0, 1.0 },
+    { 0.0, 1.0, 0.0, 1.0 },
+    { 0.0, 0.0, 1.0, 1.0 },
+    { 0.5, 0.0, 0.0, 0.5 },
+};
+
+static uint16_t
+_color_double_to_short (double d)
+{
+    uint32_t i;
+
+    i = (uint32_t) (d * 65536);
+    i -= (i >> 16);
+
+    return i;
+}
+
+static void
+compute_pixman_color (const color_t *color,
+		      pixman_color_t *out)
+{
+    out->red   = _color_double_to_short (color->r);
+    out->green = _color_double_to_short (color->g);
+    out->blue  = _color_double_to_short (color->b);
+    out->alpha = _color_double_to_short (color->a);
+}
+
+#define REPEAT 0x01000000
+#define FLAGS  0xff000000
+
+static const int sizes[] =
+{
+    0,
+    1,
+    1 | REPEAT,
+    10
+};
+
+static const pixman_format_code_t formats[] =
+{
+    /* 32 bpp formats */
+    PIXMAN_a8r8g8b8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_a8b8g8r8,
+    PIXMAN_x8b8g8r8,
+    PIXMAN_b8g8r8a8,
+    PIXMAN_b8g8r8x8,
+    PIXMAN_r8g8b8a8,
+    PIXMAN_r8g8b8x8,
+    PIXMAN_x2r10g10b10,
+    PIXMAN_x2b10g10r10,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_a2b10g10r10,
+    
+    /* sRGB formats */
+    PIXMAN_a8r8g8b8_sRGB,
+
+    /* 24 bpp formats */
+    PIXMAN_r8g8b8,
+    PIXMAN_b8g8r8,
+    PIXMAN_r5g6b5,
+    PIXMAN_b5g6r5,
+
+    /* 16 bpp formats */
+    PIXMAN_x1r5g5b5,
+    PIXMAN_x1b5g5r5,
+    PIXMAN_a1r5g5b5,
+    PIXMAN_a1b5g5r5,
+    PIXMAN_a4b4g4r4,
+    PIXMAN_x4b4g4r4,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_x4r4g4b4,
+
+    /* 8 bpp formats */
+    PIXMAN_a8,
+    PIXMAN_r3g3b2,
+    PIXMAN_b2g3r3,
+    PIXMAN_a2r2g2b2,
+    PIXMAN_a2b2g2r2,
+    PIXMAN_x4a4,
+
+    /* 4 bpp formats */
+    PIXMAN_a4,
+    PIXMAN_r1g2b1,
+    PIXMAN_b1g2r1,
+    PIXMAN_a1r1g1b1,
+    PIXMAN_a1b1g1r1,
+
+    /* 1 bpp formats */
+    PIXMAN_a1,
+};
+
+struct image_t
+{
+    pixman_image_t *image;
+    pixman_format_code_t format;
+    const color_t *color;
+    pixman_repeat_t repeat;
+    int size;
+};
+
+static const pixman_op_t operators[] =
+{
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+};
+
+static uint32_t
+get_value (pixman_image_t *image)
+{
+    uint32_t value = *(uint32_t *)pixman_image_get_data (image);
+
+#ifdef WORDS_BIGENDIAN
+    {
+	pixman_format_code_t format = pixman_image_get_format (image);
+	value >>= 8 * sizeof(value) - PIXMAN_FORMAT_BPP (format);
+    }
+#endif
+
+    return value;
+}
+
+static char *
+describe_image (image_t *info, char *buf)
+{
+    if (info->size)
+    {
+	sprintf (buf, "%s, %dx%d%s",
+		 format_name (info->format),
+		 info->size, info->size,
+		 info->repeat ? " R" :"");
+    }
+    else
+    {
+	sprintf (buf, "solid");
+    }
+
+    return buf;
+}
+
+static char *
+describe_color (const color_t *color, char *buf)
+{
+    sprintf (buf, "%.3f %.3f %.3f %.3f",
+	     color->r, color->g, color->b, color->a);
+
+    return buf;
+}
+
+static pixman_bool_t
+composite_test (image_t *dst,
+		pixman_op_t op,
+		image_t *src,
+		image_t *mask,
+		pixman_bool_t component_alpha,
+		int testno)
+{
+    color_t expected, tdst, tsrc, tmsk;
+    pixel_checker_t checker;
+
+    if (mask)
+    {
+	pixman_image_set_component_alpha (mask->image, component_alpha);
+
+	pixman_image_composite (op, src->image, mask->image, dst->image,
+				0, 0, 0, 0, 0, 0, dst->size, dst->size);
+    }
+    else
+    {
+	pixman_image_composite (op, src->image, NULL, dst->image,
+				0, 0,
+				0, 0,
+				0, 0,
+				dst->size, dst->size);
+    }
+
+    tdst = *dst->color;
+    tsrc = *src->color;
+
+    if (mask)
+    {
+	tmsk = *mask->color;
+    }
+
+    /* It turns out that by construction all source, mask etc. colors are
+     * linear because they are made from fills, and fills are always in linear
+     * color space.  However, if they have been converted to bitmaps, we need
+     * to simulate the sRGB approximation to pass the test cases.
+     */
+    if (src->size)
+    {
+	if (PIXMAN_FORMAT_TYPE (src->format) == PIXMAN_TYPE_ARGB_SRGB)
+        {
+	    tsrc.r = convert_linear_to_srgb (tsrc.r);
+	    tsrc.g = convert_linear_to_srgb (tsrc.g);
+	    tsrc.b = convert_linear_to_srgb (tsrc.b);
+	    round_color (src->format, &tsrc);
+	    tsrc.r = convert_srgb_to_linear (tsrc.r);
+	    tsrc.g = convert_srgb_to_linear (tsrc.g);
+	    tsrc.b = convert_srgb_to_linear (tsrc.b);
+	}
+        else
+        {
+	    round_color (src->format, &tsrc);
+	}
+    }
+
+    if (mask && mask->size)
+    {
+	if (PIXMAN_FORMAT_TYPE (mask->format) == PIXMAN_TYPE_ARGB_SRGB)
+	{
+	    tmsk.r = convert_linear_to_srgb (tmsk.r);
+	    tmsk.g = convert_linear_to_srgb (tmsk.g);
+	    tmsk.b = convert_linear_to_srgb (tmsk.b);
+	    round_color (mask->format, &tmsk);
+	    tmsk.r = convert_srgb_to_linear (tmsk.r);
+	    tmsk.g = convert_srgb_to_linear (tmsk.g);
+	    tmsk.b = convert_srgb_to_linear (tmsk.b);
+	}
+	else
+	{
+	    round_color (mask->format, &tmsk);
+	}
+    }
+
+    if (mask)
+    {
+	if (component_alpha && PIXMAN_FORMAT_R (mask->format) == 0)
+	{
+	    /* Ax component-alpha masks expand alpha into
+	     * all color channels.
+	     */
+	    tmsk.r = tmsk.g = tmsk.b = tmsk.a;
+	}
+    }
+
+    if (PIXMAN_FORMAT_TYPE (dst->format) == PIXMAN_TYPE_ARGB_SRGB)
+    {
+	tdst.r = convert_linear_to_srgb (tdst.r);
+	tdst.g = convert_linear_to_srgb (tdst.g);
+	tdst.b = convert_linear_to_srgb (tdst.b);
+    	round_color (dst->format, &tdst);
+	tdst.r = convert_srgb_to_linear (tdst.r);
+	tdst.g = convert_srgb_to_linear (tdst.g);
+	tdst.b = convert_srgb_to_linear (tdst.b);
+    }
+    else
+    {
+    	round_color (dst->format, &tdst);
+    }
+
+    do_composite (op,
+		  &tsrc,
+		  mask? &tmsk : NULL,
+		  &tdst,
+		  &expected,
+		  component_alpha);
+
+    pixel_checker_init (&checker, dst->format);
+
+    if (!pixel_checker_check (&checker, get_value (dst->image), &expected))
+    {
+	char buf[40], buf2[40];
+	int a, r, g, b;
+	uint32_t pixel;
+
+	printf ("---- Test %d failed ----\n", testno);
+	printf ("Operator:      %s %s\n",
+                operator_name (op), component_alpha ? "CA" : "");
+
+	printf ("Source:        %s\n", describe_image (src, buf));
+	if (mask != NULL)
+	    printf ("Mask:          %s\n", describe_image (mask, buf));
+
+	printf ("Destination:   %s\n\n", describe_image (dst, buf));
+	printf ("               R     G     B     A         Rounded\n");
+	printf ("Source color:  %s     %s\n",
+		describe_color (src->color, buf),
+		describe_color (&tsrc, buf2));
+	if (mask)
+	{
+	    printf ("Mask color:    %s     %s\n",
+		    describe_color (mask->color, buf),
+		    describe_color (&tmsk, buf2));
+	}
+	printf ("Dest. color:   %s     %s\n",
+		describe_color (dst->color, buf),
+		describe_color (&tdst, buf2));
+
+	pixel = get_value (dst->image);
+
+	printf ("Expected:      %s\n", describe_color (&expected, buf));
+
+	pixel_checker_split_pixel (&checker, pixel, &a, &r, &g, &b);
+
+	printf ("Got:           %5d %5d %5d %5d  [pixel: 0x%08x]\n", r, g, b, a, pixel);
+	pixel_checker_get_min (&checker, &expected, &a, &r, &g, &b);
+	printf ("Min accepted:  %5d %5d %5d %5d\n", r, g, b, a);
+	pixel_checker_get_max (&checker, &expected, &a, &r, &g, &b);
+	printf ("Max accepted:  %5d %5d %5d %5d\n", r, g, b, a);
+
+	return FALSE;
+    }
+    return TRUE;
+}
+
+static void
+image_init (image_t *info,
+	    int color,
+	    int format,
+	    int size)
+{
+    pixman_color_t fill;
+
+    info->color = &colors[color];
+    compute_pixman_color (info->color, &fill);
+
+    info->format = formats[format];
+    info->size = sizes[size] & ~FLAGS;
+    info->repeat = PIXMAN_REPEAT_NONE;
+
+    if (info->size)
+    {
+	pixman_image_t *solid;
+
+	info->image = pixman_image_create_bits (info->format,
+						info->size, info->size,
+						NULL, 0);
+
+	solid = pixman_image_create_solid_fill (&fill);
+	pixman_image_composite32 (PIXMAN_OP_SRC, solid, NULL, info->image,
+				  0, 0, 0, 0, 0, 0, info->size, info->size);
+	pixman_image_unref (solid);
+
+	if (sizes[size] & REPEAT)
+	{
+	    pixman_image_set_repeat (info->image, PIXMAN_REPEAT_NORMAL);
+	    info->repeat = PIXMAN_REPEAT_NORMAL;
+	}
+    }
+    else
+    {
+	info->image = pixman_image_create_solid_fill (&fill);
+    }
+}
+
+static void
+image_fini (image_t *info)
+{
+    pixman_image_unref (info->image);
+}
+
+static int
+random_size (void)
+{
+    return prng_rand_n (ARRAY_LENGTH (sizes));
+}
+
+static int
+random_color (void)
+{
+    return prng_rand_n (ARRAY_LENGTH (colors));
+}
+
+static int
+random_format (void)
+{
+    return prng_rand_n (ARRAY_LENGTH (formats));
+}
+
+static pixman_bool_t
+run_test (uint32_t seed)
+{
+    image_t src, mask, dst;
+    pixman_op_t op;
+    int ca;
+    int ok;
+
+    prng_srand (seed);
+
+    image_init (&dst, random_color(), random_format(), 1);
+    image_init (&src, random_color(), random_format(), random_size());
+    image_init (&mask, random_color(), random_format(), random_size());
+
+    op = operators [prng_rand_n (ARRAY_LENGTH (operators))];
+
+    ca = prng_rand_n (3);
+
+    switch (ca)
+    {
+    case 0:
+	ok = composite_test (&dst, op, &src, NULL, FALSE, seed);
+	break;
+    case 1:
+	ok = composite_test (&dst, op, &src, &mask, FALSE, seed);
+	break;
+    case 2:
+	ok = composite_test (&dst, op, &src, &mask,
+			     mask.size? TRUE : FALSE, seed);
+	break;
+    default:
+	ok = FALSE;
+	break;
+    }
+
+    image_fini (&src);
+    image_fini (&mask);
+    image_fini (&dst);
+
+    return ok;
+}
+
+int
+main (int argc, char **argv)
+{
+#define N_TESTS (8 * 1024 * 1024)
+    int result = 0;
+    uint32_t seed;
+    int32_t i;
+
+    if (argc > 1)
+    {
+	char *end;
+
+	i = strtol (argv[1], &end, 0);
+
+	if (end != argv[1])
+	{
+	    if (!run_test (i))
+		return 1;
+	    else
+		return 0;
+	}
+	else
+	{
+	    printf ("Usage:\n\n   %s <number>\n\n", argv[0]);
+	    return -1;
+	}
+    }
+
+    if (getenv ("PIXMAN_RANDOMIZE_TESTS"))
+	seed = get_random_seed();
+    else
+	seed = 1;
+
+#ifdef USE_OPENMP
+#   pragma omp parallel for default(none) shared(result, argv, seed)
+#endif
+    for (i = 0; i <= N_TESTS; ++i)
+    {
+	if (!result && !run_test (i + seed))
+	{
+	    printf ("Test 0x%08X failed.\n", seed + i);
+
+	    result = seed + i;
+	}
+    }
+
+    return result;
+}
diff --git a/src/pixman/test/fetch-test.c b/src/pixman/test/fetch-test.c
new file mode 100644
index 0000000..04e8cc5
--- /dev/null
+++ b/src/pixman/test/fetch-test.c
@@ -0,0 +1,205 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+#define SIZE 1024
+
+static pixman_indexed_t mono_palette =
+{
+    0, { 0x00000000, 0x00ffffff },
+};
+
+
+typedef struct {
+    pixman_format_code_t format;
+    int width, height;
+    int stride;
+    uint32_t src[SIZE];
+    uint32_t dst[SIZE];
+    pixman_indexed_t *indexed;
+} testcase_t;
+
+static testcase_t testcases[] =
+{
+    {
+	PIXMAN_a8r8g8b8,
+	2, 2,
+	8,
+	{ 0x00112233, 0x44556677,
+	  0x8899aabb, 0xccddeeff },
+	{ 0x00112233, 0x44556677,
+	  0x8899aabb, 0xccddeeff },
+	NULL,
+    },
+    {
+	PIXMAN_r8g8b8a8,
+	2, 2,
+	8,
+	{ 0x11223300, 0x55667744,
+	  0x99aabb88, 0xddeeffcc },
+	{ 0x00112233, 0x44556677,
+	  0x8899aabb, 0xccddeeff },
+	NULL,
+    },
+    {
+	PIXMAN_g1,
+	8, 2,
+	4,
+#ifdef WORDS_BIGENDIAN
+	{
+	    0xaa000000,
+	    0x55000000
+	},
+#else
+	{
+	    0x00000055,
+	    0x000000aa
+	},
+#endif
+	{
+	    0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000,
+	    0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff, 0x00000000, 0x00ffffff
+	},
+	&mono_palette,
+    },
+#if 0
+    {
+	PIXMAN_g8,
+	4, 2,
+	4,
+	{ 0x01234567,
+	  0x89abcdef },
+	{ 0x00010101, 0x00232323, 0x00454545, 0x00676767,
+	  0x00898989, 0x00ababab, 0x00cdcdcd, 0x00efefef, },
+    },
+#endif
+    /* FIXME: make this work on big endian */
+    {
+	PIXMAN_yv12,
+	8, 2,
+	8,
+#ifdef WORDS_BIGENDIAN
+	{
+	    0x00ff00ff, 0x00ff00ff,
+	    0xff00ff00, 0xff00ff00,
+	    0x80ff8000,
+	    0x800080ff
+	},
+#else
+	{
+	    0xff00ff00, 0xff00ff00,
+	    0x00ff00ff, 0x00ff00ff,
+	    0x0080ff80,
+	    0xff800080
+	},
+#endif
+	{
+	    0xff000000, 0xffffffff, 0xffb80000, 0xffffe113,
+	    0xff000000, 0xffffffff, 0xff0023ee, 0xff4affff,
+	    0xffffffff, 0xff000000, 0xffffe113, 0xffb80000,
+	    0xffffffff, 0xff000000, 0xff4affff, 0xff0023ee,
+	},
+    },
+};
+
+int n_test_cases = ARRAY_LENGTH (testcases);
+
+
+static uint32_t
+reader (const void *src, int size)
+{
+    switch (size)
+    {
+    case 1:
+	return *(uint8_t *)src;
+    case 2:
+	return *(uint16_t *)src;
+    case 4:
+	return *(uint32_t *)src;
+    default:
+	assert(0);
+	return 0; /* silence MSVC */
+    }
+}
+
+
+static void
+writer (void *src, uint32_t value, int size)
+{
+    switch (size)
+    {
+    case 1:
+	*(uint8_t *)src = value;
+	break;
+    case 2:
+	*(uint16_t *)src = value;
+	break;
+    case 4:
+	*(uint32_t *)src = value;
+	break;
+    default:
+	assert(0);
+    }
+}
+
+
+int
+main (int argc, char **argv)
+{
+    uint32_t dst[SIZE];
+    pixman_image_t *src_img;
+    pixman_image_t *dst_img;
+    int i, j, x, y;
+    int ret = 0;
+
+    for (i = 0; i < n_test_cases; ++i)
+    {
+	for (j = 0; j < 2; ++j)
+	{
+	    src_img = pixman_image_create_bits (testcases[i].format,
+						testcases[i].width,
+						testcases[i].height,
+						testcases[i].src,
+						testcases[i].stride);
+	    pixman_image_set_indexed(src_img, testcases[i].indexed);
+
+	    dst_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+						testcases[i].width,
+						testcases[i].height,
+						dst,
+						testcases[i].width*4);
+
+	    if (j)
+	    {
+		pixman_image_set_accessors (src_img, reader, writer);
+		pixman_image_set_accessors (dst_img, reader, writer);
+	    }
+
+	    pixman_image_composite (PIXMAN_OP_SRC, src_img, NULL, dst_img,
+				    0, 0, 0, 0, 0, 0, testcases[i].width, testcases[i].height);
+
+	    pixman_image_unref (src_img);
+	    pixman_image_unref (dst_img);
+
+	    for (y = 0; y < testcases[i].height; ++y)
+	    {
+		for (x = 0; x < testcases[i].width; ++x)
+		{
+		    int offset = y * testcases[i].width + x;
+
+		    if (dst[offset] != testcases[i].dst[offset])
+		    {
+			printf ("test %i%c: pixel mismatch at (x=%d,y=%d): %08x expected, %08x obtained\n",
+			        i + 1, 'a' + j,
+			        x, y,
+			        testcases[i].dst[offset], dst[offset]);
+			ret = 1;
+		    }
+		}
+	    }
+	}
+    }
+
+    return ret;
+}
diff --git a/src/pixman/test/fuzzer-find-diff.pl b/src/pixman/test/fuzzer-find-diff.pl
new file mode 100755
index 0000000..e1d67fb
--- /dev/null
+++ b/src/pixman/test/fuzzer-find-diff.pl
@@ -0,0 +1,75 @@
+#!/usr/bin/env perl
+
+$usage = "Usage:
+  fuzzer-find-diff.pl reference_binary new_binary [number_of_tests_to_run]
+
+The first two input arguments are the commands to run the test programs
+based on fuzzer_test_main() function from 'util.c' (preferably they should
+be statically compiled, this can be achieved via '--disable-shared' pixman
+configure option). The third optional argument is the number of test rounds
+to run (if not specified, then testing runs infinitely or until some problem
+is detected).
+
+Usage examples:
+  fuzzer-find-diff.pl ./blitters-test-with-sse-disabled ./blitters-test 9000000
+  fuzzer-find-diff.pl ./blitters-test \"ssh ppc64_host /path/to/blitters-test\"
+";
+
+$#ARGV >= 1 or die $usage;
+
+$batch_size = 10000;
+
+if ($#ARGV >= 2) {
+    $number_of_tests = int($ARGV[2]);
+} else {
+    $number_of_tests = -1
+}
+
+sub test_range {
+    my $min = shift;
+    my $max = shift;
+
+    # check that [$min, $max] range is "bad", otherwise return
+    if (`$ARGV[0] $min $max 2>/dev/null` eq `$ARGV[1] $min $max 2>/dev/null`) {
+        return;
+    }
+
+    # check that $min itself is "good", otherwise return
+    if (`$ARGV[0] $min 2>/dev/null` ne `$ARGV[1] $min 2>/dev/null`) {
+        return $min;
+    }
+
+    # start bisecting
+    while ($max != $min + 1) {
+        my $avg = int(($min + $max) / 2);
+        my $res1 = `$ARGV[0] $min $avg 2>/dev/null`;
+        my $res2 = `$ARGV[1] $min $avg 2>/dev/null`;
+        if ($res1 ne $res2) {
+            $max = $avg;
+        } else {
+            $min = $avg;
+        }
+    }
+    return $max;
+}
+
+$base = 1;
+while ($number_of_tests <= 0 || $base <= $number_of_tests) {
+    printf("testing %-12d\r", $base + $batch_size - 1);
+    my $res = test_range($base, $base + $batch_size - 1);
+    if ($res) {
+        printf("Failure: results are different for test %d:\n", $res);
+
+        printf("\n-- ref --\n");
+        print `$ARGV[0] $res`;
+        printf("-- new --\n");
+        print `$ARGV[1] $res`;
+
+        printf("The problematic conditions can be reproduced by running:\n");
+        printf("$ARGV[1] %d\n", $res);
+
+        exit(1);
+    }
+    $base += $batch_size;
+}
+printf("Success: %d tests finished\n", $base - 1);
diff --git a/src/pixman/test/glyph-test.c b/src/pixman/test/glyph-test.c
new file mode 100644
index 0000000..1811add
--- /dev/null
+++ b/src/pixman/test/glyph-test.c
@@ -0,0 +1,332 @@
+#include <stdlib.h>
+#include "utils.h"
+
+static const pixman_format_code_t glyph_formats[] =
+{
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a8,
+    PIXMAN_a4,
+    PIXMAN_a1,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_r3g3b2,
+    PIXMAN_null,
+};
+
+static const pixman_format_code_t formats[] =
+{
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a8b8g8r8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_x8b8g8r8,
+    PIXMAN_r5g6b5,
+    PIXMAN_b5g6r5,
+    PIXMAN_a8,
+    PIXMAN_a1,
+    PIXMAN_r3g3b2,
+    PIXMAN_b8g8r8a8,
+    PIXMAN_b8g8r8x8,
+    PIXMAN_r8g8b8a8,
+    PIXMAN_r8g8b8x8,
+    PIXMAN_x14r6g6b6,
+    PIXMAN_r8g8b8,
+    PIXMAN_b8g8r8,
+#if 0
+    /* These use floating point */
+    PIXMAN_x2r10g10b10,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_x2b10g10r10,
+    PIXMAN_a2b10g10r10,
+#endif
+    PIXMAN_a1r5g5b5,
+    PIXMAN_x1r5g5b5,
+    PIXMAN_a1b5g5r5,
+    PIXMAN_x1b5g5r5,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_x4r4g4b4,
+    PIXMAN_a4b4g4r4,
+    PIXMAN_x4b4g4r4,
+    PIXMAN_r3g3b2,
+    PIXMAN_b2g3r3,
+    PIXMAN_a2r2g2b2,
+    PIXMAN_a2b2g2r2,
+    PIXMAN_x4a4,
+    PIXMAN_a4,
+    PIXMAN_r1g2b1,
+    PIXMAN_b1g2r1,
+    PIXMAN_a1r1g1b1,
+    PIXMAN_a1b1g1r1,
+    PIXMAN_null,
+};
+
+static const pixman_op_t operators[] =
+{
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD
+};
+
+enum
+{
+    ALLOW_CLIPPED		= (1 << 0),
+    ALLOW_ALPHA_MAP		= (1 << 1),
+    ALLOW_SOURCE_CLIPPING	= (1 << 2),
+    ALLOW_REPEAT		= (1 << 3),
+    ALLOW_SOLID			= (1 << 4),
+    ALLOW_FENCED_MEMORY		= (1 << 5),
+};
+
+static void
+destroy_fenced (pixman_image_t *image, void *data)
+{
+    fence_free (data);
+}
+
+static void
+destroy_malloced (pixman_image_t *image, void *data)
+{
+    free (data);
+}
+
+static pixman_format_code_t
+random_format (const pixman_format_code_t *formats)
+{
+    int i;
+    i = 0;
+    while (formats[i] != PIXMAN_null)
+	++i;
+    return formats[prng_rand_n (i)];
+}
+
+static pixman_image_t *
+create_image (int max_size, const pixman_format_code_t *formats, uint32_t flags)
+{
+    int width, height;
+    pixman_image_t *image;
+    pixman_format_code_t format;
+    uint32_t *data;
+    int bpp;
+    int stride;
+    int i;
+    pixman_image_destroy_func_t destroy;
+
+    if ((flags & ALLOW_SOLID) && prng_rand_n (4) == 0)
+    {
+	pixman_color_t color;
+
+	color.alpha = prng_rand();
+	color.red = prng_rand();
+	color.green = prng_rand();
+	color.blue = prng_rand();
+
+	return pixman_image_create_solid_fill (&color);
+    }
+
+    width = prng_rand_n (max_size) + 1;
+    height = prng_rand_n (max_size) + 1;
+    format = random_format (formats);
+
+    bpp = PIXMAN_FORMAT_BPP (format);
+    stride = (width * bpp + 7) / 8 + prng_rand_n (17);
+    stride = (stride + 3) & ~3;
+
+    if (prng_rand_n (64) == 0)
+    {
+	if (!(data = (uint32_t *)make_random_bytes (stride * height)))
+	{
+	    fprintf (stderr, "Out of memory\n");
+	    abort ();
+	}
+	destroy = destroy_fenced;
+    }
+    else
+    {
+	data = malloc (stride * height);
+	prng_randmemset (data, height * stride, 0);
+	destroy = destroy_malloced;
+    }
+
+    image = pixman_image_create_bits (format, width, height, data, stride);
+    pixman_image_set_destroy_function (image, destroy, data);
+
+    if ((flags & ALLOW_CLIPPED) && prng_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[8];
+	pixman_region16_t clip;
+	int n = prng_rand_n (8) + 1;
+
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = prng_rand_n (width);
+	    clip_boxes[i].y1 = prng_rand_n (height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + prng_rand_n (width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + prng_rand_n (height - clip_boxes[i].y1);
+	}
+
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (image, &clip);
+	pixman_region_fini (&clip);
+    }
+
+    if ((flags & ALLOW_SOURCE_CLIPPING) && prng_rand_n (4) == 0)
+    {
+	pixman_image_set_source_clipping (image, TRUE);
+	pixman_image_set_has_client_clip (image, TRUE);
+    }
+
+    if ((flags & ALLOW_ALPHA_MAP) && prng_rand_n (16) == 0)
+    {
+	pixman_image_t *alpha_map;
+	int alpha_x, alpha_y;
+
+	alpha_x = prng_rand_n (width);
+	alpha_y = prng_rand_n (height);
+	alpha_map =
+	    create_image (max_size, formats, (flags & ~(ALLOW_ALPHA_MAP | ALLOW_SOLID)));
+	pixman_image_set_alpha_map (image, alpha_map, alpha_x, alpha_y);
+	pixman_image_unref (alpha_map);
+    }
+
+    if ((flags & ALLOW_REPEAT) && prng_rand_n (2) == 0)
+	pixman_image_set_repeat (image, prng_rand_n (4));
+
+    image_endian_swap (image);
+
+    return image;
+}
+
+#define KEY1(p) ((void *)(((uintptr_t)p) ^ (0xa7e23dfaUL)))
+#define KEY2(p) ((void *)(((uintptr_t)p) ^ (0xabcd9876UL)))
+
+#define MAX_GLYPHS 32
+
+uint32_t
+test_glyphs (int testnum, int verbose)
+{
+    pixman_image_t *glyph_images[MAX_GLYPHS];
+    pixman_glyph_t glyphs[4 * MAX_GLYPHS];
+    uint32_t crc32 = 0;
+    pixman_image_t *source, *dest;
+    int n_glyphs, i;
+    pixman_glyph_cache_t *cache;
+
+    prng_srand (testnum);
+
+    cache = pixman_glyph_cache_create ();
+
+    source = create_image (300, formats,
+			   ALLOW_CLIPPED | ALLOW_ALPHA_MAP |
+			   ALLOW_SOURCE_CLIPPING |
+			   ALLOW_REPEAT | ALLOW_SOLID);
+
+    dest = create_image (128, formats,
+			 ALLOW_CLIPPED | ALLOW_ALPHA_MAP |
+			 ALLOW_SOURCE_CLIPPING);
+
+    pixman_glyph_cache_freeze (cache);
+
+    n_glyphs = prng_rand_n (MAX_GLYPHS);
+    for (i = 0; i < n_glyphs; ++i)
+	glyph_images[i] = create_image (32, glyph_formats, 0);
+
+    for (i = 0; i < 4 * n_glyphs; ++i)
+    {
+	int g = prng_rand_n (n_glyphs);
+	pixman_image_t *glyph_img = glyph_images[g];
+	void *key1 = KEY1 (glyph_img);
+	void *key2 = KEY2 (glyph_img);
+	const void *glyph;
+
+	if (!(glyph = pixman_glyph_cache_lookup (cache, key1, key2)))
+	{
+	    glyph =
+		pixman_glyph_cache_insert (cache, key1, key2, 5, 8, glyph_img);
+	}
+
+	glyphs[i].glyph = glyph;
+	glyphs[i].x = prng_rand_n (128);
+	glyphs[i].y = prng_rand_n (128);
+    }
+
+    if (prng_rand_n (2) == 0)
+    {
+	int src_x = prng_rand_n (300) - 150;
+	int src_y = prng_rand_n (300) - 150;
+	int mask_x = prng_rand_n (64) - 32;
+	int mask_y = prng_rand_n (64) - 32;
+	int dest_x = prng_rand_n (64) - 32;
+	int dest_y = prng_rand_n (64) - 32;
+	int width = prng_rand_n (64);
+	int height = prng_rand_n (64);
+	pixman_op_t op = operators[prng_rand_n (ARRAY_LENGTH (operators))];
+	pixman_format_code_t format = random_format (glyph_formats);
+
+	pixman_composite_glyphs (
+	    op,
+	    source, dest, format,
+	    src_x, src_y,
+	    mask_x, mask_y,
+	    dest_x, dest_y,
+	    width, height,
+	    cache, 4 * n_glyphs, glyphs);
+    }
+    else
+    {
+	pixman_op_t op = operators[prng_rand_n (ARRAY_LENGTH (operators))];
+	int src_x = prng_rand_n (300) - 150;
+	int src_y = prng_rand_n (300) - 150;
+	int dest_x = prng_rand_n (64) - 32;
+	int dest_y = prng_rand_n (64) - 32;
+
+	pixman_composite_glyphs_no_mask (
+	    op, source, dest,
+	    src_x, src_y,
+	    dest_x, dest_y,
+	    cache, 4 * n_glyphs, glyphs);
+    }
+
+    pixman_glyph_cache_thaw (cache);
+
+    for (i = 0; i < n_glyphs; ++i)
+    {
+	pixman_image_t *img = glyph_images[i];
+	void *key1, *key2;
+
+	key1 = KEY1 (img);
+	key2 = KEY2 (img);
+
+	pixman_glyph_cache_remove (cache, key1, key2);
+	pixman_image_unref (glyph_images[i]);
+    }
+
+    crc32 = compute_crc32_for_image (0, dest);
+
+    pixman_image_unref (source);
+    pixman_image_unref (dest);
+
+    pixman_glyph_cache_destroy (cache);
+
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    return fuzzer_test_main ("glyph", 30000,	
+			     0xFA478A79,
+			     test_glyphs, argc, argv);
+}
diff --git a/src/pixman/test/gradient-crash-test.c b/src/pixman/test/gradient-crash-test.c
new file mode 100644
index 0000000..962d1cb
--- /dev/null
+++ b/src/pixman/test/gradient-crash-test.c
@@ -0,0 +1,158 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define WIDTH 400
+#define HEIGHT 200
+    
+    uint32_t *dest = malloc (WIDTH * HEIGHT * 4);
+    pixman_image_t *src_img;
+    pixman_image_t *dest_img;
+    int i, j, k, p;
+
+    typedef struct
+    {
+	pixman_point_fixed_t p0;
+	pixman_point_fixed_t p1;
+    } point_pair_t;
+    
+    pixman_gradient_stop_t onestop[1] =
+	{
+	    { pixman_int_to_fixed (1), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
+	};
+
+    pixman_gradient_stop_t subsetstops[2] =
+	{
+	    { pixman_int_to_fixed (1), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
+	    { pixman_int_to_fixed (1), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
+	};
+
+    pixman_gradient_stop_t stops01[2] =
+	{
+	    { pixman_int_to_fixed (0), { 0xffff, 0xeeee, 0xeeee, 0xeeee } },
+	    { pixman_int_to_fixed (1), { 0xffff, 0x1111, 0x1111, 0x1111 } }
+	};
+
+    point_pair_t point_pairs [] =
+	{ { { pixman_double_to_fixed (0), 0 },
+	    { pixman_double_to_fixed (WIDTH / 8.), pixman_int_to_fixed (0) } },
+	  { { pixman_double_to_fixed (WIDTH / 2.0), pixman_double_to_fixed (HEIGHT / 2.0) },
+	    { pixman_double_to_fixed (WIDTH / 2.0), pixman_double_to_fixed (HEIGHT / 2.0) } }
+	};
+    
+    pixman_transform_t transformations[] = {
+	{
+	    { { pixman_double_to_fixed (2), pixman_double_to_fixed (0.5), pixman_double_to_fixed (-100), },
+	      { pixman_double_to_fixed (0), pixman_double_to_fixed (3), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
+	    }
+	},
+	{
+	    { { pixman_double_to_fixed (1), pixman_double_to_fixed (0), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (0), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (0), pixman_double_to_fixed (0.000), pixman_double_to_fixed (1.0) } 
+	    }
+	},
+	{
+	    { { pixman_double_to_fixed (2), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (1), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (2), pixman_double_to_fixed (1.000), pixman_double_to_fixed (1.0) } 
+	    }
+	},
+	{
+	    { { pixman_double_to_fixed (2), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (1), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (0), pixman_double_to_fixed (0), pixman_double_to_fixed (0) } 
+	    }
+	},
+	{
+	    { { pixman_double_to_fixed (2), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (1), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (2), pixman_double_to_fixed (-1), pixman_double_to_fixed (0) } 
+	    }
+	},
+	{
+	    { { pixman_double_to_fixed (2), pixman_double_to_fixed (1), pixman_double_to_fixed (3), },
+	      { pixman_double_to_fixed (1), pixman_double_to_fixed (1), pixman_double_to_fixed (0), },
+	      { pixman_double_to_fixed (2), pixman_double_to_fixed (-1), pixman_double_to_fixed (0) } 
+	    }
+	},
+    };
+    
+    pixman_fixed_t r_inner;
+    pixman_fixed_t r_outer;
+
+    enable_divbyzero_exceptions();
+    
+    for (i = 0; i < WIDTH * HEIGHT; ++i)
+	dest[i] = 0x4f00004f; /* pale blue */
+    
+    dest_img = pixman_image_create_bits (PIXMAN_a8r8g8b8,
+					 WIDTH, HEIGHT, 
+					 dest,
+					 WIDTH * 4);
+
+    r_inner = 0;
+    r_outer = pixman_double_to_fixed (50.0);
+    
+    for (i = 0; i < 3; ++i)
+    {
+	pixman_gradient_stop_t *stops;
+        int num_stops;
+
+	if (i == 0)
+	{
+	    stops = onestop;
+	    num_stops = ARRAY_LENGTH (onestop);
+	}
+	else if (i == 1)
+	{
+	    stops = subsetstops;
+	    num_stops = ARRAY_LENGTH (subsetstops);
+	}
+	else
+	{
+	    stops = stops01;
+	    num_stops = ARRAY_LENGTH (stops01);
+	}
+	
+	for (j = 0; j < 3; ++j)
+	{
+	    for (p = 0; p < ARRAY_LENGTH (point_pairs); ++p)
+	    {
+		point_pair_t *pair = &(point_pairs[p]);
+
+		if (j == 0)
+		    src_img = pixman_image_create_conical_gradient (&(pair->p0), r_inner,
+								    stops, num_stops);
+		else if (j == 1)
+		    src_img = pixman_image_create_radial_gradient  (&(pair->p0), &(pair->p1),
+								    r_inner, r_outer,
+								    stops, num_stops);
+		else
+		    src_img = pixman_image_create_linear_gradient  (&(pair->p0), &(pair->p1),
+								    stops, num_stops);
+		
+		for (k = 0; k < ARRAY_LENGTH (transformations); ++k)
+		{
+		    pixman_image_set_transform (src_img, &transformations[k]);
+		    
+		    pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NONE);
+		    pixman_image_composite (PIXMAN_OP_OVER, src_img, NULL, dest_img,
+					    0, 0, 0, 0, 0, 0, 10 * WIDTH, HEIGHT);
+		}
+
+		pixman_image_unref (src_img);
+	    }
+
+	}
+    }
+
+    pixman_image_unref (dest_img);
+    free (dest);
+    
+    return 0;
+}
diff --git a/src/pixman/test/infinite-loop.c b/src/pixman/test/infinite-loop.c
new file mode 100644
index 0000000..02addaa
--- /dev/null
+++ b/src/pixman/test/infinite-loop.c
@@ -0,0 +1,39 @@
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "utils.h"
+
+int
+main (int argc, char **argv)
+{
+#define SRC_WIDTH 16
+#define SRC_HEIGHT 12
+#define DST_WIDTH 7
+#define DST_HEIGHT 2
+
+    static const pixman_transform_t transform = {
+	{ { 0x200017bd, 0x00000000, 0x000e6465 },
+	  { 0x00000000, 0x000a42fd, 0x000e6465 },
+	  { 0x00000000, 0x00000000, 0x00010000 },
+	}
+    };
+    pixman_image_t *src, *dest;
+
+    src = pixman_image_create_bits (
+	PIXMAN_a8r8g8b8, SRC_WIDTH, SRC_HEIGHT, NULL, -1);
+    dest = pixman_image_create_bits (
+	PIXMAN_a8r8g8b8, DST_WIDTH, DST_HEIGHT, NULL, -1);
+
+    pixman_image_set_transform (src, &transform);
+    pixman_image_set_repeat (src, PIXMAN_REPEAT_NORMAL);
+    pixman_image_set_filter (src, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    if (argc == 1 || strcmp (argv[1], "-nf") != 0)
+	fail_after (1, "infinite loop detected");
+
+    pixman_image_composite (
+	PIXMAN_OP_OVER, src, NULL, dest, -3, -3, 0, 0, 0, 0, 6, 2);
+
+    return 0;
+}
diff --git a/src/pixman/test/lowlevel-blt-bench.c b/src/pixman/test/lowlevel-blt-bench.c
new file mode 100644
index 0000000..1049e21
--- /dev/null
+++ b/src/pixman/test/lowlevel-blt-bench.c
@@ -0,0 +1,820 @@
+/*
+ * Copyright © 2009 Nokia Corporation
+ * Copyright © 2010 Movial Creative Technologies Oy
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "utils.h"
+
+#define SOLID_FLAG 1
+#define CA_FLAG    2
+
+#define L1CACHE_SIZE (8 * 1024)
+#define L2CACHE_SIZE (128 * 1024)
+
+/* This is applied to both L1 and L2 tests - alternatively, you could
+ * parameterise bench_L or split it into two functions. It could be
+ * read at runtime on some architectures, but it only really matters
+ * that it's a number that's an integer divisor of both cacheline
+ * lengths, and further, it only really matters for caches that don't
+ * do allocate0on-write. */
+#define CACHELINE_LENGTH (32) /* bytes */
+
+#define WIDTH  1920
+#define HEIGHT 1080
+#define BUFSIZE (WIDTH * HEIGHT * 4)
+#define XWIDTH 256
+#define XHEIGHT 256
+#define TILEWIDTH 32
+#define TINYWIDTH 8
+
+#define EXCLUDE_OVERHEAD 1
+
+uint32_t *dst;
+uint32_t *src;
+uint32_t *mask;
+
+double bandwidth = 0;
+
+double
+bench_memcpy ()
+{
+    int64_t n = 0, total;
+    double  t1, t2;
+    int     x = 0;
+
+    t1 = gettime ();
+    while (1)
+    {
+	memcpy (dst, src, BUFSIZE - 64);
+	memcpy (src, dst, BUFSIZE - 64);
+	n += 4 * (BUFSIZE - 64);
+	t2 = gettime ();
+	if (t2 - t1 > 0.5)
+	    break;
+    }
+    n = total = n * 5;
+    t1 = gettime ();
+    while (n > 0)
+    {
+	if (++x >= 64)
+	    x = 0;
+	memcpy ((char *)dst + 1, (char *)src + x, BUFSIZE - 64);
+	memcpy ((char *)src + 1, (char *)dst + x, BUFSIZE - 64);
+	n -= 4 * (BUFSIZE - 64);
+    }
+    t2 = gettime ();
+    return (double)total / (t2 - t1);
+}
+
+static pixman_bool_t use_scaling = FALSE;
+static pixman_filter_t filter = PIXMAN_FILTER_NEAREST;
+
+/* nearly 1x scale factor */
+static pixman_transform_t m =
+{
+    {
+        { pixman_fixed_1 + 1, 0,              0              },
+        { 0,                  pixman_fixed_1, 0              },
+        { 0,                  0,              pixman_fixed_1 }
+    }
+};
+
+static void
+pixman_image_composite_wrapper (pixman_implementation_t *impl,
+				pixman_composite_info_t *info)
+{
+    if (use_scaling)
+    {
+        pixman_image_set_filter (info->src_image, filter, NULL, 0);
+        pixman_image_set_transform(info->src_image, &m);
+    }
+    pixman_image_composite (info->op,
+			    info->src_image, info->mask_image, info->dest_image,
+			    info->src_x, info->src_y,
+			    info->mask_x, info->mask_y,
+			    info->dest_x, info->dest_y,
+			    info->width, info->height);
+}
+
+static void
+pixman_image_composite_empty (pixman_implementation_t *impl,
+			      pixman_composite_info_t *info)
+{
+    if (use_scaling)
+    {
+        pixman_image_set_filter (info->src_image, filter, NULL, 0);
+        pixman_image_set_transform(info->src_image, &m);
+    }
+    pixman_image_composite (info->op,
+			    info->src_image, info->mask_image, info->dest_image,
+			    0, 0, 0, 0, 0, 0, 1, 1);
+}
+
+static inline void
+call_func (pixman_composite_func_t func,
+	   pixman_op_t             op,
+	   pixman_image_t *        src_image,
+	   pixman_image_t *        mask_image,
+	   pixman_image_t *        dest_image,
+	   int32_t		   src_x,
+	   int32_t		   src_y,
+	   int32_t                 mask_x,
+	   int32_t                 mask_y,
+	   int32_t                 dest_x,
+	   int32_t                 dest_y,
+	   int32_t                 width,
+	   int32_t                 height)
+{
+    pixman_composite_info_t info;
+
+    info.op = op;
+    info.src_image = src_image;
+    info.mask_image = mask_image;
+    info.dest_image = dest_image;
+    info.src_x = src_x;
+    info.src_y = src_y;
+    info.mask_x = mask_x;
+    info.mask_y = mask_y;
+    info.dest_x = dest_x;
+    info.dest_y = dest_y;
+    info.width = width;
+    info.height = height;
+
+    func (0, &info);
+}
+
+void
+noinline
+bench_L  (pixman_op_t              op,
+          pixman_image_t *         src_img,
+          pixman_image_t *         mask_img,
+          pixman_image_t *         dst_img,
+          int64_t                  n,
+          pixman_composite_func_t  func,
+          int                      width,
+          int                      lines_count)
+{
+    int64_t      i, j, k;
+    int          x = 0;
+    int          q = 0;
+    volatile int qx;
+
+    for (i = 0; i < n; i++)
+    {
+        /* For caches without allocate-on-write, we need to force the
+         * destination buffer back into the cache on each iteration,
+         * otherwise if they are evicted during the test, they remain
+         * uncached. This doesn't matter for tests which read the
+         * destination buffer, or for caches that do allocate-on-write,
+         * but in those cases this loop just adds constant time, which
+         * should be successfully cancelled out.
+         */
+        for (j = 0; j < lines_count; j++)
+        {
+            for (k = 0; k < width + 62; k += CACHELINE_LENGTH / sizeof *dst)
+            {
+                q += dst[j * WIDTH + k];
+            }
+            q += dst[j * WIDTH + width + 62];
+        }
+	if (++x >= 64)
+	    x = 0;
+	call_func (func, op, src_img, mask_img, dst_img, x, 0, x, 0, 63 - x, 0, width, lines_count);
+    }
+    qx = q;
+}
+
+void
+noinline
+bench_M (pixman_op_t              op,
+         pixman_image_t *         src_img,
+         pixman_image_t *         mask_img,
+         pixman_image_t *         dst_img,
+         int64_t                  n,
+         pixman_composite_func_t  func)
+{
+    int64_t i;
+    int     x = 0;
+
+    for (i = 0; i < n; i++)
+    {
+	if (++x >= 64)
+	    x = 0;
+	call_func (func, op, src_img, mask_img, dst_img, x, 0, x, 0, 1, 0, WIDTH - 64, HEIGHT);
+    }
+}
+
+double
+noinline
+bench_HT (pixman_op_t              op,
+          pixman_image_t *         src_img,
+          pixman_image_t *         mask_img,
+          pixman_image_t *         dst_img,
+          int64_t                  n,
+          pixman_composite_func_t  func)
+{
+    double  pix_cnt = 0;
+    int     x = 0;
+    int     y = 0;
+    int64_t i;
+
+    srand (0);
+    for (i = 0; i < n; i++)
+    {
+	int w = (rand () % (TILEWIDTH * 2)) + 1;
+	int h = (rand () % (TILEWIDTH * 2)) + 1;
+	if (x + w > WIDTH)
+	{
+	    x = 0;
+	    y += TILEWIDTH * 2;
+	}
+	if (y + h > HEIGHT)
+	{
+	    y = 0;
+	}
+	call_func (func, op, src_img, mask_img, dst_img, x, y, x, y, x, y, w, h);
+	x += w;
+	pix_cnt += w * h;
+    }
+    return pix_cnt;
+}
+
+double
+noinline
+bench_VT (pixman_op_t              op,
+          pixman_image_t *         src_img,
+          pixman_image_t *         mask_img,
+          pixman_image_t *         dst_img,
+          int64_t                  n,
+          pixman_composite_func_t  func)
+{
+    double  pix_cnt = 0;
+    int     x = 0;
+    int     y = 0;
+    int64_t i;
+
+    srand (0);
+    for (i = 0; i < n; i++)
+    {
+	int w = (rand () % (TILEWIDTH * 2)) + 1;
+	int h = (rand () % (TILEWIDTH * 2)) + 1;
+	if (y + h > HEIGHT)
+	{
+	    y = 0;
+	    x += TILEWIDTH * 2;
+	}
+	if (x + w > WIDTH)
+	{
+	    x = 0;
+	}
+	call_func (func, op, src_img, mask_img, dst_img, x, y, x, y, x, y, w, h);
+	y += h;
+	pix_cnt += w * h;
+    }
+    return pix_cnt;
+}
+
+double
+noinline
+bench_R (pixman_op_t              op,
+         pixman_image_t *         src_img,
+         pixman_image_t *         mask_img,
+         pixman_image_t *         dst_img,
+         int64_t                  n,
+         pixman_composite_func_t  func,
+         int                      maxw,
+         int                      maxh)
+{
+    double  pix_cnt = 0;
+    int64_t i;
+
+    if (maxw <= TILEWIDTH * 2 || maxh <= TILEWIDTH * 2)
+    {
+	printf("error: maxw <= TILEWIDTH * 2 || maxh <= TILEWIDTH * 2\n");
+        return 0;
+    }
+
+    srand (0);
+    for (i = 0; i < n; i++)
+    {
+	int w = (rand () % (TILEWIDTH * 2)) + 1;
+	int h = (rand () % (TILEWIDTH * 2)) + 1;
+	int sx = rand () % (maxw - TILEWIDTH * 2);
+	int sy = rand () % (maxh - TILEWIDTH * 2);
+	int dx = rand () % (maxw - TILEWIDTH * 2);
+	int dy = rand () % (maxh - TILEWIDTH * 2);
+	call_func (func, op, src_img, mask_img, dst_img, sx, sy, sx, sy, dx, dy, w, h);
+	pix_cnt += w * h;
+    }
+    return pix_cnt;
+}
+
+double
+noinline
+bench_RT (pixman_op_t              op,
+          pixman_image_t *         src_img,
+          pixman_image_t *         mask_img,
+          pixman_image_t *         dst_img,
+          int64_t                  n,
+          pixman_composite_func_t  func,
+          int                      maxw,
+          int                      maxh)
+{
+    double  pix_cnt = 0;
+    int64_t i;
+
+    if (maxw <= TINYWIDTH * 2 || maxh <= TINYWIDTH * 2)
+    {
+	printf("error: maxw <= TINYWIDTH * 2 || maxh <= TINYWIDTH * 2\n");
+        return 0;
+    }
+
+    srand (0);
+    for (i = 0; i < n; i++)
+    {
+	int w = (rand () % (TINYWIDTH * 2)) + 1;
+	int h = (rand () % (TINYWIDTH * 2)) + 1;
+	int sx = rand () % (maxw - TINYWIDTH * 2);
+	int sy = rand () % (maxh - TINYWIDTH * 2);
+	int dx = rand () % (maxw - TINYWIDTH * 2);
+	int dy = rand () % (maxh - TINYWIDTH * 2);
+	call_func (func, op, src_img, mask_img, dst_img, sx, sy, sx, sy, dx, dy, w, h);
+	pix_cnt += w * h;
+    }
+    return pix_cnt;
+}
+
+void
+bench_composite (char * testname,
+                 int    src_fmt,
+                 int    src_flags,
+                 int    op,
+                 int    mask_fmt,
+                 int    mask_flags,
+                 int    dst_fmt,
+                 double npix)
+{
+    pixman_image_t *                src_img;
+    pixman_image_t *                dst_img;
+    pixman_image_t *                mask_img;
+    pixman_image_t *                xsrc_img;
+    pixman_image_t *                xdst_img;
+    pixman_image_t *                xmask_img;
+    double                          t1, t2, t3, pix_cnt;
+    int64_t                         n, l1test_width, nlines;
+    double                             bytes_per_pix = 0;
+    pixman_bool_t                   bench_pixbuf = FALSE;
+
+    pixman_composite_func_t func = pixman_image_composite_wrapper;
+
+    if (!(src_flags & SOLID_FLAG))
+    {
+        bytes_per_pix += (src_fmt >> 24) / 8.0;
+        src_img = pixman_image_create_bits (src_fmt,
+                                            WIDTH, HEIGHT,
+                                            src,
+                                            WIDTH * 4);
+        xsrc_img = pixman_image_create_bits (src_fmt,
+                                             XWIDTH, XHEIGHT,
+                                             src,
+                                             XWIDTH * 4);
+    }
+    else
+    {
+        src_img = pixman_image_create_bits (src_fmt,
+                                            1, 1,
+                                            src,
+                                            4);
+        xsrc_img = pixman_image_create_bits (src_fmt,
+                                             1, 1,
+                                             src,
+                                             4);
+        pixman_image_set_repeat (src_img, PIXMAN_REPEAT_NORMAL);
+        pixman_image_set_repeat (xsrc_img, PIXMAN_REPEAT_NORMAL);
+    }
+
+    bytes_per_pix += (dst_fmt >> 24) / 8.0;
+    dst_img = pixman_image_create_bits (dst_fmt,
+                                        WIDTH, HEIGHT,
+                                        dst,
+                                        WIDTH * 4);
+
+    mask_img = NULL;
+    xmask_img = NULL;
+    if (strcmp (testname, "pixbuf") == 0 || strcmp (testname, "rpixbuf") == 0)
+    {
+        bench_pixbuf = TRUE;
+    }
+    if (!(mask_flags & SOLID_FLAG) && mask_fmt != PIXMAN_null)
+    {
+        bytes_per_pix += (mask_fmt >> 24) / ((op == PIXMAN_OP_SRC) ? 8.0 : 4.0);
+        mask_img = pixman_image_create_bits (mask_fmt,
+                                             WIDTH, HEIGHT,
+                                             bench_pixbuf ? src : mask,
+                                             WIDTH * 4);
+        xmask_img = pixman_image_create_bits (mask_fmt,
+                                             XWIDTH, XHEIGHT,
+                                             bench_pixbuf ? src : mask,
+                                             XWIDTH * 4);
+    }
+    else if (mask_fmt != PIXMAN_null)
+    {
+        mask_img = pixman_image_create_bits (mask_fmt,
+                                             1, 1,
+                                             mask,
+                                             4);
+        xmask_img = pixman_image_create_bits (mask_fmt,
+                                             1, 1,
+                                             mask,
+                                             4 * 4);
+       pixman_image_set_repeat (mask_img, PIXMAN_REPEAT_NORMAL);
+       pixman_image_set_repeat (xmask_img, PIXMAN_REPEAT_NORMAL);
+    }
+    if ((mask_flags & CA_FLAG) && mask_fmt != PIXMAN_null)
+    {
+       pixman_image_set_component_alpha (mask_img, 1);
+    }
+    xdst_img = pixman_image_create_bits (dst_fmt,
+                                         XWIDTH, XHEIGHT,
+                                         dst,
+                                         XWIDTH * 4);
+
+
+    printf ("%24s %c", testname, func != pixman_image_composite_wrapper ?
+            '-' : '=');
+
+    memcpy (dst, src, BUFSIZE);
+    memcpy (src, dst, BUFSIZE);
+
+    l1test_width = L1CACHE_SIZE / 8 - 64;
+    if (l1test_width < 1)
+	l1test_width = 1;
+    if (l1test_width > WIDTH - 64)
+	l1test_width = WIDTH - 64;
+    n = 1 + npix / (l1test_width * 8);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    bench_L (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, l1test_width, 1);
+#endif
+    t2 = gettime ();
+    bench_L (op, src_img, mask_img, dst_img, n, func, l1test_width, 1);
+    t3 = gettime ();
+    printf ("  L1:%7.2f", (double)n * l1test_width * 1 /
+            ((t3 - t2) - (t2 - t1)) / 1000000.);
+    fflush (stdout);
+
+    memcpy (dst, src, BUFSIZE);
+    memcpy (src, dst, BUFSIZE);
+
+    nlines = (L2CACHE_SIZE / l1test_width) /
+	((PIXMAN_FORMAT_BPP(src_fmt) + PIXMAN_FORMAT_BPP(dst_fmt)) / 8);
+    if (nlines < 1)
+	nlines = 1;
+    n = 1 + npix / (l1test_width * nlines);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    bench_L (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, l1test_width, nlines);
+#endif
+    t2 = gettime ();
+    bench_L (op, src_img, mask_img, dst_img, n, func, l1test_width, nlines);
+    t3 = gettime ();
+    printf ("  L2:%7.2f", (double)n * l1test_width * nlines /
+            ((t3 - t2) - (t2 - t1)) / 1000000.);
+    fflush (stdout);
+
+    memcpy (dst, src, BUFSIZE);
+    memcpy (src, dst, BUFSIZE);
+
+    n = 1 + npix / (WIDTH * HEIGHT);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    bench_M (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty);
+#endif
+    t2 = gettime ();
+    bench_M (op, src_img, mask_img, dst_img, n, func);
+    t3 = gettime ();
+    printf ("  M:%6.2f (%6.2f%%)",
+        ((double)n * (WIDTH - 64) * HEIGHT / ((t3 - t2) - (t2 - t1))) / 1000000.,
+        ((double)n * (WIDTH - 64) * HEIGHT / ((t3 - t2) - (t2 - t1)) * bytes_per_pix) * (100.0 / bandwidth) );
+    fflush (stdout);
+
+    memcpy (dst, src, BUFSIZE);
+    memcpy (src, dst, BUFSIZE);
+
+    n = 1 + npix / (8 * TILEWIDTH * TILEWIDTH);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    pix_cnt = bench_HT (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty);
+#endif
+    t2 = gettime ();
+    pix_cnt = bench_HT (op, src_img, mask_img, dst_img, n, func);
+    t3 = gettime ();
+    printf ("  HT:%6.2f", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000.);
+    fflush (stdout);
+
+    memcpy (dst, src, BUFSIZE);
+    memcpy (src, dst, BUFSIZE);
+
+    n = 1 + npix / (8 * TILEWIDTH * TILEWIDTH);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    pix_cnt = bench_VT (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty);
+#endif
+    t2 = gettime ();
+    pix_cnt = bench_VT (op, src_img, mask_img, dst_img, n, func);
+    t3 = gettime ();
+    printf ("  VT:%6.2f", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000.);
+    fflush (stdout);
+
+    memcpy (dst, src, BUFSIZE);
+    memcpy (src, dst, BUFSIZE);
+
+    n = 1 + npix / (8 * TILEWIDTH * TILEWIDTH);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    pix_cnt = bench_R (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, WIDTH, HEIGHT);
+#endif
+    t2 = gettime ();
+    pix_cnt = bench_R (op, src_img, mask_img, dst_img, n, func, WIDTH, HEIGHT);
+    t3 = gettime ();
+    printf ("  R:%6.2f", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000.);
+    fflush (stdout);
+
+    memcpy (dst, src, BUFSIZE);
+    memcpy (src, dst, BUFSIZE);
+
+    n = 1 + npix / (16 * TINYWIDTH * TINYWIDTH);
+    t1 = gettime ();
+#if EXCLUDE_OVERHEAD
+    pix_cnt = bench_RT (op, src_img, mask_img, dst_img, n, pixman_image_composite_empty, WIDTH, HEIGHT);
+#endif
+    t2 = gettime ();
+    pix_cnt = bench_RT (op, src_img, mask_img, dst_img, n, func, WIDTH, HEIGHT);
+    t3 = gettime ();
+    printf ("  RT:%6.2f (%4.0fKops/s)\n", (double)pix_cnt / ((t3 - t2) - (t2 - t1)) / 1000000., (double) n / ((t3 - t2) * 1000));
+
+    if (mask_img) {
+	pixman_image_unref (mask_img);
+	pixman_image_unref (xmask_img);
+    }
+    pixman_image_unref (src_img);
+    pixman_image_unref (dst_img);
+    pixman_image_unref (xsrc_img);
+    pixman_image_unref (xdst_img);
+}
+
+#define PIXMAN_OP_OUT_REV (PIXMAN_OP_OUT_REVERSE)
+
+struct
+{
+    char *testname;
+    int   src_fmt;
+    int   src_flags;
+    int   op;
+    int   mask_fmt;
+    int   mask_flags;
+    int   dst_fmt;
+}
+tests_tbl[] =
+{
+    { "add_8_8_8",             PIXMAN_a8,          0, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8 },
+    { "add_n_8_8",             PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8 },
+    { "add_n_8_8888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "add_n_8_x888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "add_n_8_0565",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "add_n_8_1555",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
+    { "add_n_8_4444",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a4r4g4b4 },
+    { "add_n_8_2222",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a2r2g2b2 },
+    { "add_n_8_2x10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_x2r10g10b10 },
+    { "add_n_8_2a10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_a8,       0, PIXMAN_a2r10g10b10 },
+    { "add_n_8",               PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8 },
+    { "add_n_8888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "add_n_x888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "add_n_0565",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "add_n_1555",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "add_n_4444",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a4r4g4b4 },
+    { "add_n_2222",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r2g2b2 },
+    { "add_n_2x10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
+    { "add_n_2a10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
+    { "add_8_8",               PIXMAN_a8,          0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8 },
+    { "add_x888_x888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "add_8888_8888",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "add_8888_0565",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "add_8888_1555",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "add_8888_4444",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a4r4g4b4 },
+    { "add_8888_2222",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r2g2b2 },
+    { "add_0565_0565",         PIXMAN_r5g6b5,      0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "add_1555_1555",         PIXMAN_a1r5g5b5,    0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "add_0565_2x10",         PIXMAN_r5g6b5,      0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
+    { "add_2a10_2a10",         PIXMAN_a2r10g10b10, 0, PIXMAN_OP_ADD,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
+    { "in_n_8_8",              PIXMAN_a8r8g8b8,    1, PIXMAN_OP_IN,      PIXMAN_a8,       0, PIXMAN_a8 },
+    { "in_8_8",                PIXMAN_a8,          0, PIXMAN_OP_IN,      PIXMAN_null,     0, PIXMAN_a8 },
+    { "src_n_2222",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a2r2g2b2 },
+    { "src_n_0565",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_n_1555",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "src_n_4444",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a4r4g4b4 },
+    { "src_n_x888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_n_8888",            PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "src_n_2x10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
+    { "src_n_2a10",            PIXMAN_a2r10g10b10, 1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
+    { "src_8888_0565",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_0565_8888",         PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "src_8888_4444",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a4r4g4b4 },
+    { "src_8888_2222",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a2r2g2b2 },
+    { "src_8888_2x10",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x2r10g10b10 },
+    { "src_8888_2a10",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a2r10g10b10 },
+    { "src_0888_0565",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_0888_8888",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "src_0888_x888",         PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_0888_8888_rev",     PIXMAN_b8g8r8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_0888_0565_rev",     PIXMAN_b8g8r8,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_x888_x888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "src_x888_8888",         PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "src_8888_8888",         PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "src_0565_0565",         PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_1555_0565",         PIXMAN_a1r5g5b5,    0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "src_0565_1555",         PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "src_8_8",               PIXMAN_a8,          0, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8 },
+    { "src_n_8",               PIXMAN_a8,          1, PIXMAN_OP_SRC,     PIXMAN_null,     0, PIXMAN_a8 },
+    { "src_n_8_0565",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "src_n_8_1555",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
+    { "src_n_8_4444",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a4r4g4b4 },
+    { "src_n_8_2222",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a2r2g2b2 },
+    { "src_n_8_x888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "src_n_8_8888",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "src_n_8_2x10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_x2r10g10b10 },
+    { "src_n_8_2a10",          PIXMAN_a8r8g8b8,    1, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a2r10g10b10 },
+    { "src_8888_8_0565",       PIXMAN_a8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "src_0888_8_0565",       PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "src_0888_8_8888",       PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "src_0888_8_x888",       PIXMAN_r8g8b8,      0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "src_x888_8_x888",       PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "src_x888_8_8888",       PIXMAN_x8r8g8b8,    0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "src_0565_8_0565",       PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "src_1555_8_0565",       PIXMAN_a1r5g5b5,    0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "src_0565_8_1555",       PIXMAN_r5g6b5,      0, PIXMAN_OP_SRC,     PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
+    { "over_n_x888",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "over_n_8888",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "over_n_0565",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "over_n_1555",           PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_a1r5g5b5 },
+    { "over_8888_0565",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_r5g6b5 },
+    { "over_8888_8888",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_a8r8g8b8 },
+    { "over_8888_x888",        PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_null,     0, PIXMAN_x8r8g8b8 },
+    { "over_x888_8_0565",      PIXMAN_x8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "over_x888_8_8888",      PIXMAN_x8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "over_n_8_0565",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "over_n_8_1555",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
+    { "over_n_8_4444",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a4r4g4b4 },
+    { "over_n_8_2222",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a2r2g2b2 },
+    { "over_n_8_x888",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "over_n_8_8888",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "over_n_8_2x10",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_x2r10g10b10 },
+    { "over_n_8_2a10",         PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8,       0, PIXMAN_a2r10g10b10 },
+    { "over_n_8888_8888_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_a8r8g8b8 },
+    { "over_n_8888_x888_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_x8r8g8b8 },
+    { "over_n_8888_0565_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_r5g6b5 },
+    { "over_n_8888_1555_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_a1r5g5b5 },
+    { "over_n_8888_4444_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_a4r4g4b4 },
+    { "over_n_8888_2222_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_a2r2g2b2 },
+    { "over_n_8888_2x10_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_x2r10g10b10 },
+    { "over_n_8888_2a10_ca",   PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OVER,    PIXMAN_a8r8g8b8, 2, PIXMAN_a2r10g10b10 },
+    { "over_8888_n_8888",      PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_a8r8g8b8 },
+    { "over_8888_n_x888",      PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_x8r8g8b8 },
+    { "over_8888_n_0565",      PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_r5g6b5 },
+    { "over_8888_n_1555",      PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_a1r5g5b5 },
+    { "over_x888_n_8888",      PIXMAN_x8r8g8b8,    0, PIXMAN_OP_OVER,    PIXMAN_a8,       1, PIXMAN_a8r8g8b8 },
+    { "outrev_n_8_0565",       PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8,       0, PIXMAN_r5g6b5 },
+    { "outrev_n_8_1555",       PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8,       0, PIXMAN_a1r5g5b5 },
+    { "outrev_n_8_x888",       PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8,       0, PIXMAN_x8r8g8b8 },
+    { "outrev_n_8_8888",       PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8,       0, PIXMAN_a8r8g8b8 },
+    { "outrev_n_8888_0565_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_r5g6b5 },
+    { "outrev_n_8888_1555_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_a1r5g5b5 },
+    { "outrev_n_8888_x888_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_x8r8g8b8 },
+    { "outrev_n_8888_8888_ca", PIXMAN_a8r8g8b8,    1, PIXMAN_OP_OUT_REV, PIXMAN_a8r8g8b8, 2, PIXMAN_a8r8g8b8 },
+    { "over_reverse_n_8888",   PIXMAN_a8r8g8b8,    0, PIXMAN_OP_OVER_REVERSE, PIXMAN_null, 0, PIXMAN_a8r8g8b8 },
+    { "pixbuf",                PIXMAN_x8b8g8r8,    0, PIXMAN_OP_SRC,     PIXMAN_a8b8g8r8, 0, PIXMAN_a8r8g8b8 },
+    { "rpixbuf",               PIXMAN_x8b8g8r8,    0, PIXMAN_OP_SRC,     PIXMAN_a8b8g8r8, 0, PIXMAN_a8b8g8r8 },
+};
+
+int
+main (int argc, char *argv[])
+{
+    double x;
+    int i;
+    const char *pattern = NULL;
+    for (i = 1; i < argc; i++)
+    {
+	if (argv[i][0] == '-')
+	{
+	    if (strchr (argv[i] + 1, 'b'))
+	    {
+		use_scaling = TRUE;
+		filter = PIXMAN_FILTER_BILINEAR;
+	    }
+	    else if (strchr (argv[i] + 1, 'n'))
+	    {
+		use_scaling = TRUE;
+		filter = PIXMAN_FILTER_NEAREST;
+	    }
+	}
+	else
+	{
+	    pattern = argv[i];
+	}
+    }
+
+    if (!pattern)
+    {
+	printf ("Usage: lowlevel-blt-bench [-b] [-n] pattern\n");
+	printf ("  -n : benchmark nearest scaling\n");
+	printf ("  -b : benchmark bilinear scaling\n");
+	return 1;
+    }
+
+    src = aligned_malloc (4096, BUFSIZE * 3);
+    memset (src, 0xCC, BUFSIZE * 3);
+    dst = src + (BUFSIZE / 4);
+    mask = dst + (BUFSIZE / 4);
+
+    printf ("Benchmark for a set of most commonly used functions\n");
+    printf ("---\n");
+    printf ("All results are presented in millions of pixels per second\n");
+    printf ("L1  - small Xx1 rectangle (fitting L1 cache), always blitted at the same\n");
+    printf ("      memory location with small drift in horizontal direction\n");
+    printf ("L2  - small XxY rectangle (fitting L2 cache), always blitted at the same\n");
+    printf ("      memory location with small drift in horizontal direction\n");
+    printf ("M   - large %dx%d rectangle, always blitted at the same\n",
+            WIDTH - 64, HEIGHT);
+    printf ("      memory location with small drift in horizontal direction\n");
+    printf ("HT  - random rectangles with %dx%d average size are copied from\n",
+            TILEWIDTH, TILEWIDTH);
+    printf ("      one %dx%d buffer to another, traversing from left to right\n",
+            WIDTH, HEIGHT);
+    printf ("      and from top to bottom\n");
+    printf ("VT  - random rectangles with %dx%d average size are copied from\n",
+            TILEWIDTH, TILEWIDTH);
+    printf ("      one %dx%d buffer to another, traversing from top to bottom\n",
+            WIDTH, HEIGHT);
+    printf ("      and from left to right\n");
+    printf ("R   - random rectangles with %dx%d average size are copied from\n",
+            TILEWIDTH, TILEWIDTH);
+    printf ("      random locations of one %dx%d buffer to another\n",
+            WIDTH, HEIGHT);
+    printf ("RT  - as R, but %dx%d average sized rectangles are copied\n",
+            TINYWIDTH, TINYWIDTH);
+    printf ("---\n");
+    bandwidth = x = bench_memcpy ();
+    printf ("reference memcpy speed = %.1fMB/s (%.1fMP/s for 32bpp fills)\n",
+            x / 1000000., x / 4000000);
+    if (use_scaling)
+    {
+	printf ("---\n");
+	if (filter == PIXMAN_FILTER_BILINEAR)
+	    printf ("BILINEAR scaling\n");
+	else if (filter == PIXMAN_FILTER_NEAREST)
+	    printf ("NEAREST scaling\n");
+	else
+	    printf ("UNKNOWN scaling\n");
+    }
+    printf ("---\n");
+
+    for (i = 0; i < ARRAY_LENGTH (tests_tbl); i++)
+    {
+	if (strcmp (pattern, "all") == 0 || strcmp (tests_tbl[i].testname, pattern) == 0)
+	{
+	    bench_composite (tests_tbl[i].testname,
+			     tests_tbl[i].src_fmt,
+			     tests_tbl[i].src_flags,
+			     tests_tbl[i].op,
+			     tests_tbl[i].mask_fmt,
+			     tests_tbl[i].mask_flags,
+			     tests_tbl[i].dst_fmt,
+			     bandwidth/8);
+	}
+    }
+
+    free (src);
+    return 0;
+}
diff --git a/src/pixman/test/matrix-test.c b/src/pixman/test/matrix-test.c
new file mode 100644
index 0000000..0a5f203
--- /dev/null
+++ b/src/pixman/test/matrix-test.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright © 2012 Siarhei Siamashka <siarhei.siamashka@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "utils.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef HAVE_FLOAT128
+
+#define pixman_fixed_to_float128(x) (((__float128)(x)) / 65536.0Q)
+
+typedef struct { __float128 v[3]; } pixman_vector_f128_t;
+typedef struct { __float128 m[3][3]; } pixman_transform_f128_t;
+
+pixman_bool_t
+pixman_transform_point_f128 (const pixman_transform_f128_t *t,
+                             const pixman_vector_f128_t    *v,
+                             pixman_vector_f128_t          *result)
+{
+    int i;
+    for (i = 0; i < 3; i++)
+    {
+        result->v[i] = t->m[i][0] * v->v[0] +
+                       t->m[i][1] * v->v[1] +
+                       t->m[i][2] * v->v[2];
+    }
+    if (result->v[2] != 0)
+    {
+        result->v[0] /= result->v[2];
+        result->v[1] /= result->v[2];
+        result->v[2] = 1;
+        return TRUE;
+    }
+    else
+    {
+        return FALSE;
+    }
+}
+
+pixman_bool_t does_it_fit_fixed_48_16 (__float128 x)
+{
+    if (x >= 65536.0Q * 65536.0Q * 32768.0Q)
+        return FALSE;
+    if (x <= -65536.0Q * 65536.0Q * 32768.0Q)
+        return FALSE;
+    return TRUE;
+}
+
+#endif
+
+static inline uint32_t
+byteswap32 (uint32_t x)
+{
+    return ((x & ((uint32_t)0xFF << 24)) >> 24) |
+           ((x & ((uint32_t)0xFF << 16)) >>  8) |
+           ((x & ((uint32_t)0xFF <<  8)) <<  8) |
+           ((x & ((uint32_t)0xFF <<  0)) << 24);
+}
+
+static inline uint64_t
+byteswap64 (uint64_t x)
+{
+    return ((x & ((uint64_t)0xFF << 56)) >> 56) |
+           ((x & ((uint64_t)0xFF << 48)) >> 40) |
+           ((x & ((uint64_t)0xFF << 40)) >> 24) |
+           ((x & ((uint64_t)0xFF << 32)) >>  8) |
+           ((x & ((uint64_t)0xFF << 24)) <<  8) |
+           ((x & ((uint64_t)0xFF << 16)) << 24) |
+           ((x & ((uint64_t)0xFF <<  8)) << 40) |
+           ((x & ((uint64_t)0xFF <<  0)) << 56);
+}
+
+static void
+byteswap_transform (pixman_transform_t *t)
+{
+    int i, j;
+
+    if (is_little_endian ())
+        return;
+
+    for (i = 0; i < 3; i++)
+        for (j = 0; j < 3; j++)
+            t->matrix[i][j] = byteswap32 (t->matrix[i][j]);
+}
+
+static void
+byteswap_vector_48_16 (pixman_vector_48_16_t *v)
+{
+    int i;
+
+    if (is_little_endian ())
+        return;
+
+    for (i = 0; i < 3; i++)
+        v->v[i] = byteswap64 (v->v[i]);
+}
+
+uint32_t
+test_matrix (int testnum, int verbose)
+{
+    uint32_t crc32 = 0;
+    int i, j, k;
+    pixman_bool_t is_affine;
+
+    prng_srand (testnum);
+
+    for (i = 0; i < 100; i++)
+    {
+        pixman_bool_t           transform_ok;
+        pixman_transform_t      ti;
+        pixman_vector_48_16_t   vi, result_i;
+#ifdef HAVE_FLOAT128
+        pixman_transform_f128_t tf;
+        pixman_vector_f128_t    vf, result_f;
+#endif
+        prng_randmemset (&ti, sizeof(ti), 0);
+        prng_randmemset (&vi, sizeof(vi), 0);
+        byteswap_transform (&ti);
+        byteswap_vector_48_16 (&vi);
+
+        for (j = 0; j < 3; j++)
+        {
+            /* make sure that "vi" contains 31.16 fixed point data */
+            vi.v[j] >>= 17;
+            /* and apply random shift */
+            if (prng_rand_n (3) == 0)
+                vi.v[j] >>= prng_rand_n (46);
+        }
+
+        if (prng_rand_n (2))
+        {
+            /* random shift for the matrix */
+            for (j = 0; j < 3; j++)
+                for (k = 0; k < 3; k++)
+                    ti.matrix[j][k] >>= prng_rand_n (30);
+        }
+
+        if (prng_rand_n (2))
+        {
+            /* affine matrix */
+            ti.matrix[2][0] = 0;
+            ti.matrix[2][1] = 0;
+            ti.matrix[2][2] = pixman_fixed_1;
+        }
+
+        if (prng_rand_n (2))
+        {
+            /* cartesian coordinates */
+            vi.v[2] = pixman_fixed_1;
+        }
+
+        is_affine = (ti.matrix[2][0] == 0 && ti.matrix[2][1] == 0 &&
+                     ti.matrix[2][2] == pixman_fixed_1 &&
+                     vi.v[2] == pixman_fixed_1);
+
+        transform_ok = TRUE;
+        if (is_affine && prng_rand_n (2))
+            pixman_transform_point_31_16_affine (&ti, &vi, &result_i);
+        else
+            transform_ok = pixman_transform_point_31_16 (&ti, &vi, &result_i);
+
+#ifdef HAVE_FLOAT128
+        /* compare with a reference 128-bit floating point implementation */
+        for (j = 0; j < 3; j++)
+        {
+            vf.v[j] = pixman_fixed_to_float128 (vi.v[j]);
+            for (k = 0; k < 3; k++)
+            {
+                tf.m[j][k] = pixman_fixed_to_float128 (ti.matrix[j][k]);
+            }
+        }
+
+        if (pixman_transform_point_f128 (&tf, &vf, &result_f))
+        {
+            if (transform_ok ||
+                (does_it_fit_fixed_48_16 (result_f.v[0]) &&
+                 does_it_fit_fixed_48_16 (result_f.v[1]) &&
+                 does_it_fit_fixed_48_16 (result_f.v[2])))
+            {
+                for (j = 0; j < 3; j++)
+                {
+                    double diff = fabs (result_f.v[j] -
+                                        pixman_fixed_to_float128 (result_i.v[j]));
+
+                    if (is_affine && diff > (0.51 / 65536.0))
+                    {
+                        printf ("%d:%d: bad precision for affine (%.12f)\n",
+                               testnum, i, diff);
+                        abort ();
+                    }
+                    else if (diff > (0.71 / 65536.0))
+                    {
+                        printf ("%d:%d: bad precision for projective (%.12f)\n",
+                               testnum, i, diff);
+                        abort ();
+                    }
+                }
+            }
+        }
+#endif
+        byteswap_vector_48_16 (&result_i);
+        crc32 = compute_crc32 (crc32, &result_i, sizeof (result_i));
+    }
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    return fuzzer_test_main ("matrix", 20000,
+			     0xBEBF98C3,
+			     test_matrix, argc, argv);
+}
diff --git a/src/pixman/test/oob-test.c b/src/pixman/test/oob-test.c
new file mode 100644
index 0000000..0d19b50
--- /dev/null
+++ b/src/pixman/test/oob-test.c
@@ -0,0 +1,101 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils.h"
+
+typedef struct
+{
+    int				width;
+    int				height;
+    int				stride;
+    pixman_format_code_t	format;
+    
+} image_info_t;
+
+typedef struct
+{
+    pixman_op_t		op;
+    
+    image_info_t	src;
+    image_info_t	dest;
+
+    int			src_x;
+    int			src_y;
+    int			dest_x;
+    int			dest_y;
+    int			width;
+    int			height;
+} composite_info_t;
+
+const composite_info_t info[] =
+{
+    {
+	PIXMAN_OP_SRC,
+	{  3, 6, 16, PIXMAN_a8r8g8b8 },
+	{  5, 7, 20, PIXMAN_x8r8g8b8 },
+	1, 8,
+	1, -1,
+	1, 8
+    },
+    {
+	PIXMAN_OP_SRC,
+	{ 7, 5, 36, PIXMAN_a8r8g8b8 },
+	{ 6, 5, 28, PIXMAN_x8r8g8b8 },
+	8, 5,
+	5, 3,
+	1, 2
+    },
+    {
+	PIXMAN_OP_OVER,
+	{ 10, 10, 40, PIXMAN_a2b10g10r10 },
+	{ 10, 10, 40, PIXMAN_a2b10g10r10 },
+	0, 0,
+	0, 0,
+	10, 10
+    },
+    {
+	PIXMAN_OP_OVER,
+	{ 10, 10, 40, PIXMAN_x2b10g10r10 },
+	{ 10, 10, 40, PIXMAN_x2b10g10r10 },
+	0, 0,
+	0, 0,
+	10, 10
+    },
+};
+
+static pixman_image_t *
+make_image (const image_info_t *info)
+{
+    char *data = malloc (info->stride * info->height);
+    int i;
+
+    for (i = 0; i < info->height * info->stride; ++i)
+	data[i] = (i % 255) ^ (((i % 16) << 4) | (i & 0xf0));
+
+    return pixman_image_create_bits (info->format, info->width, info->height, (uint32_t *)data, info->stride);
+}
+    
+static void
+test_composite (const composite_info_t *info)
+{
+    pixman_image_t *src = make_image (&info->src);
+    pixman_image_t *dest = make_image (&info->dest);
+
+    pixman_image_composite (PIXMAN_OP_SRC, src, NULL, dest,
+			    info->src_x, info->src_y,
+			    0, 0,
+			    info->dest_x, info->dest_y,
+			    info->width, info->height);
+}
+
+
+
+int
+main (int argc, char **argv)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_LENGTH (info); ++i)
+	test_composite (&info[i]);
+    
+    return 0;
+}
diff --git a/src/pixman/test/pdf-op-test.c b/src/pixman/test/pdf-op-test.c
new file mode 100644
index 0000000..dcb3a60
--- /dev/null
+++ b/src/pixman/test/pdf-op-test.c
@@ -0,0 +1,83 @@
+#include <stdlib.h>
+#include "utils.h"
+
+static const pixman_op_t pdf_ops[] =
+{
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_SOFT_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+    PIXMAN_OP_HSL_HUE,
+    PIXMAN_OP_HSL_SATURATION,
+    PIXMAN_OP_HSL_COLOR,
+    PIXMAN_OP_HSL_LUMINOSITY
+};
+
+static const uint32_t pixels[] =
+{
+    0x00808080,
+    0x80123456,
+    0x00000000,
+    0xffffffff,
+    0x00ffffff,
+    0x80808080,
+    0x00123456,
+};
+
+int
+main ()
+{
+    int o, s, m, d;
+
+    enable_divbyzero_exceptions();
+
+    for (o = 0; o < ARRAY_LENGTH (pdf_ops); ++o)
+    {
+	pixman_op_t op = pdf_ops[o];
+
+	for (s = 0; s < ARRAY_LENGTH (pixels); ++s)
+	{
+	    pixman_image_t *src;
+
+	    src = pixman_image_create_bits (
+		PIXMAN_a8r8g8b8, 1, 1, (uint32_t *)&(pixels[s]), 4);
+
+	    for (m = -1; m < ARRAY_LENGTH (pixels); ++m)
+	    {
+		pixman_image_t *msk = NULL;
+		if (m >= 0)
+		{
+		    msk = pixman_image_create_bits (
+			PIXMAN_a8r8g8b8, 1, 1, (uint32_t *)&(pixels[m]), 4);
+		}
+
+		for (d = 0; d < ARRAY_LENGTH (pixels); ++d)
+		{
+		    pixman_image_t *dst;
+		    uint32_t dp = pixels[d];
+
+		    dst = pixman_image_create_bits (
+			PIXMAN_a8r8g8b8, 1, 1, &dp, 4);
+
+		    pixman_image_composite (op, src, msk, dst,
+					    0, 0, 0, 0, 0, 0, 1, 1);
+
+		    pixman_image_unref (dst);
+		}
+		if (msk)
+		    pixman_image_unref (msk);
+	    }
+
+	    pixman_image_unref (src);
+	}
+    }
+
+    return 0;
+}
diff --git a/src/pixman/test/pixel-test.c b/src/pixman/test/pixel-test.c
new file mode 100644
index 0000000..8c525d2
--- /dev/null
+++ b/src/pixman/test/pixel-test.c
@@ -0,0 +1,267 @@
+/*
+ * Copyright © 2013 Soeren Sandmann
+ * Copyright © 2013 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <stdio.h>
+#include <stdlib.h> /* abort() */
+#include <math.h>
+#include <time.h>
+#include "utils.h"
+
+typedef struct pixel_combination_t pixel_combination_t;
+struct pixel_combination_t
+{
+    pixman_op_t			op;
+    pixman_format_code_t	src_format;
+    uint32_t			src_pixel;
+    pixman_format_code_t	dest_format;
+    uint32_t			dest_pixel;
+};
+
+static const pixel_combination_t regressions[] =
+{
+    { PIXMAN_OP_OVER,
+      PIXMAN_a8r8g8b8,	0x0f00c300,
+      PIXMAN_x14r6g6b6,	0x003c0,
+    },
+    { PIXMAN_OP_DISJOINT_XOR,
+      PIXMAN_a4r4g4b4,	0xd0c0,
+      PIXMAN_a8r8g8b8,	0x5300ea00,
+    },
+    { PIXMAN_OP_OVER,
+      PIXMAN_a8r8g8b8,	0x20c6bf00,
+      PIXMAN_r5g6b5,	0xb9ff
+    },
+    { PIXMAN_OP_OVER,
+      PIXMAN_a8r8g8b8,	0x204ac7ff,
+      PIXMAN_r5g6b5,	0xc1ff
+    },
+    { PIXMAN_OP_OVER_REVERSE,
+      PIXMAN_r5g6b5,	0xffc3,
+      PIXMAN_a8r8g8b8,	0x102d00dd
+    },
+    { PIXMAN_OP_OVER_REVERSE,
+      PIXMAN_r5g6b5,	0x1f00,
+      PIXMAN_a8r8g8b8,	0x1bdf0c89
+    },
+    { PIXMAN_OP_OVER_REVERSE,
+      PIXMAN_r5g6b5,	0xf9d2,
+      PIXMAN_a8r8g8b8,	0x1076bcf7
+    },
+    { PIXMAN_OP_OVER_REVERSE,
+      PIXMAN_r5g6b5,	0x00c3,
+      PIXMAN_a8r8g8b8,	0x1bfe9ae5
+    },
+    { PIXMAN_OP_OVER_REVERSE,
+      PIXMAN_r5g6b5,	0x09ff,
+      PIXMAN_a8r8g8b8,	0x0b00c16c
+    },
+    { PIXMAN_OP_DISJOINT_ATOP,
+      PIXMAN_a2r2g2b2,	0xbc,
+      PIXMAN_a8r8g8b8,	0x9efff1ff
+    },
+    { PIXMAN_OP_DISJOINT_ATOP,
+      PIXMAN_a4r4g4b4,	0xae5f,
+      PIXMAN_a8r8g8b8,	0xf215b675
+    },
+    { PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+      PIXMAN_a8r8g8b8,	0xce007980,
+      PIXMAN_a8r8g8b8,	0x80ffe4ad
+    },
+    { PIXMAN_OP_DISJOINT_XOR,
+      PIXMAN_a8r8g8b8,	0xb8b07bea,
+      PIXMAN_a4r4g4b4,	0x939c
+    },
+    { PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+      PIXMAN_r5g6b5,	0x0063,
+      PIXMAN_a8r8g8b8,	0x10bb1ed7,
+    },
+};
+
+static void
+fill (pixman_image_t *image, uint32_t pixel)
+{
+    uint8_t *data = (uint8_t *)pixman_image_get_data (image);
+    int bytes_per_pixel = PIXMAN_FORMAT_BPP (pixman_image_get_format (image)) / 8;
+    int n_bytes = pixman_image_get_stride (image) * pixman_image_get_height (image);
+    int i;
+
+    switch (bytes_per_pixel)
+    {
+    case 4:
+	for (i = 0; i < n_bytes / 4; ++i)
+	    ((uint32_t *)data)[i] = pixel;
+	break;
+
+    case 2:
+	pixel &= 0xffff;
+	for (i = 0; i < n_bytes / 2; ++i)
+	    ((uint16_t *)data)[i] = pixel;
+	break;
+
+    case 1:
+	pixel &= 0xff;
+	for (i = 0; i < n_bytes; ++i)
+	    ((uint8_t *)data)[i] = pixel;
+	break;
+
+    default:
+	assert (0);
+	break;
+    }
+}
+
+static uint32_t
+access (pixman_image_t *image, int x, int y)
+{
+    int bytes_per_pixel;
+    int stride;
+    uint32_t result;
+    uint8_t *location;
+
+    if (x < 0 || x >= image->bits.width || y < 0 || y >= image->bits.height)
+        return 0;
+
+    bytes_per_pixel = PIXMAN_FORMAT_BPP (image->bits.format) / 8;
+    stride = image->bits.rowstride * 4;
+
+    location = (uint8_t *)image->bits.bits + y * stride + x * bytes_per_pixel;
+
+    if (bytes_per_pixel == 4)
+        result = *(uint32_t *)location;
+    else if (bytes_per_pixel == 2)
+        result = *(uint16_t *)location;
+    else if (bytes_per_pixel == 1)
+        result = *(uint8_t *)location;
+    else
+	assert (0);
+
+    return result;
+}
+
+static pixman_bool_t
+verify (int test_no, const pixel_combination_t *combination, int size)
+{
+    pixman_image_t *src, *dest;
+    pixel_checker_t src_checker, dest_checker;
+    color_t source_color, dest_color, reference_color;
+    pixman_bool_t result = TRUE;
+    int i, j;
+
+    /* Compute reference color */
+    pixel_checker_init (&src_checker, combination->src_format);
+    pixel_checker_init (&dest_checker, combination->dest_format);
+    pixel_checker_convert_pixel_to_color (
+	&src_checker, combination->src_pixel, &source_color);
+    pixel_checker_convert_pixel_to_color (
+	&dest_checker, combination->dest_pixel, &dest_color);
+    do_composite (combination->op,
+		  &source_color, NULL, &dest_color,
+		  &reference_color, FALSE);
+
+    src = pixman_image_create_bits (
+	combination->src_format, size, size, NULL, -1);
+    dest = pixman_image_create_bits (
+	combination->dest_format, size, size, NULL, -1);
+
+    fill (src, combination->src_pixel);
+    fill (dest, combination->dest_pixel);
+
+    pixman_image_composite32 (
+	combination->op, src, NULL, dest, 0, 0, 0, 0, 0, 0, size, size);
+
+    for (j = 0; j < size; ++j)
+    {
+	for (i = 0; i < size; ++i)
+	{
+	    uint32_t computed = access (dest, i, j);
+	    int32_t a, r, g, b;
+
+	    if (!pixel_checker_check (&dest_checker, computed, &reference_color))
+	    {
+		printf ("----------- Test %d failed ----------\n", test_no);
+
+		printf ("   operator:         %s\n", operator_name (combination->op));
+		printf ("   src format:       %s\n", format_name (combination->src_format));
+		printf ("   dest format:      %s\n", format_name (combination->dest_format));
+                printf (" - source ARGB:      %f  %f  %f  %f   (pixel: %8x)\n",
+                        source_color.a, source_color.r, source_color.g, source_color.b,
+                        combination->src_pixel);
+		pixel_checker_split_pixel (&src_checker, combination->src_pixel,
+					   &a, &r, &g, &b);
+                printf ("                     %8d  %8d  %8d  %8d\n", a, r, g, b);
+
+                printf (" - dest ARGB:        %f  %f  %f  %f   (pixel: %8x)\n",
+                        dest_color.a, dest_color.r, dest_color.g, dest_color.b,
+                        combination->dest_pixel);
+		pixel_checker_split_pixel (&dest_checker, combination->dest_pixel,
+					   &a, &r, &g, &b);
+                printf ("                     %8d  %8d  %8d  %8d\n", a, r, g, b);
+
+                pixel_checker_split_pixel (&dest_checker, computed, &a, &r, &g, &b);
+                printf (" - expected ARGB:    %f  %f  %f  %f\n",
+                        reference_color.a, reference_color.r, reference_color.g, reference_color.b);
+
+                pixel_checker_get_min (&dest_checker, &reference_color, &a, &r, &g, &b);
+                printf ("   min acceptable:   %8d  %8d  %8d  %8d\n", a, r, g, b);
+
+                pixel_checker_split_pixel (&dest_checker, computed, &a, &r, &g, &b);
+                printf ("   got:              %8d  %8d  %8d  %8d   (pixel: %8x)\n", a, r, g, b, computed);
+
+                pixel_checker_get_max (&dest_checker, &reference_color, &a, &r, &g, &b);
+                printf ("   max acceptable:   %8d  %8d  %8d  %8d\n", a, r, g, b);
+
+		result = FALSE;
+		goto done;
+	    }
+	}
+    }
+
+done:
+    pixman_image_unref (src);
+    pixman_image_unref (dest);
+
+    return result;
+}
+
+int
+main (int argc, char **argv)
+{
+    int result = 0;
+    int i, j;
+
+    for (i = 0; i < ARRAY_LENGTH (regressions); ++i)
+    {
+	const pixel_combination_t *combination = &(regressions[i]);
+
+	for (j = 1; j < 34; ++j)
+	{
+	    if (!verify (i, combination, j))
+	    {
+		result = 1;
+		break;
+	    }
+	}
+    }
+
+    return result;
+}
diff --git a/src/pixman/test/prng-test.c b/src/pixman/test/prng-test.c
new file mode 100644
index 0000000..c1d9320
--- /dev/null
+++ b/src/pixman/test/prng-test.c
@@ -0,0 +1,175 @@
+/*
+ * Copyright © 2012 Siarhei Siamashka <siarhei.siamashka@gmail.com>
+ *
+ * Based on the public domain implementation of small noncryptographic PRNG
+ * authored by Bob Jenkins: http://burtleburtle.net/bob/rand/smallprng.html
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include "utils-prng.h"
+#include "utils.h"
+
+/* The original code from http://www.burtleburtle.net/bob/rand/smallprng.html */
+
+typedef uint32_t u4;
+typedef struct ranctx { u4 a; u4 b; u4 c; u4 d; } ranctx;
+
+#define rot(x,k) (((x)<<(k))|((x)>>(32-(k))))
+u4 ranval( ranctx *x ) {
+    u4 e = x->a - rot(x->b, 27);
+    x->a = x->b ^ rot(x->c, 17);
+    x->b = x->c + x->d;
+    x->c = x->d + e;
+    x->d = e + x->a;
+    return x->d;
+}
+
+void raninit( ranctx *x, u4 seed ) {
+    u4 i;
+    x->a = 0xf1ea5eed, x->b = x->c = x->d = seed;
+    for (i=0; i<20; ++i) {
+        (void)ranval(x);
+    }
+}
+
+/*****************************************************************************/
+
+#define BUFSIZE (8 * 1024 * 1024)
+#define N 50
+
+void bench (void)
+{
+    double t1, t2;
+    int i;
+    prng_t prng;
+    uint8_t *buf = aligned_malloc (16, BUFSIZE + 1);
+
+    prng_srand_r (&prng, 1234);
+    t1 = gettime();
+    for (i = 0; i < N; i++)
+        prng_randmemset_r (&prng, buf, BUFSIZE, 0);
+    t2 = gettime();
+    printf ("aligned randmemset                    : %.2f MB/s\n",
+            (double)BUFSIZE * N / 1000000. / (t2 - t1));
+
+    t1 = gettime();
+    for (i = 0; i < N; i++)
+        prng_randmemset_r (&prng, buf + 1, BUFSIZE, 0);
+    t2 = gettime();
+    printf ("unaligned randmemset                  : %.2f MB/s\n",
+            (double)BUFSIZE * N / 1000000. / (t2 - t1));
+
+    t1 = gettime();
+    for (i = 0; i < N; i++)
+    {
+        prng_randmemset_r (&prng, buf, BUFSIZE, RANDMEMSET_MORE_00_AND_FF);
+    }
+    t2 = gettime ();
+    printf ("aligned randmemset (more 00 and FF)   : %.2f MB/s\n",
+            (double)BUFSIZE * N / 1000000. / (t2 - t1));
+
+    t1 = gettime();
+    for (i = 0; i < N; i++)
+    {
+        prng_randmemset_r (&prng, buf + 1, BUFSIZE, RANDMEMSET_MORE_00_AND_FF);
+    }
+    t2 = gettime ();
+    printf ("unaligned randmemset (more 00 and FF) : %.2f MB/s\n",
+            (double)BUFSIZE * N / 1000000. / (t2 - t1));
+
+    free (buf);
+}
+
+#define SMALLBUFSIZE 100
+
+int main (int argc, char *argv[])
+{
+    const uint32_t ref_crc[RANDMEMSET_MORE_00_AND_FF + 1] =
+    {
+        0xBA06763D, 0x103FC550, 0x8B59ABA5, 0xD82A0F39,
+        0xD2321099, 0xFD8C5420, 0xD3B7C42A, 0xFC098093,
+        0x85E01DE0, 0x6680F8F7, 0x4D32DD3C, 0xAE52382B,
+        0x149E6CB5, 0x8B336987, 0x15DCB2B3, 0x8A71B781
+    };
+    uint32_t crc1, crc2;
+    uint32_t ref, seed, seed0, seed1, seed2, seed3;
+    prng_rand_128_data_t buf;
+    uint8_t *bytebuf = aligned_malloc(16, SMALLBUFSIZE + 1);
+    ranctx x;
+    prng_t prng;
+    prng_randmemset_flags_t flags;
+
+    if (argc > 1 && strcmp(argv[1], "-bench") == 0)
+    {
+        bench ();
+        return 0;
+    }
+
+    /* basic test */
+    raninit (&x, 0);
+    prng_srand_r (&prng, 0);
+    assert (ranval (&x) == prng_rand_r (&prng));
+
+    /* test for simd code */
+    seed = 0;
+    prng_srand_r (&prng, seed);
+    seed0 = (seed = seed * 1103515245 + 12345);
+    seed1 = (seed = seed * 1103515245 + 12345);
+    seed2 = (seed = seed * 1103515245 + 12345);
+    seed3 = (seed = seed * 1103515245 + 12345);
+    prng_rand_128_r (&prng, &buf);
+
+    raninit (&x, seed0);
+    ref = ranval (&x);
+    assert (ref == buf.w[0]);
+
+    raninit (&x, seed1);
+    ref = ranval (&x);
+    assert (ref == buf.w[1]);
+
+    raninit (&x, seed2);
+    ref = ranval (&x);
+    assert (ref == buf.w[2]);
+
+    raninit (&x, seed3);
+    ref = ranval (&x);
+    assert (ref == buf.w[3]);
+
+    /* test for randmemset */
+    for (flags = 0; flags <= RANDMEMSET_MORE_00_AND_FF; flags++)
+    {
+        prng_srand_r (&prng, 1234);
+        prng_randmemset_r (&prng, bytebuf, 16, flags);
+        prng_randmemset_r (&prng, bytebuf + 16, SMALLBUFSIZE - 17, flags);
+        crc1 = compute_crc32 (0, bytebuf, SMALLBUFSIZE - 1);
+        prng_srand_r (&prng, 1234);
+        prng_randmemset_r (&prng, bytebuf + 1, SMALLBUFSIZE - 1, flags);
+        crc2 = compute_crc32 (0, bytebuf + 1, SMALLBUFSIZE - 1);
+        assert (ref_crc[flags] == crc1);
+        assert (ref_crc[flags] == crc2);
+    }
+
+    free (bytebuf);
+
+    return 0;
+}
diff --git a/src/pixman/test/radial-perf-test.c b/src/pixman/test/radial-perf-test.c
new file mode 100644
index 0000000..71092e2
--- /dev/null
+++ b/src/pixman/test/radial-perf-test.c
@@ -0,0 +1,58 @@
+#include "utils.h"
+#include <stdio.h>
+
+int
+main ()
+{
+    static const pixman_point_fixed_t inner = { 0x0000, 0x0000 };
+    static const pixman_point_fixed_t outer = { 0x0000, 0x0000 };
+    static const pixman_fixed_t r_inner = 0;
+    static const pixman_fixed_t r_outer = 64 << 16;
+    static const pixman_gradient_stop_t stops[] = {
+	{ 0x00000, { 0x6666, 0x6666, 0x6666, 0xffff } },
+	{ 0x10000, { 0x0000, 0x0000, 0x0000, 0xffff } }
+    };
+    static const pixman_transform_t transform = {
+	{ { 0x0,        0x26ee, 0x0}, 
+	  { 0xffffeeef, 0x0,    0x0}, 
+	  { 0x0,        0x0,    0x10000}
+	}
+    };
+    static const pixman_color_t z = { 0x0000, 0x0000, 0x0000, 0x0000 };
+    pixman_image_t *dest, *radial, *zero;
+    int i;
+    double before, after;
+
+    dest = pixman_image_create_bits (
+	PIXMAN_x8r8g8b8, 640, 429, NULL, -1);
+    zero = pixman_image_create_solid_fill (&z);
+    radial = pixman_image_create_radial_gradient (
+	&inner, &outer, r_inner, r_outer, stops, ARRAY_LENGTH (stops));
+    pixman_image_set_transform (radial, &transform);
+    pixman_image_set_repeat (radial, PIXMAN_REPEAT_PAD);
+
+#define N_COMPOSITE	500
+
+    before = gettime();
+    for (i = 0; i < N_COMPOSITE; ++i)
+    {
+	before -= gettime();
+
+	pixman_image_composite (
+	    PIXMAN_OP_SRC, zero, NULL, dest,
+	    0, 0, 0, 0, 0, 0, 640, 429);
+
+	before += gettime();
+
+	pixman_image_composite32 (
+	    PIXMAN_OP_OVER, radial, NULL, dest,
+	    - 150, -158, 0, 0, 0, 0, 640, 361);
+    }
+
+    after = gettime();
+
+    write_png (dest, "radial.png");
+
+    printf ("Average time to composite: %f\n", (after - before) / N_COMPOSITE);
+    return 0;
+}
diff --git a/src/pixman/test/region-contains-test.c b/src/pixman/test/region-contains-test.c
new file mode 100644
index 0000000..096e651
--- /dev/null
+++ b/src/pixman/test/region-contains-test.c
@@ -0,0 +1,169 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+static void
+make_random_region (pixman_region32_t *region)
+{
+    int n_boxes;
+
+    pixman_region32_init (region);
+
+    n_boxes = prng_rand_n (64);
+    while (n_boxes--)
+    {
+	int32_t x, y;
+	uint32_t w, h;
+
+	x = (int32_t)prng_rand() >> 2;
+	y = (int32_t)prng_rand() >> 2;
+	w = prng_rand() >> 2;
+	h = prng_rand() >> 2;
+
+	pixman_region32_union_rect (region, region, x, y, w, h);
+    }
+}
+
+static void
+print_box (pixman_box32_t *box)
+{
+    printf ("    %d %d %d %d\n", box->x1, box->y1, box->x2, box->y2);
+}
+
+static int32_t
+random_coord (pixman_region32_t *region, pixman_bool_t x)
+{
+    pixman_box32_t *b, *bb;
+    int n_boxes;
+    int begin, end;
+
+    if (prng_rand_n (14))
+    {
+	bb = pixman_region32_rectangles (region, &n_boxes);
+	if (n_boxes == 0)
+	    goto use_extent;
+	b = bb + prng_rand_n (n_boxes);
+    }
+    else
+    {
+    use_extent:
+	b = pixman_region32_extents (region);
+	n_boxes = 1;
+    }
+
+    if (x)
+    {
+	begin = b->x1;
+	end = b->x2;
+    }
+    else
+    {
+	begin = b->y1;
+	end = b->y2;
+    }
+
+    switch (prng_rand_n (5))
+    {
+    case 0:
+	return begin - prng_rand();
+    case 1:
+	return end + prng_rand ();
+    case 2:
+	return end;
+    case 3:
+	return begin;
+    default:
+	return (end - begin) / 2 + begin;
+    }
+    return 0;
+}
+
+static uint32_t
+compute_crc32_u32 (uint32_t crc32, uint32_t v)
+{
+    if (!is_little_endian())
+    {
+	v = ((v & 0xff000000) >> 24)	|
+	    ((v & 0x00ff0000) >> 8)	|
+	    ((v & 0x0000ff00) << 8)	|
+	    ((v & 0x000000ff) << 24);
+    }
+
+    return compute_crc32 (crc32, &v, sizeof (int32_t));
+}
+
+static uint32_t
+crc32_box32 (uint32_t crc32, pixman_box32_t *box)
+{
+    crc32 = compute_crc32_u32 (crc32, box->x1);
+    crc32 = compute_crc32_u32 (crc32, box->y1);
+    crc32 = compute_crc32_u32 (crc32, box->x2);
+    crc32 = compute_crc32_u32 (crc32, box->y2);
+
+    return crc32;
+}
+
+static uint32_t
+test_region_contains_rectangle (int i, int verbose)
+{
+    pixman_box32_t box;
+    pixman_box32_t rbox = { 0, 0, 0, 0 };
+    pixman_region32_t region;
+    uint32_t r, r1, r2, r3, r4, crc32;
+
+    prng_srand (i);
+
+    make_random_region (&region);
+
+    box.x1 = random_coord (&region, TRUE);
+    box.x2 = box.x1 + prng_rand ();
+    box.y1 = random_coord (&region, FALSE);
+    box.y2 = box.y1 + prng_rand ();
+
+    if (verbose)
+    {
+	int n_rects;
+	pixman_box32_t *boxes;
+
+	boxes = pixman_region32_rectangles (&region, &n_rects);
+
+	printf ("region:\n");
+	while (n_rects--)
+	    print_box (boxes++);
+	printf ("box:\n");
+	print_box (&box);
+    }
+
+    crc32 = 0;
+
+    r1 = pixman_region32_contains_point (&region, box.x1, box.y1, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+    r2 = pixman_region32_contains_point (&region, box.x1, box.y2, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+    r3 = pixman_region32_contains_point (&region, box.x2, box.y1, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+    r4 = pixman_region32_contains_point (&region, box.x2, box.y2, &rbox);
+    crc32 = crc32_box32 (crc32, &rbox);
+
+    r = pixman_region32_contains_rectangle (&region, &box);
+    r = (i << 8) | (r << 4) | (r1 << 3) | (r2 << 2) | (r3 << 1) | (r4 << 0);
+
+    crc32 = compute_crc32_u32 (crc32, r);
+
+    if (verbose)
+	printf ("results: %d %d %d %d %d\n", (r & 0xf0) >> 4, r1, r2, r3, r4);
+
+    pixman_region32_fini (&region);
+
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    return fuzzer_test_main ("region_contains",
+			     1000000,
+			     0x548E0F3F,
+			     test_region_contains_rectangle,
+			     argc, argv);
+}
diff --git a/src/pixman/test/region-test.c b/src/pixman/test/region-test.c
new file mode 100644
index 0000000..bfc219b
--- /dev/null
+++ b/src/pixman/test/region-test.c
@@ -0,0 +1,125 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+int
+main ()
+{
+    pixman_region32_t r1;
+    pixman_region32_t r2;
+    pixman_region32_t r3;
+    pixman_box32_t boxes[] = {
+	{ 10, 10, 20, 20 },
+	{ 30, 30, 30, 40 },
+	{ 50, 45, 60, 44 },
+    };
+    pixman_box32_t boxes2[] = {
+	{ 2, 6, 7, 6 },
+	{ 4, 1, 6, 7 },
+    };
+    pixman_box32_t boxes3[] = {
+	{ 2, 6, 7, 6 },
+	{ 4, 1, 6, 1 },
+    };
+    int i, j;
+    pixman_box32_t *b;
+    pixman_image_t *image, *fill;
+    pixman_color_t white = {
+	0xffff,
+	0xffff,
+	0xffff,
+	0xffff
+    };
+
+    prng_srand (0);
+
+    /* This used to go into an infinite loop before pixman-region.c
+     * was fixed to not use explict "short" variables
+     */
+    pixman_region32_init_rect (&r1, 0, 0, 20, 64000);
+    pixman_region32_init_rect (&r2, 0, 0, 20, 64000);
+    pixman_region32_init_rect (&r3, 0, 0, 20, 64000);
+
+    pixman_region32_subtract (&r1, &r2, &r3);
+
+
+    /* This would produce a region containing an empty
+     * rectangle in it. Such regions are considered malformed,
+     * but using an empty rectangle for initialization should
+     * work.
+     */
+    pixman_region32_init_rects (&r1, boxes, 3);
+
+    b = pixman_region32_rectangles (&r1, &i);
+
+    assert (i == 1);
+    
+    while (i--)
+    {
+	assert (b[i].x1 < b[i].x2);
+	assert (b[i].y1 < b[i].y2);
+    }
+
+    /* This would produce a rectangle containing the bounding box
+     * of the two rectangles. The correct result is to eliminate
+     * the broken rectangle.
+     */
+    pixman_region32_init_rects (&r1, boxes2, 2);
+
+    b = pixman_region32_rectangles (&r1, &i);
+
+    assert (i == 1);
+
+    assert (b[0].x1 == 4);
+    assert (b[0].y1 == 1);
+    assert (b[0].x2 == 6);
+    assert (b[0].y2 == 7);
+
+    /* This should produce an empty region */
+    pixman_region32_init_rects (&r1, boxes3, 2);
+
+    b = pixman_region32_rectangles (&r1, &i);
+
+    assert (i == 0);
+
+    fill = pixman_image_create_solid_fill (&white);
+    for (i = 0; i < 100; i++)
+    {
+	int image_size = 128;
+
+	pixman_region32_init (&r1);
+
+	/* Add some random rectangles */
+	for (j = 0; j < 64; j++)
+	    pixman_region32_union_rect (&r1, &r1,
+					prng_rand_n (image_size),
+					prng_rand_n (image_size),
+					prng_rand_n (25),
+					prng_rand_n (25));
+
+	/* Clip to image size */
+	pixman_region32_init_rect (&r2, 0, 0, image_size, image_size);
+	pixman_region32_intersect (&r1, &r1, &r2);
+	pixman_region32_fini (&r2);
+
+	/* render region to a1 mask */
+	image = pixman_image_create_bits (PIXMAN_a1, image_size, image_size, NULL, 0);
+	pixman_image_set_clip_region32 (image, &r1);
+	pixman_image_composite32 (PIXMAN_OP_SRC,
+				  fill, NULL, image,
+				  0, 0, 0, 0, 0, 0,
+				  image_size, image_size);
+	pixman_region32_init_from_image (&r2, image);
+
+	pixman_image_unref (image);
+
+	assert (pixman_region32_equal (&r1, &r2));
+	pixman_region32_fini (&r1);
+	pixman_region32_fini (&r2);
+
+    }
+    pixman_image_unref (fill);
+
+    return 0;
+}
diff --git a/src/pixman/test/region-translate-test.c b/src/pixman/test/region-translate-test.c
new file mode 100644
index 0000000..5a03027
--- /dev/null
+++ b/src/pixman/test/region-translate-test.c
@@ -0,0 +1,30 @@
+#include <assert.h>
+#include "utils.h"
+
+/* Pixman had a bug where 32bit regions where clipped to 16bit sizes when
+ * pixman_region32_translate() was called. This test exercises that bug.
+ */
+
+#define LARGE 32000
+
+int
+main (int argc, char **argv)
+{
+  pixman_box32_t rect = { -LARGE, -LARGE, LARGE, LARGE };
+  pixman_region32_t r1, r2;
+
+  pixman_region32_init_rects (&r1, &rect, 1);
+  pixman_region32_init_rect (&r2, rect.x1, rect.y1, rect.x2 - rect.x1, rect.y2 - rect.y1);
+
+  assert (pixman_region32_equal (&r1,  &r2));
+
+  pixman_region32_translate (&r1, -LARGE, LARGE);
+  pixman_region32_translate (&r1, LARGE, -LARGE);
+
+  assert (pixman_region32_equal (&r1,  &r2));
+
+  pixman_region32_fini (&r1);
+  pixman_region32_fini (&r2);
+
+  return 0;
+}
diff --git a/src/pixman/test/rotate-test.c b/src/pixman/test/rotate-test.c
new file mode 100644
index 0000000..18ca60d
--- /dev/null
+++ b/src/pixman/test/rotate-test.c
@@ -0,0 +1,120 @@
+#include <stdlib.h>
+#include "utils.h"
+
+#define WIDTH	32
+#define HEIGHT	32
+
+static const pixman_format_code_t formats[] =
+{
+    PIXMAN_a8r8g8b8,
+    PIXMAN_a8b8g8r8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_x8b8g8r8,
+    PIXMAN_r5g6b5,
+    PIXMAN_b5g6r5,
+    PIXMAN_a8,
+    PIXMAN_a1,
+};
+
+static const pixman_op_t ops[] =
+{
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_ADD,
+};
+
+#define TRANSFORM(v00, v01, v10, v11)					\
+    { { { v00, v01, WIDTH * pixman_fixed_1 / 2 },			\
+        { v10, v11, HEIGHT * pixman_fixed_1 / 2 },			\
+	{ 0, 0, pixman_fixed_1 } } }
+
+#define F1 pixman_fixed_1
+
+static const pixman_transform_t transforms[] =
+{
+    TRANSFORM (0, -1, 1, 0),		/* wrong 90 degree rotation */
+    TRANSFORM (0, 1, -1, 0),		/* wrong 270 degree rotation */
+    TRANSFORM (1, 0, 0, 1),		/* wrong identity */
+    TRANSFORM (-1, 0, 0, -1),		/* wrong 180 degree rotation */
+    TRANSFORM (0, -F1, F1, 0),		/* correct 90 degree rotation */
+    TRANSFORM (0, F1, -F1, 0),		/* correct 270 degree rotation */
+    TRANSFORM (F1, 0, 0, F1),		/* correct identity */
+    TRANSFORM (-F1, 0, 0, -F1),		/* correct 180 degree rotation */
+};
+
+#define RANDOM_FORMAT()							\
+    (formats[prng_rand_n (ARRAY_LENGTH (formats))])
+
+#define RANDOM_OP()							\
+    (ops[prng_rand_n (ARRAY_LENGTH (ops))])
+
+#define RANDOM_TRANSFORM()						\
+    (&(transforms[prng_rand_n (ARRAY_LENGTH (transforms))]))
+
+static void
+on_destroy (pixman_image_t *image, void *data)
+{
+    free (data);
+}
+
+static pixman_image_t *
+make_image (void)
+{
+    pixman_format_code_t format = RANDOM_FORMAT();
+    uint32_t *bytes, *orig;
+    pixman_image_t *image;
+    int stride;
+
+    orig = bytes = malloc (WIDTH * HEIGHT * 4);
+    prng_randmemset (bytes, WIDTH * HEIGHT * 4, 0);
+
+    stride = WIDTH * 4;
+    if (prng_rand_n (2) == 0)
+    {
+	bytes += (stride / 4) * (HEIGHT - 1);
+	stride = - stride;
+    }
+
+    image = pixman_image_create_bits (
+	format, WIDTH, HEIGHT, bytes, stride);
+
+    pixman_image_set_transform (image, RANDOM_TRANSFORM());
+    pixman_image_set_destroy_function (image, on_destroy, orig);
+    pixman_image_set_repeat (image, PIXMAN_REPEAT_NORMAL);
+
+    image_endian_swap (image);
+
+    return image;
+}
+
+static uint32_t
+test_transform (int testnum, int verbose)
+{
+    pixman_image_t *src, *dest;
+    uint32_t crc;
+
+    prng_srand (testnum);
+    
+    src = make_image ();
+    dest = make_image ();
+
+    pixman_image_composite (RANDOM_OP(),
+			    src, NULL, dest,
+			    0, 0, 0, 0, WIDTH / 2, HEIGHT / 2,
+			    WIDTH, HEIGHT);
+
+    crc = compute_crc32_for_image (0, dest);
+
+    pixman_image_unref (src);
+    pixman_image_unref (dest);
+
+    return crc;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    return fuzzer_test_main ("rotate", 15000,
+			     0x81E9EC2F,
+			     test_transform, argc, argv);
+}
diff --git a/src/pixman/test/scaling-bench.c b/src/pixman/test/scaling-bench.c
new file mode 100644
index 0000000..365e798
--- /dev/null
+++ b/src/pixman/test/scaling-bench.c
@@ -0,0 +1,80 @@
+#include <stdlib.h>
+#include "utils.h"
+
+#define SOURCE_WIDTH 320
+#define SOURCE_HEIGHT 240
+#define TEST_REPEATS 3
+
+static pixman_image_t *
+make_source (void)
+{
+    size_t n_bytes = (SOURCE_WIDTH + 2) * (SOURCE_HEIGHT + 2) * 4;
+    uint32_t *data = malloc (n_bytes);
+    pixman_image_t *source;
+
+    prng_randmemset (data, n_bytes, 0);
+    
+    source = pixman_image_create_bits (
+	PIXMAN_a8r8g8b8, SOURCE_WIDTH + 2, SOURCE_HEIGHT + 2,
+	data,
+	(SOURCE_WIDTH + 2) * 4);
+
+    pixman_image_set_filter (source, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    return source;
+}
+
+int
+main ()
+{
+    double scale;
+    pixman_image_t *src;
+
+    prng_srand (23874);
+    
+    src = make_source ();
+    printf ("# %-6s %-22s   %-14s %-12s\n",
+	    "ratio",
+	    "resolutions",
+	    "time / ms",
+	    "time per pixel / ns");
+    for (scale = 0.1; scale < 10.005; scale += 0.01)
+    {
+	int i;
+	int dest_width = SOURCE_WIDTH * scale + 0.5;
+	int dest_height = SOURCE_HEIGHT * scale + 0.5;
+	int dest_byte_stride = (dest_width * 4 + 15) & ~15;
+	pixman_fixed_t s = (1 / scale) * 65536.0 + 0.5;
+	pixman_transform_t transform;
+	pixman_image_t *dest;
+	double t1, t2, t = -1;
+	uint32_t *dest_buf = aligned_malloc (16, dest_byte_stride * dest_height);
+	memset (dest_buf, 0, dest_byte_stride * dest_height);
+
+	pixman_transform_init_scale (&transform, s, s);
+	pixman_image_set_transform (src, &transform);
+	
+	dest = pixman_image_create_bits (
+	    PIXMAN_a8r8g8b8, dest_width, dest_height, dest_buf, dest_byte_stride);
+
+	for (i = 0; i < TEST_REPEATS; i++)
+	{
+	    t1 = gettime();
+	    pixman_image_composite (
+		PIXMAN_OP_OVER, src, NULL, dest,
+		scale, scale, 0, 0, 0, 0, dest_width, dest_height);
+	    t2 = gettime();
+	    if (t < 0 || t2 - t1 < t)
+		t = t2 - t1;
+	}
+
+	printf ("%6.2f : %4dx%-4d => %4dx%-4d : %12.4f : %12.4f\n",
+		scale, SOURCE_WIDTH, SOURCE_HEIGHT, dest_width, dest_height,
+		t * 1000, (t / (dest_width * dest_height)) * 1000000000);
+
+	pixman_image_unref (dest);
+	free (dest_buf);
+    }
+
+    return 0;
+}
diff --git a/src/pixman/test/scaling-crash-test.c b/src/pixman/test/scaling-crash-test.c
new file mode 100644
index 0000000..0dac892
--- /dev/null
+++ b/src/pixman/test/scaling-crash-test.c
@@ -0,0 +1,219 @@
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "utils.h"
+
+/*
+ * We have a source image filled with solid color, set NORMAL or PAD repeat,
+ * and some transform which results in nearest neighbour scaling.
+ *
+ * The expected result is either that the destination image filled with this solid
+ * color or, if the transformation is such that we can't composite anything at
+ * all, that nothing has changed in the destination.
+ *
+ * The surrounding memory of the source image is a different solid color so that
+ * we are sure to get failures if we access it.
+ */
+static int
+run_test (int32_t		dst_width,
+	  int32_t		dst_height,
+	  int32_t		src_width,
+	  int32_t		src_height,
+	  int32_t		src_x,
+	  int32_t		src_y,
+	  int32_t		scale_x,
+	  int32_t		scale_y,
+	  pixman_filter_t	filter,
+	  pixman_repeat_t	repeat)
+{
+    pixman_image_t *   src_img;
+    pixman_image_t *   dst_img;
+    pixman_transform_t transform;
+    uint32_t *         srcbuf;
+    uint32_t *         dstbuf;
+    pixman_color_t     color_cc = { 0xcccc, 0xcccc, 0xcccc, 0xcccc };
+    pixman_image_t *   solid;
+    int result;
+    int i;
+
+    static const pixman_fixed_t kernel[] =
+    {
+#define D(f)	(pixman_double_to_fixed (f) + 0x0001)
+
+	pixman_int_to_fixed (5),
+	pixman_int_to_fixed (5),
+	D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0),
+	D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0),
+	D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0),
+	D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0),
+	D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0), D(1/25.0)
+    };
+
+    result = 0;
+
+    srcbuf = (uint32_t *)malloc ((src_width + 10) * (src_height + 10) * 4);
+    dstbuf = (uint32_t *)malloc (dst_width * dst_height * 4);
+
+    memset (srcbuf, 0x88, src_width * src_height * 4);
+    memset (dstbuf, 0x33, dst_width * dst_height * 4);
+
+    src_img = pixman_image_create_bits (
+        PIXMAN_a8r8g8b8, src_width, src_height,
+	srcbuf + (src_width + 10) * 5 + 5, (src_width + 10) * 4);
+
+    solid = pixman_image_create_solid_fill (&color_cc);
+    pixman_image_composite32 (PIXMAN_OP_SRC, solid, NULL, src_img,
+			      0, 0, 0, 0, 0, 0, src_width, src_height);
+    pixman_image_unref (solid);
+
+    dst_img = pixman_image_create_bits (
+        PIXMAN_a8r8g8b8, dst_width, dst_height, dstbuf, dst_width * 4);
+
+    pixman_transform_init_scale (&transform, scale_x, scale_y);
+    pixman_image_set_transform (src_img, &transform);
+    pixman_image_set_repeat (src_img, repeat);
+    if (filter == PIXMAN_FILTER_CONVOLUTION)
+	pixman_image_set_filter (src_img, filter, kernel, 27);
+    else
+	pixman_image_set_filter (src_img, filter, NULL, 0);
+
+    pixman_image_composite (PIXMAN_OP_SRC, src_img, NULL, dst_img,
+                            src_x, src_y, 0, 0, 0, 0, dst_width, dst_height);
+
+    pixman_image_unref (src_img);
+    pixman_image_unref (dst_img);
+
+    for (i = 0; i < dst_width * dst_height; i++)
+    {
+	if (dstbuf[i] != 0xCCCCCCCC && dstbuf[i] != 0x33333333)
+	{
+	    result = 1;
+	    break;
+	}
+    }
+
+    free (srcbuf);
+    free (dstbuf);
+    return result;
+}
+
+typedef struct filter_info_t filter_info_t;
+struct filter_info_t
+{
+    pixman_filter_t value;
+    char name[28];
+};
+
+static const filter_info_t filters[] =
+{
+    { PIXMAN_FILTER_NEAREST, "NEAREST" },
+    { PIXMAN_FILTER_BILINEAR, "BILINEAR" },
+    { PIXMAN_FILTER_CONVOLUTION, "CONVOLUTION" },
+};
+
+typedef struct repeat_info_t repeat_info_t;
+struct repeat_info_t
+{
+    pixman_repeat_t value;
+    char name[28];
+};
+
+
+static const repeat_info_t repeats[] =
+{
+    { PIXMAN_REPEAT_PAD, "PAD" },
+    { PIXMAN_REPEAT_REFLECT, "REFLECT" },
+    { PIXMAN_REPEAT_NORMAL, "NORMAL" }
+};
+
+static int
+do_test (int32_t		dst_size,
+	 int32_t		src_size,
+	 int32_t		src_offs,
+	 int32_t		scale_factor)
+{
+    int i, j;
+
+    for (i = 0; i < ARRAY_LENGTH (filters); ++i)
+    {
+	for (j = 0; j < ARRAY_LENGTH (repeats); ++j)
+	{
+	    /* horizontal test */
+	    if (run_test (dst_size, 1,
+			  src_size, 1,
+			  src_offs, 0,
+			  scale_factor, 65536,
+			  filters[i].value,
+			  repeats[j].value) != 0)
+	    {
+		printf ("Vertical test failed with %s filter and repeat mode %s\n",
+			filters[i].name, repeats[j].name);
+
+		return 1;
+	    }
+
+	    /* vertical test */
+	    if (run_test (1, dst_size,
+			  1, src_size,
+			  0, src_offs,
+			  65536, scale_factor,
+			  filters[i].value,
+			  repeats[j].value) != 0)
+	    {
+		printf ("Vertical test failed with %s filter and repeat mode %s\n",
+			filters[i].name, repeats[j].name);
+
+		return 1;
+	    }
+	}
+    }
+
+    return 0;
+}
+
+int
+main (int argc, char *argv[])
+{
+    int i;
+
+    pixman_disable_out_of_bounds_workaround ();
+
+    /* can potentially crash */
+    assert (do_test (
+		48000, 32767, 1, 65536 * 128) == 0);
+
+    /* can potentially get into a deadloop */
+    assert (do_test (
+		16384, 65536, 32, 32768) == 0);
+
+    /* can potentially access memory outside source image buffer */
+    assert (do_test (
+		10, 10, 0, 1) == 0);
+    assert (do_test (
+		10, 10, 0, 0) == 0);
+
+    for (i = 0; i < 100; ++i)
+    {
+	pixman_fixed_t one_seventh =
+	    (((pixman_fixed_48_16_t)pixman_fixed_1) << 16) / (7 << 16);
+
+	assert (do_test (
+		    1, 7, 3, one_seventh + i - 50) == 0);
+    }
+
+    for (i = 0; i < 100; ++i)
+    {
+	pixman_fixed_t scale =
+	    (((pixman_fixed_48_16_t)pixman_fixed_1) << 16) / (32767 << 16);
+
+	assert (do_test (
+		    1, 32767, 16383, scale + i - 50) == 0);
+    }
+
+    /* can potentially provide invalid results (out of range matrix stuff) */
+    assert (do_test (
+	48000, 32767, 16384, 65536 * 128) == 0);
+
+    return 0;
+}
diff --git a/src/pixman/test/scaling-helpers-test.c b/src/pixman/test/scaling-helpers-test.c
new file mode 100644
index 0000000..cd5ace0
--- /dev/null
+++ b/src/pixman/test/scaling-helpers-test.c
@@ -0,0 +1,92 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include "utils.h"
+#include "pixman-inlines.h"
+
+/* A trivial reference implementation for
+ * 'bilinear_pad_repeat_get_scanline_bounds'
+ */
+static void
+bilinear_pad_repeat_get_scanline_bounds_ref (int32_t        source_image_width,
+					     pixman_fixed_t vx_,
+					     pixman_fixed_t unit_x,
+					     int32_t *      left_pad,
+					     int32_t *      left_tz,
+					     int32_t *      width,
+					     int32_t *      right_tz,
+					     int32_t *      right_pad)
+{
+    int w = *width;
+    int64_t vx = vx_;
+    *left_pad = 0;
+    *left_tz = 0;
+    *width = 0;
+    *right_tz = 0;
+    *right_pad = 0;
+    while (--w >= 0)
+    {
+	if (vx < 0)
+	{
+	    if (vx + pixman_fixed_1 < 0)
+		*left_pad += 1;
+	    else
+		*left_tz += 1;
+	}
+	else if (vx + pixman_fixed_1 >= pixman_int_to_fixed (source_image_width))
+	{
+	    if (vx >= pixman_int_to_fixed (source_image_width))
+		*right_pad += 1;
+	    else
+		*right_tz += 1;
+	}
+	else
+	{
+	    *width += 1;
+	}
+	vx += unit_x;
+    }
+}
+
+int
+main (void)
+{
+    int i;
+    prng_srand (0);
+    for (i = 0; i < 10000; i++)
+    {
+	int32_t left_pad1, left_tz1, width1, right_tz1, right_pad1;
+	int32_t left_pad2, left_tz2, width2, right_tz2, right_pad2;
+	pixman_fixed_t vx = prng_rand_n(10000 << 16) - (3000 << 16);
+	int32_t width = prng_rand_n(10000);
+	int32_t source_image_width = prng_rand_n(10000) + 1;
+	pixman_fixed_t unit_x = prng_rand_n(10 << 16) + 1;
+	width1 = width2 = width;
+
+	bilinear_pad_repeat_get_scanline_bounds_ref (source_image_width,
+						     vx,
+						     unit_x,
+						     &left_pad1,
+						     &left_tz1,
+						     &width1,
+						     &right_tz1,
+						     &right_pad1);
+
+	bilinear_pad_repeat_get_scanline_bounds (source_image_width,
+						 vx,
+						 unit_x,
+						 &left_pad2,
+						 &left_tz2,
+						 &width2,
+						 &right_tz2,
+						 &right_pad2);
+
+	assert (left_pad1 == left_pad2);
+	assert (left_tz1 == left_tz2);
+	assert (width1 == width2);
+	assert (right_tz1 == right_tz2);
+	assert (right_pad1 == right_pad2);
+    }
+
+    return 0;
+}
diff --git a/src/pixman/test/scaling-test.c b/src/pixman/test/scaling-test.c
new file mode 100644
index 0000000..e2f7fa9
--- /dev/null
+++ b/src/pixman/test/scaling-test.c
@@ -0,0 +1,402 @@
+/*
+ * Test program, which can detect some problems with nearest neighbour
+ * and bilinear scaling in pixman. Testing is done by running lots
+ * of random SRC and OVER compositing operations a8r8g8b8, x8a8r8g8b8
+ * and r5g6b5 color formats.
+ *
+ * Script 'fuzzer-find-diff.pl' can be used to narrow down the problem in
+ * the case of test failure.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include "utils.h"
+
+#define MAX_SRC_WIDTH  48
+#define MAX_SRC_HEIGHT 8
+#define MAX_DST_WIDTH  48
+#define MAX_DST_HEIGHT 8
+#define MAX_STRIDE     4
+
+/*
+ * Composite operation with pseudorandom images
+ */
+
+static pixman_format_code_t
+get_format (int bpp)
+{
+    if (bpp == 4)
+    {
+	switch (prng_rand_n (4))
+	{
+	default:
+	case 0:
+	    return PIXMAN_a8r8g8b8;
+	case 1:
+	    return PIXMAN_x8r8g8b8;
+	case 2:
+	    return PIXMAN_a8b8g8r8;
+	case 3:
+	    return PIXMAN_x8b8g8r8;
+	}
+    }
+    else
+    {
+	return PIXMAN_r5g6b5;
+    }
+}
+
+uint32_t
+test_composite (int      testnum,
+		int      verbose)
+{
+    int                i;
+    pixman_image_t *   src_img;
+    pixman_image_t *   mask_img;
+    pixman_image_t *   dst_img;
+    pixman_transform_t transform;
+    pixman_region16_t  clip;
+    int                src_width, src_height;
+    int                mask_width, mask_height;
+    int                dst_width, dst_height;
+    int                src_stride, mask_stride, dst_stride;
+    int                src_x, src_y;
+    int                mask_x, mask_y;
+    int                dst_x, dst_y;
+    int                src_bpp;
+    int                mask_bpp = 1;
+    int                dst_bpp;
+    int                w, h;
+    pixman_fixed_t     scale_x = 65536, scale_y = 65536;
+    pixman_fixed_t     translate_x = 0, translate_y = 0;
+    pixman_fixed_t     mask_scale_x = 65536, mask_scale_y = 65536;
+    pixman_fixed_t     mask_translate_x = 0, mask_translate_y = 0;
+    pixman_op_t        op;
+    pixman_repeat_t    repeat = PIXMAN_REPEAT_NONE;
+    pixman_repeat_t    mask_repeat = PIXMAN_REPEAT_NONE;
+    pixman_format_code_t src_fmt, dst_fmt;
+    uint32_t *         srcbuf;
+    uint32_t *         dstbuf;
+    uint32_t *         maskbuf;
+    uint32_t           crc32;
+    FLOAT_REGS_CORRUPTION_DETECTOR_START ();
+
+    prng_srand (testnum);
+
+    src_bpp = (prng_rand_n (2) == 0) ? 2 : 4;
+    dst_bpp = (prng_rand_n (2) == 0) ? 2 : 4;
+    switch (prng_rand_n (3))
+    {
+    case 0:
+	op = PIXMAN_OP_SRC;
+	break;
+    case 1:
+	op = PIXMAN_OP_OVER;
+	break;
+    default:
+	op = PIXMAN_OP_ADD;
+	break;
+    }
+
+    src_width = prng_rand_n (MAX_SRC_WIDTH) + 1;
+    src_height = prng_rand_n (MAX_SRC_HEIGHT) + 1;
+
+    if (prng_rand_n (2))
+    {
+	mask_width = prng_rand_n (MAX_SRC_WIDTH) + 1;
+	mask_height = prng_rand_n (MAX_SRC_HEIGHT) + 1;
+    }
+    else
+    {
+	mask_width = mask_height = 1;
+    }
+
+    dst_width = prng_rand_n (MAX_DST_WIDTH) + 1;
+    dst_height = prng_rand_n (MAX_DST_HEIGHT) + 1;
+    src_stride = src_width * src_bpp + prng_rand_n (MAX_STRIDE) * src_bpp;
+    mask_stride = mask_width * mask_bpp + prng_rand_n (MAX_STRIDE) * mask_bpp;
+    dst_stride = dst_width * dst_bpp + prng_rand_n (MAX_STRIDE) * dst_bpp;
+
+    if (src_stride & 3)
+	src_stride += 2;
+
+    if (mask_stride & 1)
+	mask_stride += 1;
+    if (mask_stride & 2)
+	mask_stride += 2;
+
+    if (dst_stride & 3)
+	dst_stride += 2;
+
+    src_x = -(src_width / 4) + prng_rand_n (src_width * 3 / 2);
+    src_y = -(src_height / 4) + prng_rand_n (src_height * 3 / 2);
+    mask_x = -(mask_width / 4) + prng_rand_n (mask_width * 3 / 2);
+    mask_y = -(mask_height / 4) + prng_rand_n (mask_height * 3 / 2);
+    dst_x = -(dst_width / 4) + prng_rand_n (dst_width * 3 / 2);
+    dst_y = -(dst_height / 4) + prng_rand_n (dst_height * 3 / 2);
+    w = prng_rand_n (dst_width * 3 / 2 - dst_x);
+    h = prng_rand_n (dst_height * 3 / 2 - dst_y);
+
+    srcbuf = (uint32_t *)malloc (src_stride * src_height);
+    maskbuf = (uint32_t *)malloc (mask_stride * mask_height);
+    dstbuf = (uint32_t *)malloc (dst_stride * dst_height);
+
+    prng_randmemset (srcbuf, src_stride * src_height, 0);
+    prng_randmemset (maskbuf, mask_stride * mask_height, 0);
+    prng_randmemset (dstbuf, dst_stride * dst_height, 0);
+
+    src_fmt = get_format (src_bpp);
+    dst_fmt = get_format (dst_bpp);
+
+    if (prng_rand_n (2))
+    {
+	srcbuf += (src_stride / 4) * (src_height - 1);
+	src_stride = - src_stride;
+    }
+
+    if (prng_rand_n (2))
+    {
+	maskbuf += (mask_stride / 4) * (mask_height - 1);
+	mask_stride = - mask_stride;
+    }
+
+    if (prng_rand_n (2))
+    {
+	dstbuf += (dst_stride / 4) * (dst_height - 1);
+	dst_stride = - dst_stride;
+    }
+
+    src_img = pixman_image_create_bits (
+        src_fmt, src_width, src_height, srcbuf, src_stride);
+
+    mask_img = pixman_image_create_bits (
+        PIXMAN_a8, mask_width, mask_height, maskbuf, mask_stride);
+
+    dst_img = pixman_image_create_bits (
+        dst_fmt, dst_width, dst_height, dstbuf, dst_stride);
+
+    image_endian_swap (src_img);
+    image_endian_swap (dst_img);
+
+    if (prng_rand_n (4) > 0)
+    {
+	scale_x = -32768 * 3 + prng_rand_n (65536 * 5);
+	scale_y = -32768 * 3 + prng_rand_n (65536 * 5);
+	translate_x = prng_rand_n (65536);
+	translate_y = prng_rand_n (65536);
+	pixman_transform_init_scale (&transform, scale_x, scale_y);
+	pixman_transform_translate (&transform, NULL, translate_x, translate_y);
+	pixman_image_set_transform (src_img, &transform);
+    }
+
+    if (prng_rand_n (2) > 0)
+    {
+	mask_scale_x = -32768 * 3 + prng_rand_n (65536 * 5);
+	mask_scale_y = -32768 * 3 + prng_rand_n (65536 * 5);
+	mask_translate_x = prng_rand_n (65536);
+	mask_translate_y = prng_rand_n (65536);
+	pixman_transform_init_scale (&transform, mask_scale_x, mask_scale_y);
+	pixman_transform_translate (&transform, NULL, mask_translate_x, mask_translate_y);
+	pixman_image_set_transform (mask_img, &transform);
+    }
+
+    switch (prng_rand_n (4))
+    {
+    case 0:
+	mask_repeat = PIXMAN_REPEAT_NONE;
+	break;
+
+    case 1:
+	mask_repeat = PIXMAN_REPEAT_NORMAL;
+	break;
+
+    case 2:
+	mask_repeat = PIXMAN_REPEAT_PAD;
+	break;
+
+    case 3:
+	mask_repeat = PIXMAN_REPEAT_REFLECT;
+	break;
+
+    default:
+        break;
+    }
+    pixman_image_set_repeat (mask_img, mask_repeat);
+
+    switch (prng_rand_n (4))
+    {
+    case 0:
+	repeat = PIXMAN_REPEAT_NONE;
+	break;
+
+    case 1:
+	repeat = PIXMAN_REPEAT_NORMAL;
+	break;
+
+    case 2:
+	repeat = PIXMAN_REPEAT_PAD;
+	break;
+
+    case 3:
+	repeat = PIXMAN_REPEAT_REFLECT;
+	break;
+
+    default:
+        break;
+    }
+    pixman_image_set_repeat (src_img, repeat);
+
+    if (prng_rand_n (2))
+	pixman_image_set_filter (src_img, PIXMAN_FILTER_NEAREST, NULL, 0);
+    else
+	pixman_image_set_filter (src_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    if (prng_rand_n (2))
+	pixman_image_set_filter (mask_img, PIXMAN_FILTER_NEAREST, NULL, 0);
+    else
+	pixman_image_set_filter (mask_img, PIXMAN_FILTER_BILINEAR, NULL, 0);
+
+    if (verbose)
+    {
+	printf ("src_fmt=%s, dst_fmt=%s\n", 
+		format_name (src_fmt), format_name (dst_fmt));
+	printf ("op=%s, scale_x=%d, scale_y=%d, repeat=%d\n",
+	        operator_name (op), scale_x, scale_y, repeat);
+	printf ("translate_x=%d, translate_y=%d\n",
+	        translate_x, translate_y);
+	printf ("src_width=%d, src_height=%d, dst_width=%d, dst_height=%d\n",
+	        src_width, src_height, dst_width, dst_height);
+	printf ("src_x=%d, src_y=%d, dst_x=%d, dst_y=%d\n",
+	        src_x, src_y, dst_x, dst_y);
+	printf ("w=%d, h=%d\n", w, h);
+    }
+
+    if (prng_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = prng_rand_n (2) + 1;
+
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = prng_rand_n (src_width);
+	    clip_boxes[i].y1 = prng_rand_n (src_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + prng_rand_n (src_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + prng_rand_n (src_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("source clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (src_img, &clip);
+	pixman_image_set_source_clipping (src_img, 1);
+	pixman_region_fini (&clip);
+    }
+
+    if (prng_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = prng_rand_n (2) + 1;
+
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = prng_rand_n (mask_width);
+	    clip_boxes[i].y1 = prng_rand_n (mask_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + prng_rand_n (mask_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + prng_rand_n (mask_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("mask clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (mask_img, &clip);
+	pixman_image_set_source_clipping (mask_img, 1);
+	pixman_region_fini (&clip);
+    }
+
+    if (prng_rand_n (8) == 0)
+    {
+	pixman_box16_t clip_boxes[2];
+	int            n = prng_rand_n (2) + 1;
+	for (i = 0; i < n; i++)
+	{
+	    clip_boxes[i].x1 = prng_rand_n (dst_width);
+	    clip_boxes[i].y1 = prng_rand_n (dst_height);
+	    clip_boxes[i].x2 =
+		clip_boxes[i].x1 + prng_rand_n (dst_width - clip_boxes[i].x1);
+	    clip_boxes[i].y2 =
+		clip_boxes[i].y1 + prng_rand_n (dst_height - clip_boxes[i].y1);
+
+	    if (verbose)
+	    {
+		printf ("destination clip box: [%d,%d-%d,%d]\n",
+		        clip_boxes[i].x1, clip_boxes[i].y1,
+		        clip_boxes[i].x2, clip_boxes[i].y2);
+	    }
+	}
+	pixman_region_init_rects (&clip, clip_boxes, n);
+	pixman_image_set_clip_region (dst_img, &clip);
+	pixman_region_fini (&clip);
+    }
+
+    if (prng_rand_n (2) == 0)
+	pixman_image_composite (op, src_img, NULL, dst_img,
+                            src_x, src_y, 0, 0, dst_x, dst_y, w, h);
+    else
+	pixman_image_composite (op, src_img, mask_img, dst_img,
+                            src_x, src_y, mask_x, mask_y, dst_x, dst_y, w, h);
+
+    crc32 = compute_crc32_for_image (0, dst_img);
+    
+    if (verbose)
+	print_image (dst_img);
+
+    pixman_image_unref (src_img);
+    pixman_image_unref (mask_img);
+    pixman_image_unref (dst_img);
+
+    if (src_stride < 0)
+	srcbuf += (src_stride / 4) * (src_height - 1);
+
+    if (mask_stride < 0)
+	maskbuf += (mask_stride / 4) * (mask_height - 1);
+
+    if (dst_stride < 0)
+	dstbuf += (dst_stride / 4) * (dst_height - 1);
+
+    free (srcbuf);
+    free (maskbuf);
+    free (dstbuf);
+
+    FLOAT_REGS_CORRUPTION_DETECTOR_FINISH ();
+    return crc32;
+}
+
+#if BILINEAR_INTERPOLATION_BITS == 7
+#define CHECKSUM 0x92E0F068
+#elif BILINEAR_INTERPOLATION_BITS == 4
+#define CHECKSUM 0x8EFFA1E5
+#else
+#define CHECKSUM 0x00000000
+#endif
+
+int
+main (int argc, const char *argv[])
+{
+    pixman_disable_out_of_bounds_workaround ();
+
+    return fuzzer_test_main("scaling", 8000000, CHECKSUM,
+			    test_composite, argc, argv);
+}
diff --git a/src/pixman/test/stress-test.c b/src/pixman/test/stress-test.c
new file mode 100644
index 0000000..1f03c75
--- /dev/null
+++ b/src/pixman/test/stress-test.c
@@ -0,0 +1,1040 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils.h"
+#include <sys/types.h>
+
+#if 0
+#define fence_malloc malloc
+#define fence_free free
+#define make_random_bytes malloc
+#endif
+
+static const pixman_format_code_t image_formats[] =
+{
+    PIXMAN_a8r8g8b8,
+    PIXMAN_x8r8g8b8,
+    PIXMAN_r5g6b5,
+    PIXMAN_r3g3b2,
+    PIXMAN_a8,
+    PIXMAN_a8b8g8r8,
+    PIXMAN_x8b8g8r8,
+    PIXMAN_b8g8r8a8,
+    PIXMAN_b8g8r8x8,
+    PIXMAN_r8g8b8a8,
+    PIXMAN_r8g8b8x8,
+    PIXMAN_x14r6g6b6,
+    PIXMAN_r8g8b8,
+    PIXMAN_b8g8r8,
+    PIXMAN_a8r8g8b8_sRGB,
+    PIXMAN_r5g6b5,
+    PIXMAN_b5g6r5,
+    PIXMAN_x2r10g10b10,
+    PIXMAN_a2r10g10b10,
+    PIXMAN_x2b10g10r10,
+    PIXMAN_a2b10g10r10,
+    PIXMAN_a1r5g5b5,
+    PIXMAN_x1r5g5b5,
+    PIXMAN_a1b5g5r5,
+    PIXMAN_x1b5g5r5,
+    PIXMAN_a4r4g4b4,
+    PIXMAN_x4r4g4b4,
+    PIXMAN_a4b4g4r4,
+    PIXMAN_x4b4g4r4,
+    PIXMAN_a8,
+    PIXMAN_r3g3b2,
+    PIXMAN_b2g3r3,
+    PIXMAN_a2r2g2b2,
+    PIXMAN_a2b2g2r2,
+    PIXMAN_c8,
+    PIXMAN_g8,
+    PIXMAN_x4c4,
+    PIXMAN_x4g4,
+    PIXMAN_c4,
+    PIXMAN_g4,
+    PIXMAN_g1,
+    PIXMAN_x4a4,
+    PIXMAN_a4,
+    PIXMAN_r1g2b1,
+    PIXMAN_b1g2r1,
+    PIXMAN_a1r1g1b1,
+    PIXMAN_a1b1g1r1,
+    PIXMAN_a1
+};
+
+static pixman_filter_t filters[] =
+{
+    PIXMAN_FILTER_NEAREST,
+    PIXMAN_FILTER_BILINEAR,
+    PIXMAN_FILTER_FAST,
+    PIXMAN_FILTER_GOOD,
+    PIXMAN_FILTER_BEST,
+    PIXMAN_FILTER_CONVOLUTION
+};
+
+static int
+get_size (void)
+{
+    switch (prng_rand_n (28))
+    {
+    case 0:
+	return 1;
+
+    case 1:
+	return 2;
+
+    default:
+    case 2:
+	return prng_rand_n (100);
+
+    case 4:
+	return prng_rand_n (2000) + 1000;
+
+    case 5:
+	return 65535;
+
+    case 6:
+	return 65536;
+
+    case 7:
+	return prng_rand_n (64000) + 63000;
+    }
+}
+
+static void
+destroy (pixman_image_t *image, void *data)
+{
+    if (image->type == BITS && image->bits.free_me != image->bits.bits)
+    {
+	uint32_t *bits;
+
+	if (image->bits.bits != (void *)0x01)
+	{
+	    bits = image->bits.bits;
+
+	    if (image->bits.rowstride < 0)
+		bits -= (- image->bits.rowstride * (image->bits.height - 1));
+
+	    fence_free (bits);
+	}
+    }
+
+    free (data);
+}
+
+static uint32_t
+real_reader (const void *src, int size)
+{
+    switch (size)
+    {
+    case 1:
+	return *(uint8_t *)src;
+    case 2:
+	return *(uint16_t *)src;
+    case 4:
+	return *(uint32_t *)src;
+    default:
+	assert (0);
+	return 0; /* silence MSVC */
+    }
+}
+
+static void
+real_writer (void *src, uint32_t value, int size)
+{
+    switch (size)
+    {
+    case 1:
+	*(uint8_t *)src = value;
+	break;
+
+    case 2:
+	*(uint16_t *)src = value;
+	break;
+
+    case 4:
+	*(uint32_t *)src = value;
+	break;
+
+    default:
+	assert (0);
+	break;
+    }
+}
+
+static uint32_t
+fake_reader (const void *src, int size)
+{
+    uint32_t r = prng_rand ();
+
+    assert (size == 1 || size == 2 || size == 4);
+
+    return r >> (32 - (size * 8));
+}
+
+static void
+fake_writer (void *src, uint32_t value, int size)
+{
+    assert (size == 1 || size == 2 || size == 4);
+}
+
+static int32_t
+log_rand (void)
+{
+    uint32_t mask;
+
+    mask = (1 << prng_rand_n (10)) - 1;
+
+    return (prng_rand () & mask) - (mask >> 1);
+}
+
+static int32_t
+rand_x (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return prng_rand_n (image->bits.width);
+    else
+	return log_rand ();
+}
+
+static int32_t
+rand_y (pixman_image_t *image)
+{
+    if (image->type == BITS)
+	return prng_rand_n (image->bits.height);
+    else
+	return log_rand ();
+}
+
+typedef enum
+{
+    DONT_CARE,
+    PREFER_ALPHA,
+    REQUIRE_ALPHA
+} alpha_preference_t;
+
+static pixman_format_code_t
+random_format (alpha_preference_t alpha)
+{
+    pixman_format_code_t format;
+    int n = prng_rand_n (ARRAY_LENGTH (image_formats));
+
+    if (alpha >= PREFER_ALPHA &&
+	(alpha == REQUIRE_ALPHA || prng_rand_n (4) != 0))
+    {
+        do
+        {
+            format = image_formats[n++ % ARRAY_LENGTH (image_formats)];
+        } while (PIXMAN_FORMAT_TYPE (format) != PIXMAN_TYPE_A);
+    }
+    else
+    {
+        format = image_formats[n];
+    }
+
+    return format;
+}
+
+static pixman_image_t *
+create_random_bits_image (alpha_preference_t alpha_preference)
+{
+    pixman_format_code_t format;
+    pixman_indexed_t *indexed;
+    pixman_image_t *image;
+    int width, height, stride;
+    uint32_t *bits;
+    pixman_read_memory_func_t read_func = NULL;
+    pixman_write_memory_func_t write_func = NULL;
+    pixman_filter_t filter;
+    pixman_fixed_t *coefficients = NULL;
+    int n_coefficients = 0;
+
+    /* format */
+    format = random_format (alpha_preference);
+
+    indexed = NULL;
+    if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_COLOR)
+    {
+	indexed = malloc (sizeof (pixman_indexed_t));
+
+	initialize_palette (indexed, PIXMAN_FORMAT_BPP (format), TRUE);
+    }
+    else if (PIXMAN_FORMAT_TYPE (format) == PIXMAN_TYPE_GRAY)
+    {
+	indexed = malloc (sizeof (pixman_indexed_t));
+
+	initialize_palette (indexed, PIXMAN_FORMAT_BPP (format), FALSE);
+    }
+    else
+    {
+	indexed = NULL;
+    }
+
+    /* size */
+    width = get_size ();
+    height = get_size ();
+
+    while ((uint64_t)width * height > 200000)
+    {
+	if (prng_rand_n(2) == 0)
+	    height = 200000 / width;
+	else
+	    width = 200000 / height;
+    }
+
+    if (height == 0)
+	height = 1;
+    if (width == 0)
+	width = 1;
+
+    /* bits */
+    switch (prng_rand_n (7))
+    {
+    default:
+    case 0:
+	stride = width * PIXMAN_FORMAT_BPP (format) + prng_rand_n (17);
+	stride = (stride + 3) & (~3);
+	bits = (uint32_t *)make_random_bytes (height * stride);
+	break;
+
+    case 1:
+	stride = 0;
+	bits = NULL;
+	break;
+
+    case 2: /* Zero-filled */
+	stride = width * PIXMAN_FORMAT_BPP (format) + prng_rand_n (17);
+	stride = (stride + 3) & (~3);
+	bits = fence_malloc (height * stride);
+	if (!bits)
+	    return NULL;
+	memset (bits, 0, height * stride);
+	break;
+
+    case 3: /* Filled with 0xFF */
+	stride = width * PIXMAN_FORMAT_BPP (format) + prng_rand_n (17);
+	stride = (stride + 3) & (~3);
+	bits = fence_malloc (height * stride);
+	if (!bits)
+	    return NULL;
+	memset (bits, 0xff, height * stride);
+	break;
+
+    case 4: /* bits is a bad pointer, has read/write functions */
+	stride = 232;
+	bits = (void *)0x01;
+	read_func = fake_reader;
+	write_func = fake_writer;
+	break;
+
+    case 5: /* bits is a real pointer, has read/write functions */
+	stride = width * PIXMAN_FORMAT_BPP (format) + prng_rand_n (17);
+	stride = (stride + 3) & (~3);
+	bits = fence_malloc (height * stride);
+	if (!bits)
+	    return NULL;
+	memset (bits, 0xff, height * stride);
+	read_func = real_reader;
+	write_func = real_writer;
+	break;
+
+    case 6: /* bits is a real pointer, stride is negative */
+	stride = (width * PIXMAN_FORMAT_BPP (format) + prng_rand_n (17));
+	stride = (stride + 3) & (~3);
+	bits = (uint32_t *)make_random_bytes (height * stride);
+	if (!bits)
+	    return NULL;
+	bits += ((height - 1) * stride) / 4;
+	stride = - stride;
+	break;
+    }
+
+    /* Filter */
+    filter = filters[prng_rand_n (ARRAY_LENGTH (filters))];
+    if (filter == PIXMAN_FILTER_CONVOLUTION)
+    {
+	int width = prng_rand_n (3);
+	int height = prng_rand_n (4);
+
+	n_coefficients = width * height + 2;
+	coefficients = malloc (n_coefficients * sizeof (pixman_fixed_t));
+
+	if (coefficients)
+	{
+	    int i;
+
+	    for (i = 0; i < width * height; ++i)
+		coefficients[i + 2] = prng_rand();
+
+	    coefficients[0] = width << 16;
+	    coefficients[1] = height << 16;
+	}
+	else
+	{
+	    filter = PIXMAN_FILTER_BEST;
+	}
+    }
+
+    /* Finally create the image */
+    image = pixman_image_create_bits (format, width, height, bits, stride);
+    if (!image)
+	return NULL;
+
+    pixman_image_set_indexed (image, indexed);
+    pixman_image_set_destroy_function (image, destroy, indexed);
+    pixman_image_set_accessors (image, read_func, write_func);
+    pixman_image_set_filter (image, filter, coefficients, n_coefficients);
+
+    return image;
+}
+
+static pixman_repeat_t repeats[] =
+{
+    PIXMAN_REPEAT_NONE,
+    PIXMAN_REPEAT_NORMAL,
+    PIXMAN_REPEAT_REFLECT,
+    PIXMAN_REPEAT_PAD
+};
+
+static uint32_t
+absolute (int32_t i)
+{
+    return i < 0? -i : i;
+}
+
+static void
+set_general_properties (pixman_image_t *image, pixman_bool_t allow_alpha_map)
+{
+    pixman_repeat_t repeat;
+
+    /* Set properties that are generic to all images */
+
+    /* Repeat */
+    repeat = repeats[prng_rand_n (ARRAY_LENGTH (repeats))];
+    pixman_image_set_repeat (image, repeat);
+
+    /* Alpha map */
+    if (allow_alpha_map && prng_rand_n (4) == 0)
+    {
+	pixman_image_t *alpha_map;
+	int16_t x, y;
+
+	alpha_map = create_random_bits_image (DONT_CARE);
+
+	if (alpha_map)
+	{
+	    set_general_properties (alpha_map, FALSE);
+
+	    x = rand_x (image) - image->bits.width / 2;
+	    y = rand_y (image) - image->bits.height / 2;
+
+	    pixman_image_set_alpha_map (image, alpha_map, x, y);
+
+	    pixman_image_unref (alpha_map);
+	}
+    }
+
+    /* Component alpha */
+    pixman_image_set_component_alpha (image, prng_rand_n (3) == 0);
+
+    /* Clip region */
+    if (prng_rand_n (8) < 2)
+    {
+	pixman_region32_t region;
+	int i, n_rects;
+
+	pixman_region32_init (&region);
+
+	switch (prng_rand_n (12))
+	{
+	case 0:
+	    n_rects = 0;
+	    break;
+
+	case 1: case 2: case 3:
+	    n_rects = 1;
+	    break;
+
+	case 4: case 5:
+	    n_rects = 2;
+	    break;
+
+	case 6: case 7:
+	    n_rects = 3;
+	    break;
+
+	default:
+	    n_rects = prng_rand_n (100);
+	    break;
+	}
+
+	for (i = 0; i < n_rects; ++i)
+	{
+	    uint32_t width, height;
+	    int x, y;
+
+	    x = log_rand();
+	    y = log_rand();
+	    width = absolute (log_rand ()) + 1;
+	    height = absolute (log_rand ()) + 1;
+
+	    pixman_region32_union_rect (
+		&region, &region, x, y, width, height);
+	}
+
+	if (image->type == BITS && prng_rand_n (8) != 0)
+	{
+	    uint32_t width, height;
+	    int x, y;
+	    int i;
+
+	    /* Also add a couple of clip rectangles inside the image
+	     * so that compositing will actually take place.
+	     */
+	    for (i = 0; i < 5; ++i)
+	    {
+		x = prng_rand_n (2 * image->bits.width) - image->bits.width;
+		y = prng_rand_n (2 * image->bits.height) - image->bits.height;
+		width = prng_rand_n (image->bits.width) - x + 10;
+		height = prng_rand_n (image->bits.height) - y + 10;
+
+		if (width + x < x)
+		    width = INT32_MAX - x;
+		if (height + y < y)
+		    height = INT32_MAX - y;
+
+		pixman_region32_union_rect (
+		    &region, &region, x, y, width, height);
+	    }
+	}
+
+	pixman_image_set_clip_region32 (image, &region);
+
+	pixman_region32_fini (&region);
+    }
+
+    /* Whether source clipping is enabled */
+    pixman_image_set_source_clipping (image, !!prng_rand_n (2));
+
+    /* Client clip */
+    pixman_image_set_has_client_clip (image, !!prng_rand_n (2));
+
+    /* Transform */
+    if (prng_rand_n (5) < 2)
+    {
+	pixman_transform_t xform;
+	int i, j, k;
+	uint32_t tx, ty, sx, sy;
+	uint32_t c, s;
+
+	memset (&xform, 0, sizeof xform);
+	xform.matrix[0][0] = pixman_fixed_1;
+	xform.matrix[1][1] = pixman_fixed_1;
+	xform.matrix[2][2] = pixman_fixed_1;
+
+	for (k = 0; k < 3; ++k)
+	{
+	    switch (prng_rand_n (4))
+	    {
+	    case 0:
+		/* rotation */
+		c = prng_rand_n (2 * 65536) - 65536;
+		s = prng_rand_n (2 * 65536) - 65536;
+		pixman_transform_rotate (&xform, NULL, c, s);
+		break;
+
+	    case 1:
+		/* translation */
+		tx = prng_rand();
+		ty = prng_rand();
+		pixman_transform_translate (&xform, NULL, tx, ty);
+		break;
+
+	    case 2:
+		/* scale */
+		sx = prng_rand();
+		sy = prng_rand();
+		pixman_transform_scale (&xform, NULL, sx, sy);
+		break;
+
+	    case 3:
+		if (prng_rand_n (16) == 0)
+		{
+		    /* random */
+		    for (i = 0; i < 3; ++i)
+			for (j = 0; j < 3; ++j)
+			    xform.matrix[i][j] = prng_rand();
+		    break;
+		}
+		else if (prng_rand_n (16) == 0)
+		{
+		    /* zero */
+		    memset (&xform, 0, sizeof xform);
+		}
+		break;
+	    }
+	}
+
+	pixman_image_set_transform (image, &xform);
+    }
+}
+
+static pixman_color_t
+random_color (void)
+{
+    pixman_color_t color =
+    {
+	prng_rand() & 0xffff,
+	prng_rand() & 0xffff,
+	prng_rand() & 0xffff,
+	prng_rand() & 0xffff,
+    };
+
+    return color;
+}
+
+
+static pixman_image_t *
+create_random_solid_image (void)
+{
+    pixman_color_t color = random_color();
+    pixman_image_t *image = pixman_image_create_solid_fill (&color);
+
+    return image;
+}
+
+static pixman_gradient_stop_t *
+create_random_stops (int *n_stops)
+{
+    pixman_fixed_t step;
+    pixman_fixed_t s;
+    int i;
+    pixman_gradient_stop_t *stops;
+
+    *n_stops = prng_rand_n (50) + 1;
+
+    step = pixman_fixed_1 / *n_stops;
+
+    stops = malloc (*n_stops * sizeof (pixman_gradient_stop_t));
+
+    s = 0;
+    for (i = 0; i < (*n_stops) - 1; ++i)
+    {
+	stops[i].x = s;
+	stops[i].color = random_color();
+
+	s += step;
+    }
+
+    stops[*n_stops - 1].x = pixman_fixed_1;
+    stops[*n_stops - 1].color = random_color();
+
+    return stops;
+}
+
+static pixman_point_fixed_t
+create_random_point (void)
+{
+    pixman_point_fixed_t p;
+
+    p.x = log_rand ();
+    p.y = log_rand ();
+
+    return p;
+}
+
+static pixman_image_t *
+create_random_linear_image (void)
+{
+    int n_stops;
+    pixman_gradient_stop_t *stops;
+    pixman_point_fixed_t p1, p2;
+    pixman_image_t *result;
+
+    stops = create_random_stops (&n_stops);
+    if (!stops)
+	return NULL;
+
+    p1 = create_random_point ();
+    p2 = create_random_point ();
+
+    result = pixman_image_create_linear_gradient (&p1, &p2, stops, n_stops);
+
+    free (stops);
+
+    return result;
+}
+
+static pixman_image_t *
+create_random_radial_image (void)
+{
+    int n_stops;
+    pixman_gradient_stop_t *stops;
+    pixman_point_fixed_t inner_c, outer_c;
+    pixman_fixed_t inner_r, outer_r;
+    pixman_image_t *result;
+
+    inner_c = create_random_point();
+    outer_c = create_random_point();
+    inner_r = prng_rand();
+    outer_r = prng_rand();
+
+    stops = create_random_stops (&n_stops);
+
+    if (!stops)
+	return NULL;
+
+    result = pixman_image_create_radial_gradient (
+	&inner_c, &outer_c, inner_r, outer_r, stops, n_stops);
+
+    free (stops);
+
+    return result;
+}
+
+static pixman_image_t *
+create_random_conical_image (void)
+{
+    pixman_gradient_stop_t *stops;
+    int n_stops;
+    pixman_point_fixed_t c;
+    pixman_fixed_t angle;
+    pixman_image_t *result;
+
+    c = create_random_point();
+    angle = prng_rand();
+
+    stops = create_random_stops (&n_stops);
+
+    if (!stops)
+	return NULL;
+
+    result = pixman_image_create_conical_gradient (&c, angle, stops, n_stops);
+
+    free (stops);
+
+    return result;
+}
+
+static pixman_image_t *
+create_random_image (void)
+{
+    pixman_image_t *result;
+
+    switch (prng_rand_n (5))
+    {
+    default:
+    case 0:
+	result = create_random_bits_image (DONT_CARE);
+	break;
+
+    case 1:
+	result = create_random_solid_image ();
+	break;
+
+    case 2:
+	result = create_random_linear_image ();
+	break;
+
+    case 3:
+	result = create_random_radial_image ();
+	break;
+
+    case 4:
+	result = create_random_conical_image ();
+	break;
+    }
+
+    if (result)
+	set_general_properties (result, TRUE);
+
+    return result;
+}
+
+static void
+random_line (pixman_line_fixed_t *line, int width, int height)
+{
+    line->p1.x = prng_rand_n (width) << 16;
+    line->p1.y = prng_rand_n (height) << 16;
+    line->p2.x = prng_rand_n (width) << 16;
+    line->p2.y = prng_rand_n (height) << 16;
+}
+
+static pixman_trapezoid_t *
+create_random_trapezoids (int *n_traps, int height, int width)
+{
+    pixman_trapezoid_t *trapezoids;
+    int i;
+
+    *n_traps = prng_rand_n (16) + 1;
+
+    trapezoids = malloc (sizeof (pixman_trapezoid_t) * *n_traps);
+
+    for (i = 0; i < *n_traps; ++i)
+    {
+        pixman_trapezoid_t *t = &(trapezoids[i]);
+
+        t->top = prng_rand_n (height) << 16;
+        t->bottom = prng_rand_n (height) << 16;
+
+        random_line (&t->left, height, width);
+        random_line (&t->right, height, width);
+    }
+
+    return trapezoids;
+}
+
+static const pixman_op_t op_list[] =
+{
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+    PIXMAN_OP_SOFT_LIGHT,
+    PIXMAN_OP_HSL_HUE,
+    PIXMAN_OP_HSL_SATURATION,
+    PIXMAN_OP_HSL_COLOR,
+    PIXMAN_OP_HSL_LUMINOSITY,
+};
+
+static void
+run_test (uint32_t seed, pixman_bool_t verbose, uint32_t mod)
+{
+    pixman_image_t *source, *mask, *dest;
+    pixman_op_t op;
+
+    if (verbose)
+    {
+	if (mod == 0 || (seed % mod) == 0)
+	    printf ("Seed 0x%08x\n", seed);
+    }
+
+    source = mask = dest = NULL;
+
+    prng_srand (seed);
+
+    if (prng_rand_n (8) == 0)
+    {
+        int n_traps;
+        pixman_trapezoid_t *trapezoids;
+	int p = prng_rand_n (3);
+
+	if (p == 0)
+	    dest = create_random_bits_image (DONT_CARE);
+	else
+	    dest = create_random_bits_image (REQUIRE_ALPHA);
+
+	if (!dest)
+	    goto out;
+
+	set_general_properties (dest, TRUE);
+
+	if (!(trapezoids = create_random_trapezoids (
+		  &n_traps, dest->bits.width, dest->bits.height)))
+	{
+	    goto out;
+	}
+
+	switch (p)
+	{
+	case 0:
+	    source = create_random_image ();
+
+	    if (source)
+	    {
+		op = op_list [prng_rand_n (ARRAY_LENGTH (op_list))];
+
+		pixman_composite_trapezoids (
+		    op, source, dest,
+		    random_format (REQUIRE_ALPHA),
+		    rand_x (source), rand_y (source),
+		    rand_x (dest), rand_y (dest),
+		    n_traps, trapezoids);
+	    }
+	    break;
+
+	case 1:
+	    pixman_rasterize_trapezoid (
+		dest, &trapezoids[prng_rand_n (n_traps)],
+		rand_x (dest), rand_y (dest));
+	    break;
+
+	case 2:
+	    pixman_add_trapezoids (
+		dest, rand_x (dest), rand_y (dest), n_traps, trapezoids);
+	    break;
+        }
+
+	free (trapezoids);
+    }
+    else
+    {
+        dest = create_random_bits_image (DONT_CARE);
+        source = create_random_image ();
+        mask = create_random_image ();
+
+        if (source && mask && dest)
+        {
+            set_general_properties (dest, TRUE);
+
+            op = op_list [prng_rand_n (ARRAY_LENGTH (op_list))];
+
+            pixman_image_composite32 (op,
+                                      source, mask, dest,
+                                      rand_x (source), rand_y (source),
+                                      rand_x (mask), rand_y (mask),
+                                      0, 0,
+                                      dest->bits.width,
+                                      dest->bits.height);
+        }
+    }
+
+out:
+    if (source)
+	pixman_image_unref (source);
+    if (mask)
+	pixman_image_unref (mask);
+    if (dest)
+	pixman_image_unref (dest);
+}
+
+static pixman_bool_t
+get_int (char *s, uint32_t *i)
+{
+    char *end;
+    int p;
+
+    p = strtol (s, &end, 0);
+
+    if (end != s && *end == 0)
+    {
+	*i = p;
+	return TRUE;
+    }
+
+    return FALSE;
+}
+
+int
+main (int argc, char **argv)
+{
+    int verbose = FALSE;
+    uint32_t seed = 1;
+    uint32_t n_tests = 8000;
+    uint32_t mod = 0;
+    pixman_bool_t use_threads = TRUE;
+    int32_t i;
+
+    pixman_disable_out_of_bounds_workaround ();
+
+    enable_divbyzero_exceptions();
+
+    if (getenv ("VERBOSE") != NULL)
+	verbose = TRUE;
+
+    for (i = 1; i < argc; ++i)
+    {
+	if (strcmp (argv[i], "-v") == 0)
+	{
+	    verbose = TRUE;
+
+	    if (i + 1 < argc)
+	    {
+		get_int (argv[i + 1], &mod);
+		i++;
+	    }
+	}
+	else if (strcmp (argv[i], "-s") == 0 && i + 1 < argc)
+	{
+	    get_int (argv[i + 1], &seed);
+	    use_threads = FALSE;
+	    i++;
+	}
+	else if (strcmp (argv[i], "-n") == 0 && i + 1 < argc)
+	{
+	    get_int (argv[i + 1], &n_tests);
+	    i++;
+	}
+	else
+	{
+	    if (strcmp (argv[i], "-h") != 0)
+		printf ("Unknown option '%s'\n\n", argv[i]);
+
+	    printf ("Options:\n\n"
+		    "-n <number>        Number of tests to run\n"
+		    "-s <seed> 	        Seed of first test (ignored if PIXMAN_RANDOMIZE_TESTS is set)\n"
+		    "-v                 Print out seeds\n"
+		    "-v <n>             Print out every n'th seed\n\n");
+
+	    exit (-1);
+	}
+    }
+
+    if (getenv ("PIXMAN_RANDOMIZE_TESTS"))
+    {
+	seed = get_random_seed();
+	printf ("First seed: 0x%08x\n", seed);
+    }
+
+    if (use_threads)
+    {
+#ifdef USE_OPENMP
+#   pragma omp parallel for default(none) shared(verbose, n_tests, mod, seed)
+#endif
+	for (i = 0; i < (int32_t)n_tests; ++i)
+	    run_test (seed + i, verbose, mod);
+    }
+    else
+    {
+	for (i = 0; i < (int32_t)n_tests; ++i)
+	    run_test (seed + i, verbose, mod);
+    }
+
+    return 0;
+}
diff --git a/src/pixman/test/thread-test.c b/src/pixman/test/thread-test.c
new file mode 100644
index 0000000..0b07b26
--- /dev/null
+++ b/src/pixman/test/thread-test.c
@@ -0,0 +1,199 @@
+#include "utils.h"
+
+#ifndef HAVE_PTHREADS
+
+int main ()
+{
+    printf ("Skipped thread-test - pthreads not supported\n");
+    return 0;
+}
+
+#else
+
+#include <stdlib.h>
+#include <pthread.h>
+
+typedef struct
+{
+    int       thread_no;
+    uint32_t *dst_buf;
+    prng_t    prng_state;
+} info_t;
+
+static const pixman_op_t operators[] = 
+{
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_CLEAR,
+    PIXMAN_OP_SRC,
+    PIXMAN_OP_DST,
+    PIXMAN_OP_OVER,
+    PIXMAN_OP_OVER_REVERSE,
+    PIXMAN_OP_IN,
+    PIXMAN_OP_IN_REVERSE,
+    PIXMAN_OP_OUT,
+    PIXMAN_OP_OUT_REVERSE,
+    PIXMAN_OP_ATOP,
+    PIXMAN_OP_ATOP_REVERSE,
+    PIXMAN_OP_XOR,
+    PIXMAN_OP_ADD,
+    PIXMAN_OP_SATURATE,
+    PIXMAN_OP_DISJOINT_CLEAR,
+    PIXMAN_OP_DISJOINT_SRC,
+    PIXMAN_OP_DISJOINT_DST,
+    PIXMAN_OP_DISJOINT_OVER,
+    PIXMAN_OP_DISJOINT_OVER_REVERSE,
+    PIXMAN_OP_DISJOINT_IN,
+    PIXMAN_OP_DISJOINT_IN_REVERSE,
+    PIXMAN_OP_DISJOINT_OUT,
+    PIXMAN_OP_DISJOINT_OUT_REVERSE,
+    PIXMAN_OP_DISJOINT_ATOP,
+    PIXMAN_OP_DISJOINT_ATOP_REVERSE,
+    PIXMAN_OP_DISJOINT_XOR,
+    PIXMAN_OP_CONJOINT_CLEAR,
+    PIXMAN_OP_CONJOINT_SRC,
+    PIXMAN_OP_CONJOINT_DST,
+    PIXMAN_OP_CONJOINT_OVER,
+    PIXMAN_OP_CONJOINT_OVER_REVERSE,
+    PIXMAN_OP_CONJOINT_IN,
+    PIXMAN_OP_CONJOINT_IN_REVERSE,
+    PIXMAN_OP_CONJOINT_OUT,
+    PIXMAN_OP_CONJOINT_OUT_REVERSE,
+    PIXMAN_OP_CONJOINT_ATOP,
+    PIXMAN_OP_CONJOINT_ATOP_REVERSE,
+    PIXMAN_OP_CONJOINT_XOR,
+    PIXMAN_OP_MULTIPLY,
+    PIXMAN_OP_SCREEN,
+    PIXMAN_OP_OVERLAY,
+    PIXMAN_OP_DARKEN,
+    PIXMAN_OP_LIGHTEN,
+    PIXMAN_OP_COLOR_DODGE,
+    PIXMAN_OP_COLOR_BURN,
+    PIXMAN_OP_HARD_LIGHT,
+    PIXMAN_OP_DIFFERENCE,
+    PIXMAN_OP_EXCLUSION,
+};
+
+static const pixman_format_code_t formats[] =
+{
+    PIXMAN_a8r8g8b8,
+    PIXMAN_r5g6b5,
+    PIXMAN_a8,
+    PIXMAN_a4,
+    PIXMAN_a1,
+    PIXMAN_b5g6r5,
+    PIXMAN_r8g8b8a8,
+    PIXMAN_a4r4g4b4
+};
+
+#define N_ROUNDS 8192
+
+#define RAND_ELT(arr)							\
+    arr[prng_rand_r(&info->prng_state) % ARRAY_LENGTH (arr)]
+
+#define DEST_WIDTH (7)
+
+static void *
+thread (void *data)
+{
+    info_t *info = data;
+    uint32_t crc32 = 0x0;
+    uint32_t src_buf[64];
+    pixman_image_t *dst_img, *src_img;
+    int i;
+
+    prng_srand_r (&info->prng_state, info->thread_no);
+
+    for (i = 0; i < N_ROUNDS; ++i)
+    {
+	pixman_op_t op;
+	int rand1, rand2;
+
+	prng_randmemset_r (&info->prng_state, info->dst_buf,
+			   DEST_WIDTH * sizeof (uint32_t), 0);
+	prng_randmemset_r (&info->prng_state, src_buf,
+			   sizeof (src_buf), 0);
+
+	src_img = pixman_image_create_bits (
+	    RAND_ELT (formats), 4, 4, src_buf, 16);
+	dst_img = pixman_image_create_bits (
+	    RAND_ELT (formats), DEST_WIDTH, 1, info->dst_buf,
+	    DEST_WIDTH * sizeof (uint32_t));
+
+	image_endian_swap (src_img);
+	image_endian_swap (dst_img);
+	
+	rand2 = prng_rand_r (&info->prng_state) % 4;
+	rand1 = prng_rand_r (&info->prng_state) % 4;
+	op = RAND_ELT (operators);
+
+	pixman_image_composite32 (
+	    op,
+	    src_img, NULL, dst_img,
+	    rand1, rand2, 0, 0, 0, 0, DEST_WIDTH, 1);
+
+	crc32 = compute_crc32_for_image (crc32, dst_img);
+
+	pixman_image_unref (src_img);
+	pixman_image_unref (dst_img);
+    }
+
+    return (void *)(uintptr_t)crc32;
+}
+
+static inline uint32_t
+byteswap32 (uint32_t x)
+{
+    return ((x & ((uint32_t)0xFF << 24)) >> 24) |
+           ((x & ((uint32_t)0xFF << 16)) >>  8) |
+           ((x & ((uint32_t)0xFF <<  8)) <<  8) |
+           ((x & ((uint32_t)0xFF <<  0)) << 24);
+}
+
+int
+main (void)
+{
+    uint32_t dest[16 * DEST_WIDTH];
+    info_t info[16] = { { 0 } };
+    pthread_t threads[16];
+    void *retvals[16];
+    uint32_t crc32s[16], crc32;
+    int i;
+
+    for (i = 0; i < 16; ++i)
+    {
+	info[i].thread_no = i;
+	info[i].dst_buf = &dest[i * DEST_WIDTH];
+    }
+
+    for (i = 0; i < 16; ++i)
+	pthread_create (&threads[i], NULL, thread, &info[i]);
+
+    for (i = 0; i < 16; ++i)
+	pthread_join (threads[i], &retvals[i]);
+
+    for (i = 0; i < 16; ++i)
+    {
+	crc32s[i] = (uintptr_t)retvals[i];
+
+	if (is_little_endian())
+	    crc32s[i] = byteswap32 (crc32s[i]);
+    }
+
+    crc32 = compute_crc32 (0, crc32s, sizeof crc32s);
+
+#define EXPECTED 0xE299B18E
+
+    if (crc32 != EXPECTED)
+    {
+	printf ("thread-test failed. Got checksum 0x%08X, expected 0x%08X\n",
+		crc32, EXPECTED);
+	return 1;
+    }
+
+    return 0;
+}
+
+#endif
+
diff --git a/src/pixman/test/trap-crasher.c b/src/pixman/test/trap-crasher.c
new file mode 100644
index 0000000..77be1c9
--- /dev/null
+++ b/src/pixman/test/trap-crasher.c
@@ -0,0 +1,39 @@
+#include <stdlib.h>
+#include "utils.h"
+
+int
+main()
+{
+    pixman_image_t *dst;
+    pixman_trapezoid_t traps[] = {
+	{
+	    2147483646,
+	    2147483647,
+	    {
+		{ 0, 0 },
+		{ 0, 2147483647 }
+	    },
+	    {
+		{ 65536, 0 },
+		{ 0, 2147483647 }
+	    }
+	},
+	{
+	    32768,
+	    - 2147483647,
+	    {
+		{ 0, 0 },
+		{ 0, 2147483647 }
+	    },
+	    {
+		{ 65536, 0 },
+		{ 0, 2147483647 }
+	    }
+	},
+    };
+
+    dst = pixman_image_create_bits (PIXMAN_a8, 1, 1, NULL, -1);
+
+    pixman_add_trapezoids (dst, 0, 0, ARRAY_LENGTH (traps), traps);
+    return (0);
+}
diff --git a/src/pixman/test/utils-prng.c b/src/pixman/test/utils-prng.c
new file mode 100644
index 0000000..c27b5be
--- /dev/null
+++ b/src/pixman/test/utils-prng.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright © 2012 Siarhei Siamashka <siarhei.siamashka@gmail.com>
+ *
+ * Based on the public domain implementation of small noncryptographic PRNG
+ * authored by Bob Jenkins: http://burtleburtle.net/bob/rand/smallprng.html
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "utils.h"
+#include "utils-prng.h"
+
+#if defined(HAVE_GCC_VECTOR_EXTENSIONS) && defined(__SSE2__)
+#include <xmmintrin.h>
+#endif
+
+void smallprng_srand_r (smallprng_t *x, uint32_t seed)
+{
+    uint32_t i;
+    x->a = 0xf1ea5eed, x->b = x->c = x->d = seed;
+    for (i = 0; i < 20; ++i)
+        smallprng_rand_r (x);
+}
+
+/*
+ * Set a 32-bit seed for PRNG
+ *
+ * LCG is used here for generating independent seeds for different
+ * smallprng instances (in the case if smallprng is also used for
+ * generating these seeds, "Big Crush" test from TestU01 detects
+ * some problems in the glued 'prng_rand_128_r' output data).
+ * Actually we might be even better using some cryptographic
+ * hash for this purpose, but LCG seems to be also enough for
+ * passing "Big Crush".
+ */
+void prng_srand_r (prng_t *x, uint32_t seed)
+{
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
+    int i;
+    prng_rand_128_data_t dummy;
+    smallprng_srand_r (&x->p0, seed);
+    x->a[0] = x->a[1] = x->a[2] = x->a[3] = 0xf1ea5eed;
+    x->b[0] = x->c[0] = x->d[0] = (seed = seed * 1103515245 + 12345);
+    x->b[1] = x->c[1] = x->d[1] = (seed = seed * 1103515245 + 12345);
+    x->b[2] = x->c[2] = x->d[2] = (seed = seed * 1103515245 + 12345);
+    x->b[3] = x->c[3] = x->d[3] = (seed = seed * 1103515245 + 12345);
+    for (i = 0; i < 20; ++i)
+        prng_rand_128_r (x, &dummy);
+#else
+    smallprng_srand_r (&x->p0, seed);
+    smallprng_srand_r (&x->p1, (seed = seed * 1103515245 + 12345));
+    smallprng_srand_r (&x->p2, (seed = seed * 1103515245 + 12345));
+    smallprng_srand_r (&x->p3, (seed = seed * 1103515245 + 12345));
+    smallprng_srand_r (&x->p4, (seed = seed * 1103515245 + 12345));
+#endif
+}
+
+static force_inline void
+store_rand_128_data (void *addr, prng_rand_128_data_t *d, int aligned)
+{
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
+    if (aligned)
+    {
+        *(uint8x16 *)addr = d->vb;
+        return;
+    }
+    else
+    {
+#ifdef __SSE2__
+        /* workaround for http://gcc.gnu.org/PR55614 */
+        _mm_storeu_si128 (addr, _mm_loadu_si128 ((__m128i *)d));
+        return;
+#endif
+    }
+#endif
+    /* we could try something better for unaligned writes (packed attribute),
+     * but GCC is not very reliable: http://gcc.gnu.org/PR55454 */
+    memcpy (addr, d, 16);
+}
+
+/*
+ * Helper function and the actual code for "prng_randmemset_r" function
+ */
+static force_inline void
+randmemset_internal (prng_t                  *prng,
+                     uint8_t                 *buf,
+                     size_t                   size,
+                     prng_randmemset_flags_t  flags,
+                     int                      aligned)
+{
+    prng_t local_prng = *prng;
+    prng_rand_128_data_t randdata;
+    size_t i;
+
+    while (size >= 16)
+    {
+        prng_rand_128_data_t t;
+        if (flags == 0)
+        {
+            prng_rand_128_r (&local_prng, &randdata);
+        }
+        else
+        {
+            prng_rand_128_r (&local_prng, &t);
+            prng_rand_128_r (&local_prng, &randdata);
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
+            if (flags & RANDMEMSET_MORE_FF)
+            {
+                const uint8x16 const_C0 =
+                {
+                    0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0,
+                    0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0, 0xC0
+                };
+                randdata.vb |= (t.vb >= const_C0);
+            }
+            if (flags & RANDMEMSET_MORE_00)
+            {
+                const uint8x16 const_40 =
+                {
+                    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
+                    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40
+                };
+                randdata.vb &= (t.vb >= const_40);
+            }
+            if (flags & RANDMEMSET_MORE_FFFFFFFF)
+            {
+                const uint32x4 const_C0000000 =
+                {
+                    0xC0000000, 0xC0000000, 0xC0000000, 0xC0000000
+                };
+                randdata.vw |= ((t.vw << 30) >= const_C0000000);
+            }
+            if (flags & RANDMEMSET_MORE_00000000)
+            {
+                const uint32x4 const_40000000 =
+                {
+                    0x40000000, 0x40000000, 0x40000000, 0x40000000
+                };
+                randdata.vw &= ((t.vw << 30) >= const_40000000);
+            }
+#else
+            #define PROCESS_ONE_LANE(i)                                       \
+                if (flags & RANDMEMSET_MORE_FF)                               \
+                {                                                             \
+                    uint32_t mask_ff = (t.w[i] & (t.w[i] << 1)) & 0x80808080; \
+                    mask_ff |= mask_ff >> 1;                                  \
+                    mask_ff |= mask_ff >> 2;                                  \
+                    mask_ff |= mask_ff >> 4;                                  \
+                    randdata.w[i] |= mask_ff;                                 \
+                }                                                             \
+                if (flags & RANDMEMSET_MORE_00)                               \
+                {                                                             \
+                    uint32_t mask_00 = (t.w[i] | (t.w[i] << 1)) & 0x80808080; \
+                    mask_00 |= mask_00 >> 1;                                  \
+                    mask_00 |= mask_00 >> 2;                                  \
+                    mask_00 |= mask_00 >> 4;                                  \
+                    randdata.w[i] &= mask_00;                                 \
+                }                                                             \
+                if (flags & RANDMEMSET_MORE_FFFFFFFF)                         \
+                {                                                             \
+                    int32_t mask_ff = ((t.w[i] << 30) & (t.w[i] << 31)) &     \
+                                       0x80000000;                            \
+                    randdata.w[i] |= mask_ff >> 31;                           \
+                }                                                             \
+                if (flags & RANDMEMSET_MORE_00000000)                         \
+                {                                                             \
+                    int32_t mask_00 = ((t.w[i] << 30) | (t.w[i] << 31)) &     \
+                                       0x80000000;                            \
+                    randdata.w[i] &= mask_00 >> 31;                           \
+                }
+
+            PROCESS_ONE_LANE (0)
+            PROCESS_ONE_LANE (1)
+            PROCESS_ONE_LANE (2)
+            PROCESS_ONE_LANE (3)
+#endif
+        }
+        if (is_little_endian ())
+        {
+            store_rand_128_data (buf, &randdata, aligned);
+            buf += 16;
+        }
+        else
+        {
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
+            const uint8x16 bswap_shufflemask =
+            {
+                3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+            };
+            randdata.vb = __builtin_shuffle (randdata.vb, bswap_shufflemask);
+            store_rand_128_data (buf, &randdata, aligned);
+            buf += 16;
+#else
+            uint8_t t1, t2, t3, t4;
+            #define STORE_ONE_LANE(i)                                         \
+                t1 = randdata.b[i * 4 + 3];                                   \
+                t2 = randdata.b[i * 4 + 2];                                   \
+                t3 = randdata.b[i * 4 + 1];                                   \
+                t4 = randdata.b[i * 4 + 0];                                   \
+                *buf++ = t1;                                                  \
+                *buf++ = t2;                                                  \
+                *buf++ = t3;                                                  \
+                *buf++ = t4;
+
+            STORE_ONE_LANE (0)
+            STORE_ONE_LANE (1)
+            STORE_ONE_LANE (2)
+            STORE_ONE_LANE (3)
+#endif
+        }
+        size -= 16;
+    }
+    i = 0;
+    while (i < size)
+    {
+        uint8_t randbyte = prng_rand_r (&local_prng) & 0xFF;
+        if (flags != 0)
+        {
+            uint8_t t = prng_rand_r (&local_prng) & 0xFF;
+            if ((flags & RANDMEMSET_MORE_FF) && (t >= 0xC0))
+                randbyte = 0xFF;
+            if ((flags & RANDMEMSET_MORE_00) && (t < 0x40))
+                randbyte = 0x00;
+            if (i % 4 == 0 && i + 4 <= size)
+            {
+                t = prng_rand_r (&local_prng) & 0xFF;
+                if ((flags & RANDMEMSET_MORE_FFFFFFFF) && (t >= 0xC0))
+                {
+                    memset(&buf[i], 0xFF, 4);
+                    i += 4;
+                    continue;
+                }
+                if ((flags & RANDMEMSET_MORE_00000000) && (t < 0x40))
+                {
+                    memset(&buf[i], 0x00, 4);
+                    i += 4;
+                    continue;
+                }
+            }
+        }
+        buf[i] = randbyte;
+        i++;
+    }
+    *prng = local_prng;
+}
+
+/*
+ * Fill memory buffer with random data. Flags argument may be used
+ * to tweak some statistics properties:
+ *    RANDMEMSET_MORE_00        - set ~25% of bytes to 0x00
+ *    RANDMEMSET_MORE_FF        - set ~25% of bytes to 0xFF
+ *    RANDMEMSET_MORE_00000000  - ~25% chance for 00000000 4-byte clusters
+ *    RANDMEMSET_MORE_FFFFFFFF  - ~25% chance for FFFFFFFF 4-byte clusters
+ */
+void prng_randmemset_r (prng_t                  *prng,
+                        void                    *voidbuf,
+                        size_t                   size,
+                        prng_randmemset_flags_t  flags)
+{
+    uint8_t *buf = (uint8_t *)voidbuf;
+    if ((uintptr_t)buf & 15)
+    {
+        /* unaligned buffer */
+        if (flags == 0)
+            randmemset_internal (prng, buf, size, 0, 0);
+        else if (flags == RANDMEMSET_MORE_00_AND_FF)
+            randmemset_internal (prng, buf, size, RANDMEMSET_MORE_00_AND_FF, 0);
+        else
+            randmemset_internal (prng, buf, size, flags, 0);
+    }
+    else
+    {
+        /* aligned buffer */
+        if (flags == 0)
+            randmemset_internal (prng, buf, size, 0, 1);
+        else if (flags == RANDMEMSET_MORE_00_AND_FF)
+            randmemset_internal (prng, buf, size, RANDMEMSET_MORE_00_AND_FF, 1);
+        else
+            randmemset_internal (prng, buf, size, flags, 1);
+    }
+}
diff --git a/src/pixman/test/utils-prng.h b/src/pixman/test/utils-prng.h
new file mode 100644
index 0000000..f9ae8dd
--- /dev/null
+++ b/src/pixman/test/utils-prng.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright © 2012 Siarhei Siamashka <siarhei.siamashka@gmail.com>
+ *
+ * Based on the public domain implementation of small noncryptographic PRNG
+ * authored by Bob Jenkins: http://burtleburtle.net/bob/rand/smallprng.html
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __UTILS_PRNG_H__
+#define __UTILS_PRNG_H__
+
+/*
+ * This file provides a fast SIMD-optimized noncryptographic PRNG (pseudorandom
+ * number generator), with the output good enough to pass "Big Crush" tests
+ * from TestU01 (http://en.wikipedia.org/wiki/TestU01).
+ *
+ * SIMD code uses http://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html
+ * which is a GCC specific extension. There is also a slower alternative
+ * code path, which should work with any C compiler.
+ *
+ * The "prng_t" structure keeps the internal state of the random number
+ * generator. It is possible to have multiple instances of the random number
+ * generator active at the same time, in this case each of them needs to have
+ * its own "prng_t". All the functions take a pointer to "prng_t"
+ * as the first argument.
+ *
+ * Functions:
+ *
+ * ----------------------------------------------------------------------------
+ * void prng_srand_r (prng_t *prng, uint32_t seed);
+ *
+ * Initialize the pseudorandom number generator. The sequence of preudorandom
+ * numbers is deterministic and only depends on "seed". Any two generators
+ * initialized with the same seed will produce exactly the same sequence.
+ *
+ * ----------------------------------------------------------------------------
+ * uint32_t prng_rand_r (prng_t *prng);
+ *
+ * Generate a single uniformly distributed 32-bit pseudorandom value.
+ *
+ * ----------------------------------------------------------------------------
+ * void prng_randmemset_r (prng_t                  *prng,
+ *                         void                    *buffer,
+ *                         size_t                   size,
+ *                         prng_randmemset_flags_t  flags);
+ *
+ * Fills the memory buffer "buffer" with "size" bytes of pseudorandom data.
+ * The "flags" argument may be used to tweak some statistics properties:
+ *    RANDMEMSET_MORE_00 - set ~25% of bytes to 0x00
+ *    RANDMEMSET_MORE_FF - set ~25% of bytes to 0xFF
+ * The flags can be combined. This allows a bit better simulation of typical
+ * pixel data, which normally contains a lot of fully transparent or fully
+ * opaque pixels.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "pixman-private.h"
+
+/*****************************************************************************/
+
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
+typedef uint32_t uint32x4 __attribute__ ((vector_size(16)));
+typedef uint8_t  uint8x16 __attribute__ ((vector_size(16)));
+#endif
+
+typedef struct
+{
+    uint32_t a, b, c, d;
+} smallprng_t;
+
+typedef struct
+{
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
+    uint32x4 a, b, c, d;
+#else
+    smallprng_t p1, p2, p3, p4;
+#endif
+    smallprng_t p0;
+} prng_t;
+
+typedef union
+{
+    uint8_t  b[16];
+    uint32_t w[4];
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
+    uint8x16 vb;
+    uint32x4 vw;
+#endif
+} prng_rand_128_data_t;
+
+/*****************************************************************************/
+
+static force_inline uint32_t
+smallprng_rand_r (smallprng_t *x)
+{
+    uint32_t e = x->a - ((x->b << 27) + (x->b >> (32 - 27)));
+    x->a = x->b ^ ((x->c << 17) ^ (x->c >> (32 - 17)));
+    x->b = x->c + x->d;
+    x->c = x->d + e;
+    x->d = e + x->a;
+    return x->d;
+}
+
+/* Generate 4 bytes (32-bits) of random data */
+static force_inline uint32_t
+prng_rand_r (prng_t *x)
+{
+    return smallprng_rand_r (&x->p0);
+}
+
+/* Generate 16 bytes (128-bits) of random data */
+static force_inline void
+prng_rand_128_r (prng_t *x, prng_rand_128_data_t *data)
+{
+#ifdef HAVE_GCC_VECTOR_EXTENSIONS
+    uint32x4 e = x->a - ((x->b << 27) + (x->b >> (32 - 27)));
+    x->a = x->b ^ ((x->c << 17) ^ (x->c >> (32 - 17)));
+    x->b = x->c + x->d;
+    x->c = x->d + e;
+    x->d = e + x->a;
+    data->vw = x->d;
+#else
+    data->w[0] = smallprng_rand_r (&x->p1);
+    data->w[1] = smallprng_rand_r (&x->p2);
+    data->w[2] = smallprng_rand_r (&x->p3);
+    data->w[3] = smallprng_rand_r (&x->p4);
+#endif
+}
+
+typedef enum
+{
+    RANDMEMSET_MORE_00        = 1, /* ~25% chance for 0x00 bytes */
+    RANDMEMSET_MORE_FF        = 2, /* ~25% chance for 0xFF bytes */
+    RANDMEMSET_MORE_00000000  = 4, /* ~25% chance for 0x00000000 clusters */
+    RANDMEMSET_MORE_FFFFFFFF  = 8, /* ~25% chance for 0xFFFFFFFF clusters */
+    RANDMEMSET_MORE_00_AND_FF = (RANDMEMSET_MORE_00 | RANDMEMSET_MORE_00000000 |
+                                 RANDMEMSET_MORE_FF | RANDMEMSET_MORE_FFFFFFFF)
+} prng_randmemset_flags_t;
+
+/* Set the 32-bit seed for PRNG */
+void prng_srand_r (prng_t *prng, uint32_t seed);
+
+/* Fill memory buffer with random data */
+void prng_randmemset_r (prng_t                  *prng,
+                        void                    *buffer,
+                        size_t                   size,
+                        prng_randmemset_flags_t  flags);
+
+#endif
diff --git a/src/pixman/test/utils.c b/src/pixman/test/utils.c
new file mode 100644
index 0000000..ebe0ccc
--- /dev/null
+++ b/src/pixman/test/utils.c
@@ -0,0 +1,1618 @@
+#define _GNU_SOURCE
+
+#include "utils.h"
+#include <math.h>
+#include <signal.h>
+#include <stdlib.h>
+
+#ifdef HAVE_GETTIMEOFDAY
+#include <sys/time.h>
+#else
+#include <time.h>
+#endif
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+
+#ifdef HAVE_FENV_H
+#include <fenv.h>
+#endif
+
+#ifdef HAVE_LIBPNG
+#include <png.h>
+#endif
+
+/* Random number generator state
+ */
+
+prng_t prng_state_data;
+prng_t *prng_state;
+
+/*----------------------------------------------------------------------------*\
+ *  CRC-32 version 2.0.0 by Craig Bruce, 2006-04-29.
+ *
+ *  This program generates the CRC-32 values for the files named in the
+ *  command-line arguments.  These are the same CRC-32 values used by GZIP,
+ *  PKZIP, and ZMODEM.  The Crc32_ComputeBuf () can also be detached and
+ *  used independently.
+ *
+ *  THIS PROGRAM IS PUBLIC-DOMAIN SOFTWARE.
+ *
+ *  Based on the byte-oriented implementation "File Verification Using CRC"
+ *  by Mark R. Nelson in Dr. Dobb's Journal, May 1992, pp. 64-67.
+ *
+ *  v1.0.0: original release.
+ *  v1.0.1: fixed printf formats.
+ *  v1.0.2: fixed something else.
+ *  v1.0.3: replaced CRC constant table by generator function.
+ *  v1.0.4: reformatted code, made ANSI C.  1994-12-05.
+ *  v2.0.0: rewrote to use memory buffer & static table, 2006-04-29.
+\*----------------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------------*\
+ *  NAME:
+ *     Crc32_ComputeBuf () - computes the CRC-32 value of a memory buffer
+ *  DESCRIPTION:
+ *     Computes or accumulates the CRC-32 value for a memory buffer.
+ *     The 'inCrc32' gives a previously accumulated CRC-32 value to allow
+ *     a CRC to be generated for multiple sequential buffer-fuls of data.
+ *     The 'inCrc32' for the first buffer must be zero.
+ *  ARGUMENTS:
+ *     inCrc32 - accumulated CRC-32 value, must be 0 on first call
+ *     buf     - buffer to compute CRC-32 value for
+ *     bufLen  - number of bytes in buffer
+ *  RETURNS:
+ *     crc32 - computed CRC-32 value
+ *  ERRORS:
+ *     (no errors are possible)
+\*----------------------------------------------------------------------------*/
+
+uint32_t
+compute_crc32 (uint32_t    in_crc32,
+	       const void *buf,
+	       size_t      buf_len)
+{
+    static const uint32_t crc_table[256] = {
+	0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F,
+	0xE963A535, 0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
+	0x09B64C2B, 0x7EB17CBD,	0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2,
+	0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+	0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,	0x14015C4F, 0x63066CD9,
+	0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
+	0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C,
+	0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+	0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423,
+	0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
+	0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106,
+	0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+	0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D,
+	0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+	0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950,
+	0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+	0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7,
+	0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
+	0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA,
+	0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+	0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81,
+	0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
+	0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84,
+	0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+	0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB,
+	0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
+	0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E,
+	0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+	0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55,
+	0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
+	0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28,
+	0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+	0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F,
+	0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
+	0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242,
+	0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+	0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69,
+	0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
+	0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC,
+	0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+	0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693,
+	0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+	0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+    };
+
+    uint32_t              crc32;
+    unsigned char *       byte_buf;
+    size_t                i;
+
+    /* accumulate crc32 for buffer */
+    crc32 = in_crc32 ^ 0xFFFFFFFF;
+    byte_buf = (unsigned char*) buf;
+
+    for (i = 0; i < buf_len; i++)
+	crc32 = (crc32 >> 8) ^ crc_table[(crc32 ^ byte_buf[i]) & 0xFF];
+
+    return (crc32 ^ 0xFFFFFFFF);
+}
+
+static uint32_t
+compute_crc32_for_image_internal (uint32_t        crc32,
+				  pixman_image_t *img,
+				  pixman_bool_t	  remove_alpha,
+				  pixman_bool_t	  remove_rgb)
+{
+    pixman_format_code_t fmt = pixman_image_get_format (img);
+    uint32_t *data = pixman_image_get_data (img);
+    int stride = pixman_image_get_stride (img);
+    int height = pixman_image_get_height (img);
+    uint32_t mask = 0xffffffff;
+    int i;
+
+    if (stride < 0)
+    {
+	data += (stride / 4) * (height - 1);
+	stride = - stride;
+    }
+
+    /* mask unused 'x' part */
+    if (PIXMAN_FORMAT_BPP (fmt) - PIXMAN_FORMAT_DEPTH (fmt) &&
+	PIXMAN_FORMAT_DEPTH (fmt) != 0)
+    {
+	uint32_t m = (1 << PIXMAN_FORMAT_DEPTH (fmt)) - 1;
+
+	if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_BGRA ||
+	    PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_RGBA)
+	{
+	    m <<= (PIXMAN_FORMAT_BPP (fmt) - PIXMAN_FORMAT_DEPTH (fmt));
+	}
+
+	mask &= m;
+    }
+
+    /* mask alpha channel */
+    if (remove_alpha && PIXMAN_FORMAT_A (fmt))
+    {
+	uint32_t m;
+
+	if (PIXMAN_FORMAT_BPP (fmt) == 32)
+	    m = 0xffffffff;
+	else
+	    m = (1 << PIXMAN_FORMAT_BPP (fmt)) - 1;
+
+	m >>= PIXMAN_FORMAT_A (fmt);
+
+	if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_BGRA ||
+	    PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_RGBA ||
+	    PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_A)
+	{
+	    /* Alpha is at the bottom of the pixel */
+	    m <<= PIXMAN_FORMAT_A (fmt);
+	}
+
+	mask &= m;
+    }
+
+    /* mask rgb channels */
+    if (remove_rgb && PIXMAN_FORMAT_RGB (fmt))
+    {
+	uint32_t m = ((uint32_t)~0) >> (32 - PIXMAN_FORMAT_BPP (fmt));
+	uint32_t size = PIXMAN_FORMAT_R (fmt) + PIXMAN_FORMAT_G (fmt) + PIXMAN_FORMAT_B (fmt);
+
+	m &= ~((1 << size) - 1);
+
+	if (PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_BGRA ||
+	    PIXMAN_FORMAT_TYPE (fmt) == PIXMAN_TYPE_RGBA)
+	{
+	    /* RGB channels are at the top of the pixel */
+	    m >>= size;
+	}
+
+	mask &= m;
+    }
+
+    for (i = 0; i * PIXMAN_FORMAT_BPP (fmt) < 32; i++)
+	mask |= mask << (i * PIXMAN_FORMAT_BPP (fmt));
+
+    for (i = 0; i < stride * height / 4; i++)
+	data[i] &= mask;
+
+    /* swap endiannes in order to provide identical results on both big
+     * and litte endian systems
+     */
+    image_endian_swap (img);
+
+    return compute_crc32 (crc32, data, stride * height);
+}
+
+uint32_t
+compute_crc32_for_image (uint32_t        crc32,
+			 pixman_image_t *img)
+{
+    if (img->common.alpha_map)
+    {
+	crc32 = compute_crc32_for_image_internal (crc32, img, TRUE, FALSE);
+	crc32 = compute_crc32_for_image_internal (
+	    crc32, (pixman_image_t *)img->common.alpha_map, FALSE, TRUE);
+    }
+    else
+    {
+	crc32 = compute_crc32_for_image_internal (crc32, img, FALSE, FALSE);
+    }
+
+    return crc32;
+}
+
+void
+print_image (pixman_image_t *image)
+{
+    int i, j;
+    int width, height, stride;
+    pixman_format_code_t format;
+    uint8_t *buffer;
+    int s;
+
+    width = pixman_image_get_width (image);
+    height = pixman_image_get_height (image);
+    stride = pixman_image_get_stride (image);
+    format = pixman_image_get_format (image);
+    buffer = (uint8_t *)pixman_image_get_data (image);
+
+    s = (stride >= 0)? stride : - stride;
+    
+    printf ("---\n");
+    for (i = 0; i < height; i++)
+    {
+	for (j = 0; j < s; j++)
+	{
+	    if (j == (width * PIXMAN_FORMAT_BPP (format) + 7) / 8)
+		printf ("| ");
+
+	    printf ("%02X ", *((uint8_t *)buffer + i * stride + j));
+	}
+	printf ("\n");
+    }
+    printf ("---\n");
+}
+
+/* perform endian conversion of pixel data
+ */
+void
+image_endian_swap (pixman_image_t *img)
+{
+    int stride = pixman_image_get_stride (img);
+    uint32_t *data = pixman_image_get_data (img);
+    int height = pixman_image_get_height (img);
+    int bpp = PIXMAN_FORMAT_BPP (pixman_image_get_format (img));
+    int i, j;
+
+    /* swap bytes only on big endian systems */
+    if (is_little_endian())
+	return;
+
+    if (bpp == 8)
+	return;
+
+    for (i = 0; i < height; i++)
+    {
+	uint8_t *line_data = (uint8_t *)data + stride * i;
+	int s = (stride >= 0)? stride : - stride;
+    	
+	switch (bpp)
+	{
+	case 1:
+	    for (j = 0; j < s; j++)
+	    {
+		line_data[j] =
+		    ((line_data[j] & 0x80) >> 7) |
+		    ((line_data[j] & 0x40) >> 5) |
+		    ((line_data[j] & 0x20) >> 3) |
+		    ((line_data[j] & 0x10) >> 1) |
+		    ((line_data[j] & 0x08) << 1) |
+		    ((line_data[j] & 0x04) << 3) |
+		    ((line_data[j] & 0x02) << 5) |
+		    ((line_data[j] & 0x01) << 7);
+	    }
+	    break;
+	case 4:
+	    for (j = 0; j < s; j++)
+	    {
+		line_data[j] = (line_data[j] >> 4) | (line_data[j] << 4);
+	    }
+	    break;
+	case 16:
+	    for (j = 0; j + 2 <= s; j += 2)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+
+		line_data[j + 1] = t1;
+		line_data[j + 0] = t2;
+	    }
+	    break;
+	case 24:
+	    for (j = 0; j + 3 <= s; j += 3)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		char t3 = line_data[j + 2];
+
+		line_data[j + 2] = t1;
+		line_data[j + 1] = t2;
+		line_data[j + 0] = t3;
+	    }
+	    break;
+	case 32:
+	    for (j = 0; j + 4 <= s; j += 4)
+	    {
+		char t1 = line_data[j + 0];
+		char t2 = line_data[j + 1];
+		char t3 = line_data[j + 2];
+		char t4 = line_data[j + 3];
+
+		line_data[j + 3] = t1;
+		line_data[j + 2] = t2;
+		line_data[j + 1] = t3;
+		line_data[j + 0] = t4;
+	    }
+	    break;
+	default:
+	    assert (FALSE);
+	    break;
+	}
+    }
+}
+
+#define N_LEADING_PROTECTED	10
+#define N_TRAILING_PROTECTED	10
+
+typedef struct
+{
+    void *addr;
+    uint32_t len;
+    uint8_t *trailing;
+    int n_bytes;
+} info_t;
+
+#if defined(HAVE_MPROTECT) && defined(HAVE_GETPAGESIZE) && defined(HAVE_SYS_MMAN_H) && defined(HAVE_MMAP)
+
+/* This is apparently necessary on at least OS X */
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+void *
+fence_malloc (int64_t len)
+{
+    unsigned long page_size = getpagesize();
+    unsigned long page_mask = page_size - 1;
+    uint32_t n_payload_bytes = (len + page_mask) & ~page_mask;
+    uint32_t n_bytes =
+	(page_size * (N_LEADING_PROTECTED + N_TRAILING_PROTECTED + 2) +
+	 n_payload_bytes) & ~page_mask;
+    uint8_t *initial_page;
+    uint8_t *leading_protected;
+    uint8_t *trailing_protected;
+    uint8_t *payload;
+    uint8_t *addr;
+
+    if (len < 0)
+	abort();
+    
+    addr = mmap (NULL, n_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
+		 -1, 0);
+
+    if (addr == MAP_FAILED)
+    {
+	printf ("mmap failed on %lld %u\n", (long long int)len, n_bytes);
+	return NULL;
+    }
+
+    initial_page = (uint8_t *)(((uintptr_t)addr + page_mask) & ~page_mask);
+    leading_protected = initial_page + page_size;
+    payload = leading_protected + N_LEADING_PROTECTED * page_size;
+    trailing_protected = payload + n_payload_bytes;
+
+    ((info_t *)initial_page)->addr = addr;
+    ((info_t *)initial_page)->len = len;
+    ((info_t *)initial_page)->trailing = trailing_protected;
+    ((info_t *)initial_page)->n_bytes = n_bytes;
+
+    if ((mprotect (leading_protected, N_LEADING_PROTECTED * page_size,
+		  PROT_NONE) == -1) ||
+	(mprotect (trailing_protected, N_TRAILING_PROTECTED * page_size,
+		  PROT_NONE) == -1))
+    {
+	munmap (addr, n_bytes);
+	return NULL;
+    }
+
+    return payload;
+}
+
+void
+fence_free (void *data)
+{
+    uint32_t page_size = getpagesize();
+    uint8_t *payload = data;
+    uint8_t *leading_protected = payload - N_LEADING_PROTECTED * page_size;
+    uint8_t *initial_page = leading_protected - page_size;
+    info_t *info = (info_t *)initial_page;
+
+    munmap (info->addr, info->n_bytes);
+}
+
+#else
+
+void *
+fence_malloc (int64_t len)
+{
+    return malloc (len);
+}
+
+void
+fence_free (void *data)
+{
+    free (data);
+}
+
+#endif
+
+uint8_t *
+make_random_bytes (int n_bytes)
+{
+    uint8_t *bytes = fence_malloc (n_bytes);
+
+    if (!bytes)
+	return NULL;
+
+    prng_randmemset (bytes, n_bytes, 0);
+
+    return bytes;
+}
+
+void
+a8r8g8b8_to_rgba_np (uint32_t *dst, uint32_t *src, int n_pixels)
+{
+    uint8_t *dst8 = (uint8_t *)dst;
+    int i;
+
+    for (i = 0; i < n_pixels; ++i)
+    {
+	uint32_t p = src[i];
+	uint8_t a, r, g, b;
+
+	a = (p & 0xff000000) >> 24;
+	r = (p & 0x00ff0000) >> 16;
+	g = (p & 0x0000ff00) >> 8;
+	b = (p & 0x000000ff) >> 0;
+
+	if (a != 0)
+	{
+#define DIVIDE(c, a)							\
+	    do								\
+	    {								\
+		int t = ((c) * 255) / a;				\
+		(c) = t < 0? 0 : t > 255? 255 : t;			\
+	    } while (0)
+
+	    DIVIDE (r, a);
+	    DIVIDE (g, a);
+	    DIVIDE (b, a);
+	}
+
+	*dst8++ = r;
+	*dst8++ = g;
+	*dst8++ = b;
+	*dst8++ = a;
+    }
+}
+
+#ifdef HAVE_LIBPNG
+
+pixman_bool_t
+write_png (pixman_image_t *image, const char *filename)
+{
+    int width = pixman_image_get_width (image);
+    int height = pixman_image_get_height (image);
+    int stride = width * 4;
+    uint32_t *data = malloc (height * stride);
+    pixman_image_t *copy;
+    png_struct *write_struct;
+    png_info *info_struct;
+    pixman_bool_t result = FALSE;
+    FILE *f = fopen (filename, "wb");
+    png_bytep *row_pointers;
+    int i;
+
+    if (!f)
+	return FALSE;
+
+    row_pointers = malloc (height * sizeof (png_bytep));
+
+    copy = pixman_image_create_bits (
+	PIXMAN_a8r8g8b8, width, height, data, stride);
+
+    pixman_image_composite32 (
+	PIXMAN_OP_SRC, image, NULL, copy, 0, 0, 0, 0, 0, 0, width, height);
+
+    a8r8g8b8_to_rgba_np (data, data, height * width);
+
+    for (i = 0; i < height; ++i)
+	row_pointers[i] = (png_bytep)(data + i * width);
+
+    if (!(write_struct = png_create_write_struct (
+	      PNG_LIBPNG_VER_STRING, NULL, NULL, NULL)))
+	goto out1;
+
+    if (!(info_struct = png_create_info_struct (write_struct)))
+	goto out2;
+
+    png_init_io (write_struct, f);
+
+    png_set_IHDR (write_struct, info_struct, width, height,
+		  8, PNG_COLOR_TYPE_RGB_ALPHA,
+		  PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
+		  PNG_FILTER_TYPE_BASE);
+
+    png_write_info (write_struct, info_struct);
+
+    png_write_image (write_struct, row_pointers);
+
+    png_write_end (write_struct, NULL);
+
+    result = TRUE;
+
+out2:
+    png_destroy_write_struct (&write_struct, &info_struct);
+
+out1:
+    if (fclose (f) != 0)
+	result = FALSE;
+
+    pixman_image_unref (copy);
+    free (row_pointers);
+    free (data);
+    return result;
+}
+
+#else /* no libpng */
+
+pixman_bool_t
+write_png (pixman_image_t *image, const char *filename)
+{
+    return FALSE;
+}
+
+#endif
+
+static void
+color8_to_color16 (uint32_t color8, pixman_color_t *color16)
+{
+    color16->alpha = ((color8 & 0xff000000) >> 24);
+    color16->red =   ((color8 & 0x00ff0000) >> 16);
+    color16->green = ((color8 & 0x0000ff00) >> 8);
+    color16->blue =  ((color8 & 0x000000ff) >> 0);
+
+    color16->alpha |= color16->alpha << 8;
+    color16->red   |= color16->red << 8;
+    color16->blue  |= color16->blue << 8;
+    color16->green |= color16->green << 8;
+}
+
+void
+draw_checkerboard (pixman_image_t *image,
+		   int check_size,
+		   uint32_t color1, uint32_t color2)
+{
+    pixman_color_t check1, check2;
+    pixman_image_t *c1, *c2;
+    int n_checks_x, n_checks_y;
+    int i, j;
+
+    color8_to_color16 (color1, &check1);
+    color8_to_color16 (color2, &check2);
+    
+    c1 = pixman_image_create_solid_fill (&check1);
+    c2 = pixman_image_create_solid_fill (&check2);
+
+    n_checks_x = (
+	pixman_image_get_width (image) + check_size - 1) / check_size;
+    n_checks_y = (
+	pixman_image_get_height (image) + check_size - 1) / check_size;
+
+    for (j = 0; j < n_checks_y; j++)
+    {
+	for (i = 0; i < n_checks_x; i++)
+	{
+	    pixman_image_t *src;
+
+	    if (((i ^ j) & 1))
+		src = c1;
+	    else
+		src = c2;
+
+	    pixman_image_composite32 (PIXMAN_OP_SRC, src, NULL, image,
+				      0, 0, 0, 0,
+				      i * check_size, j * check_size,
+				      check_size, check_size);
+	}
+    }
+}
+
+static uint32_t
+call_test_function (uint32_t    (*test_function)(int testnum, int verbose),
+		    int		testnum,
+		    int		verbose)
+{
+    uint32_t retval;
+
+#if defined (__GNUC__) && defined (_WIN32) && (defined (__i386) || defined (__i386__))
+    __asm__ (
+	/* Deliberately avoid aligning the stack to 16 bytes */
+	"pushl	%1\n\t"
+	"pushl	%2\n\t"
+	"call	*%3\n\t"
+	"addl	$8, %%esp\n\t"
+	: "=a" (retval)
+	: "r" (verbose),
+	  "r" (testnum),
+	  "r" (test_function)
+	: "edx", "ecx"); /* caller save registers */
+#else
+    retval = test_function (testnum, verbose);
+#endif
+
+    return retval;
+}
+
+/*
+ * A function, which can be used as a core part of the test programs,
+ * intended to detect various problems with the help of fuzzing input
+ * to pixman API (according to some templates, aka "smart" fuzzing).
+ * Some general information about such testing can be found here:
+ * http://en.wikipedia.org/wiki/Fuzz_testing
+ *
+ * It may help detecting:
+ *  - crashes on bad handling of valid or reasonably invalid input to
+ *    pixman API.
+ *  - deviations from the behavior of older pixman releases.
+ *  - deviations from the behavior of the same pixman release, but
+ *    configured in a different way (for example with SIMD optimizations
+ *    disabled), or running on a different OS or hardware.
+ *
+ * The test is performed by calling a callback function a huge number
+ * of times. The callback function is expected to run some snippet of
+ * pixman code with pseudorandom variations to the data feeded to
+ * pixman API. A result of running each callback function should be
+ * some deterministic value which depends on test number (test number
+ * can be used as a seed for PRNG). When 'verbose' argument is nonzero,
+ * callback function is expected to print to stdout some information
+ * about what it does.
+ *
+ * Return values from many small tests are accumulated together and
+ * used as final checksum, which can be compared to some expected
+ * value. Running the tests not individually, but in a batch helps
+ * to reduce process start overhead and also allows to parallelize
+ * testing and utilize multiple CPU cores.
+ *
+ * The resulting executable can be run without any arguments. In
+ * this case it runs a batch of tests starting from 1 and up to
+ * 'default_number_of_iterations'. The resulting checksum is
+ * compared with 'expected_checksum' and FAIL or PASS verdict
+ * depends on the result of this comparison.
+ *
+ * If the executable is run with 2 numbers provided as command line
+ * arguments, they specify the starting and ending numbers for a test
+ * batch.
+ *
+ * If the executable is run with only one number provided as a command
+ * line argument, then this number is used to call the callback function
+ * once, and also with verbose flag set.
+ */
+int
+fuzzer_test_main (const char *test_name,
+		  int         default_number_of_iterations,
+		  uint32_t    expected_checksum,
+		  uint32_t    (*test_function)(int testnum, int verbose),
+		  int         argc,
+		  const char *argv[])
+{
+    int i, n1 = 1, n2 = 0;
+    uint32_t checksum = 0;
+    int verbose = getenv ("VERBOSE") != NULL;
+
+    if (argc >= 3)
+    {
+	n1 = atoi (argv[1]);
+	n2 = atoi (argv[2]);
+	if (n2 < n1)
+	{
+	    printf ("invalid test range\n");
+	    return 1;
+	}
+    }
+    else if (argc >= 2)
+    {
+	n2 = atoi (argv[1]);
+
+	checksum = call_test_function (test_function, n2, 1);
+
+	printf ("%d: checksum=%08X\n", n2, checksum);
+	return 0;
+    }
+    else
+    {
+	n1 = 1;
+	n2 = default_number_of_iterations;
+    }
+
+#ifdef USE_OPENMP
+    #pragma omp parallel for reduction(+:checksum) default(none) \
+					shared(n1, n2, test_function, verbose)
+#endif
+    for (i = n1; i <= n2; i++)
+    {
+	uint32_t crc = call_test_function (test_function, i, 0);
+	if (verbose)
+	    printf ("%d: %08X\n", i, crc);
+	checksum += crc;
+    }
+
+    if (n1 == 1 && n2 == default_number_of_iterations)
+    {
+	if (checksum == expected_checksum)
+	{
+	    printf ("%s test passed (checksum=%08X)\n",
+		    test_name, checksum);
+	}
+	else
+	{
+	    printf ("%s test failed! (checksum=%08X, expected %08X)\n",
+		    test_name, checksum, expected_checksum);
+	    return 1;
+	}
+    }
+    else
+    {
+	printf ("%d-%d: checksum=%08X\n", n1, n2, checksum);
+    }
+
+    return 0;
+}
+
+/* Try to obtain current time in seconds */
+double
+gettime (void)
+{
+#ifdef HAVE_GETTIMEOFDAY
+    struct timeval tv;
+
+    gettimeofday (&tv, NULL);
+    return (double)((int64_t)tv.tv_sec * 1000000 + tv.tv_usec) / 1000000.;
+#else
+    return (double)clock() / (double)CLOCKS_PER_SEC;
+#endif
+}
+
+uint32_t
+get_random_seed (void)
+{
+    union { double d; uint32_t u32; } t;
+    t.d = gettime();
+    prng_srand (t.u32);
+
+    return prng_rand ();
+}
+
+#ifdef HAVE_SIGACTION
+#ifdef HAVE_ALARM
+static const char *global_msg;
+
+static void
+on_alarm (int signo)
+{
+    printf ("%s\n", global_msg);
+    exit (1);
+}
+#endif
+#endif
+
+void
+fail_after (int seconds, const char *msg)
+{
+#ifdef HAVE_SIGACTION
+#ifdef HAVE_ALARM
+    struct sigaction action;
+
+    global_msg = msg;
+
+    memset (&action, 0, sizeof (action));
+    action.sa_handler = on_alarm;
+
+    alarm (seconds);
+
+    sigaction (SIGALRM, &action, NULL);
+#endif
+#endif
+}
+
+void
+enable_divbyzero_exceptions (void)
+{
+#ifdef HAVE_FENV_H
+#ifdef HAVE_FEENABLEEXCEPT
+    feenableexcept (FE_DIVBYZERO);
+#endif
+#endif
+}
+
+void *
+aligned_malloc (size_t align, size_t size)
+{
+    void *result;
+
+#ifdef HAVE_POSIX_MEMALIGN
+    if (posix_memalign (&result, align, size) != 0)
+      result = NULL;
+#else
+    result = malloc (size);
+#endif
+
+    return result;
+}
+
+#define CONVERT_15(c, is_rgb)						\
+    (is_rgb?								\
+     ((((c) >> 3) & 0x001f) |						\
+      (((c) >> 6) & 0x03e0) |						\
+      (((c) >> 9) & 0x7c00)) :						\
+     (((((c) >> 16) & 0xff) * 153 +					\
+       (((c) >>  8) & 0xff) * 301 +					\
+       (((c)      ) & 0xff) * 58) >> 2))
+
+double
+convert_srgb_to_linear (double c)
+{
+    if (c <= 0.04045)
+        return c / 12.92;
+    else
+        return pow ((c + 0.055) / 1.055, 2.4);
+}
+
+double
+convert_linear_to_srgb (double c)
+{
+    if (c <= 0.0031308)
+        return c * 12.92;
+    else
+        return 1.055 * pow (c, 1.0/2.4) - 0.055;
+}
+
+void
+initialize_palette (pixman_indexed_t *palette, uint32_t depth, int is_rgb)
+{
+    int i;
+    uint32_t mask = (1 << depth) - 1;
+
+    for (i = 0; i < 32768; ++i)
+	palette->ent[i] = prng_rand() & mask;
+
+    memset (palette->rgba, 0, sizeof (palette->rgba));
+
+    for (i = 0; i < mask + 1; ++i)
+    {
+	uint32_t rgba24;
+ 	pixman_bool_t retry;
+	uint32_t i15;
+
+	/* We filled the rgb->index map with random numbers, but we
+	 * do need the ability to round trip, that is if some indexed
+	 * color expands to an argb24, then the 15 bit version of that
+	 * color must map back to the index. Anything else, we don't
+	 * care about too much.
+	 */
+	do
+	{
+	    uint32_t old_idx;
+
+	    rgba24 = prng_rand();
+	    i15 = CONVERT_15 (rgba24, is_rgb);
+
+	    old_idx = palette->ent[i15];
+	    if (CONVERT_15 (palette->rgba[old_idx], is_rgb) == i15)
+		retry = 1;
+	    else
+		retry = 0;
+	} while (retry);
+
+	palette->rgba[i] = rgba24;
+	palette->ent[i15] = i;
+    }
+
+    for (i = 0; i < mask + 1; ++i)
+    {
+	assert (palette->ent[CONVERT_15 (palette->rgba[i], is_rgb)] == i);
+    }
+}
+
+const char *
+operator_name (pixman_op_t op)
+{
+    switch (op)
+    {
+    case PIXMAN_OP_CLEAR: return "PIXMAN_OP_CLEAR";
+    case PIXMAN_OP_SRC: return "PIXMAN_OP_SRC";
+    case PIXMAN_OP_DST: return "PIXMAN_OP_DST";
+    case PIXMAN_OP_OVER: return "PIXMAN_OP_OVER";
+    case PIXMAN_OP_OVER_REVERSE: return "PIXMAN_OP_OVER_REVERSE";
+    case PIXMAN_OP_IN: return "PIXMAN_OP_IN";
+    case PIXMAN_OP_IN_REVERSE: return "PIXMAN_OP_IN_REVERSE";
+    case PIXMAN_OP_OUT: return "PIXMAN_OP_OUT";
+    case PIXMAN_OP_OUT_REVERSE: return "PIXMAN_OP_OUT_REVERSE";
+    case PIXMAN_OP_ATOP: return "PIXMAN_OP_ATOP";
+    case PIXMAN_OP_ATOP_REVERSE: return "PIXMAN_OP_ATOP_REVERSE";
+    case PIXMAN_OP_XOR: return "PIXMAN_OP_XOR";
+    case PIXMAN_OP_ADD: return "PIXMAN_OP_ADD";
+    case PIXMAN_OP_SATURATE: return "PIXMAN_OP_SATURATE";
+
+    case PIXMAN_OP_DISJOINT_CLEAR: return "PIXMAN_OP_DISJOINT_CLEAR";
+    case PIXMAN_OP_DISJOINT_SRC: return "PIXMAN_OP_DISJOINT_SRC";
+    case PIXMAN_OP_DISJOINT_DST: return "PIXMAN_OP_DISJOINT_DST";
+    case PIXMAN_OP_DISJOINT_OVER: return "PIXMAN_OP_DISJOINT_OVER";
+    case PIXMAN_OP_DISJOINT_OVER_REVERSE: return "PIXMAN_OP_DISJOINT_OVER_REVERSE";
+    case PIXMAN_OP_DISJOINT_IN: return "PIXMAN_OP_DISJOINT_IN";
+    case PIXMAN_OP_DISJOINT_IN_REVERSE: return "PIXMAN_OP_DISJOINT_IN_REVERSE";
+    case PIXMAN_OP_DISJOINT_OUT: return "PIXMAN_OP_DISJOINT_OUT";
+    case PIXMAN_OP_DISJOINT_OUT_REVERSE: return "PIXMAN_OP_DISJOINT_OUT_REVERSE";
+    case PIXMAN_OP_DISJOINT_ATOP: return "PIXMAN_OP_DISJOINT_ATOP";
+    case PIXMAN_OP_DISJOINT_ATOP_REVERSE: return "PIXMAN_OP_DISJOINT_ATOP_REVERSE";
+    case PIXMAN_OP_DISJOINT_XOR: return "PIXMAN_OP_DISJOINT_XOR";
+
+    case PIXMAN_OP_CONJOINT_CLEAR: return "PIXMAN_OP_CONJOINT_CLEAR";
+    case PIXMAN_OP_CONJOINT_SRC: return "PIXMAN_OP_CONJOINT_SRC";
+    case PIXMAN_OP_CONJOINT_DST: return "PIXMAN_OP_CONJOINT_DST";
+    case PIXMAN_OP_CONJOINT_OVER: return "PIXMAN_OP_CONJOINT_OVER";
+    case PIXMAN_OP_CONJOINT_OVER_REVERSE: return "PIXMAN_OP_CONJOINT_OVER_REVERSE";
+    case PIXMAN_OP_CONJOINT_IN: return "PIXMAN_OP_CONJOINT_IN";
+    case PIXMAN_OP_CONJOINT_IN_REVERSE: return "PIXMAN_OP_CONJOINT_IN_REVERSE";
+    case PIXMAN_OP_CONJOINT_OUT: return "PIXMAN_OP_CONJOINT_OUT";
+    case PIXMAN_OP_CONJOINT_OUT_REVERSE: return "PIXMAN_OP_CONJOINT_OUT_REVERSE";
+    case PIXMAN_OP_CONJOINT_ATOP: return "PIXMAN_OP_CONJOINT_ATOP";
+    case PIXMAN_OP_CONJOINT_ATOP_REVERSE: return "PIXMAN_OP_CONJOINT_ATOP_REVERSE";
+    case PIXMAN_OP_CONJOINT_XOR: return "PIXMAN_OP_CONJOINT_XOR";
+
+    case PIXMAN_OP_MULTIPLY: return "PIXMAN_OP_MULTIPLY";
+    case PIXMAN_OP_SCREEN: return "PIXMAN_OP_SCREEN";
+    case PIXMAN_OP_OVERLAY: return "PIXMAN_OP_OVERLAY";
+    case PIXMAN_OP_DARKEN: return "PIXMAN_OP_DARKEN";
+    case PIXMAN_OP_LIGHTEN: return "PIXMAN_OP_LIGHTEN";
+    case PIXMAN_OP_COLOR_DODGE: return "PIXMAN_OP_COLOR_DODGE";
+    case PIXMAN_OP_COLOR_BURN: return "PIXMAN_OP_COLOR_BURN";
+    case PIXMAN_OP_HARD_LIGHT: return "PIXMAN_OP_HARD_LIGHT";
+    case PIXMAN_OP_SOFT_LIGHT: return "PIXMAN_OP_SOFT_LIGHT";
+    case PIXMAN_OP_DIFFERENCE: return "PIXMAN_OP_DIFFERENCE";
+    case PIXMAN_OP_EXCLUSION: return "PIXMAN_OP_EXCLUSION";
+    case PIXMAN_OP_HSL_HUE: return "PIXMAN_OP_HSL_HUE";
+    case PIXMAN_OP_HSL_SATURATION: return "PIXMAN_OP_HSL_SATURATION";
+    case PIXMAN_OP_HSL_COLOR: return "PIXMAN_OP_HSL_COLOR";
+    case PIXMAN_OP_HSL_LUMINOSITY: return "PIXMAN_OP_HSL_LUMINOSITY";
+
+    case PIXMAN_OP_NONE:
+	return "<invalid operator 'none'>";
+    };
+
+    return "<unknown operator>";
+}
+
+const char *
+format_name (pixman_format_code_t format)
+{
+    switch (format)
+    {
+/* 32bpp formats */
+    case PIXMAN_a8r8g8b8: return "a8r8g8b8";
+    case PIXMAN_x8r8g8b8: return "x8r8g8b8";
+    case PIXMAN_a8b8g8r8: return "a8b8g8r8";
+    case PIXMAN_x8b8g8r8: return "x8b8g8r8";
+    case PIXMAN_b8g8r8a8: return "b8g8r8a8";
+    case PIXMAN_b8g8r8x8: return "b8g8r8x8";
+    case PIXMAN_r8g8b8a8: return "r8g8b8a8";
+    case PIXMAN_r8g8b8x8: return "r8g8b8x8";
+    case PIXMAN_x14r6g6b6: return "x14r6g6b6";
+    case PIXMAN_x2r10g10b10: return "x2r10g10b10";
+    case PIXMAN_a2r10g10b10: return "a2r10g10b10";
+    case PIXMAN_x2b10g10r10: return "x2b10g10r10";
+    case PIXMAN_a2b10g10r10: return "a2b10g10r10";
+
+/* sRGB formats */
+    case PIXMAN_a8r8g8b8_sRGB: return "a8r8g8b8_sRGB";
+
+/* 24bpp formats */
+    case PIXMAN_r8g8b8: return "r8g8b8";
+    case PIXMAN_b8g8r8: return "b8g8r8";
+
+/* 16bpp formats */
+    case PIXMAN_r5g6b5: return "r5g6b5";
+    case PIXMAN_b5g6r5: return "b5g6r5";
+
+    case PIXMAN_a1r5g5b5: return "a1r5g5b5";
+    case PIXMAN_x1r5g5b5: return "x1r5g5b5";
+    case PIXMAN_a1b5g5r5: return "a1b5g5r5";
+    case PIXMAN_x1b5g5r5: return "x1b5g5r5";
+    case PIXMAN_a4r4g4b4: return "a4r4g4b4";
+    case PIXMAN_x4r4g4b4: return "x4r4g4b4";
+    case PIXMAN_a4b4g4r4: return "a4b4g4r4";
+    case PIXMAN_x4b4g4r4: return "x4b4g4r4";
+
+/* 8bpp formats */
+    case PIXMAN_a8: return "a8";
+    case PIXMAN_r3g3b2: return "r3g3b2";
+    case PIXMAN_b2g3r3: return "b2g3r3";
+    case PIXMAN_a2r2g2b2: return "a2r2g2b2";
+    case PIXMAN_a2b2g2r2: return "a2b2g2r2";
+
+#if 0
+    case PIXMAN_x4c4: return "x4c4";
+    case PIXMAN_g8: return "g8";
+#endif
+    case PIXMAN_c8: return "x4c4 / c8";
+    case PIXMAN_x4g4: return "x4g4 / g8";
+
+    case PIXMAN_x4a4: return "x4a4";
+
+/* 4bpp formats */
+    case PIXMAN_a4: return "a4";
+    case PIXMAN_r1g2b1: return "r1g2b1";
+    case PIXMAN_b1g2r1: return "b1g2r1";
+    case PIXMAN_a1r1g1b1: return "a1r1g1b1";
+    case PIXMAN_a1b1g1r1: return "a1b1g1r1";
+
+    case PIXMAN_c4: return "c4";
+    case PIXMAN_g4: return "g4";
+
+/* 1bpp formats */
+    case PIXMAN_a1: return "a1";
+
+    case PIXMAN_g1: return "g1";
+
+/* YUV formats */
+    case PIXMAN_yuy2: return "yuy2";
+    case PIXMAN_yv12: return "yv12";
+    };
+
+    /* Fake formats.
+     *
+     * This is separate switch to prevent GCC from complaining
+     * that the values are not in the pixman_format_code_t enum.
+     */
+    switch ((uint32_t)format)
+    {
+    case PIXMAN_null: return "null"; 
+    case PIXMAN_solid: return "solid"; 
+    case PIXMAN_pixbuf: return "pixbuf"; 
+    case PIXMAN_rpixbuf: return "rpixbuf"; 
+    case PIXMAN_unknown: return "unknown"; 
+    };
+
+    return "<unknown format>";
+};
+
+static double
+calc_op (pixman_op_t op, double src, double dst, double srca, double dsta)
+{
+#define mult_chan(src, dst, Fa, Fb) MIN ((src) * (Fa) + (dst) * (Fb), 1.0)
+
+    double Fa, Fb;
+
+    switch (op)
+    {
+    case PIXMAN_OP_CLEAR:
+    case PIXMAN_OP_DISJOINT_CLEAR:
+    case PIXMAN_OP_CONJOINT_CLEAR:
+	return mult_chan (src, dst, 0.0, 0.0);
+
+    case PIXMAN_OP_SRC:
+    case PIXMAN_OP_DISJOINT_SRC:
+    case PIXMAN_OP_CONJOINT_SRC:
+	return mult_chan (src, dst, 1.0, 0.0);
+
+    case PIXMAN_OP_DST:
+    case PIXMAN_OP_DISJOINT_DST:
+    case PIXMAN_OP_CONJOINT_DST:
+	return mult_chan (src, dst, 0.0, 1.0);
+
+    case PIXMAN_OP_OVER:
+	return mult_chan (src, dst, 1.0, 1.0 - srca);
+
+    case PIXMAN_OP_OVER_REVERSE:
+	return mult_chan (src, dst, 1.0 - dsta, 1.0);
+
+    case PIXMAN_OP_IN:
+	return mult_chan (src, dst, dsta, 0.0);
+
+    case PIXMAN_OP_IN_REVERSE:
+	return mult_chan (src, dst, 0.0, srca);
+
+    case PIXMAN_OP_OUT:
+	return mult_chan (src, dst, 1.0 - dsta, 0.0);
+
+    case PIXMAN_OP_OUT_REVERSE:
+	return mult_chan (src, dst, 0.0, 1.0 - srca);
+
+    case PIXMAN_OP_ATOP:
+	return mult_chan (src, dst, dsta, 1.0 - srca);
+
+    case PIXMAN_OP_ATOP_REVERSE:
+	return mult_chan (src, dst, 1.0 - dsta,  srca);
+
+    case PIXMAN_OP_XOR:
+	return mult_chan (src, dst, 1.0 - dsta, 1.0 - srca);
+
+    case PIXMAN_OP_ADD:
+	return mult_chan (src, dst, 1.0, 1.0);
+
+    case PIXMAN_OP_SATURATE:
+    case PIXMAN_OP_DISJOINT_OVER_REVERSE:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, (1.0 - dsta) / srca);
+	return mult_chan (src, dst, Fa, 1.0);
+
+    case PIXMAN_OP_DISJOINT_OVER:
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, 1.0, Fb);
+
+    case PIXMAN_OP_DISJOINT_IN:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - (1.0 - dsta) / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_DISJOINT_IN_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - (1.0 - srca) / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_DISJOINT_OUT:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, (1.0 - dsta) / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_DISJOINT_OUT_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_DISJOINT_ATOP:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - (1.0 - dsta) / srca);
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_DISJOINT_ATOP_REVERSE:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, (1.0 - dsta) / srca);
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - (1.0 - srca) / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_DISJOINT_XOR:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, (1.0 - dsta) / srca);
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, (1.0 - srca) / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_CONJOINT_OVER:
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, 1.0, Fb);
+
+    case PIXMAN_OP_CONJOINT_OVER_REVERSE:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - dsta / srca);
+	return mult_chan (src, dst, Fa, 1.0);
+
+    case PIXMAN_OP_CONJOINT_IN:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, dsta / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_CONJOINT_IN_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, srca / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_CONJOINT_OUT:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - dsta / srca);
+	return mult_chan (src, dst, Fa, 0.0);
+
+    case PIXMAN_OP_CONJOINT_OUT_REVERSE:
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, 0.0, Fb);
+
+    case PIXMAN_OP_CONJOINT_ATOP:
+	if (srca == 0.0)
+	    Fa = 1.0;
+	else
+	    Fa = MIN (1.0, dsta / srca);
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_CONJOINT_ATOP_REVERSE:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - dsta / srca);
+	if (dsta == 0.0)
+	    Fb = 1.0;
+	else
+	    Fb = MIN (1.0, srca / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_CONJOINT_XOR:
+	if (srca == 0.0)
+	    Fa = 0.0;
+	else
+	    Fa = MAX (0.0, 1.0 - dsta / srca);
+	if (dsta == 0.0)
+	    Fb = 0.0;
+	else
+	    Fb = MAX (0.0, 1.0 - srca / dsta);
+	return mult_chan (src, dst, Fa, Fb);
+
+    case PIXMAN_OP_MULTIPLY:
+    case PIXMAN_OP_SCREEN:
+    case PIXMAN_OP_OVERLAY:
+    case PIXMAN_OP_DARKEN:
+    case PIXMAN_OP_LIGHTEN:
+    case PIXMAN_OP_COLOR_DODGE:
+    case PIXMAN_OP_COLOR_BURN:
+    case PIXMAN_OP_HARD_LIGHT:
+    case PIXMAN_OP_SOFT_LIGHT:
+    case PIXMAN_OP_DIFFERENCE:
+    case PIXMAN_OP_EXCLUSION:
+    case PIXMAN_OP_HSL_HUE:
+    case PIXMAN_OP_HSL_SATURATION:
+    case PIXMAN_OP_HSL_COLOR:
+    case PIXMAN_OP_HSL_LUMINOSITY:
+    default:
+	abort();
+	return 0; /* silence MSVC */
+    }
+#undef mult_chan
+}
+
+void
+do_composite (pixman_op_t op,
+	      const color_t *src,
+	      const color_t *mask,
+	      const color_t *dst,
+	      color_t *result,
+	      pixman_bool_t component_alpha)
+{
+    color_t srcval, srcalpha;
+
+    if (mask == NULL)
+    {
+	srcval = *src;
+
+	srcalpha.r = src->a;
+	srcalpha.g = src->a;
+	srcalpha.b = src->a;
+	srcalpha.a = src->a;
+    }
+    else if (component_alpha)
+    {
+	srcval.r = src->r * mask->r;
+	srcval.g = src->g * mask->g;
+	srcval.b = src->b * mask->b;
+	srcval.a = src->a * mask->a;
+
+	srcalpha.r = src->a * mask->r;
+	srcalpha.g = src->a * mask->g;
+	srcalpha.b = src->a * mask->b;
+	srcalpha.a = src->a * mask->a;
+    }
+    else
+    {
+	srcval.r = src->r * mask->a;
+	srcval.g = src->g * mask->a;
+	srcval.b = src->b * mask->a;
+	srcval.a = src->a * mask->a;
+
+	srcalpha.r = src->a * mask->a;
+	srcalpha.g = src->a * mask->a;
+	srcalpha.b = src->a * mask->a;
+	srcalpha.a = src->a * mask->a;
+    }
+
+    result->r = calc_op (op, srcval.r, dst->r, srcalpha.r, dst->a);
+    result->g = calc_op (op, srcval.g, dst->g, srcalpha.g, dst->a);
+    result->b = calc_op (op, srcval.b, dst->b, srcalpha.b, dst->a);
+    result->a = calc_op (op, srcval.a, dst->a, srcalpha.a, dst->a);
+}
+
+static double
+round_channel (double p, int m)
+{
+    int t;
+    double r;
+
+    t = p * ((1 << m));
+    t -= t >> m;
+
+    r = t / (double)((1 << m) - 1);
+
+    return r;
+}
+
+void
+round_color (pixman_format_code_t format, color_t *color)
+{
+    if (PIXMAN_FORMAT_R (format) == 0)
+    {
+	color->r = 0.0;
+	color->g = 0.0;
+	color->b = 0.0;
+    }
+    else
+    {
+	color->r = round_channel (color->r, PIXMAN_FORMAT_R (format));
+	color->g = round_channel (color->g, PIXMAN_FORMAT_G (format));
+	color->b = round_channel (color->b, PIXMAN_FORMAT_B (format));
+    }
+
+    if (PIXMAN_FORMAT_A (format) == 0)
+	color->a = 1;
+    else
+	color->a = round_channel (color->a, PIXMAN_FORMAT_A (format));
+}
+
+/* Check whether @pixel is a valid quantization of the a, r, g, b
+ * parameters. Some slack is permitted.
+ */
+void
+pixel_checker_init (pixel_checker_t *checker, pixman_format_code_t format)
+{
+    assert (PIXMAN_FORMAT_VIS (format));
+
+    checker->format = format;
+
+    switch (PIXMAN_FORMAT_TYPE (format))
+    {
+    case PIXMAN_TYPE_A:
+	checker->bs = 0;
+	checker->gs = 0;
+	checker->rs = 0;
+	checker->as = 0;
+	break;
+
+    case PIXMAN_TYPE_ARGB:
+    case PIXMAN_TYPE_ARGB_SRGB:
+	checker->bs = 0;
+	checker->gs = checker->bs + PIXMAN_FORMAT_B (format);
+	checker->rs = checker->gs + PIXMAN_FORMAT_G (format);
+	checker->as = checker->rs + PIXMAN_FORMAT_R (format);
+	break;
+
+    case PIXMAN_TYPE_ABGR:
+	checker->rs = 0;
+	checker->gs = checker->rs + PIXMAN_FORMAT_R (format);
+	checker->bs = checker->gs + PIXMAN_FORMAT_G (format);
+	checker->as = checker->bs + PIXMAN_FORMAT_B (format);
+	break;
+
+    case PIXMAN_TYPE_BGRA:
+	/* With BGRA formats we start counting at the high end of the pixel */
+	checker->bs = PIXMAN_FORMAT_BPP (format) - PIXMAN_FORMAT_B (format);
+	checker->gs = checker->bs - PIXMAN_FORMAT_B (format);
+	checker->rs = checker->gs - PIXMAN_FORMAT_G (format);
+	checker->as = checker->rs - PIXMAN_FORMAT_R (format);
+	break;
+
+    case PIXMAN_TYPE_RGBA:
+	/* With BGRA formats we start counting at the high end of the pixel */
+	checker->rs = PIXMAN_FORMAT_BPP (format) - PIXMAN_FORMAT_R (format);
+	checker->gs = checker->rs - PIXMAN_FORMAT_R (format);
+	checker->bs = checker->gs - PIXMAN_FORMAT_G (format);
+	checker->as = checker->bs - PIXMAN_FORMAT_B (format);
+	break;
+
+    default:
+	assert (0);
+	break;
+    }
+
+    checker->am = ((1 << PIXMAN_FORMAT_A (format)) - 1) << checker->as;
+    checker->rm = ((1 << PIXMAN_FORMAT_R (format)) - 1) << checker->rs;
+    checker->gm = ((1 << PIXMAN_FORMAT_G (format)) - 1) << checker->gs;
+    checker->bm = ((1 << PIXMAN_FORMAT_B (format)) - 1) << checker->bs;
+
+    checker->aw = PIXMAN_FORMAT_A (format);
+    checker->rw = PIXMAN_FORMAT_R (format);
+    checker->gw = PIXMAN_FORMAT_G (format);
+    checker->bw = PIXMAN_FORMAT_B (format);
+}
+
+void
+pixel_checker_split_pixel (const pixel_checker_t *checker, uint32_t pixel,
+			   int *a, int *r, int *g, int *b)
+{
+    *a = (pixel & checker->am) >> checker->as;
+    *r = (pixel & checker->rm) >> checker->rs;
+    *g = (pixel & checker->gm) >> checker->gs;
+    *b = (pixel & checker->bm) >> checker->bs;
+}
+
+void
+pixel_checker_get_masks (const pixel_checker_t *checker,
+                         uint32_t              *am,
+                         uint32_t              *rm,
+                         uint32_t              *gm,
+                         uint32_t              *bm)
+{
+    if (am)
+        *am = checker->am;
+    if (rm)
+        *rm = checker->rm;
+    if (gm)
+        *gm = checker->gm;
+    if (bm)
+        *bm = checker->bm;
+}
+
+void
+pixel_checker_convert_pixel_to_color (const pixel_checker_t *checker,
+                                      uint32_t pixel, color_t *color)
+{
+    int a, r, g, b;
+
+    pixel_checker_split_pixel (checker, pixel, &a, &r, &g, &b);
+
+    if (checker->am == 0)
+        color->a = 1.0;
+    else
+        color->a = a / (double)(checker->am >> checker->as);
+
+    if (checker->rm == 0)
+        color->r = 0.0;
+    else
+        color->r = r / (double)(checker->rm >> checker->rs);
+
+    if (checker->gm == 0)
+        color->g = 0.0;
+    else
+        color->g = g / (double)(checker->gm >> checker->gs);
+
+    if (checker->bm == 0)
+        color->b = 0.0;
+    else
+        color->b = b / (double)(checker->bm >> checker->bs);
+
+    if (PIXMAN_FORMAT_TYPE (checker->format) == PIXMAN_TYPE_ARGB_SRGB)
+    {
+	color->r = convert_srgb_to_linear (color->r);
+	color->g = convert_srgb_to_linear (color->g);
+	color->b = convert_srgb_to_linear (color->b);
+    }
+}
+
+static int32_t
+convert (double v, uint32_t width, uint32_t mask, uint32_t shift, double def)
+{
+    int32_t r;
+
+    if (!mask)
+	v = def;
+
+    r = (v * ((mask >> shift) + 1));
+    r -= r >> width;
+
+    return r;
+}
+
+static void
+get_limits (const pixel_checker_t *checker, double limit,
+	    color_t *color,
+	    int *ao, int *ro, int *go, int *bo)
+{
+    color_t tmp;
+
+    if (PIXMAN_FORMAT_TYPE (checker->format) == PIXMAN_TYPE_ARGB_SRGB)
+    {
+	tmp.a = color->a;
+	tmp.r = convert_linear_to_srgb (color->r);
+	tmp.g = convert_linear_to_srgb (color->g);
+	tmp.b = convert_linear_to_srgb (color->b);
+
+	color = &tmp;
+    }
+    
+    *ao = convert (color->a + limit, checker->aw, checker->am, checker->as, 1.0);
+    *ro = convert (color->r + limit, checker->rw, checker->rm, checker->rs, 0.0);
+    *go = convert (color->g + limit, checker->gw, checker->gm, checker->gs, 0.0);
+    *bo = convert (color->b + limit, checker->bw, checker->bm, checker->bs, 0.0);
+}
+
+/* The acceptable deviation in units of [0.0, 1.0]
+ */
+#define DEVIATION (0.0064)
+
+void
+pixel_checker_get_max (const pixel_checker_t *checker, color_t *color,
+		       int *am, int *rm, int *gm, int *bm)
+{
+    get_limits (checker, DEVIATION, color, am, rm, gm, bm);
+}
+
+void
+pixel_checker_get_min (const pixel_checker_t *checker, color_t *color,
+		       int *am, int *rm, int *gm, int *bm)
+{
+    get_limits (checker, - DEVIATION, color, am, rm, gm, bm);
+}
+
+pixman_bool_t
+pixel_checker_check (const pixel_checker_t *checker, uint32_t pixel,
+		     color_t *color)
+{
+    int32_t a_lo, a_hi, r_lo, r_hi, g_lo, g_hi, b_lo, b_hi;
+    int32_t ai, ri, gi, bi;
+    pixman_bool_t result;
+
+    pixel_checker_get_min (checker, color, &a_lo, &r_lo, &g_lo, &b_lo);
+    pixel_checker_get_max (checker, color, &a_hi, &r_hi, &g_hi, &b_hi);
+    pixel_checker_split_pixel (checker, pixel, &ai, &ri, &gi, &bi);
+
+    result =
+	a_lo <= ai && ai <= a_hi	&&
+	r_lo <= ri && ri <= r_hi	&&
+	g_lo <= gi && gi <= g_hi	&&
+	b_lo <= bi && bi <= b_hi;
+
+    return result;
+}
diff --git a/src/pixman/test/utils.h b/src/pixman/test/utils.h
new file mode 100644
index 0000000..ebb14d9
--- /dev/null
+++ b/src/pixman/test/utils.h
@@ -0,0 +1,247 @@
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <assert.h>
+#include "pixman-private.h" /* For 'inline' definition */
+#include "utils-prng.h"
+
+#if defined(_MSC_VER)
+#define snprintf _snprintf
+#define strcasecmp _stricmp
+#endif
+
+#define ARRAY_LENGTH(A) ((int) (sizeof (A) / sizeof ((A) [0])))
+
+/* A primitive pseudorandom number generator,
+ * taken from POSIX.1-2001 example
+ */
+
+extern prng_t prng_state_data;
+extern prng_t *prng_state;
+#ifdef USE_OPENMP
+#pragma omp threadprivate(prng_state_data)
+#pragma omp threadprivate(prng_state)
+#endif
+
+static inline uint32_t
+prng_rand (void)
+{
+    return prng_rand_r (prng_state);
+}
+
+static inline void
+prng_srand (uint32_t seed)
+{
+    if (!prng_state)
+    {
+        /* Without setting a seed, PRNG does not work properly (is just
+         * returning zeros). So we only initialize the pointer here to
+         * make sure that 'prng_srand' is always called before any
+         * other 'prng_*' function. The wrongdoers violating this order
+         * will get a segfault. */
+        prng_state = &prng_state_data;
+    }
+    prng_srand_r (prng_state, seed);
+}
+
+static inline uint32_t
+prng_rand_n (int max)
+{
+    return prng_rand () % max;
+}
+
+static inline void
+prng_randmemset (void *buffer, size_t size, prng_randmemset_flags_t flags)
+{
+    prng_randmemset_r (prng_state, buffer, size, flags);
+}
+
+/* CRC 32 computation
+ */
+uint32_t
+compute_crc32 (uint32_t    in_crc32,
+	       const void *buf,
+	       size_t      buf_len);
+
+uint32_t
+compute_crc32_for_image (uint32_t        in_crc32,
+			 pixman_image_t *image);
+
+/* Print the image in hexadecimal */
+void
+print_image (pixman_image_t *image);
+
+/* Returns TRUE if running on a little endian system
+ */
+static force_inline pixman_bool_t
+is_little_endian (void)
+{
+    unsigned long endian_check_var = 1;
+    return *(unsigned char *)&endian_check_var == 1;
+}
+
+/* perform endian conversion of pixel data
+ */
+void
+image_endian_swap (pixman_image_t *img);
+
+/* Allocate memory that is bounded by protected pages,
+ * so that out-of-bounds access will cause segfaults
+ */
+void *
+fence_malloc (int64_t len);
+
+void
+fence_free (void *data);
+
+/* Generate n_bytes random bytes in fence_malloced memory */
+uint8_t *
+make_random_bytes (int n_bytes);
+
+/* Return current time in seconds */
+double
+gettime (void);
+
+uint32_t
+get_random_seed (void);
+
+/* main body of the fuzzer test */
+int
+fuzzer_test_main (const char *test_name,
+		  int         default_number_of_iterations,
+		  uint32_t    expected_checksum,
+		  uint32_t    (*test_function)(int testnum, int verbose),
+		  int         argc,
+		  const char *argv[]);
+
+void
+fail_after (int seconds, const char *msg);
+
+/* If possible, enable traps for floating point exceptions */
+void enable_divbyzero_exceptions(void);
+
+/* Converts a8r8g8b8 pixels to pixels that
+ *  - are not premultiplied,
+ *  - are stored in this order in memory: R, G, B, A, regardless of
+ *    the endianness of the computer.
+ * It is allowed for @src and @dst to point to the same memory buffer.
+ */
+void
+a8r8g8b8_to_rgba_np (uint32_t *dst, uint32_t *src, int n_pixels);
+
+pixman_bool_t
+write_png (pixman_image_t *image, const char *filename);
+
+void
+draw_checkerboard (pixman_image_t *image,
+		   int check_size,
+		   uint32_t color1, uint32_t color2);
+
+/* A pair of macros which can help to detect corruption of
+ * floating point registers after a function call. This may
+ * happen if _mm_empty() call is forgotten in MMX/SSE2 fast
+ * path code, or ARM NEON assembly optimized function forgets
+ * to save/restore d8-d15 registers before use.
+ */
+
+#define FLOAT_REGS_CORRUPTION_DETECTOR_START()                 \
+    static volatile double frcd_volatile_constant1 = 123451;   \
+    static volatile double frcd_volatile_constant2 = 123452;   \
+    static volatile double frcd_volatile_constant3 = 123453;   \
+    static volatile double frcd_volatile_constant4 = 123454;   \
+    static volatile double frcd_volatile_constant5 = 123455;   \
+    static volatile double frcd_volatile_constant6 = 123456;   \
+    static volatile double frcd_volatile_constant7 = 123457;   \
+    static volatile double frcd_volatile_constant8 = 123458;   \
+    double frcd_canary_variable1 = frcd_volatile_constant1;    \
+    double frcd_canary_variable2 = frcd_volatile_constant2;    \
+    double frcd_canary_variable3 = frcd_volatile_constant3;    \
+    double frcd_canary_variable4 = frcd_volatile_constant4;    \
+    double frcd_canary_variable5 = frcd_volatile_constant5;    \
+    double frcd_canary_variable6 = frcd_volatile_constant6;    \
+    double frcd_canary_variable7 = frcd_volatile_constant7;    \
+    double frcd_canary_variable8 = frcd_volatile_constant8;
+
+#define FLOAT_REGS_CORRUPTION_DETECTOR_FINISH()                \
+    assert (frcd_canary_variable1 == frcd_volatile_constant1); \
+    assert (frcd_canary_variable2 == frcd_volatile_constant2); \
+    assert (frcd_canary_variable3 == frcd_volatile_constant3); \
+    assert (frcd_canary_variable4 == frcd_volatile_constant4); \
+    assert (frcd_canary_variable5 == frcd_volatile_constant5); \
+    assert (frcd_canary_variable6 == frcd_volatile_constant6); \
+    assert (frcd_canary_variable7 == frcd_volatile_constant7); \
+    assert (frcd_canary_variable8 == frcd_volatile_constant8);
+
+/* Try to get an aligned memory chunk */
+void *
+aligned_malloc (size_t align, size_t size);
+
+double
+convert_srgb_to_linear (double component);
+
+double
+convert_linear_to_srgb (double component);
+
+void
+initialize_palette (pixman_indexed_t *palette, uint32_t depth, int is_rgb);
+
+const char *
+operator_name (pixman_op_t op);
+
+const char *
+format_name (pixman_format_code_t format);
+
+typedef struct
+{
+    double r, g, b, a;
+} color_t;
+
+void
+do_composite (pixman_op_t op,
+	      const color_t *src,
+	      const color_t *mask,
+	      const color_t *dst,
+	      color_t *result,
+	      pixman_bool_t component_alpha);
+
+void
+round_color (pixman_format_code_t format, color_t *color);
+
+typedef struct
+{
+    pixman_format_code_t format;
+    uint32_t am, rm, gm, bm;
+    uint32_t as, rs, gs, bs;
+    uint32_t aw, rw, gw, bw;
+} pixel_checker_t;
+
+void
+pixel_checker_init (pixel_checker_t *checker, pixman_format_code_t format);
+
+void
+pixel_checker_split_pixel (const pixel_checker_t *checker, uint32_t pixel,
+			   int *a, int *r, int *g, int *b);
+
+void
+pixel_checker_get_max (const pixel_checker_t *checker, color_t *color,
+		       int *a, int *r, int *g, int *b);
+
+void
+pixel_checker_get_min (const pixel_checker_t *checker, color_t *color,
+		       int *a, int *r, int *g, int *b);
+
+pixman_bool_t
+pixel_checker_check (const pixel_checker_t *checker,
+		     uint32_t pixel, color_t *color);
+
+void
+pixel_checker_convert_pixel_to_color (const pixel_checker_t *checker,
+                                      uint32_t pixel, color_t *color);
+
+void
+pixel_checker_get_masks (const pixel_checker_t *checker,
+                         uint32_t              *am,
+                         uint32_t              *rm,
+                         uint32_t              *gm,
+                         uint32_t              *bm);