82 files changed, 27704 insertions, 0 deletions
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/Automake.inc b/lib/mesa/src/gallium/drivers/llvmpipe/Automake.inc
new file mode 100644
index 000000000..0a0aa34e7
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/Automake.inc
@@ -0,0 +1,7 @@
+if HAVE_GALLIUM_LLVMPIPE
+
+TARGET_CPPFLAGS += -DGALLIUM_LLVMPIPE
+TARGET_LIB_DEPS += \
+	$(top_builddir)/src/gallium/drivers/llvmpipe/libllvmpipe.la
+
+endif
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/Makefile.am b/lib/mesa/src/gallium/drivers/llvmpipe/Makefile.am
new file mode 100644
index 000000000..1d3853e41
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/Makefile.am
@@ -0,0 +1,79 @@
+# Copyright © 2012 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+include Makefile.sources
+include $(top_srcdir)/src/gallium/Automake.inc
+
+AM_CFLAGS = \
+	$(GALLIUM_DRIVER_CFLAGS) \
+	$(LLVM_CFLAGS) \
+	$(MSVC2008_COMPAT_CFLAGS)
+AM_CXXFLAGS= \
+	$(GALLIUM_DRIVER_CXXFLAGS) \
+	$(LLVM_CXXFLAGS) \
+	$(MSVC2008_COMPAT_CXXFLAGS)
+
+noinst_LTLIBRARIES = libllvmpipe.la
+
+libllvmpipe_la_SOURCES = $(C_SOURCES)
+
+libllvmpipe_la_LDFLAGS = $(LLVM_LDFLAGS)
+
+noinst_HEADERS = lp_test.h
+
+check_PROGRAMS = \
+	lp_test_format	\
+	lp_test_arit	\
+	lp_test_blend	\
+	lp_test_conv	\
+	lp_test_printf
+TESTS = $(check_PROGRAMS)
+
+TEST_LIBS = \
+	libllvmpipe.la \
+	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
+	$(top_builddir)/src/util/libmesautil.la \
+	$(LLVM_LIBS) \
+	$(DLOPEN_LIBS) \
+	$(PTHREAD_LIBS)
+
+lp_test_format_SOURCES = lp_test_format.c lp_test_main.c
+lp_test_format_LDADD = $(TEST_LIBS)
+nodist_EXTRA_lp_test_format_SOURCES = dummy.cpp
+
+lp_test_arit_SOURCES = lp_test_arit.c lp_test_main.c
+lp_test_arit_LDADD = $(TEST_LIBS)
+nodist_EXTRA_lp_test_arit_SOURCES = dummy.cpp
+
+lp_test_blend_SOURCES = lp_test_blend.c lp_test_main.c
+lp_test_blend_LDADD = $(TEST_LIBS)
+nodist_EXTRA_lp_test_blend_SOURCES = dummy.cpp
+
+lp_test_conv_SOURCES = lp_test_conv.c lp_test_main.c
+lp_test_conv_LDADD = $(TEST_LIBS)
+nodist_EXTRA_lp_test_conv_SOURCES = dummy.cpp
+
+lp_test_printf_SOURCES = lp_test_printf.c lp_test_main.c
+lp_test_printf_LDADD = $(TEST_LIBS)
+nodist_EXTRA_lp_test_printf_SOURCES = dummy.cpp
+
+EXTRA_DIST = SConscript
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/Makefile.in b/lib/mesa/src/gallium/drivers/llvmpipe/Makefile.in
new file mode 100644
index 000000000..0274f7e87
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/Makefile.in
@@ -0,0 +1,1529 @@
+# Makefile.in generated by automake 1.15 from Makefile.am.
+# @configure_input@
+
+# Copyright (C) 1994-2014 Free Software Foundation, Inc.
+
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+
+@SET_MAKE@
+
+# Copyright © 2012 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+
+VPATH = @srcdir@
+am__is_gnu_make = { \
+  if test -z '$(MAKELEVEL)'; then \
+    false; \
+  elif test -n '$(MAKE_HOST)'; then \
+    true; \
+  elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
+    true; \
+  else \
+    false; \
+  fi; \
+}
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
+    case $$MAKEFLAGS in \
+      *\\[\ \	]*) \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
+pkgdatadir = $(datadir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkglibexecdir = $(libexecdir)/@PACKAGE@
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+build_triplet = @build@
+host_triplet = @host@
+target_triplet = @target@
+@HAVE_DRISW_TRUE@am__append_1 = \
+@HAVE_DRISW_TRUE@	$(top_builddir)/src/gallium/winsys/sw/dri/libswdri.la
+
+check_PROGRAMS = lp_test_format$(EXEEXT) lp_test_arit$(EXEEXT) \
+	lp_test_blend$(EXEEXT) lp_test_conv$(EXEEXT) \
+	lp_test_printf$(EXEEXT)
+subdir = src/gallium/drivers/llvmpipe
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_gnu_make.m4 \
+	$(top_srcdir)/m4/ax_check_python_mako_module.m4 \
+	$(top_srcdir)/m4/ax_gcc_builtin.m4 \
+	$(top_srcdir)/m4/ax_gcc_func_attribute.m4 \
+	$(top_srcdir)/m4/ax_prog_bison.m4 \
+	$(top_srcdir)/m4/ax_prog_flex.m4 \
+	$(top_srcdir)/m4/ax_pthread.m4 $(top_srcdir)/m4/libtool.m4 \
+	$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
+	$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
+	$(top_srcdir)/VERSION $(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+DIST_COMMON = $(srcdir)/Makefile.am $(noinst_HEADERS) \
+	$(am__DIST_COMMON)
+mkinstalldirs = $(install_sh) -d
+CONFIG_CLEAN_FILES =
+CONFIG_CLEAN_VPATH_FILES =
+LTLIBRARIES = $(noinst_LTLIBRARIES)
+libllvmpipe_la_LIBADD =
+am__objects_1 = lp_bld_alpha.lo lp_bld_blend_aos.lo lp_bld_blend.lo \
+	lp_bld_blend_logicop.lo lp_bld_depth.lo lp_bld_interp.lo \
+	lp_clear.lo lp_context.lo lp_draw_arrays.lo lp_fence.lo \
+	lp_flush.lo lp_jit.lo lp_memory.lo lp_perf.lo lp_query.lo \
+	lp_rast.lo lp_rast_debug.lo lp_rast_tri.lo lp_scene.lo \
+	lp_scene_queue.lo lp_screen.lo lp_setup.lo lp_setup_line.lo \
+	lp_setup_point.lo lp_setup_tri.lo lp_setup_vbuf.lo \
+	lp_state_blend.lo lp_state_clip.lo lp_state_derived.lo \
+	lp_state_fs.lo lp_state_gs.lo lp_state_rasterizer.lo \
+	lp_state_sampler.lo lp_state_setup.lo lp_state_so.lo \
+	lp_state_surface.lo lp_state_vertex.lo lp_state_vs.lo \
+	lp_surface.lo lp_tex_sample.lo lp_texture.lo
+am_libllvmpipe_la_OBJECTS = $(am__objects_1)
+libllvmpipe_la_OBJECTS = $(am_libllvmpipe_la_OBJECTS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+libllvmpipe_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(AM_CFLAGS) $(CFLAGS) $(libllvmpipe_la_LDFLAGS) $(LDFLAGS) -o \
+	$@
+am_lp_test_arit_OBJECTS = lp_test_arit.$(OBJEXT) \
+	lp_test_main.$(OBJEXT)
+lp_test_arit_OBJECTS = $(am_lp_test_arit_OBJECTS)
+am__DEPENDENCIES_1 =
+am__DEPENDENCIES_2 = libllvmpipe.la \
+	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
+	$(top_builddir)/src/util/libmesautil.la $(am__DEPENDENCIES_1) \
+	$(am__DEPENDENCIES_1) $(am__DEPENDENCIES_1)
+lp_test_arit_DEPENDENCIES = $(am__DEPENDENCIES_2)
+am_lp_test_blend_OBJECTS = lp_test_blend.$(OBJEXT) \
+	lp_test_main.$(OBJEXT)
+lp_test_blend_OBJECTS = $(am_lp_test_blend_OBJECTS)
+lp_test_blend_DEPENDENCIES = $(am__DEPENDENCIES_2)
+am_lp_test_conv_OBJECTS = lp_test_conv.$(OBJEXT) \
+	lp_test_main.$(OBJEXT)
+lp_test_conv_OBJECTS = $(am_lp_test_conv_OBJECTS)
+lp_test_conv_DEPENDENCIES = $(am__DEPENDENCIES_2)
+am_lp_test_format_OBJECTS = lp_test_format.$(OBJEXT) \
+	lp_test_main.$(OBJEXT)
+lp_test_format_OBJECTS = $(am_lp_test_format_OBJECTS)
+lp_test_format_DEPENDENCIES = $(am__DEPENDENCIES_2)
+am_lp_test_printf_OBJECTS = lp_test_printf.$(OBJEXT) \
+	lp_test_main.$(OBJEXT)
+lp_test_printf_OBJECTS = $(am_lp_test_printf_OBJECTS)
+lp_test_printf_DEPENDENCIES = $(am__DEPENDENCIES_2)
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
+DEFAULT_INCLUDES = -I.@am__isrc@
+depcomp = $(SHELL) $(top_srcdir)/bin/depcomp
+am__depfiles_maybe = depfiles
+am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
+CCLD = $(CC)
+LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
+CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
+LTCXXCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CXX) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CXXFLAGS) $(CXXFLAGS)
+AM_V_CXX = $(am__v_CXX_@AM_V@)
+am__v_CXX_ = $(am__v_CXX_@AM_DEFAULT_V@)
+am__v_CXX_0 = @echo "  CXX     " $@;
+am__v_CXX_1 = 
+CXXLD = $(CXX)
+CXXLINK = $(LIBTOOL) $(AM_V_lt) --tag=CXX $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \
+	$(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CXXLD = $(am__v_CXXLD_@AM_V@)
+am__v_CXXLD_ = $(am__v_CXXLD_@AM_DEFAULT_V@)
+am__v_CXXLD_0 = @echo "  CXXLD   " $@;
+am__v_CXXLD_1 = 
+SOURCES = $(libllvmpipe_la_SOURCES) $(lp_test_arit_SOURCES) \
+	$(nodist_EXTRA_lp_test_arit_SOURCES) $(lp_test_blend_SOURCES) \
+	$(nodist_EXTRA_lp_test_blend_SOURCES) $(lp_test_conv_SOURCES) \
+	$(nodist_EXTRA_lp_test_conv_SOURCES) $(lp_test_format_SOURCES) \
+	$(nodist_EXTRA_lp_test_format_SOURCES) \
+	$(lp_test_printf_SOURCES) \
+	$(nodist_EXTRA_lp_test_printf_SOURCES)
+DIST_SOURCES = $(libllvmpipe_la_SOURCES) $(lp_test_arit_SOURCES) \
+	$(lp_test_blend_SOURCES) $(lp_test_conv_SOURCES) \
+	$(lp_test_format_SOURCES) $(lp_test_printf_SOURCES)
+am__can_run_installinfo = \
+  case $$AM_UPDATE_INFO_DIR in \
+    n|no|NO) false;; \
+    *) (install-info --version) >/dev/null 2>&1;; \
+  esac
+HEADERS = $(noinst_HEADERS)
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
+ETAGS = etags
+CTAGS = ctags
+am__tty_colors_dummy = \
+  mgn= red= grn= lgn= blu= brg= std=; \
+  am__color_tests=no
+am__tty_colors = { \
+  $(am__tty_colors_dummy); \
+  if test "X$(AM_COLOR_TESTS)" = Xno; then \
+    am__color_tests=no; \
+  elif test "X$(AM_COLOR_TESTS)" = Xalways; then \
+    am__color_tests=yes; \
+  elif test "X$$TERM" != Xdumb && { test -t 1; } 2>/dev/null; then \
+    am__color_tests=yes; \
+  fi; \
+  if test $$am__color_tests = yes; then \
+    red='[0;31m'; \
+    grn='[0;32m'; \
+    lgn='[1;32m'; \
+    blu='[1;34m'; \
+    mgn='[0;35m'; \
+    brg='[1m'; \
+    std='[m'; \
+  fi; \
+}
+am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
+am__vpath_adj = case $$p in \
+    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
+    *) f=$$p;; \
+  esac;
+am__strip_dir = f=`echo $$p | sed -e 's|^.*/||'`;
+am__install_max = 40
+am__nobase_strip_setup = \
+  srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*|]/\\\\&/g'`
+am__nobase_strip = \
+  for p in $$list; do echo "$$p"; done | sed -e "s|$$srcdirstrip/||"
+am__nobase_list = $(am__nobase_strip_setup); \
+  for p in $$list; do echo "$$p $$p"; done | \
+  sed "s| $$srcdirstrip/| |;"' / .*\//!s/ .*/ ./; s,\( .*\)/[^/]*$$,\1,' | \
+  $(AWK) 'BEGIN { files["."] = "" } { files[$$2] = files[$$2] " " $$1; \
+    if (++n[$$2] == $(am__install_max)) \
+      { print $$2, files[$$2]; n[$$2] = 0; files[$$2] = "" } } \
+    END { for (dir in files) print dir, files[dir] }'
+am__base_list = \
+  sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
+  sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
+am__uninstall_files_from_dir = { \
+  test -z "$$files" \
+    || { test ! -d "$$dir" && test ! -f "$$dir" && test ! -r "$$dir"; } \
+    || { echo " ( cd '$$dir' && rm -f" $$files ")"; \
+         $(am__cd) "$$dir" && rm -f $$files; }; \
+  }
+am__recheck_rx = ^[ 	]*:recheck:[ 	]*
+am__global_test_result_rx = ^[ 	]*:global-test-result:[ 	]*
+am__copy_in_global_log_rx = ^[ 	]*:copy-in-global-log:[ 	]*
+# A command that, given a newline-separated list of test names on the
+# standard input, print the name of the tests that are to be re-run
+# upon "make recheck".
+am__list_recheck_tests = $(AWK) '{ \
+  recheck = 1; \
+  while ((rc = (getline line < ($$0 ".trs"))) != 0) \
+    { \
+      if (rc < 0) \
+        { \
+          if ((getline line2 < ($$0 ".log")) < 0) \
+	    recheck = 0; \
+          break; \
+        } \
+      else if (line ~ /$(am__recheck_rx)[nN][Oo]/) \
+        { \
+          recheck = 0; \
+          break; \
+        } \
+      else if (line ~ /$(am__recheck_rx)[yY][eE][sS]/) \
+        { \
+          break; \
+        } \
+    }; \
+  if (recheck) \
+    print $$0; \
+  close ($$0 ".trs"); \
+  close ($$0 ".log"); \
+}'
+# A command that, given a newline-separated list of test names on the
+# standard input, create the global log from their .trs and .log files.
+am__create_global_log = $(AWK) ' \
+function fatal(msg) \
+{ \
+  print "fatal: making $@: " msg | "cat >&2"; \
+  exit 1; \
+} \
+function rst_section(header) \
+{ \
+  print header; \
+  len = length(header); \
+  for (i = 1; i <= len; i = i + 1) \
+    printf "="; \
+  printf "\n\n"; \
+} \
+{ \
+  copy_in_global_log = 1; \
+  global_test_result = "RUN"; \
+  while ((rc = (getline line < ($$0 ".trs"))) != 0) \
+    { \
+      if (rc < 0) \
+         fatal("failed to read from " $$0 ".trs"); \
+      if (line ~ /$(am__global_test_result_rx)/) \
+        { \
+          sub("$(am__global_test_result_rx)", "", line); \
+          sub("[ 	]*$$", "", line); \
+          global_test_result = line; \
+        } \
+      else if (line ~ /$(am__copy_in_global_log_rx)[nN][oO]/) \
+        copy_in_global_log = 0; \
+    }; \
+  if (copy_in_global_log) \
+    { \
+      rst_section(global_test_result ": " $$0); \
+      while ((rc = (getline line < ($$0 ".log"))) != 0) \
+      { \
+        if (rc < 0) \
+          fatal("failed to read from " $$0 ".log"); \
+        print line; \
+      }; \
+      printf "\n"; \
+    }; \
+  close ($$0 ".trs"); \
+  close ($$0 ".log"); \
+}'
+# Restructured Text title.
+am__rst_title = { sed 's/.*/   &   /;h;s/./=/g;p;x;s/ *$$//;p;g' && echo; }
+# Solaris 10 'make', and several other traditional 'make' implementations,
+# pass "-e" to $(SHELL), and POSIX 2008 even requires this.  Work around it
+# by disabling -e (using the XSI extension "set +e") if it's set.
+am__sh_e_setup = case $$- in *e*) set +e;; esac
+# Default flags passed to test drivers.
+am__common_driver_flags = \
+  --color-tests "$$am__color_tests" \
+  --enable-hard-errors "$$am__enable_hard_errors" \
+  --expect-failure "$$am__expect_failure"
+# To be inserted before the command running the test.  Creates the
+# directory for the log if needed.  Stores in $dir the directory
+# containing $f, in $tst the test, in $log the log.  Executes the
+# developer- defined test setup AM_TESTS_ENVIRONMENT (if any), and
+# passes TESTS_ENVIRONMENT.  Set up options for the wrapper that
+# will run the test scripts (or their associated LOG_COMPILER, if
+# thy have one).
+am__check_pre = \
+$(am__sh_e_setup);					\
+$(am__vpath_adj_setup) $(am__vpath_adj)			\
+$(am__tty_colors);					\
+srcdir=$(srcdir); export srcdir;			\
+case "$@" in						\
+  */*) am__odir=`echo "./$@" | sed 's|/[^/]*$$||'`;;	\
+    *) am__odir=.;; 					\
+esac;							\
+test "x$$am__odir" = x"." || test -d "$$am__odir" 	\
+  || $(MKDIR_P) "$$am__odir" || exit $$?;		\
+if test -f "./$$f"; then dir=./;			\
+elif test -f "$$f"; then dir=;				\
+else dir="$(srcdir)/"; fi;				\
+tst=$$dir$$f; log='$@'; 				\
+if test -n '$(DISABLE_HARD_ERRORS)'; then		\
+  am__enable_hard_errors=no; 				\
+else							\
+  am__enable_hard_errors=yes; 				\
+fi; 							\
+case " $(XFAIL_TESTS) " in				\
+  *[\ \	]$$f[\ \	]* | *[\ \	]$$dir$$f[\ \	]*) \
+    am__expect_failure=yes;;				\
+  *)							\
+    am__expect_failure=no;;				\
+esac; 							\
+$(AM_TESTS_ENVIRONMENT) $(TESTS_ENVIRONMENT)
+# A shell command to get the names of the tests scripts with any registered
+# extension removed (i.e., equivalently, the names of the test logs, with
+# the '.log' extension removed).  The result is saved in the shell variable
+# '$bases'.  This honors runtime overriding of TESTS and TEST_LOGS.  Sadly,
+# we cannot use something simpler, involving e.g., "$(TEST_LOGS:.log=)",
+# since that might cause problem with VPATH rewrites for suffix-less tests.
+# See also 'test-harness-vpath-rewrite.sh' and 'test-trs-basic.sh'.
+am__set_TESTS_bases = \
+  bases='$(TEST_LOGS)'; \
+  bases=`for i in $$bases; do echo $$i; done | sed 's/\.log$$//'`; \
+  bases=`echo $$bases`
+RECHECK_LOGS = $(TEST_LOGS)
+AM_RECURSIVE_TARGETS = check recheck
+TEST_SUITE_LOG = test-suite.log
+TEST_EXTENSIONS = @EXEEXT@ .test
+LOG_DRIVER = $(SHELL) $(top_srcdir)/bin/test-driver
+LOG_COMPILE = $(LOG_COMPILER) $(AM_LOG_FLAGS) $(LOG_FLAGS)
+am__set_b = \
+  case '$@' in \
+    */*) \
+      case '$*' in \
+        */*) b='$*';; \
+          *) b=`echo '$@' | sed 's/\.log$$//'`; \
+       esac;; \
+    *) \
+      b='$*';; \
+  esac
+am__test_logs1 = $(TESTS:=.log)
+am__test_logs2 = $(am__test_logs1:@EXEEXT@.log=.log)
+TEST_LOGS = $(am__test_logs2:.test.log=.log)
+TEST_LOG_DRIVER = $(SHELL) $(top_srcdir)/bin/test-driver
+TEST_LOG_COMPILE = $(TEST_LOG_COMPILER) $(AM_TEST_LOG_FLAGS) \
+	$(TEST_LOG_FLAGS)
+am__DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.sources \
+	$(top_srcdir)/bin/depcomp $(top_srcdir)/bin/test-driver \
+	$(top_srcdir)/src/gallium/Automake.inc
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+ACLOCAL = @ACLOCAL@
+AMDGPU_CFLAGS = @AMDGPU_CFLAGS@
+AMDGPU_LIBS = @AMDGPU_LIBS@
+AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
+AR = @AR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+BSYMBOLIC = @BSYMBOLIC@
+CC = @CC@
+CCAS = @CCAS@
+CCASDEPMODE = @CCASDEPMODE@
+CCASFLAGS = @CCASFLAGS@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CLANG_RESOURCE_DIR = @CLANG_RESOURCE_DIR@
+CLOCK_LIB = @CLOCK_LIB@
+CPP = @CPP@
+CPPFLAGS = @CPPFLAGS@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+D3D_DRIVER_INSTALL_DIR = @D3D_DRIVER_INSTALL_DIR@
+DEFINES = @DEFINES@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+DLLTOOL = @DLLTOOL@
+DLOPEN_LIBS = @DLOPEN_LIBS@
+DRI2PROTO_CFLAGS = @DRI2PROTO_CFLAGS@
+DRI2PROTO_LIBS = @DRI2PROTO_LIBS@
+DRI3PROTO_CFLAGS = @DRI3PROTO_CFLAGS@
+DRI3PROTO_LIBS = @DRI3PROTO_LIBS@
+DRIGL_CFLAGS = @DRIGL_CFLAGS@
+DRIGL_LIBS = @DRIGL_LIBS@
+DRI_DRIVER_INSTALL_DIR = @DRI_DRIVER_INSTALL_DIR@
+DRI_DRIVER_SEARCH_DIR = @DRI_DRIVER_SEARCH_DIR@
+DRI_LIB_DEPS = @DRI_LIB_DEPS@
+DRI_PC_REQ_PRIV = @DRI_PC_REQ_PRIV@
+DSYMUTIL = @DSYMUTIL@
+DUMPBIN = @DUMPBIN@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGL_CFLAGS = @EGL_CFLAGS@
+EGL_CLIENT_APIS = @EGL_CLIENT_APIS@
+EGL_LIB_DEPS = @EGL_LIB_DEPS@
+EGL_NATIVE_PLATFORM = @EGL_NATIVE_PLATFORM@
+EGREP = @EGREP@
+ELF_LIB = @ELF_LIB@
+EXEEXT = @EXEEXT@
+EXPAT_CFLAGS = @EXPAT_CFLAGS@
+EXPAT_LIBS = @EXPAT_LIBS@
+FGREP = @FGREP@
+FREEDRENO_CFLAGS = @FREEDRENO_CFLAGS@
+FREEDRENO_LIBS = @FREEDRENO_LIBS@
+GALLIUM_PIPE_LOADER_DEFINES = @GALLIUM_PIPE_LOADER_DEFINES@
+GBM_PC_LIB_PRIV = @GBM_PC_LIB_PRIV@
+GBM_PC_REQ_PRIV = @GBM_PC_REQ_PRIV@
+GC_SECTIONS = @GC_SECTIONS@
+GLESv1_CM_LIB_DEPS = @GLESv1_CM_LIB_DEPS@
+GLESv1_CM_PC_LIB_PRIV = @GLESv1_CM_PC_LIB_PRIV@
+GLESv2_LIB_DEPS = @GLESv2_LIB_DEPS@
+GLESv2_PC_LIB_PRIV = @GLESv2_PC_LIB_PRIV@
+GLPROTO_CFLAGS = @GLPROTO_CFLAGS@
+GLPROTO_LIBS = @GLPROTO_LIBS@
+GLX_TLS = @GLX_TLS@
+GL_LIB = @GL_LIB@
+GL_LIB_DEPS = @GL_LIB_DEPS@
+GL_PC_CFLAGS = @GL_PC_CFLAGS@
+GL_PC_LIB_PRIV = @GL_PC_LIB_PRIV@
+GL_PC_REQ_PRIV = @GL_PC_REQ_PRIV@
+GREP = @GREP@
+HAVE_XF86VIDMODE = @HAVE_XF86VIDMODE@
+INDENT = @INDENT@
+INDENT_FLAGS = @INDENT_FLAGS@
+INSTALL = @INSTALL@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+INTEL_CFLAGS = @INTEL_CFLAGS@
+INTEL_LIBS = @INTEL_LIBS@
+LD = @LD@
+LDFLAGS = @LDFLAGS@
+LD_NO_UNDEFINED = @LD_NO_UNDEFINED@
+LEX = @LEX@
+LEXLIB = @LEXLIB@
+LEX_OUTPUT_ROOT = @LEX_OUTPUT_ROOT@
+LIBCLC_INCLUDEDIR = @LIBCLC_INCLUDEDIR@
+LIBCLC_LIBEXECDIR = @LIBCLC_LIBEXECDIR@
+LIBDRM_CFLAGS = @LIBDRM_CFLAGS@
+LIBDRM_LIBS = @LIBDRM_LIBS@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LIBTOOL = @LIBTOOL@
+LIBUDEV_CFLAGS = @LIBUDEV_CFLAGS@
+LIBUDEV_LIBS = @LIBUDEV_LIBS@
+LIB_DIR = @LIB_DIR@
+LIB_EXT = @LIB_EXT@
+LIPO = @LIPO@
+LLVM_BINDIR = @LLVM_BINDIR@
+LLVM_CFLAGS = @LLVM_CFLAGS@
+LLVM_CONFIG = @LLVM_CONFIG@
+LLVM_CPPFLAGS = @LLVM_CPPFLAGS@
+LLVM_CXXFLAGS = @LLVM_CXXFLAGS@
+LLVM_INCLUDEDIR = @LLVM_INCLUDEDIR@
+LLVM_LDFLAGS = @LLVM_LDFLAGS@
+LLVM_LIBDIR = @LLVM_LIBDIR@
+LLVM_LIBS = @LLVM_LIBS@
+LLVM_VERSION = @LLVM_VERSION@
+LN_S = @LN_S@
+LTLIBOBJS = @LTLIBOBJS@
+LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
+MAKEINFO = @MAKEINFO@
+MANIFEST_TOOL = @MANIFEST_TOOL@
+MESA_LLVM = @MESA_LLVM@
+MKDIR_P = @MKDIR_P@
+MSVC2008_COMPAT_CFLAGS = @MSVC2008_COMPAT_CFLAGS@
+MSVC2008_COMPAT_CXXFLAGS = @MSVC2008_COMPAT_CXXFLAGS@
+MSVC2013_COMPAT_CFLAGS = @MSVC2013_COMPAT_CFLAGS@
+MSVC2013_COMPAT_CXXFLAGS = @MSVC2013_COMPAT_CXXFLAGS@
+NINE_MAJOR = @NINE_MAJOR@
+NINE_MINOR = @NINE_MINOR@
+NINE_TINY = @NINE_TINY@
+NINE_VERSION = @NINE_VERSION@
+NM = @NM@
+NMEDIT = @NMEDIT@
+NOUVEAU_CFLAGS = @NOUVEAU_CFLAGS@
+NOUVEAU_LIBS = @NOUVEAU_LIBS@
+NVVIEUX_CFLAGS = @NVVIEUX_CFLAGS@
+NVVIEUX_LIBS = @NVVIEUX_LIBS@
+OBJDUMP = @OBJDUMP@
+OBJEXT = @OBJEXT@
+OMX_CFLAGS = @OMX_CFLAGS@
+OMX_LIBS = @OMX_LIBS@
+OMX_LIB_INSTALL_DIR = @OMX_LIB_INSTALL_DIR@
+OPENCL_LIBNAME = @OPENCL_LIBNAME@
+OPENCL_VERSION = @OPENCL_VERSION@
+OSMESA_LIB = @OSMESA_LIB@
+OSMESA_LIB_DEPS = @OSMESA_LIB_DEPS@
+OSMESA_PC_LIB_PRIV = @OSMESA_PC_LIB_PRIV@
+OSMESA_PC_REQ = @OSMESA_PC_REQ@
+OSMESA_VERSION = @OSMESA_VERSION@
+OTOOL = @OTOOL@
+OTOOL64 = @OTOOL64@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_URL = @PACKAGE_URL@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+PKG_CONFIG = @PKG_CONFIG@
+PKG_CONFIG_LIBDIR = @PKG_CONFIG_LIBDIR@
+PKG_CONFIG_PATH = @PKG_CONFIG_PATH@
+POSIX_SHELL = @POSIX_SHELL@
+PRESENTPROTO_CFLAGS = @PRESENTPROTO_CFLAGS@
+PRESENTPROTO_LIBS = @PRESENTPROTO_LIBS@
+PTHREAD_CC = @PTHREAD_CC@
+PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
+PTHREAD_LIBS = @PTHREAD_LIBS@
+PYTHON2 = @PYTHON2@
+RADEON_CFLAGS = @RADEON_CFLAGS@
+RADEON_LIBS = @RADEON_LIBS@
+RANLIB = @RANLIB@
+RM = @RM@
+SED = @SED@
+SELINUX_CFLAGS = @SELINUX_CFLAGS@
+SELINUX_LIBS = @SELINUX_LIBS@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+SSE41_CFLAGS = @SSE41_CFLAGS@
+STRIP = @STRIP@
+VA_CFLAGS = @VA_CFLAGS@
+VA_LIBS = @VA_LIBS@
+VA_LIB_INSTALL_DIR = @VA_LIB_INSTALL_DIR@
+VA_MAJOR = @VA_MAJOR@
+VA_MINOR = @VA_MINOR@
+VDPAU_CFLAGS = @VDPAU_CFLAGS@
+VDPAU_LIBS = @VDPAU_LIBS@
+VDPAU_LIB_INSTALL_DIR = @VDPAU_LIB_INSTALL_DIR@
+VDPAU_MAJOR = @VDPAU_MAJOR@
+VDPAU_MINOR = @VDPAU_MINOR@
+VERSION = @VERSION@
+VG_LIB_DEPS = @VG_LIB_DEPS@
+VISIBILITY_CFLAGS = @VISIBILITY_CFLAGS@
+VISIBILITY_CXXFLAGS = @VISIBILITY_CXXFLAGS@
+VL_CFLAGS = @VL_CFLAGS@
+VL_LIBS = @VL_LIBS@
+WAYLAND_CFLAGS = @WAYLAND_CFLAGS@
+WAYLAND_LIBS = @WAYLAND_LIBS@
+WAYLAND_SCANNER = @WAYLAND_SCANNER@
+WAYLAND_SCANNER_CFLAGS = @WAYLAND_SCANNER_CFLAGS@
+WAYLAND_SCANNER_LIBS = @WAYLAND_SCANNER_LIBS@
+X11_INCLUDES = @X11_INCLUDES@
+XA_MAJOR = @XA_MAJOR@
+XA_MINOR = @XA_MINOR@
+XA_TINY = @XA_TINY@
+XA_VERSION = @XA_VERSION@
+XCB_DRI2_CFLAGS = @XCB_DRI2_CFLAGS@
+XCB_DRI2_LIBS = @XCB_DRI2_LIBS@
+XF86VIDMODE_CFLAGS = @XF86VIDMODE_CFLAGS@
+XF86VIDMODE_LIBS = @XF86VIDMODE_LIBS@
+XLIBGL_CFLAGS = @XLIBGL_CFLAGS@
+XLIBGL_LIBS = @XLIBGL_LIBS@
+XVMC_CFLAGS = @XVMC_CFLAGS@
+XVMC_LIBS = @XVMC_LIBS@
+XVMC_LIB_INSTALL_DIR = @XVMC_LIB_INSTALL_DIR@
+XVMC_MAJOR = @XVMC_MAJOR@
+XVMC_MINOR = @XVMC_MINOR@
+YACC = @YACC@
+YFLAGS = @YFLAGS@
+abs_builddir = @abs_builddir@
+abs_srcdir = @abs_srcdir@
+abs_top_builddir = @abs_top_builddir@
+abs_top_srcdir = @abs_top_srcdir@
+ac_ct_AR = @ac_ct_AR@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
+acv_mako_found = @acv_mako_found@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+ax_pthread_config = @ax_pthread_config@
+bindir = @bindir@
+build = @build@
+build_alias = @build_alias@
+build_cpu = @build_cpu@
+build_os = @build_os@
+build_vendor = @build_vendor@
+builddir = @builddir@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host = @host@
+host_alias = @host_alias@
+host_cpu = @host_cpu@
+host_os = @host_os@
+host_vendor = @host_vendor@
+htmldir = @htmldir@
+ifGNUmake = @ifGNUmake@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+srcdir = @srcdir@
+sysconfdir = @sysconfdir@
+target = @target@
+target_alias = @target_alias@
+target_cpu = @target_cpu@
+target_os = @target_os@
+target_vendor = @target_vendor@
+top_build_prefix = @top_build_prefix@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+C_SOURCES := \
+	lp_bld_alpha.c \
+	lp_bld_alpha.h \
+	lp_bld_blend_aos.c \
+	lp_bld_blend.c \
+	lp_bld_blend.h \
+	lp_bld_blend_logicop.c \
+	lp_bld_depth.c \
+	lp_bld_depth.h \
+	lp_bld_interp.c \
+	lp_bld_interp.h \
+	lp_clear.c \
+	lp_clear.h \
+	lp_context.c \
+	lp_context.h \
+	lp_debug.h \
+	lp_draw_arrays.c \
+	lp_fence.c \
+	lp_fence.h \
+	lp_flush.c \
+	lp_flush.h \
+	lp_jit.c \
+	lp_jit.h \
+	lp_limits.h \
+	lp_memory.c \
+	lp_memory.h \
+	lp_perf.c \
+	lp_perf.h \
+	lp_public.h \
+	lp_query.c \
+	lp_query.h \
+	lp_rast.c \
+	lp_rast_debug.c \
+	lp_rast.h \
+	lp_rast_priv.h \
+	lp_rast_tri.c \
+	lp_rast_tri_tmp.h \
+	lp_scene.c \
+	lp_scene.h \
+	lp_scene_queue.c \
+	lp_scene_queue.h \
+	lp_screen.c \
+	lp_screen.h \
+	lp_setup.c \
+	lp_setup_context.h \
+	lp_setup.h \
+	lp_setup_line.c \
+	lp_setup_point.c \
+	lp_setup_tri.c \
+	lp_setup_vbuf.c \
+	lp_state_blend.c \
+	lp_state_clip.c \
+	lp_state_derived.c \
+	lp_state_fs.c \
+	lp_state_fs.h \
+	lp_state_gs.c \
+	lp_state.h \
+	lp_state_rasterizer.c \
+	lp_state_sampler.c \
+	lp_state_setup.c \
+	lp_state_setup.h \
+	lp_state_so.c \
+	lp_state_surface.c \
+	lp_state_vertex.c \
+	lp_state_vs.c \
+	lp_surface.c \
+	lp_surface.h \
+	lp_tex_sample.c \
+	lp_tex_sample.h \
+	lp_texture.c \
+	lp_texture.h
+
+GALLIUM_CFLAGS = \
+	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/src \
+	-I$(top_srcdir)/src/gallium/include \
+	-I$(top_srcdir)/src/gallium/auxiliary \
+	$(DEFINES)
+
+
+# src/gallium/auxiliary must appear before src/gallium/drivers
+# because there are stupidly two rbug_context.h files in
+# different directories, and which one is included by the
+# preprocessor is determined by the ordering of the -I flags.
+GALLIUM_DRIVER_CFLAGS = \
+	-I$(srcdir)/include \
+	-I$(top_srcdir)/src \
+	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/src/gallium/include \
+	-I$(top_srcdir)/src/gallium/auxiliary \
+	-I$(top_srcdir)/src/gallium/drivers \
+	-I$(top_srcdir)/src/gallium/winsys \
+	$(DEFINES) \
+	$(VISIBILITY_CFLAGS)
+
+GALLIUM_DRIVER_CXXFLAGS = \
+	-I$(srcdir)/include \
+	-I$(top_srcdir)/src \
+	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/src/gallium/include \
+	-I$(top_srcdir)/src/gallium/auxiliary \
+	-I$(top_srcdir)/src/gallium/drivers \
+	-I$(top_srcdir)/src/gallium/winsys \
+	$(DEFINES) \
+	$(VISIBILITY_CXXFLAGS)
+
+GALLIUM_TARGET_CFLAGS = \
+	-I$(top_srcdir)/src \
+	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/src/loader \
+	-I$(top_srcdir)/src/gallium/include \
+	-I$(top_srcdir)/src/gallium/auxiliary \
+	-I$(top_srcdir)/src/gallium/drivers \
+	-I$(top_srcdir)/src/gallium/winsys \
+	$(DEFINES) \
+	$(PTHREAD_CFLAGS) \
+	$(LIBDRM_CFLAGS) \
+	$(VISIBILITY_CFLAGS)
+
+GALLIUM_COMMON_LIB_DEPS = \
+	-lm \
+	$(CLOCK_LIB) \
+	$(PTHREAD_LIBS) \
+	$(DLOPEN_LIBS)
+
+GALLIUM_WINSYS_CFLAGS = \
+	-I$(top_srcdir)/src \
+	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/src/gallium/include \
+	-I$(top_srcdir)/src/gallium/auxiliary \
+	$(DEFINES) \
+	$(VISIBILITY_CFLAGS)
+
+GALLIUM_PIPE_LOADER_WINSYS_LIBS =  \
+	$(top_builddir)/src/gallium/winsys/sw/null/libws_null.la \
+	$(top_builddir)/src/gallium/winsys/sw/wrapper/libwsw.la \
+	$(am__append_1)
+AM_CFLAGS = \
+	$(GALLIUM_DRIVER_CFLAGS) \
+	$(LLVM_CFLAGS) \
+	$(MSVC2008_COMPAT_CFLAGS)
+
+AM_CXXFLAGS = \
+	$(GALLIUM_DRIVER_CXXFLAGS) \
+	$(LLVM_CXXFLAGS) \
+	$(MSVC2008_COMPAT_CXXFLAGS)
+
+noinst_LTLIBRARIES = libllvmpipe.la
+libllvmpipe_la_SOURCES = $(C_SOURCES)
+libllvmpipe_la_LDFLAGS = $(LLVM_LDFLAGS)
+noinst_HEADERS = lp_test.h
+TESTS = $(check_PROGRAMS)
+TEST_LIBS = \
+	libllvmpipe.la \
+	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
+	$(top_builddir)/src/util/libmesautil.la \
+	$(LLVM_LIBS) \
+	$(DLOPEN_LIBS) \
+	$(PTHREAD_LIBS)
+
+lp_test_format_SOURCES = lp_test_format.c lp_test_main.c
+lp_test_format_LDADD = $(TEST_LIBS)
+nodist_EXTRA_lp_test_format_SOURCES = dummy.cpp
+lp_test_arit_SOURCES = lp_test_arit.c lp_test_main.c
+lp_test_arit_LDADD = $(TEST_LIBS)
+nodist_EXTRA_lp_test_arit_SOURCES = dummy.cpp
+lp_test_blend_SOURCES = lp_test_blend.c lp_test_main.c
+lp_test_blend_LDADD = $(TEST_LIBS)
+nodist_EXTRA_lp_test_blend_SOURCES = dummy.cpp
+lp_test_conv_SOURCES = lp_test_conv.c lp_test_main.c
+lp_test_conv_LDADD = $(TEST_LIBS)
+nodist_EXTRA_lp_test_conv_SOURCES = dummy.cpp
+lp_test_printf_SOURCES = lp_test_printf.c lp_test_main.c
+lp_test_printf_LDADD = $(TEST_LIBS)
+nodist_EXTRA_lp_test_printf_SOURCES = dummy.cpp
+EXTRA_DIST = SConscript
+all: all-am
+
+.SUFFIXES:
+.SUFFIXES: .c .cpp .lo .log .o .obj .test .test$(EXEEXT) .trs
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am $(srcdir)/Makefile.sources $(top_srcdir)/src/gallium/Automake.inc $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      ( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
+	        && { if test -f $@; then exit 0; else break; fi; }; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign src/gallium/drivers/llvmpipe/Makefile'; \
+	$(am__cd) $(top_srcdir) && \
+	  $(AUTOMAKE) --foreign src/gallium/drivers/llvmpipe/Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+$(srcdir)/Makefile.sources $(top_srcdir)/src/gallium/Automake.inc $(am__empty):
+
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+
+$(top_srcdir)/configure:  $(am__configure_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
+	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
+$(am__aclocal_m4_deps):
+
+clean-noinstLTLIBRARIES:
+	-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
+	@list='$(noinst_LTLIBRARIES)'; \
+	locs=`for p in $$list; do echo $$p; done | \
+	      sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
+	      sort -u`; \
+	test -z "$$locs" || { \
+	  echo rm -f $${locs}; \
+	  rm -f $${locs}; \
+	}
+
+libllvmpipe.la: $(libllvmpipe_la_OBJECTS) $(libllvmpipe_la_DEPENDENCIES) $(EXTRA_libllvmpipe_la_DEPENDENCIES) 
+	$(AM_V_CCLD)$(libllvmpipe_la_LINK)  $(libllvmpipe_la_OBJECTS) $(libllvmpipe_la_LIBADD) $(LIBS)
+
+clean-checkPROGRAMS:
+	@list='$(check_PROGRAMS)'; test -n "$$list" || exit 0; \
+	echo " rm -f" $$list; \
+	rm -f $$list || exit $$?; \
+	test -n "$(EXEEXT)" || exit 0; \
+	list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \
+	echo " rm -f" $$list; \
+	rm -f $$list
+
+lp_test_arit$(EXEEXT): $(lp_test_arit_OBJECTS) $(lp_test_arit_DEPENDENCIES) $(EXTRA_lp_test_arit_DEPENDENCIES) 
+	@rm -f lp_test_arit$(EXEEXT)
+	$(AM_V_CXXLD)$(CXXLINK) $(lp_test_arit_OBJECTS) $(lp_test_arit_LDADD) $(LIBS)
+
+lp_test_blend$(EXEEXT): $(lp_test_blend_OBJECTS) $(lp_test_blend_DEPENDENCIES) $(EXTRA_lp_test_blend_DEPENDENCIES) 
+	@rm -f lp_test_blend$(EXEEXT)
+	$(AM_V_CXXLD)$(CXXLINK) $(lp_test_blend_OBJECTS) $(lp_test_blend_LDADD) $(LIBS)
+
+lp_test_conv$(EXEEXT): $(lp_test_conv_OBJECTS) $(lp_test_conv_DEPENDENCIES) $(EXTRA_lp_test_conv_DEPENDENCIES) 
+	@rm -f lp_test_conv$(EXEEXT)
+	$(AM_V_CXXLD)$(CXXLINK) $(lp_test_conv_OBJECTS) $(lp_test_conv_LDADD) $(LIBS)
+
+lp_test_format$(EXEEXT): $(lp_test_format_OBJECTS) $(lp_test_format_DEPENDENCIES) $(EXTRA_lp_test_format_DEPENDENCIES) 
+	@rm -f lp_test_format$(EXEEXT)
+	$(AM_V_CXXLD)$(CXXLINK) $(lp_test_format_OBJECTS) $(lp_test_format_LDADD) $(LIBS)
+
+lp_test_printf$(EXEEXT): $(lp_test_printf_OBJECTS) $(lp_test_printf_DEPENDENCIES) $(EXTRA_lp_test_printf_DEPENDENCIES) 
+	@rm -f lp_test_printf$(EXEEXT)
+	$(AM_V_CXXLD)$(CXXLINK) $(lp_test_printf_OBJECTS) $(lp_test_printf_LDADD) $(LIBS)
+
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+
+distclean-compile:
+	-rm -f *.tab.c
+
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dummy.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_bld_alpha.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_bld_blend.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_bld_blend_aos.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_bld_blend_logicop.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_bld_depth.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_bld_interp.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_clear.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_context.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_draw_arrays.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_fence.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_flush.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_jit.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_memory.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_perf.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_query.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_rast.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_rast_debug.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_rast_tri.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_scene.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_scene_queue.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_screen.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_setup.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_setup_line.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_setup_point.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_setup_tri.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_setup_vbuf.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_state_blend.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_state_clip.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_state_derived.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_state_fs.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_state_gs.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_state_rasterizer.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_state_sampler.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_state_setup.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_state_so.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_state_surface.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_state_vertex.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_state_vs.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_surface.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_test_arit.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_test_blend.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_test_conv.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_test_format.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_test_main.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_test_printf.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_tex_sample.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lp_texture.Plo@am__quote@
+
+.c.o:
+@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.obj$$||'`;\
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ `$(CYGPATH_W) '$<'` &&\
+@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.c.lo:
+@am__fastdepCC_TRUE@	$(AM_V_CC)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.lo$$||'`;\
+@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCC_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
+
+.cpp.o:
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\
+@am__fastdepCXX_TRUE@	$(CXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCXX_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ $<
+
+.cpp.obj:
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.obj$$||'`;\
+@am__fastdepCXX_TRUE@	$(CXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ `$(CYGPATH_W) '$<'` &&\
+@am__fastdepCXX_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Po
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+
+.cpp.lo:
+@am__fastdepCXX_TRUE@	$(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.lo$$||'`;\
+@am__fastdepCXX_TRUE@	$(LTCXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\
+@am__fastdepCXX_TRUE@	$(am__mv) $$depbase.Tpo $$depbase.Plo
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	$(AM_V_CXX)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(AM_V_CXX@am__nodep@)$(LTCXXCOMPILE) -c -o $@ $<
+
+mostlyclean-libtool:
+	-rm -f *.lo
+
+clean-libtool:
+	-rm -rf .libs _libs
+
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	set x; \
+	here=`pwd`; \
+	$(am__define_uniq_tagged_files); \
+	shift; \
+	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  if test $$# -gt 0; then \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      "$$@" $$unique; \
+	  else \
+	    $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	      $$unique; \
+	  fi; \
+	fi
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
+	test -z "$(CTAGS_ARGS)$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$unique
+
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && $(am__cd) $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am
+
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
+	case "$(srcdir)" in \
+	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
+	  *) sdir=$(subdir)/$(srcdir) ;; \
+	esac; \
+	for i in $$list; do \
+	  if test -f "$$i"; then \
+	    echo "$(subdir)/$$i"; \
+	  else \
+	    echo "$$sdir/$$i"; \
+	  fi; \
+	done >> $(top_builddir)/cscope.files
+
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+
+# Recover from deleted '.trs' file; this should ensure that
+# "rm -f foo.log; make foo.trs" re-run 'foo.test', and re-create
+# both 'foo.log' and 'foo.trs'.  Break the recipe in two subshells
+# to avoid problems with "make -n".
+.log.trs:
+	rm -f $< $@
+	$(MAKE) $(AM_MAKEFLAGS) $<
+
+# Leading 'am--fnord' is there to ensure the list of targets does not
+# expand to empty, as could happen e.g. with make check TESTS=''.
+am--fnord $(TEST_LOGS) $(TEST_LOGS:.log=.trs): $(am__force_recheck)
+am--force-recheck:
+	@:
+
+$(TEST_SUITE_LOG): $(TEST_LOGS)
+	@$(am__set_TESTS_bases); \
+	am__f_ok () { test -f "$$1" && test -r "$$1"; }; \
+	redo_bases=`for i in $$bases; do \
+	              am__f_ok $$i.trs && am__f_ok $$i.log || echo $$i; \
+	            done`; \
+	if test -n "$$redo_bases"; then \
+	  redo_logs=`for i in $$redo_bases; do echo $$i.log; done`; \
+	  redo_results=`for i in $$redo_bases; do echo $$i.trs; done`; \
+	  if $(am__make_dryrun); then :; else \
+	    rm -f $$redo_logs && rm -f $$redo_results || exit 1; \
+	  fi; \
+	fi; \
+	if test -n "$$am__remaking_logs"; then \
+	  echo "fatal: making $(TEST_SUITE_LOG): possible infinite" \
+	       "recursion detected" >&2; \
+	elif test -n "$$redo_logs"; then \
+	  am__remaking_logs=yes $(MAKE) $(AM_MAKEFLAGS) $$redo_logs; \
+	fi; \
+	if $(am__make_dryrun); then :; else \
+	  st=0;  \
+	  errmsg="fatal: making $(TEST_SUITE_LOG): failed to create"; \
+	  for i in $$redo_bases; do \
+	    test -f $$i.trs && test -r $$i.trs \
+	      || { echo "$$errmsg $$i.trs" >&2; st=1; }; \
+	    test -f $$i.log && test -r $$i.log \
+	      || { echo "$$errmsg $$i.log" >&2; st=1; }; \
+	  done; \
+	  test $$st -eq 0 || exit 1; \
+	fi
+	@$(am__sh_e_setup); $(am__tty_colors); $(am__set_TESTS_bases); \
+	ws='[ 	]'; \
+	results=`for b in $$bases; do echo $$b.trs; done`; \
+	test -n "$$results" || results=/dev/null; \
+	all=`  grep "^$$ws*:test-result:"           $$results | wc -l`; \
+	pass=` grep "^$$ws*:test-result:$$ws*PASS"  $$results | wc -l`; \
+	fail=` grep "^$$ws*:test-result:$$ws*FAIL"  $$results | wc -l`; \
+	skip=` grep "^$$ws*:test-result:$$ws*SKIP"  $$results | wc -l`; \
+	xfail=`grep "^$$ws*:test-result:$$ws*XFAIL" $$results | wc -l`; \
+	xpass=`grep "^$$ws*:test-result:$$ws*XPASS" $$results | wc -l`; \
+	error=`grep "^$$ws*:test-result:$$ws*ERROR" $$results | wc -l`; \
+	if test `expr $$fail + $$xpass + $$error` -eq 0; then \
+	  success=true; \
+	else \
+	  success=false; \
+	fi; \
+	br='==================='; br=$$br$$br$$br$$br; \
+	result_count () \
+	{ \
+	    if test x"$$1" = x"--maybe-color"; then \
+	      maybe_colorize=yes; \
+	    elif test x"$$1" = x"--no-color"; then \
+	      maybe_colorize=no; \
+	    else \
+	      echo "$@: invalid 'result_count' usage" >&2; exit 4; \
+	    fi; \
+	    shift; \
+	    desc=$$1 count=$$2; \
+	    if test $$maybe_colorize = yes && test $$count -gt 0; then \
+	      color_start=$$3 color_end=$$std; \
+	    else \
+	      color_start= color_end=; \
+	    fi; \
+	    echo "$${color_start}# $$desc $$count$${color_end}"; \
+	}; \
+	create_testsuite_report () \
+	{ \
+	  result_count $$1 "TOTAL:" $$all   "$$brg"; \
+	  result_count $$1 "PASS: " $$pass  "$$grn"; \
+	  result_count $$1 "SKIP: " $$skip  "$$blu"; \
+	  result_count $$1 "XFAIL:" $$xfail "$$lgn"; \
+	  result_count $$1 "FAIL: " $$fail  "$$red"; \
+	  result_count $$1 "XPASS:" $$xpass "$$red"; \
+	  result_count $$1 "ERROR:" $$error "$$mgn"; \
+	}; \
+	{								\
+	  echo "$(PACKAGE_STRING): $(subdir)/$(TEST_SUITE_LOG)" |	\
+	    $(am__rst_title);						\
+	  create_testsuite_report --no-color;				\
+	  echo;								\
+	  echo ".. contents:: :depth: 2";				\
+	  echo;								\
+	  for b in $$bases; do echo $$b; done				\
+	    | $(am__create_global_log);					\
+	} >$(TEST_SUITE_LOG).tmp || exit 1;				\
+	mv $(TEST_SUITE_LOG).tmp $(TEST_SUITE_LOG);			\
+	if $$success; then						\
+	  col="$$grn";							\
+	 else								\
+	  col="$$red";							\
+	  test x"$$VERBOSE" = x || cat $(TEST_SUITE_LOG);		\
+	fi;								\
+	echo "$${col}$$br$${std}"; 					\
+	echo "$${col}Testsuite summary for $(PACKAGE_STRING)$${std}";	\
+	echo "$${col}$$br$${std}"; 					\
+	create_testsuite_report --maybe-color;				\
+	echo "$$col$$br$$std";						\
+	if $$success; then :; else					\
+	  echo "$${col}See $(subdir)/$(TEST_SUITE_LOG)$${std}";		\
+	  if test -n "$(PACKAGE_BUGREPORT)"; then			\
+	    echo "$${col}Please report to $(PACKAGE_BUGREPORT)$${std}";	\
+	  fi;								\
+	  echo "$$col$$br$$std";					\
+	fi;								\
+	$$success || exit 1
+
+check-TESTS:
+	@list='$(RECHECK_LOGS)';           test -z "$$list" || rm -f $$list
+	@list='$(RECHECK_LOGS:.log=.trs)'; test -z "$$list" || rm -f $$list
+	@test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG)
+	@set +e; $(am__set_TESTS_bases); \
+	log_list=`for i in $$bases; do echo $$i.log; done`; \
+	trs_list=`for i in $$bases; do echo $$i.trs; done`; \
+	log_list=`echo $$log_list`; trs_list=`echo $$trs_list`; \
+	$(MAKE) $(AM_MAKEFLAGS) $(TEST_SUITE_LOG) TEST_LOGS="$$log_list"; \
+	exit $$?;
+recheck: all $(check_PROGRAMS)
+	@test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG)
+	@set +e; $(am__set_TESTS_bases); \
+	bases=`for i in $$bases; do echo $$i; done \
+	         | $(am__list_recheck_tests)` || exit 1; \
+	log_list=`for i in $$bases; do echo $$i.log; done`; \
+	log_list=`echo $$log_list`; \
+	$(MAKE) $(AM_MAKEFLAGS) $(TEST_SUITE_LOG) \
+	        am__force_recheck=am--force-recheck \
+	        TEST_LOGS="$$log_list"; \
+	exit $$?
+lp_test_format.log: lp_test_format$(EXEEXT)
+	@p='lp_test_format$(EXEEXT)'; \
+	b='lp_test_format'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+lp_test_arit.log: lp_test_arit$(EXEEXT)
+	@p='lp_test_arit$(EXEEXT)'; \
+	b='lp_test_arit'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+lp_test_blend.log: lp_test_blend$(EXEEXT)
+	@p='lp_test_blend$(EXEEXT)'; \
+	b='lp_test_blend'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+lp_test_conv.log: lp_test_conv$(EXEEXT)
+	@p='lp_test_conv$(EXEEXT)'; \
+	b='lp_test_conv'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+lp_test_printf.log: lp_test_printf$(EXEEXT)
+	@p='lp_test_printf$(EXEEXT)'; \
+	b='lp_test_printf'; \
+	$(am__check_pre) $(LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_LOG_DRIVER_FLAGS) $(LOG_DRIVER_FLAGS) -- $(LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+.test.log:
+	@p='$<'; \
+	$(am__set_b); \
+	$(am__check_pre) $(TEST_LOG_DRIVER) --test-name "$$f" \
+	--log-file $$b.log --trs-file $$b.trs \
+	$(am__common_driver_flags) $(AM_TEST_LOG_DRIVER_FLAGS) $(TEST_LOG_DRIVER_FLAGS) -- $(TEST_LOG_COMPILE) \
+	"$$tst" $(AM_TESTS_FD_REDIRECT)
+@am__EXEEXT_TRUE@.test$(EXEEXT).log:
+@am__EXEEXT_TRUE@	@p='$<'; \
+@am__EXEEXT_TRUE@	$(am__set_b); \
+@am__EXEEXT_TRUE@	$(am__check_pre) $(TEST_LOG_DRIVER) --test-name "$$f" \
+@am__EXEEXT_TRUE@	--log-file $$b.log --trs-file $$b.trs \
+@am__EXEEXT_TRUE@	$(am__common_driver_flags) $(AM_TEST_LOG_DRIVER_FLAGS) $(TEST_LOG_DRIVER_FLAGS) -- $(TEST_LOG_COMPILE) \
+@am__EXEEXT_TRUE@	"$$tst" $(AM_TESTS_FD_REDIRECT)
+
+distdir: $(DISTFILES)
+	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
+	list='$(DISTFILES)'; \
+	  dist_files=`for file in $$list; do echo $$file; done | \
+	  sed -e "s|^$$srcdirstrip/||;t" \
+	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
+	case $$dist_files in \
+	  */*) $(MKDIR_P) `echo "$$dist_files" | \
+			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
+			   sort -u` ;; \
+	esac; \
+	for file in $$dist_files; do \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  if test -d $$d/$$file; then \
+	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
+	    if test -d "$(distdir)/$$file"; then \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
+	      find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
+	    fi; \
+	    cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
+	  else \
+	    test -f "$(distdir)/$$file" \
+	    || cp -p $$d/$$file "$(distdir)/$$file" \
+	    || exit 1; \
+	  fi; \
+	done
+check-am: all-am
+	$(MAKE) $(AM_MAKEFLAGS) $(check_PROGRAMS)
+	$(MAKE) $(AM_MAKEFLAGS) check-TESTS
+check: check-am
+all-am: Makefile $(LTLIBRARIES) $(HEADERS)
+installdirs:
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+
+installcheck: installcheck-am
+install-strip:
+	if test -z '$(STRIP)'; then \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	      install; \
+	else \
+	  $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	    install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	    "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
+	fi
+mostlyclean-generic:
+	-test -z "$(TEST_LOGS)" || rm -f $(TEST_LOGS)
+	-test -z "$(TEST_LOGS:.log=.trs)" || rm -f $(TEST_LOGS:.log=.trs)
+	-test -z "$(TEST_SUITE_LOG)" || rm -f $(TEST_SUITE_LOG)
+
+clean-generic:
+
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+	-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
+
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+
+clean-am: clean-checkPROGRAMS clean-generic clean-libtool \
+	clean-noinstLTLIBRARIES mostlyclean-am
+
+distclean: distclean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-tags
+
+dvi: dvi-am
+
+dvi-am:
+
+html: html-am
+
+html-am:
+
+info: info-am
+
+info-am:
+
+install-data-am:
+
+install-dvi: install-dvi-am
+
+install-dvi-am:
+
+install-exec-am:
+
+install-html: install-html-am
+
+install-html-am:
+
+install-info: install-info-am
+
+install-info-am:
+
+install-man:
+
+install-pdf: install-pdf-am
+
+install-pdf-am:
+
+install-ps: install-ps-am
+
+install-ps-am:
+
+installcheck-am:
+
+maintainer-clean: maintainer-clean-am
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+
+mostlyclean: mostlyclean-am
+
+mostlyclean-am: mostlyclean-compile mostlyclean-generic \
+	mostlyclean-libtool
+
+pdf: pdf-am
+
+pdf-am:
+
+ps: ps-am
+
+ps-am:
+
+uninstall-am:
+
+.MAKE: check-am install-am install-strip
+
+.PHONY: CTAGS GTAGS TAGS all all-am check check-TESTS check-am clean \
+	clean-checkPROGRAMS clean-generic clean-libtool \
+	clean-noinstLTLIBRARIES cscopelist-am ctags ctags-am distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am recheck tags tags-am uninstall \
+	uninstall-am
+
+.PRECIOUS: Makefile
+
+
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/Makefile.sources b/lib/mesa/src/gallium/drivers/llvmpipe/Makefile.sources
new file mode 100644
index 000000000..d928ccba4
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/Makefile.sources
@@ -0,0 +1,71 @@
+C_SOURCES := \
+	lp_bld_alpha.c \
+	lp_bld_alpha.h \
+	lp_bld_blend_aos.c \
+	lp_bld_blend.c \
+	lp_bld_blend.h \
+	lp_bld_blend_logicop.c \
+	lp_bld_depth.c \
+	lp_bld_depth.h \
+	lp_bld_interp.c \
+	lp_bld_interp.h \
+	lp_clear.c \
+	lp_clear.h \
+	lp_context.c \
+	lp_context.h \
+	lp_debug.h \
+	lp_draw_arrays.c \
+	lp_fence.c \
+	lp_fence.h \
+	lp_flush.c \
+	lp_flush.h \
+	lp_jit.c \
+	lp_jit.h \
+	lp_limits.h \
+	lp_memory.c \
+	lp_memory.h \
+	lp_perf.c \
+	lp_perf.h \
+	lp_public.h \
+	lp_query.c \
+	lp_query.h \
+	lp_rast.c \
+	lp_rast_debug.c \
+	lp_rast.h \
+	lp_rast_priv.h \
+	lp_rast_tri.c \
+	lp_rast_tri_tmp.h \
+	lp_scene.c \
+	lp_scene.h \
+	lp_scene_queue.c \
+	lp_scene_queue.h \
+	lp_screen.c \
+	lp_screen.h \
+	lp_setup.c \
+	lp_setup_context.h \
+	lp_setup.h \
+	lp_setup_line.c \
+	lp_setup_point.c \
+	lp_setup_tri.c \
+	lp_setup_vbuf.c \
+	lp_state_blend.c \
+	lp_state_clip.c \
+	lp_state_derived.c \
+	lp_state_fs.c \
+	lp_state_fs.h \
+	lp_state_gs.c \
+	lp_state.h \
+	lp_state_rasterizer.c \
+	lp_state_sampler.c \
+	lp_state_setup.c \
+	lp_state_setup.h \
+	lp_state_so.c \
+	lp_state_surface.c \
+	lp_state_vertex.c \
+	lp_state_vs.c \
+	lp_surface.c \
+	lp_surface.h \
+	lp_tex_sample.c \
+	lp_tex_sample.h \
+	lp_texture.c \
+	lp_texture.h
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/SConscript b/lib/mesa/src/gallium/drivers/llvmpipe/SConscript
new file mode 100644
index 000000000..3a51efcd5
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/SConscript
@@ -0,0 +1,49 @@
+from sys import executable as python_cmd
+import distutils.version
+
+Import('*')
+
+if not env['llvm']:
+    print 'warning: LLVM disabled: not building llvmpipe'
+    Return()
+
+env = env.Clone()
+
+env.MSVC2008Compat()
+
+llvmpipe = env.ConvenienceLibrary(
+	target = 'llvmpipe',
+	source = env.ParseSourceList('Makefile.sources', 'C_SOURCES')
+	)
+
+env.Alias('llvmpipe', llvmpipe)
+
+
+if not env['embedded']:
+    env = env.Clone()
+
+    env.Prepend(LIBS = [llvmpipe, gallium, mesautil])
+
+    tests = [
+        'format',
+        'blend',
+        'conv',
+        'printf',
+    ]
+
+    if not env['msvc']:
+        tests.append('arit')
+
+    for test in tests:
+        testname = 'lp_test_' + test
+        target = env.Program(
+            target = testname,
+            source = [testname + '.c', 'lp_test_main.c'],
+        )
+        env.InstallProgram(target)
+        
+        # http://www.scons.org/wiki/UnitTests
+        alias = env.Alias(testname, [target], target[0].abspath)
+        AlwaysBuild(alias)
+
+Export('llvmpipe')
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_alpha.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
new file mode 100644
index 000000000..6e2d0376d
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
@@ -0,0 +1,95 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Alpha testing to LLVM IR translation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_state.h"
+#include "util/u_format.h"
+
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_conv.h"
+#include "gallivm/lp_bld_logic.h"
+#include "gallivm/lp_bld_flow.h"
+#include "gallivm/lp_bld_debug.h"
+
+#include "lp_bld_alpha.h"
+
+
+void
+lp_build_alpha_test(struct gallivm_state *gallivm,
+                    unsigned func,
+                    struct lp_type type,
+                    const struct util_format_description *cbuf_format_desc,
+                    struct lp_build_mask_context *mask,
+                    LLVMValueRef alpha,
+                    LLVMValueRef ref,
+                    boolean do_branch)
+{
+   struct lp_build_context bld;
+   LLVMValueRef test;
+
+   lp_build_context_init(&bld, gallivm, type);
+
+   /*
+    * Alpha testing needs to be done in the color buffer precision.
+    *
+    * TODO: Ideally, instead of duplicating the color conversion code, we would do
+    * alpha testing after converting the output colors, but that's not very
+    * convenient, because it needs to be done before depth testing.  Hopefully
+    * LLVM will detect and remove the duplicate expression.
+    *
+    * FIXME: This should be generalized to formats other than rgba8 variants.
+    */
+   if (type.floating &&
+       util_format_is_rgba8_variant(cbuf_format_desc)) {
+      const unsigned dst_width = 8;
+
+      alpha = lp_build_clamp(&bld, alpha, bld.zero, bld.one);
+      ref   = lp_build_clamp(&bld, ref,   bld.zero, bld.one);
+
+      alpha = lp_build_clamped_float_to_unsigned_norm(gallivm, type, dst_width, alpha);
+      ref   = lp_build_clamped_float_to_unsigned_norm(gallivm, type, dst_width, ref);
+
+      type.floating = 0;
+      lp_build_context_init(&bld, gallivm, type);
+   }
+
+   test = lp_build_cmp(&bld, func, alpha, ref);
+
+   lp_build_name(test, "alpha_mask");
+
+   lp_build_mask_update(mask, test);
+
+   if (do_branch)
+      lp_build_mask_check(mask);
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_alpha.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
new file mode 100644
index 000000000..15f1284c5
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
@@ -0,0 +1,59 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Alpha testing to LLVM IR translation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_BLD_ALPHA_H
+#define LP_BLD_ALPHA_H
+
+#include "pipe/p_compiler.h"
+
+#include "gallivm/lp_bld.h"
+
+struct pipe_alpha_state;
+struct util_format_description;
+struct gallivm_state;
+struct lp_type;
+struct lp_build_mask_context;
+
+
+void
+lp_build_alpha_test(struct gallivm_state *gallivm,
+                    unsigned func,
+                    struct lp_type type,
+                    const struct util_format_description *cbuf_format_desc,
+                    struct lp_build_mask_context *mask,
+                    LLVMValueRef alpha,
+                    LLVMValueRef ref,
+                    boolean do_branch);
+
+
+#endif /* !LP_BLD_ALPHA_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_blend.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_blend.c
new file mode 100644
index 000000000..1feb415c9
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_blend.c
@@ -0,0 +1,223 @@
+/**************************************************************************
+ *
+ * Copyright 2012 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_logic.h"
+#include "gallivm/lp_bld_swizzle.h"
+#include "gallivm/lp_bld_flow.h"
+#include "gallivm/lp_bld_debug.h"
+
+#include "lp_bld_blend.h"
+
+/**
+ * Is (a OP b) == (b OP a)?
+ */
+boolean
+lp_build_blend_func_commutative(unsigned func)
+{
+   switch (func) {
+   case PIPE_BLEND_ADD:
+   case PIPE_BLEND_MIN:
+   case PIPE_BLEND_MAX:
+      return TRUE;
+   case PIPE_BLEND_SUBTRACT:
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return FALSE;
+   default:
+      assert(0);
+      return TRUE;
+   }
+}
+
+
+/**
+ * Whether the blending functions are the reverse of each other.
+ */
+boolean
+lp_build_blend_func_reverse(unsigned rgb_func, unsigned alpha_func)
+{
+   if(rgb_func == alpha_func)
+      return FALSE;
+   if(rgb_func == PIPE_BLEND_SUBTRACT && alpha_func == PIPE_BLEND_REVERSE_SUBTRACT)
+      return TRUE;
+   if(rgb_func == PIPE_BLEND_REVERSE_SUBTRACT && alpha_func == PIPE_BLEND_SUBTRACT)
+      return TRUE;
+   return FALSE;
+}
+
+
+/**
+ * Whether the blending factors are complementary of each other.
+ */
+static inline boolean
+lp_build_blend_factor_complementary(unsigned src_factor, unsigned dst_factor)
+{
+   return dst_factor == (src_factor ^ 0x10);
+}
+
+
+/**
+ * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendEquationSeparate.xml
+ */
+LLVMValueRef
+lp_build_blend_func(struct lp_build_context *bld,
+                    unsigned func,
+                    LLVMValueRef term1,
+                    LLVMValueRef term2)
+{
+   switch (func) {
+   case PIPE_BLEND_ADD:
+      return lp_build_add(bld, term1, term2);
+   case PIPE_BLEND_SUBTRACT:
+      return lp_build_sub(bld, term1, term2);
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return lp_build_sub(bld, term2, term1);
+   case PIPE_BLEND_MIN:
+      return lp_build_min(bld, term1, term2);
+   case PIPE_BLEND_MAX:
+      return lp_build_max(bld, term1, term2);
+   default:
+      assert(0);
+      return bld->zero;
+   }
+}
+
+
+/**
+ * Performs optimisations and blending independent of SoA/AoS
+ *
+ * @param func                   the blend function
+ * @param factor_src             PIPE_BLENDFACTOR_xxx
+ * @param factor_dst             PIPE_BLENDFACTOR_xxx
+ * @param src                    source rgba
+ * @param dst                    dest rgba
+ * @param src_factor             src factor computed value
+ * @param dst_factor             dst factor computed value
+ * @param not_alpha_dependent    same factors accross all channels of src/dst
+ *
+ * not_alpha_dependent should be:
+ *  SoA: always true as it is only one channel at a time
+ *  AoS: rgb_src_factor == alpha_src_factor && rgb_dst_factor == alpha_dst_factor
+ *
+ * Note that pretty much every possible optimisation can only be done on non-unorm targets
+ * due to unorm values not going above 1.0 meaning factorisation can change results.
+ * e.g. (0.9 * 0.9) + (0.9 * 0.9) != 0.9 * (0.9 + 0.9) as result of + is always <= 1.
+ */
+LLVMValueRef
+lp_build_blend(struct lp_build_context *bld,
+               unsigned func,
+               unsigned factor_src,
+               unsigned factor_dst,
+               LLVMValueRef src,
+               LLVMValueRef dst,
+               LLVMValueRef src_factor,
+               LLVMValueRef dst_factor,
+               boolean not_alpha_dependent,
+               boolean optimise_only)
+{
+   LLVMValueRef result, src_term, dst_term;
+
+   /* If we are not alpha dependent we can mess with the src/dst factors */
+   if (not_alpha_dependent) {
+      if (lp_build_blend_factor_complementary(factor_src, factor_dst)) {
+         if (func == PIPE_BLEND_ADD) {
+            if (factor_src < factor_dst) {
+               return lp_build_lerp(bld, src_factor, dst, src, 0);
+            } else {
+               return lp_build_lerp(bld, dst_factor, src, dst, 0);
+            }
+         } else if(bld->type.floating && func == PIPE_BLEND_SUBTRACT) {
+            result = lp_build_add(bld, src, dst);
+
+            if (factor_src < factor_dst) {
+               result = lp_build_mul(bld, result, src_factor);
+               return lp_build_sub(bld, result, dst);
+            } else {
+               result = lp_build_mul(bld, result, dst_factor);
+               return lp_build_sub(bld, src, result);
+            }
+         } else if(bld->type.floating && func == PIPE_BLEND_REVERSE_SUBTRACT) {
+            result = lp_build_add(bld, src, dst);
+
+            if (factor_src < factor_dst) {
+               result = lp_build_mul(bld, result, src_factor);
+               return lp_build_sub(bld, dst, result);
+            } else {
+               result = lp_build_mul(bld, result, dst_factor);
+               return lp_build_sub(bld, result, src);
+            }
+         }
+      }
+
+      if (bld->type.floating && factor_src == factor_dst) {
+         if (func == PIPE_BLEND_ADD ||
+             func == PIPE_BLEND_SUBTRACT ||
+             func == PIPE_BLEND_REVERSE_SUBTRACT) {
+            LLVMValueRef result;
+            result = lp_build_blend_func(bld, func, src, dst);
+            return lp_build_mul(bld, result, src_factor);
+         }
+      }
+   }
+
+   if (optimise_only)
+      return NULL;
+
+   src_term = lp_build_mul(bld, src, src_factor);
+   dst_term = lp_build_mul(bld, dst, dst_factor);
+   return lp_build_blend_func(bld, func, src_term, dst_term);
+}
+
+void
+lp_build_alpha_to_coverage(struct gallivm_state *gallivm,
+                           struct lp_type type,
+                           struct lp_build_mask_context *mask,
+                           LLVMValueRef alpha,
+                           boolean do_branch)
+{
+   struct lp_build_context bld;
+   LLVMValueRef test;
+   LLVMValueRef alpha_ref_value;
+
+   lp_build_context_init(&bld, gallivm, type);
+
+   alpha_ref_value = lp_build_const_vec(gallivm, type, 0.5);
+
+   test = lp_build_cmp(&bld, PIPE_FUNC_GREATER, alpha, alpha_ref_value);
+
+   lp_build_name(test, "alpha_to_coverage");
+
+   lp_build_mask_update(mask, test);
+
+   if (do_branch)
+      lp_build_mask_check(mask);
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_blend.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_blend.h
new file mode 100644
index 000000000..adfab85dc
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_blend.h
@@ -0,0 +1,110 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef LP_BLD_BLEND_H
+#define LP_BLD_BLEND_H
+
+
+#include "gallivm/lp_bld.h"
+#include "gallivm/lp_bld_init.h"
+ 
+#include "pipe/p_format.h"
+
+
+struct pipe_blend_state;
+struct lp_type;
+struct lp_build_context;
+struct lp_build_mask_context;
+
+
+LLVMValueRef
+lp_build_blend(struct lp_build_context *bld,
+               unsigned func,
+               unsigned factor_src,
+               unsigned factor_dst,
+               LLVMValueRef src,
+               LLVMValueRef dst,
+               LLVMValueRef src_factor,
+               LLVMValueRef dst_factor,
+               boolean not_alpha_dependent,
+               boolean optimise_only);
+
+
+LLVMValueRef
+lp_build_blend_aos(struct gallivm_state *gallivm,
+                   const struct pipe_blend_state *blend,
+                   enum pipe_format cbuf_format,
+                   struct lp_type type,
+                   unsigned rt,
+                   LLVMValueRef src,
+                   LLVMValueRef src_alpha,
+                   LLVMValueRef src1,
+                   LLVMValueRef src1_alpha,
+                   LLVMValueRef dst,
+                   LLVMValueRef mask,
+                   LLVMValueRef const_,
+                   LLVMValueRef const_alpha,
+                   const unsigned char swizzle[4],
+                   int nr_channels);
+
+
+/**
+ * Apply a logic op.
+ *
+ * src/dst parameters are packed values. It should work regardless the inputs
+ * are scalars, or a vector.
+ */
+LLVMValueRef
+lp_build_logicop(LLVMBuilderRef builder,
+                 unsigned logicop_func,
+                 LLVMValueRef src,
+                 LLVMValueRef dst);
+
+
+LLVMValueRef
+lp_build_blend_func(struct lp_build_context *bld,
+                    unsigned func,
+                    LLVMValueRef term1,
+                    LLVMValueRef term2);
+
+
+boolean
+lp_build_blend_func_reverse(unsigned rgb_func,
+                            unsigned alpha_func);
+
+
+boolean
+lp_build_blend_func_commutative(unsigned func);
+
+void
+lp_build_alpha_to_coverage(struct gallivm_state *gallivm,
+                           struct lp_type type,
+                           struct lp_build_mask_context *mask,
+                           LLVMValueRef alpha,
+                           boolean do_branch);
+
+#endif /* !LP_BLD_BLEND_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
new file mode 100644
index 000000000..564e19a15
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -0,0 +1,423 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Blend LLVM IR generation -- AoS layout.
+ *
+ * AoS blending is in general much slower than SoA, but there are some cases
+ * where it might be faster. In particular, if a pixel is rendered only once
+ * then the overhead of tiling and untiling will dominate over the speedup that
+ * SoA gives. So we might want to detect such cases and fallback to AoS in the
+ * future, but for now this function is here for historical/benchmarking
+ * purposes.
+ *
+ * Run lp_blend_test after any change to this file.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+#include "util/u_format.h"
+
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_logic.h"
+#include "gallivm/lp_bld_swizzle.h"
+#include "gallivm/lp_bld_bitarit.h"
+#include "gallivm/lp_bld_debug.h"
+
+#include "lp_bld_blend.h"
+
+
+/**
+ * We may the same values several times, so we keep them here to avoid
+ * recomputing them. Also reusing the values allows us to do simplifications
+ * that LLVM optimization passes wouldn't normally be able to do.
+ */
+struct lp_build_blend_aos_context
+{
+   struct lp_build_context base;
+
+   LLVMValueRef src;
+   LLVMValueRef src_alpha;
+   LLVMValueRef src1;
+   LLVMValueRef src1_alpha;
+   LLVMValueRef dst;
+   LLVMValueRef const_;
+   LLVMValueRef const_alpha;
+
+   LLVMValueRef inv_src;
+   LLVMValueRef inv_src_alpha;
+   LLVMValueRef inv_dst;
+   LLVMValueRef inv_const;
+   LLVMValueRef inv_const_alpha;
+   LLVMValueRef saturate;
+
+   LLVMValueRef rgb_src_factor;
+   LLVMValueRef alpha_src_factor;
+   LLVMValueRef rgb_dst_factor;
+   LLVMValueRef alpha_dst_factor;
+};
+
+
+static LLVMValueRef
+lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
+                                 unsigned factor,
+                                 boolean alpha)
+{
+   LLVMValueRef src_alpha = bld->src_alpha ? bld->src_alpha : bld->src;
+   LLVMValueRef src1_alpha = bld->src1_alpha ? bld->src1_alpha : bld->src1;
+   LLVMValueRef const_alpha = bld->const_alpha ? bld->const_alpha : bld->const_;
+
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      return bld->base.zero;
+   case PIPE_BLENDFACTOR_ONE:
+      return bld->base.one;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return bld->src;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return src_alpha;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return bld->dst;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      if(alpha)
+         return bld->base.one;
+      else {
+         /*
+          * if there's separate src_alpha there's no dst alpha hence the complement
+          * is zero but for unclamped float inputs min can be non-zero (negative).
+          */
+         if (bld->src_alpha) {
+            if (!bld->saturate)
+               bld->saturate = lp_build_min(&bld->base, src_alpha, bld->base.zero);
+         }
+         else {
+            if(!bld->inv_dst)
+               bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
+            if(!bld->saturate)
+               bld->saturate = lp_build_min(&bld->base, src_alpha, bld->inv_dst);
+         }
+         return bld->saturate;
+      }
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return bld->const_;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return const_alpha;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      return bld->src1;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      return src1_alpha;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      if(!bld->inv_src)
+         bld->inv_src = lp_build_comp(&bld->base, bld->src);
+      return bld->inv_src;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      if(!bld->inv_src_alpha)
+         bld->inv_src_alpha = lp_build_comp(&bld->base, src_alpha);
+      return bld->inv_src_alpha;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      if(!bld->inv_dst)
+         bld->inv_dst = lp_build_comp(&bld->base, bld->dst);
+      return bld->inv_dst;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      if(!bld->inv_const)
+         bld->inv_const = lp_build_comp(&bld->base, bld->const_);
+      return bld->inv_const;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      if(!bld->inv_const_alpha)
+         bld->inv_const_alpha = lp_build_comp(&bld->base, const_alpha);
+      return bld->inv_const_alpha;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      return lp_build_comp(&bld->base, bld->src1);
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      return lp_build_comp(&bld->base, src1_alpha);
+   default:
+      assert(0);
+      return bld->base.zero;
+   }
+}
+
+
+enum lp_build_blend_swizzle {
+   LP_BUILD_BLEND_SWIZZLE_RGBA = 0,
+   LP_BUILD_BLEND_SWIZZLE_AAAA = 1
+};
+
+
+/**
+ * How should we shuffle the base factor.
+ */
+static enum lp_build_blend_swizzle
+lp_build_blend_factor_swizzle(unsigned factor)
+{
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:
+   case PIPE_BLENDFACTOR_ZERO:
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+   case PIPE_BLENDFACTOR_DST_COLOR:
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      return LP_BUILD_BLEND_SWIZZLE_RGBA;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      return LP_BUILD_BLEND_SWIZZLE_AAAA;
+   default:
+      assert(0);
+      return LP_BUILD_BLEND_SWIZZLE_RGBA;
+   }
+}
+
+
+static LLVMValueRef
+lp_build_blend_swizzle(struct lp_build_blend_aos_context *bld,
+                       LLVMValueRef rgb, 
+                       LLVMValueRef alpha, 
+                       enum lp_build_blend_swizzle rgb_swizzle,
+                       unsigned alpha_swizzle,
+                       unsigned num_channels)
+{
+   LLVMValueRef swizzled_rgb;
+
+   switch (rgb_swizzle) {
+   case LP_BUILD_BLEND_SWIZZLE_RGBA:
+      swizzled_rgb = rgb;
+      break;
+   case LP_BUILD_BLEND_SWIZZLE_AAAA:
+      swizzled_rgb = lp_build_swizzle_scalar_aos(&bld->base, rgb, alpha_swizzle, num_channels);
+      break;
+   default:
+      assert(0);
+      swizzled_rgb = bld->base.undef;
+   }
+
+   if (rgb != alpha) {
+      swizzled_rgb = lp_build_select_aos(&bld->base, 1 << alpha_swizzle,
+                                         alpha, swizzled_rgb,
+                                         num_channels);
+   }
+
+   return swizzled_rgb;
+}
+
+/**
+ * @sa http://www.opengl.org/sdk/docs/man/xhtml/glBlendFuncSeparate.xml
+ */
+static LLVMValueRef
+lp_build_blend_factor(struct lp_build_blend_aos_context *bld,
+                      unsigned rgb_factor,
+                      unsigned alpha_factor,
+                      unsigned alpha_swizzle,
+                      unsigned num_channels)
+{
+   LLVMValueRef rgb_factor_, alpha_factor_;
+   enum lp_build_blend_swizzle rgb_swizzle;
+
+   if (alpha_swizzle == UTIL_FORMAT_SWIZZLE_X && num_channels == 1) {
+      return lp_build_blend_factor_unswizzled(bld, alpha_factor, TRUE);
+   }
+
+   rgb_factor_ = lp_build_blend_factor_unswizzled(bld, rgb_factor, FALSE);
+
+   if (alpha_swizzle != UTIL_FORMAT_SWIZZLE_NONE) {
+      rgb_swizzle   = lp_build_blend_factor_swizzle(rgb_factor);
+      alpha_factor_ = lp_build_blend_factor_unswizzled(bld, alpha_factor, TRUE);
+      return lp_build_blend_swizzle(bld, rgb_factor_, alpha_factor_, rgb_swizzle, alpha_swizzle, num_channels);
+   } else {
+      return rgb_factor_;
+   }
+}
+
+
+/**
+ * Performs blending of src and dst pixels
+ *
+ * @param blend         the blend state of the shader variant
+ * @param cbuf_format   format of the colour buffer
+ * @param type          data type of the pixel vector
+ * @param rt            render target index
+ * @param src           blend src
+ * @param src_alpha     blend src alpha (if not included in src)
+ * @param src1          second blend src (for dual source blend)
+ * @param src1_alpha    second blend src alpha (if not included in src1)
+ * @param dst           blend dst
+ * @param mask          optional mask to apply to the blending result
+ * @param const_        const blend color
+ * @param const_alpha   const blend color alpha (if not included in const_)
+ * @param swizzle       swizzle values for RGBA
+ *
+ * @return the result of blending src and dst
+ */
+LLVMValueRef
+lp_build_blend_aos(struct gallivm_state *gallivm,
+                   const struct pipe_blend_state *blend,
+                   enum pipe_format cbuf_format,
+                   struct lp_type type,
+                   unsigned rt,
+                   LLVMValueRef src,
+                   LLVMValueRef src_alpha,
+                   LLVMValueRef src1,
+                   LLVMValueRef src1_alpha,
+                   LLVMValueRef dst,
+                   LLVMValueRef mask,
+                   LLVMValueRef const_,
+                   LLVMValueRef const_alpha,
+                   const unsigned char swizzle[4],
+                   int nr_channels)
+{
+   const struct pipe_rt_blend_state * state = &blend->rt[rt];
+   const struct util_format_description * desc;
+   struct lp_build_blend_aos_context bld;
+   LLVMValueRef src_factor, dst_factor;
+   LLVMValueRef result;
+   unsigned alpha_swizzle = UTIL_FORMAT_SWIZZLE_NONE;
+   unsigned i;
+
+   desc = util_format_description(cbuf_format);
+
+   /* Setup build context */
+   memset(&bld, 0, sizeof bld);
+   lp_build_context_init(&bld.base, gallivm, type);
+   bld.src = src;
+   bld.src1 = src1;
+   bld.dst = dst;
+   bld.const_ = const_;
+   bld.src_alpha = src_alpha;
+   bld.src1_alpha = src1_alpha;
+   bld.const_alpha = const_alpha;
+
+   /* Find the alpha channel if not provided seperately */
+   if (!src_alpha) {
+      for (i = 0; i < 4; ++i) {
+         if (swizzle[i] == 3) {
+            alpha_swizzle = i;
+         }
+      }
+   }
+
+   if (blend->logicop_enable) {
+      if(!type.floating) {
+         result = lp_build_logicop(gallivm->builder, blend->logicop_func, src, dst);
+      }
+      else {
+         result = src;
+      }
+   } else if (!state->blend_enable) {
+      result = src;
+   } else {
+      boolean rgb_alpha_same = (state->rgb_src_factor == state->rgb_dst_factor && state->alpha_src_factor == state->alpha_dst_factor) || nr_channels == 1;
+
+      src_factor = lp_build_blend_factor(&bld, state->rgb_src_factor,
+                                         state->alpha_src_factor,
+                                         alpha_swizzle,
+                                         nr_channels);
+
+      dst_factor = lp_build_blend_factor(&bld, state->rgb_dst_factor,
+                                         state->alpha_dst_factor,
+                                         alpha_swizzle,
+                                         nr_channels);
+
+      result = lp_build_blend(&bld.base,
+                              state->rgb_func,
+                              state->rgb_src_factor,
+                              state->rgb_dst_factor,
+                              src,
+                              dst,
+                              src_factor,
+                              dst_factor,
+                              rgb_alpha_same,
+                              false);
+
+      if(state->rgb_func != state->alpha_func && nr_channels > 1 && alpha_swizzle != UTIL_FORMAT_SWIZZLE_NONE) {
+         LLVMValueRef alpha;
+
+         alpha = lp_build_blend(&bld.base,
+                                state->alpha_func,
+                                state->alpha_src_factor,
+                                state->alpha_dst_factor,
+                                src,
+                                dst,
+                                src_factor,
+                                dst_factor,
+                                rgb_alpha_same,
+                                false);
+
+         result = lp_build_blend_swizzle(&bld,
+                                         result,
+                                         alpha,
+                                         LP_BUILD_BLEND_SWIZZLE_RGBA,
+                                         alpha_swizzle,
+                                         nr_channels);
+      }
+   }
+
+   /* Check if color mask is necessary */
+   if (!util_format_colormask_full(desc, state->colormask)) {
+      LLVMValueRef color_mask;
+
+      color_mask = lp_build_const_mask_aos_swizzled(gallivm, bld.base.type, state->colormask, nr_channels, swizzle);
+      lp_build_name(color_mask, "color_mask");
+
+      /* Combine with input mask if necessary */
+      if (mask) {
+         /* We can be blending floating values but masks are always integer... */
+         unsigned floating = bld.base.type.floating;
+         bld.base.type.floating = 0;
+
+         mask = lp_build_and(&bld.base, color_mask, mask);
+
+         bld.base.type.floating = floating;
+      } else {
+         mask = color_mask;
+      }
+   }
+
+   /* Apply mask, if one exists */
+   if (mask) {
+      result = lp_build_select(&bld.base, mask, result, dst);
+   }
+
+   return result;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_blend_logicop.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_blend_logicop.c
new file mode 100644
index 000000000..1eac0a5c8
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_blend_logicop.c
@@ -0,0 +1,109 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Blend LLVM IR generation -- logic ops.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+
+#include "lp_bld_blend.h"
+
+
+LLVMValueRef
+lp_build_logicop(LLVMBuilderRef builder,
+                 unsigned logicop_func,
+                 LLVMValueRef src,
+                 LLVMValueRef dst)
+{
+   LLVMTypeRef type;
+   LLVMValueRef res;
+
+   type = LLVMTypeOf(src);
+
+   switch (logicop_func) {
+   case PIPE_LOGICOP_CLEAR:
+      res = LLVMConstNull(type);
+      break;
+   case PIPE_LOGICOP_NOR:
+      res = LLVMBuildNot(builder, LLVMBuildOr(builder, src, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      res = LLVMBuildAnd(builder, LLVMBuildNot(builder, src, ""), dst, "");
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      res = LLVMBuildNot(builder, src, "");
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      res = LLVMBuildAnd(builder, src, LLVMBuildNot(builder, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_INVERT:
+      res = LLVMBuildNot(builder, dst, "");
+      break;
+   case PIPE_LOGICOP_XOR:
+      res = LLVMBuildXor(builder, src, dst, "");
+      break;
+   case PIPE_LOGICOP_NAND:
+      res = LLVMBuildNot(builder, LLVMBuildAnd(builder, src, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_AND:
+      res = LLVMBuildAnd(builder, src, dst, "");
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      res = LLVMBuildNot(builder, LLVMBuildXor(builder, src, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_NOOP:
+      res = dst;
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      res = LLVMBuildOr(builder, LLVMBuildNot(builder, src, ""), dst, "");
+      break;
+   case PIPE_LOGICOP_COPY:
+      res = src;
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      res = LLVMBuildOr(builder, src, LLVMBuildNot(builder, dst, ""), "");
+      break;
+   case PIPE_LOGICOP_OR:
+      res = LLVMBuildOr(builder, src, dst, "");
+      break;
+   case PIPE_LOGICOP_SET:
+      res = LLVMConstAllOnes(type);
+      break;
+   default:
+      assert(0);
+      res = src;
+   }
+
+   return res;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_depth.c
new file mode 100644
index 000000000..b25e04137
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -0,0 +1,1118 @@
+/**************************************************************************
+ *
+ * Copyright 2009-2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Depth/stencil testing to LLVM IR translation.
+ *
+ * To be done accurately/efficiently the depth/stencil test must be done with
+ * the same type/format of the depth/stencil buffer, which implies massaging
+ * the incoming depths to fit into place. Using a more straightforward
+ * type/format for depth/stencil values internally and only convert when
+ * flushing would avoid this, but it would most likely result in depth fighting
+ * artifacts.
+ *
+ * Since we're using linear layout for everything, but we need to deal with
+ * 2x2 quads, we need to load/store multiple values and swizzle them into
+ * place (we could avoid this by doing depth/stencil testing in linear format,
+ * which would be easy for late depth/stencil test as we could do that after
+ * the fragment shader loop just as we do for color buffers, but more tricky
+ * for early depth test as we'd need both masks and interpolated depth in
+ * linear format).
+ *
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ * @author Brian Paul <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_state.h"
+#include "util/u_format.h"
+#include "util/u_cpu_detect.h"
+
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_bitarit.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_conv.h"
+#include "gallivm/lp_bld_logic.h"
+#include "gallivm/lp_bld_flow.h"
+#include "gallivm/lp_bld_intr.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_swizzle.h"
+#include "gallivm/lp_bld_pack.h"
+
+#include "lp_bld_depth.h"
+
+
+/** Used to select fields from pipe_stencil_state */
+enum stencil_op {
+   S_FAIL_OP,
+   Z_FAIL_OP,
+   Z_PASS_OP
+};
+
+
+
+/**
+ * Do the stencil test comparison (compare FB stencil values against ref value).
+ * This will be used twice when generating two-sided stencil code.
+ * \param stencil  the front/back stencil state
+ * \param stencilRef  the stencil reference value, replicated as a vector
+ * \param stencilVals  vector of stencil values from framebuffer
+ * \return vector mask of pass/fail values (~0 or 0)
+ */
+static LLVMValueRef
+lp_build_stencil_test_single(struct lp_build_context *bld,
+                             const struct pipe_stencil_state *stencil,
+                             LLVMValueRef stencilRef,
+                             LLVMValueRef stencilVals)
+{
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   const unsigned stencilMax = 255; /* XXX fix */
+   struct lp_type type = bld->type;
+   LLVMValueRef res;
+
+   /*
+    * SSE2 has intrinsics for signed comparisons, but not unsigned ones. Values
+    * are between 0..255 so ensure we generate the fastest comparisons for
+    * wider elements.
+    */
+   if (type.width <= 8) {
+      assert(!type.sign);
+   } else {
+      assert(type.sign);
+   }
+
+   assert(stencil->enabled);
+
+   if (stencil->valuemask != stencilMax) {
+      /* compute stencilRef = stencilRef & valuemask */
+      LLVMValueRef valuemask = lp_build_const_int_vec(bld->gallivm, type, stencil->valuemask);
+      stencilRef = LLVMBuildAnd(builder, stencilRef, valuemask, "");
+      /* compute stencilVals = stencilVals & valuemask */
+      stencilVals = LLVMBuildAnd(builder, stencilVals, valuemask, "");
+   }
+
+   res = lp_build_cmp(bld, stencil->func, stencilRef, stencilVals);
+
+   return res;
+}
+
+
+/**
+ * Do the one or two-sided stencil test comparison.
+ * \sa lp_build_stencil_test_single
+ * \param front_facing  an integer vector mask, indicating front (~0) or back
+ *                      (0) facing polygon. If NULL, assume front-facing.
+ */
+static LLVMValueRef
+lp_build_stencil_test(struct lp_build_context *bld,
+                      const struct pipe_stencil_state stencil[2],
+                      LLVMValueRef stencilRefs[2],
+                      LLVMValueRef stencilVals,
+                      LLVMValueRef front_facing)
+{
+   LLVMValueRef res;
+
+   assert(stencil[0].enabled);
+
+   /* do front face test */
+   res = lp_build_stencil_test_single(bld, &stencil[0],
+                                      stencilRefs[0], stencilVals);
+
+   if (stencil[1].enabled && front_facing != NULL) {
+      /* do back face test */
+      LLVMValueRef back_res;
+
+      back_res = lp_build_stencil_test_single(bld, &stencil[1],
+                                              stencilRefs[1], stencilVals);
+
+      res = lp_build_select(bld, front_facing, res, back_res);
+   }
+
+   return res;
+}
+
+
+/**
+ * Apply the stencil operator (add/sub/keep/etc) to the given vector
+ * of stencil values.
+ * \return  new stencil values vector
+ */
+static LLVMValueRef
+lp_build_stencil_op_single(struct lp_build_context *bld,
+                           const struct pipe_stencil_state *stencil,
+                           enum stencil_op op,
+                           LLVMValueRef stencilRef,
+                           LLVMValueRef stencilVals)
+
+{
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   struct lp_type type = bld->type;
+   LLVMValueRef res;
+   LLVMValueRef max = lp_build_const_int_vec(bld->gallivm, type, 0xff);
+   unsigned stencil_op;
+
+   assert(type.sign);
+
+   switch (op) {
+   case S_FAIL_OP:
+      stencil_op = stencil->fail_op;
+      break;
+   case Z_FAIL_OP:
+      stencil_op = stencil->zfail_op;
+      break;
+   case Z_PASS_OP:
+      stencil_op = stencil->zpass_op;
+      break;
+   default:
+      assert(0 && "Invalid stencil_op mode");
+      stencil_op = PIPE_STENCIL_OP_KEEP;
+   }
+
+   switch (stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      res = stencilVals;
+      /* we can return early for this case */
+      return res;
+   case PIPE_STENCIL_OP_ZERO:
+      res = bld->zero;
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      res = stencilRef;
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      res = lp_build_add(bld, stencilVals, bld->one);
+      res = lp_build_min(bld, res, max);
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      res = lp_build_sub(bld, stencilVals, bld->one);
+      res = lp_build_max(bld, res, bld->zero);
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      res = lp_build_add(bld, stencilVals, bld->one);
+      res = LLVMBuildAnd(builder, res, max, "");
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      res = lp_build_sub(bld, stencilVals, bld->one);
+      res = LLVMBuildAnd(builder, res, max, "");
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      res = LLVMBuildNot(builder, stencilVals, "");
+      res = LLVMBuildAnd(builder, res, max, "");
+      break;
+   default:
+      assert(0 && "bad stencil op mode");
+      res = bld->undef;
+   }
+
+   return res;
+}
+
+
+/**
+ * Do the one or two-sided stencil test op/update.
+ */
+static LLVMValueRef
+lp_build_stencil_op(struct lp_build_context *bld,
+                    const struct pipe_stencil_state stencil[2],
+                    enum stencil_op op,
+                    LLVMValueRef stencilRefs[2],
+                    LLVMValueRef stencilVals,
+                    LLVMValueRef mask,
+                    LLVMValueRef front_facing)
+
+{
+   LLVMBuilderRef builder = bld->gallivm->builder;
+   LLVMValueRef res;
+
+   assert(stencil[0].enabled);
+
+   /* do front face op */
+   res = lp_build_stencil_op_single(bld, &stencil[0], op,
+                                     stencilRefs[0], stencilVals);
+
+   if (stencil[1].enabled && front_facing != NULL) {
+      /* do back face op */
+      LLVMValueRef back_res;
+
+      back_res = lp_build_stencil_op_single(bld, &stencil[1], op,
+                                            stencilRefs[1], stencilVals);
+
+      res = lp_build_select(bld, front_facing, res, back_res);
+   }
+
+   if (stencil[0].writemask != 0xff ||
+       (stencil[1].enabled && front_facing != NULL && stencil[1].writemask != 0xff)) {
+      /* mask &= stencil[0].writemask */
+      LLVMValueRef writemask = lp_build_const_int_vec(bld->gallivm, bld->type,
+                                                      stencil[0].writemask);
+      if (stencil[1].enabled && stencil[1].writemask != stencil[0].writemask && front_facing != NULL) {
+         LLVMValueRef back_writemask = lp_build_const_int_vec(bld->gallivm, bld->type,
+                                                         stencil[1].writemask);
+         writemask = lp_build_select(bld, front_facing, writemask, back_writemask);
+      }
+
+      mask = LLVMBuildAnd(builder, mask, writemask, "");
+      /* res = (res & mask) | (stencilVals & ~mask) */
+      res = lp_build_select_bitwise(bld, mask, res, stencilVals);
+   }
+   else {
+      /* res = mask ? res : stencilVals */
+      res = lp_build_select(bld, mask, res, stencilVals);
+   }
+
+   return res;
+}
+
+
+
+/**
+ * Return a type that matches the depth/stencil format.
+ */
+struct lp_type
+lp_depth_type(const struct util_format_description *format_desc,
+              unsigned length)
+{
+   struct lp_type type;
+   unsigned z_swizzle;
+
+   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+
+   memset(&type, 0, sizeof type);
+   type.width = format_desc->block.bits;
+
+   z_swizzle = format_desc->swizzle[0];
+   if (z_swizzle < 4) {
+      if (format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
+         type.floating = TRUE;
+         assert(z_swizzle == 0);
+         assert(format_desc->channel[z_swizzle].size == 32);
+      }
+      else if(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
+         assert(format_desc->block.bits <= 32);
+         assert(format_desc->channel[z_swizzle].normalized);
+         if (format_desc->channel[z_swizzle].size < format_desc->block.bits) {
+            /* Prefer signed integers when possible, as SSE has less support
+             * for unsigned comparison;
+             */
+            type.sign = TRUE;
+         }
+      }
+      else
+         assert(0);
+   }
+
+   type.length = length;
+
+   return type;
+}
+
+
+/**
+ * Compute bitmask and bit shift to apply to the incoming fragment Z values
+ * and the Z buffer values needed before doing the Z comparison.
+ *
+ * Note that we leave the Z bits in the position that we find them
+ * in the Z buffer (typically 0xffffff00 or 0x00ffffff).  That lets us
+ * get by with fewer bit twiddling steps.
+ */
+static boolean
+get_z_shift_and_mask(const struct util_format_description *format_desc,
+                     unsigned *shift, unsigned *width, unsigned *mask)
+{
+   unsigned total_bits;
+   unsigned z_swizzle;
+
+   assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
+   assert(format_desc->block.width == 1);
+   assert(format_desc->block.height == 1);
+
+   /* 64bit d/s format is special already extracted 32 bits */
+   total_bits = format_desc->block.bits > 32 ? 32 : format_desc->block.bits;
+
+   z_swizzle = format_desc->swizzle[0];
+
+   if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
+      return FALSE;
+
+   *width = format_desc->channel[z_swizzle].size;
+   /* & 31 is for the same reason as the 32-bit limit above */
+   *shift = format_desc->channel[z_swizzle].shift & 31;
+
+   if (*width == total_bits) {
+      *mask = 0xffffffff;
+   } else {
+      *mask = ((1 << *width) - 1) << *shift;
+   }
+
+   return TRUE;
+}
+
+
+/**
+ * Compute bitmask and bit shift to apply to the framebuffer pixel values
+ * to put the stencil bits in the least significant position.
+ * (i.e. 0x000000ff)
+ */
+static boolean
+get_s_shift_and_mask(const struct util_format_description *format_desc,
+                     unsigned *shift, unsigned *mask)
+{
+   unsigned s_swizzle;
+   unsigned sz;
+
+   s_swizzle = format_desc->swizzle[1];
+
+   if (s_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
+      return FALSE;
+
+   /* just special case 64bit d/s format */
+   if (format_desc->block.bits > 32) {
+      /* XXX big-endian? */
+      assert(format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
+      *shift = 0;
+      *mask = 0xff;
+      return TRUE;
+   }
+
+   *shift = format_desc->channel[s_swizzle].shift;
+   sz = format_desc->channel[s_swizzle].size;
+   *mask = (1U << sz) - 1U;
+
+   return TRUE;
+}
+
+
+/**
+ * Perform the occlusion test and increase the counter.
+ * Test the depth mask. Add the number of channel which has none zero mask
+ * into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}.
+ * The counter will add 4.
+ * TODO: could get that out of the fs loop.
+ *
+ * \param type holds element type of the mask vector.
+ * \param maskvalue is the depth test mask.
+ * \param counter is a pointer of the uint32 counter.
+ */
+void
+lp_build_occlusion_count(struct gallivm_state *gallivm,
+                         struct lp_type type,
+                         LLVMValueRef maskvalue,
+                         LLVMValueRef counter)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMContextRef context = gallivm->context;
+   LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1);
+   LLVMValueRef count, newcount;
+
+   assert(type.length <= 16);
+   assert(type.floating);
+
+   if(util_cpu_caps.has_sse && type.length == 4) {
+      const char *movmskintr = "llvm.x86.sse.movmsk.ps";
+      const char *popcntintr = "llvm.ctpop.i32";
+      LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+                                           lp_build_vec_type(gallivm, type), "");
+      bits = lp_build_intrinsic_unary(builder, movmskintr,
+                                      LLVMInt32TypeInContext(context), bits);
+      count = lp_build_intrinsic_unary(builder, popcntintr,
+                                       LLVMInt32TypeInContext(context), bits);
+      count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+   }
+   else if(util_cpu_caps.has_avx && type.length == 8) {
+      const char *movmskintr = "llvm.x86.avx.movmsk.ps.256";
+      const char *popcntintr = "llvm.ctpop.i32";
+      LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
+                                           lp_build_vec_type(gallivm, type), "");
+      bits = lp_build_intrinsic_unary(builder, movmskintr,
+                                      LLVMInt32TypeInContext(context), bits);
+      count = lp_build_intrinsic_unary(builder, popcntintr,
+                                       LLVMInt32TypeInContext(context), bits);
+      count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+   }
+   else {
+      unsigned i;
+      LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");
+      LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8);
+      LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4);
+      LLVMValueRef shufflev, countd;
+      LLVMValueRef shuffles[16];
+      const char *popcntintr = NULL;
+
+      countv = LLVMBuildBitCast(builder, countv, i8vntype, "");
+
+       for (i = 0; i < type.length; i++) {
+          shuffles[i] = lp_build_const_int32(gallivm, 4*i);
+       }
+
+       shufflev = LLVMConstVector(shuffles, type.length);
+       countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, "");
+       countd = LLVMBuildBitCast(builder, countd, counttype, "countd");
+
+       /*
+        * XXX FIXME
+        * this is bad on cpus without popcount (on x86 supported by intel
+        * nehalem, amd barcelona, and up - not tied to sse42).
+        * Would be much faster to just sum the 4 elements of the vector with
+        * some horizontal add (shuffle/add/shuffle/add after the initial and).
+        */
+       switch (type.length) {
+       case 4:
+          popcntintr = "llvm.ctpop.i32";
+          break;
+       case 8:
+          popcntintr = "llvm.ctpop.i64";
+          break;
+       case 16:
+          popcntintr = "llvm.ctpop.i128";
+          break;
+       default:
+          assert(0);
+       }
+       count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd);
+
+       if (type.length > 8) {
+          count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 64), "");
+       }
+       else if (type.length < 8) {
+          count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
+       }
+   }
+   newcount = LLVMBuildLoad(builder, counter, "origcount");
+   newcount = LLVMBuildAdd(builder, newcount, count, "newcount");
+   LLVMBuildStore(builder, newcount, counter);
+}
+
+
+/**
+ * Load depth/stencil values.
+ * The stored values are linear, swizzle them.
+ *
+ * \param type  the data type of the fragment depth/stencil values
+ * \param format_desc  description of the depth/stencil surface
+ * \param is_1d  whether this resource has only one dimension
+ * \param loop_counter  the current loop iteration
+ * \param depth_ptr  pointer to the depth/stencil values of this 4x4 block
+ * \param depth_stride  stride of the depth/stencil buffer
+ * \param z_fb  contains z values loaded from fb (may include padding)
+ * \param s_fb  contains s values loaded from fb (may include padding)
+ */
+void
+lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
+                                     struct lp_type z_src_type,
+                                     const struct util_format_description *format_desc,
+                                     boolean is_1d,
+                                     LLVMValueRef depth_ptr,
+                                     LLVMValueRef depth_stride,
+                                     LLVMValueRef *z_fb,
+                                     LLVMValueRef *s_fb,
+                                     LLVMValueRef loop_counter)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
+   LLVMValueRef zs_dst1, zs_dst2;
+   LLVMValueRef zs_dst_ptr;
+   LLVMValueRef depth_offset1, depth_offset2;
+   LLVMTypeRef load_ptr_type;
+   unsigned depth_bytes = format_desc->block.bits / 8;
+   struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
+   struct lp_type zs_load_type = zs_type;
+
+   zs_load_type.length = zs_load_type.length / 2;
+   load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
+
+   if (z_src_type.length == 4) {
+      unsigned i;
+      LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
+                                          lp_build_const_int32(gallivm, 1), "");
+      LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
+                                          lp_build_const_int32(gallivm, 2), "");
+      LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
+                                          depth_stride, "");
+      depth_offset1 = LLVMBuildMul(builder, looplsb,
+                                   lp_build_const_int32(gallivm, depth_bytes * 2), "");
+      depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
+
+      /* just concatenate the loaded 2x2 values into 4-wide vector */
+      for (i = 0; i < 4; i++) {
+         shuffles[i] = lp_build_const_int32(gallivm, i);
+      }
+   }
+   else {
+      unsigned i;
+      LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
+                                         lp_build_const_int32(gallivm, 1), "");
+      assert(z_src_type.length == 8);
+      depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");
+      /*
+       * We load 2x4 values, and need to swizzle them (order
+       * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
+       */
+      for (i = 0; i < 8; i++) {
+         shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
+      }
+   }
+
+   depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
+
+   /* Load current z/stencil values from z/stencil buffer */
+   zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
+   zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
+   zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, "");
+   if (is_1d) {
+      zs_dst2 = lp_build_undef(gallivm, zs_load_type);
+   }
+   else {
+      zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
+      zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
+      zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, "");
+   }
+
+   *z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2,
+                                  LLVMConstVector(shuffles, zs_type.length), "");
+   *s_fb = *z_fb;
+
+   if (format_desc->block.bits < z_src_type.width) {
+      /* Extend destination ZS values (e.g., when reading from Z16_UNORM) */
+      *z_fb = LLVMBuildZExt(builder, *z_fb,
+                            lp_build_int_vec_type(gallivm, z_src_type), "");
+   }
+
+   else if (format_desc->block.bits > 32) {
+      /* rely on llvm to handle too wide vector we have here nicely */
+      unsigned i;
+      struct lp_type typex2 = zs_type;
+      struct lp_type s_type = zs_type;
+      LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 4];
+      LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 4];
+      LLVMValueRef tmp;
+
+      typex2.width = typex2.width / 2;
+      typex2.length = typex2.length * 2;
+      s_type.width = s_type.width / 2;
+      s_type.floating = 0;
+
+      tmp = LLVMBuildBitCast(builder, *z_fb,
+                             lp_build_vec_type(gallivm, typex2), "");
+
+      for (i = 0; i < zs_type.length; i++) {
+         shuffles1[i] = lp_build_const_int32(gallivm, i * 2);
+         shuffles2[i] = lp_build_const_int32(gallivm, i * 2 + 1);
+      }
+      *z_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
+                                     LLVMConstVector(shuffles1, zs_type.length), "");
+      *s_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
+                                     LLVMConstVector(shuffles2, zs_type.length), "");
+      *s_fb = LLVMBuildBitCast(builder, *s_fb,
+                               lp_build_vec_type(gallivm, s_type), "");
+      lp_build_name(*s_fb, "s_dst");
+   }
+
+   lp_build_name(*z_fb, "z_dst");
+   lp_build_name(*s_fb, "s_dst");
+   lp_build_name(*z_fb, "z_dst");
+}
+
+/**
+ * Store depth/stencil values.
+ * Incoming values are swizzled (typically n 2x2 quads), stored linear.
+ * If there's a mask it will do select/store otherwise just store.
+ *
+ * \param type  the data type of the fragment depth/stencil values
+ * \param format_desc  description of the depth/stencil surface
+ * \param is_1d  whether this resource has only one dimension
+ * \param mask  the alive/dead pixel mask for the quad (vector)
+ * \param z_fb  z values read from fb (with padding)
+ * \param s_fb  s values read from fb (with padding)
+ * \param loop_counter  the current loop iteration
+ * \param depth_ptr  pointer to the depth/stencil values of this 4x4 block
+ * \param depth_stride  stride of the depth/stencil buffer
+ * \param z_value the depth values to store (with padding)
+ * \param s_value the stencil values to store (with padding)
+ */
+void
+lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,
+                                      struct lp_type z_src_type,
+                                      const struct util_format_description *format_desc,
+                                      boolean is_1d,
+                                      struct lp_build_mask_context *mask,
+                                      LLVMValueRef z_fb,
+                                      LLVMValueRef s_fb,
+                                      LLVMValueRef loop_counter,
+                                      LLVMValueRef depth_ptr,
+                                      LLVMValueRef depth_stride,
+                                      LLVMValueRef z_value,
+                                      LLVMValueRef s_value)
+{
+   struct lp_build_context z_bld;
+   LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef mask_value = NULL;
+   LLVMValueRef zs_dst1, zs_dst2;
+   LLVMValueRef zs_dst_ptr1, zs_dst_ptr2;
+   LLVMValueRef depth_offset1, depth_offset2;
+   LLVMTypeRef load_ptr_type;
+   unsigned depth_bytes = format_desc->block.bits / 8;
+   struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
+   struct lp_type z_type = zs_type;
+   struct lp_type zs_load_type = zs_type;
+
+   zs_load_type.length = zs_load_type.length / 2;
+   load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
+
+   z_type.width = z_src_type.width;
+
+   lp_build_context_init(&z_bld, gallivm, z_type);
+
+   /*
+    * This is far from ideal, at least for late depth write we should do this
+    * outside the fs loop to avoid all the swizzle stuff.
+    */
+   if (z_src_type.length == 4) {
+      LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
+                                          lp_build_const_int32(gallivm, 1), "");
+      LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
+                                          lp_build_const_int32(gallivm, 2), "");
+      LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
+                                          depth_stride, "");
+      depth_offset1 = LLVMBuildMul(builder, looplsb,
+                                   lp_build_const_int32(gallivm, depth_bytes * 2), "");
+      depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
+   }
+   else {
+      unsigned i;
+      LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
+                                         lp_build_const_int32(gallivm, 1), "");
+      assert(z_src_type.length == 8);
+      depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");
+      /*
+       * We load 2x4 values, and need to swizzle them (order
+       * 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
+       */
+      for (i = 0; i < 8; i++) {
+         shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
+      }
+   }
+
+   depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
+
+   zs_dst_ptr1 = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
+   zs_dst_ptr1 = LLVMBuildBitCast(builder, zs_dst_ptr1, load_ptr_type, "");
+   zs_dst_ptr2 = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
+   zs_dst_ptr2 = LLVMBuildBitCast(builder, zs_dst_ptr2, load_ptr_type, "");
+
+   if (format_desc->block.bits > 32) {
+      s_value = LLVMBuildBitCast(builder, s_value, z_bld.vec_type, "");
+   }
+
+   if (mask) {
+      mask_value = lp_build_mask_value(mask);
+      z_value = lp_build_select(&z_bld, mask_value, z_value, z_fb);
+      if (format_desc->block.bits > 32) {
+         s_fb = LLVMBuildBitCast(builder, s_fb, z_bld.vec_type, "");
+         s_value = lp_build_select(&z_bld, mask_value, s_value, s_fb);
+      }
+   }
+
+   if (zs_type.width < z_src_type.width) {
+      /* Truncate ZS values (e.g., when writing to Z16_UNORM) */
+      z_value = LLVMBuildTrunc(builder, z_value,
+                               lp_build_int_vec_type(gallivm, zs_type), "");
+   }
+
+   if (format_desc->block.bits <= 32) {
+      if (z_src_type.length == 4) {
+         zs_dst1 = lp_build_extract_range(gallivm, z_value, 0, 2);
+         zs_dst2 = lp_build_extract_range(gallivm, z_value, 2, 2);
+      }
+      else {
+         assert(z_src_type.length == 8);
+         zs_dst1 = LLVMBuildShuffleVector(builder, z_value, z_value,
+                                          LLVMConstVector(&shuffles[0],
+                                                          zs_load_type.length), "");
+         zs_dst2 = LLVMBuildShuffleVector(builder, z_value, z_value,
+                                          LLVMConstVector(&shuffles[4],
+                                                          zs_load_type.length), "");
+      }
+   }
+   else {
+      if (z_src_type.length == 4) {
+         zs_dst1 = lp_build_interleave2(gallivm, z_type,
+                                        z_value, s_value, 0);
+         zs_dst2 = lp_build_interleave2(gallivm, z_type,
+                                        z_value, s_value, 1);
+      }
+      else {
+         unsigned i;
+         LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 2];
+         assert(z_src_type.length == 8);
+         for (i = 0; i < 8; i++) {
+            shuffles[i*2] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
+            shuffles[i*2+1] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2 +
+                                                   z_src_type.length);
+         }
+         zs_dst1 = LLVMBuildShuffleVector(builder, z_value, s_value,
+                                          LLVMConstVector(&shuffles[0],
+                                                          z_src_type.length), "");
+         zs_dst2 = LLVMBuildShuffleVector(builder, z_value, s_value,
+                                          LLVMConstVector(&shuffles[8],
+                                                          z_src_type.length), "");
+      }
+      zs_dst1 = LLVMBuildBitCast(builder, zs_dst1,
+                                 lp_build_vec_type(gallivm, zs_load_type), "");
+      zs_dst2 = LLVMBuildBitCast(builder, zs_dst2,
+                                 lp_build_vec_type(gallivm, zs_load_type), "");
+   }
+
+   LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1);
+   if (!is_1d) {
+      LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2);
+   }
+}
+
+/**
+ * Generate code for performing depth and/or stencil tests.
+ * We operate on a vector of values (typically n 2x2 quads).
+ *
+ * \param depth  the depth test state
+ * \param stencil  the front/back stencil state
+ * \param type  the data type of the fragment depth/stencil values
+ * \param format_desc  description of the depth/stencil surface
+ * \param mask  the alive/dead pixel mask for the quad (vector)
+ * \param stencil_refs  the front/back stencil ref values (scalar)
+ * \param z_src  the incoming depth/stencil values (n 2x2 quad values, float32)
+ * \param zs_dst  the depth/stencil values in framebuffer
+ * \param face  contains boolean value indicating front/back facing polygon
+ */
+void
+lp_build_depth_stencil_test(struct gallivm_state *gallivm,
+                            const struct pipe_depth_state *depth,
+                            const struct pipe_stencil_state stencil[2],
+                            struct lp_type z_src_type,
+                            const struct util_format_description *format_desc,
+                            struct lp_build_mask_context *mask,
+                            LLVMValueRef stencil_refs[2],
+                            LLVMValueRef z_src,
+                            LLVMValueRef z_fb,
+                            LLVMValueRef s_fb,
+                            LLVMValueRef face,
+                            LLVMValueRef *z_value,
+                            LLVMValueRef *s_value,
+                            boolean do_branch)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_type z_type;
+   struct lp_build_context z_bld;
+   struct lp_build_context s_bld;
+   struct lp_type s_type;
+   unsigned z_shift = 0, z_width = 0, z_mask = 0;
+   LLVMValueRef z_dst = NULL;
+   LLVMValueRef stencil_vals = NULL;
+   LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
+   LLVMValueRef z_pass = NULL, s_pass_mask = NULL;
+   LLVMValueRef current_mask = lp_build_mask_value(mask);
+   LLVMValueRef front_facing = NULL;
+   boolean have_z, have_s;
+
+   /*
+    * Depths are expected to be between 0 and 1, even if they are stored in
+    * floats. Setting these bits here will ensure that the lp_build_conv() call
+    * below won't try to unnecessarily clamp the incoming values.
+    */
+   if(z_src_type.floating) {
+      z_src_type.sign = FALSE;
+      z_src_type.norm = TRUE;
+   }
+   else {
+      assert(!z_src_type.sign);
+      assert(z_src_type.norm);
+   }
+
+   /* Pick the type matching the depth-stencil format. */
+   z_type = lp_depth_type(format_desc, z_src_type.length);
+
+   /* Pick the intermediate type for depth operations. */
+   z_type.width = z_src_type.width;
+   assert(z_type.length == z_src_type.length);
+
+   /* FIXME: for non-float depth/stencil might generate better code
+    * if we'd always split it up to use 128bit operations.
+    * For stencil we'd almost certainly want to pack to 8xi16 values,
+    * for z just run twice.
+    */
+
+   /* Sanity checking */
+   {
+      const unsigned z_swizzle = format_desc->swizzle[0];
+      const unsigned s_swizzle = format_desc->swizzle[1];
+
+      assert(z_swizzle != UTIL_FORMAT_SWIZZLE_NONE ||
+             s_swizzle != UTIL_FORMAT_SWIZZLE_NONE);
+
+      assert(depth->enabled || stencil[0].enabled);
+
+      assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
+      assert(format_desc->block.width == 1);
+      assert(format_desc->block.height == 1);
+
+      if (stencil[0].enabled) {
+         assert(s_swizzle < 4);
+         assert(format_desc->channel[s_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
+         assert(format_desc->channel[s_swizzle].pure_integer);
+         assert(!format_desc->channel[s_swizzle].normalized);
+         assert(format_desc->channel[s_swizzle].size == 8);
+      }
+
+      if (depth->enabled) {
+         assert(z_swizzle < 4);
+         if (z_type.floating) {
+            assert(z_swizzle == 0);
+            assert(format_desc->channel[z_swizzle].type ==
+                   UTIL_FORMAT_TYPE_FLOAT);
+            assert(format_desc->channel[z_swizzle].size == 32);
+         }
+         else {
+            assert(format_desc->channel[z_swizzle].type ==
+                   UTIL_FORMAT_TYPE_UNSIGNED);
+            assert(format_desc->channel[z_swizzle].normalized);
+            assert(!z_type.fixed);
+         }
+      }
+   }
+
+
+   /* Setup build context for Z vals */
+   lp_build_context_init(&z_bld, gallivm, z_type);
+
+   /* Setup build context for stencil vals */
+   s_type = lp_int_type(z_type);
+   lp_build_context_init(&s_bld, gallivm, s_type);
+
+   /* Compute and apply the Z/stencil bitmasks and shifts.
+    */
+   {
+      unsigned s_shift, s_mask;
+
+      z_dst = z_fb;
+      stencil_vals = s_fb;
+
+      have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask);
+      have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask);
+
+      if (have_z) {
+         if (z_mask != 0xffffffff) {
+            z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask);
+         }
+
+         /*
+          * Align the framebuffer Z 's LSB to the right.
+          */
+         if (z_shift) {
+            LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
+            z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst");
+         } else if (z_bitmask) {
+            z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst");
+         } else {
+            lp_build_name(z_dst, "z_dst");
+         }
+      }
+
+      if (have_s) {
+         if (s_shift) {
+            LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift);
+            stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, "");
+            stencil_shift = shift;  /* used below */
+         }
+
+         if (s_mask != 0xffffffff) {
+            LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask);
+            stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, "");
+         }
+
+         lp_build_name(stencil_vals, "s_dst");
+      }
+   }
+
+   if (stencil[0].enabled) {
+
+      if (face) {
+         LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
+
+         /* front_facing = face != 0 ? ~0 : 0 */
+         front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
+         front_facing = LLVMBuildSExt(builder, front_facing,
+                                      LLVMIntTypeInContext(gallivm->context,
+                                             s_bld.type.length*s_bld.type.width),
+                                      "");
+         front_facing = LLVMBuildBitCast(builder, front_facing,
+                                         s_bld.int_vec_type, "");
+      }
+
+      s_pass_mask = lp_build_stencil_test(&s_bld, stencil,
+                                          stencil_refs, stencil_vals,
+                                          front_facing);
+
+      /* apply stencil-fail operator */
+      {
+         LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, current_mask, s_pass_mask);
+         stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP,
+                                            stencil_refs, stencil_vals,
+                                            s_fail_mask, front_facing);
+      }
+   }
+
+   if (depth->enabled) {
+      /*
+       * Convert fragment Z to the desired type, aligning the LSB to the right.
+       */
+
+      assert(z_type.width == z_src_type.width);
+      assert(z_type.length == z_src_type.length);
+      assert(lp_check_value(z_src_type, z_src));
+      if (z_src_type.floating) {
+         /*
+          * Convert from floating point values
+          */
+
+         if (!z_type.floating) {
+            z_src = lp_build_clamped_float_to_unsigned_norm(gallivm,
+                                                            z_src_type,
+                                                            z_width,
+                                                            z_src);
+         }
+      } else {
+         /*
+          * Convert from unsigned normalized values.
+          */
+
+         assert(!z_src_type.sign);
+         assert(!z_src_type.fixed);
+         assert(z_src_type.norm);
+         assert(!z_type.floating);
+         if (z_src_type.width > z_width) {
+            LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_src_type,
+                                                        z_src_type.width - z_width);
+            z_src = LLVMBuildLShr(builder, z_src, shift, "");
+         }
+      }
+      assert(lp_check_value(z_type, z_src));
+
+      lp_build_name(z_src, "z_src");
+
+      /* compare src Z to dst Z, returning 'pass' mask */
+      z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst);
+
+      /* mask off bits that failed stencil test */
+      if (s_pass_mask) {
+         current_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, "");
+      }
+
+      if (!stencil[0].enabled) {
+         /* We can potentially skip all remaining operations here, but only
+          * if stencil is disabled because we still need to update the stencil
+          * buffer values.  Don't need to update Z buffer values.
+          */
+         lp_build_mask_update(mask, z_pass);
+
+         if (do_branch) {
+            lp_build_mask_check(mask);
+         }
+      }
+
+      if (depth->writemask) {
+         LLVMValueRef z_pass_mask;
+
+         /* mask off bits that failed Z test */
+         z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, "");
+
+         /* Mix the old and new Z buffer values.
+          * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i]
+          */
+         z_dst = lp_build_select(&z_bld, z_pass_mask, z_src, z_dst);
+      }
+
+      if (stencil[0].enabled) {
+         /* update stencil buffer values according to z pass/fail result */
+         LLVMValueRef z_fail_mask, z_pass_mask;
+
+         /* apply Z-fail operator */
+         z_fail_mask = lp_build_andnot(&s_bld, current_mask, z_pass);
+         stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP,
+                                            stencil_refs, stencil_vals,
+                                            z_fail_mask, front_facing);
+
+         /* apply Z-pass operator */
+         z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, "");
+         stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
+                                            stencil_refs, stencil_vals,
+                                            z_pass_mask, front_facing);
+      }
+   }
+   else {
+      /* No depth test: apply Z-pass operator to stencil buffer values which
+       * passed the stencil test.
+       */
+      s_pass_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, "");
+      stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
+                                         stencil_refs, stencil_vals,
+                                         s_pass_mask, front_facing);
+   }
+
+   /* Put Z and stencil bits in the right place */
+   if (have_z && z_shift) {
+      LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
+      z_dst = LLVMBuildShl(builder, z_dst, shift, "");
+   }
+   if (stencil_vals && stencil_shift)
+      stencil_vals = LLVMBuildShl(builder, stencil_vals,
+                                  stencil_shift, "");
+
+   /* Finally, merge the z/stencil values */
+   if (format_desc->block.bits <= 32) {
+      if (have_z && have_s)
+         *z_value = LLVMBuildOr(builder, z_dst, stencil_vals, "");
+      else if (have_z)
+         *z_value = z_dst;
+      else
+         *z_value = stencil_vals;
+      *s_value = *z_value;
+   }
+   else {
+      *z_value = z_dst;
+      *s_value = stencil_vals;
+   }
+
+   if (s_pass_mask)
+      lp_build_mask_update(mask, s_pass_mask);
+
+   if (depth->enabled && stencil[0].enabled)
+      lp_build_mask_update(mask, z_pass);
+}
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_depth.h
new file mode 100644
index 000000000..d169c8967
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_depth.h
@@ -0,0 +1,105 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * Depth/stencil testing to LLVM IR translation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_BLD_DEPTH_H
+#define LP_BLD_DEPTH_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+#include "gallivm/lp_bld.h"
+
+ 
+struct pipe_depth_state;
+struct gallivm_state;
+struct util_format_description;
+struct lp_type;
+struct lp_build_mask_context;
+
+
+struct lp_type
+lp_depth_type(const struct util_format_description *format_desc,
+              unsigned length);
+
+
+void
+lp_build_depth_stencil_test(struct gallivm_state *gallivm,
+                            const struct pipe_depth_state *depth,
+                            const struct pipe_stencil_state stencil[2],
+                            struct lp_type z_src_type,
+                            const struct util_format_description *format_desc,
+                            struct lp_build_mask_context *mask,
+                            LLVMValueRef stencil_refs[2],
+                            LLVMValueRef z_src,
+                            LLVMValueRef z_fb,
+                            LLVMValueRef s_fb,
+                            LLVMValueRef face,
+                            LLVMValueRef *z_value,
+                            LLVMValueRef *s_value,
+                            boolean do_branch);
+
+void
+lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
+                                     struct lp_type z_src_type,
+                                     const struct util_format_description *format_desc,
+                                     boolean is_1d,
+                                     LLVMValueRef depth_ptr,
+                                     LLVMValueRef depth_stride,
+                                     LLVMValueRef *z_fb,
+                                     LLVMValueRef *s_fb,
+                                     LLVMValueRef loop_counter);
+
+void
+lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,
+                                      struct lp_type z_src_type,
+                                      const struct util_format_description *format_desc,
+                                      boolean is_1d,
+                                      struct lp_build_mask_context *mask,
+                                      LLVMValueRef z_fb,
+                                      LLVMValueRef s_fb,
+                                      LLVMValueRef loop_counter,
+                                      LLVMValueRef depth_ptr,
+                                      LLVMValueRef depth_stride,
+                                      LLVMValueRef z_value,
+                                      LLVMValueRef s_value);
+
+
+void
+lp_build_occlusion_count(struct gallivm_state *gallivm,
+                         struct lp_type type,
+                         LLVMValueRef maskvalue,
+                         LLVMValueRef counter);
+
+#endif /* !LP_BLD_DEPTH_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_interp.c
new file mode 100644
index 000000000..ceac86abe
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -0,0 +1,819 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007-2008 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Position and shader input interpolation.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_shader_tokens.h"
+#include "util/u_debug.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "tgsi/tgsi_scan.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_swizzle.h"
+#include "gallivm/lp_bld_flow.h"
+#include "lp_bld_interp.h"
+
+
+/*
+ * The shader JIT function operates on blocks of quads.
+ * Each block has 2x2 quads and each quad has 2x2 pixels.
+ *
+ * We iterate over the quads in order 0, 1, 2, 3:
+ *
+ * #################
+ * #   |   #   |   #
+ * #---0---#---1---#
+ * #   |   #   |   #
+ * #################
+ * #   |   #   |   #
+ * #---2---#---3---#
+ * #   |   #   |   #
+ * #################
+ *
+ * If we iterate over multiple quads at once, quads 01 and 23 are processed
+ * together.
+ *
+ * Within each quad, we have four pixels which are represented in SOA
+ * order:
+ *
+ * #########
+ * # 0 | 1 #
+ * #---+---#
+ * # 2 | 3 #
+ * #########
+ *
+ * So the green channel (for example) of the four pixels is stored in
+ * a single vector register: {g0, g1, g2, g3}.
+ * The order stays the same even with multiple quads:
+ * 0 1 4 5
+ * 2 3 6 7
+ * is stored as g0..g7
+ */
+
+
+/**
+ * Do one perspective divide per quad.
+ *
+ * For perspective interpolation, the final attribute value is given
+ *
+ *  a' = a/w = a * oow
+ *
+ * where
+ *
+ *  a = a0 + dadx*x + dady*y
+ *  w = w0 + dwdx*x + dwdy*y
+ *  oow = 1/w = 1/(w0 + dwdx*x + dwdy*y)
+ *
+ * Instead of computing the division per pixel, with this macro we compute the
+ * division on the upper left pixel of each quad, and use a linear
+ * approximation in the remaining pixels, given by:
+ *
+ *  da'dx = (dadx - dwdx*a)*oow
+ *  da'dy = (dady - dwdy*a)*oow
+ *
+ * Ironically, this actually makes things slower -- probably because the
+ * divide hardware unit is rarely used, whereas the multiply unit is typically
+ * already saturated.
+ */
+#define PERSPECTIVE_DIVIDE_PER_QUAD 0
+
+
+static const unsigned char quad_offset_x[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3};
+static const unsigned char quad_offset_y[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3};
+
+
+static void
+attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
+{
+   if(attrib == 0)
+      lp_build_name(val, "pos.%c%s", "xyzw"[chan], suffix);
+   else
+      lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
+}
+
+static void
+calc_offsets(struct lp_build_context *coeff_bld,
+             unsigned quad_start_index,
+             LLVMValueRef *pixoffx,
+             LLVMValueRef *pixoffy)
+{
+   unsigned i;
+   unsigned num_pix = coeff_bld->type.length;
+   struct gallivm_state *gallivm = coeff_bld->gallivm;
+   LLVMBuilderRef builder = coeff_bld->gallivm->builder;
+   LLVMValueRef nr, pixxf, pixyf;
+
+   *pixoffx = coeff_bld->undef;
+   *pixoffy = coeff_bld->undef;
+
+   for (i = 0; i < num_pix; i++) {
+      nr = lp_build_const_int32(gallivm, i);
+      pixxf = lp_build_const_float(gallivm, quad_offset_x[i % num_pix] +
+                                   (quad_start_index & 1) * 2);
+      pixyf = lp_build_const_float(gallivm, quad_offset_y[i % num_pix] +
+                                   (quad_start_index & 2));
+      *pixoffx = LLVMBuildInsertElement(builder, *pixoffx, pixxf, nr, "");
+      *pixoffy = LLVMBuildInsertElement(builder, *pixoffy, pixyf, nr, "");
+   }
+}
+
+
+/* Much easier, and significantly less instructions in the per-stamp
+ * part (less than half) but overall more instructions so a loss if
+ * most quads are active. Might be a win though with larger vectors.
+ * No ability to do per-quad divide (doable but not implemented)
+ * Could be made to work with passed in pixel offsets (i.e. active quad merging).
+ */
+static void
+coeffs_init_simple(struct lp_build_interp_soa_context *bld,
+                   LLVMValueRef a0_ptr,
+                   LLVMValueRef dadx_ptr,
+                   LLVMValueRef dady_ptr)
+{
+   struct lp_build_context *coeff_bld = &bld->coeff_bld;
+   struct lp_build_context *setup_bld = &bld->setup_bld;
+   struct gallivm_state *gallivm = coeff_bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned attrib;
+
+   for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      /*
+       * always fetch all 4 values for performance/simplicity
+       * Note: we do that here because it seems to generate better
+       * code. It generates a lot of moves initially but less
+       * moves later. As far as I can tell this looks like a
+       * llvm issue, instead of simply reloading the values from
+       * the passed in pointers it if it runs out of registers
+       * it spills/reloads them. Maybe some optimization passes
+       * would help.
+       * Might want to investigate this again later.
+       */
+      const unsigned interp = bld->interp[attrib];
+      LLVMValueRef index = lp_build_const_int32(gallivm,
+                                attrib * TGSI_NUM_CHANNELS);
+      LLVMValueRef ptr;
+      LLVMValueRef dadxaos = setup_bld->zero;
+      LLVMValueRef dadyaos = setup_bld->zero;
+      LLVMValueRef a0aos = setup_bld->zero;
+
+      switch (interp) {
+      case LP_INTERP_PERSPECTIVE:
+         /* fall-through */
+
+      case LP_INTERP_LINEAR:
+         ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         dadxaos = LLVMBuildLoad(builder, ptr, "");
+
+         ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         dadyaos = LLVMBuildLoad(builder, ptr, "");
+
+         attrib_name(dadxaos, attrib, 0, ".dadxaos");
+         attrib_name(dadyaos, attrib, 0, ".dadyaos");
+         /* fall-through */
+
+      case LP_INTERP_CONSTANT:
+      case LP_INTERP_FACING:
+         ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         a0aos = LLVMBuildLoad(builder, ptr, "");
+         attrib_name(a0aos, attrib, 0, ".a0aos");
+         break;
+
+      case LP_INTERP_POSITION:
+         /* Nothing to do as the position coeffs are already setup in slot 0 */
+         continue;
+
+      default:
+         assert(0);
+         break;
+      }
+      bld->a0aos[attrib] = a0aos;
+      bld->dadxaos[attrib] = dadxaos;
+      bld->dadyaos[attrib] = dadyaos;
+   }
+}
+
+/**
+ * Interpolate the shader input attribute values.
+ * This is called for each (group of) quad(s).
+ */
+static void
+attribs_update_simple(struct lp_build_interp_soa_context *bld,
+                      struct gallivm_state *gallivm,
+                      LLVMValueRef loop_iter,
+                      int start,
+                      int end)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *coeff_bld = &bld->coeff_bld;
+   struct lp_build_context *setup_bld = &bld->setup_bld;
+   LLVMValueRef oow = NULL;
+   unsigned attrib;
+   LLVMValueRef pixoffx;
+   LLVMValueRef pixoffy;
+   LLVMValueRef ptr;
+
+   /* could do this with code-generated passed in pixel offsets too */
+
+   assert(loop_iter);
+   ptr = LLVMBuildGEP(builder, bld->xoffset_store, &loop_iter, 1, "");
+   pixoffx = LLVMBuildLoad(builder, ptr, "");
+   ptr = LLVMBuildGEP(builder, bld->yoffset_store, &loop_iter, 1, "");
+   pixoffy = LLVMBuildLoad(builder, ptr, "");
+
+   pixoffx = LLVMBuildFAdd(builder, pixoffx,
+                           lp_build_broadcast_scalar(coeff_bld, bld->x), "");
+   pixoffy = LLVMBuildFAdd(builder, pixoffy,
+                           lp_build_broadcast_scalar(coeff_bld, bld->y), "");
+
+   for (attrib = start; attrib < end; attrib++) {
+      const unsigned mask = bld->mask[attrib];
+      const unsigned interp = bld->interp[attrib];
+      unsigned chan;
+
+      for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+         if (mask & (1 << chan)) {
+            LLVMValueRef index;
+            LLVMValueRef dadx = coeff_bld->zero;
+            LLVMValueRef dady = coeff_bld->zero;
+            LLVMValueRef a = coeff_bld->zero;
+
+            index = lp_build_const_int32(gallivm, chan);
+            switch (interp) {
+            case LP_INTERP_PERSPECTIVE:
+               /* fall-through */
+
+            case LP_INTERP_LINEAR:
+               if (attrib == 0 && chan == 0) {
+                  dadx = coeff_bld->one;
+                  if (bld->pos_offset) {
+                     a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
+                  }
+               }
+               else if (attrib == 0 && chan == 1) {
+                  dady = coeff_bld->one;
+                  if (bld->pos_offset) {
+                     a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
+                  }
+               }
+               else {
+                  dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                                    coeff_bld->type, bld->dadxaos[attrib],
+                                                    index);
+                  dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                                    coeff_bld->type, bld->dadyaos[attrib],
+                                                    index);
+                  a = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                                 coeff_bld->type, bld->a0aos[attrib],
+                                                 index);
+               }
+               /*
+                * a = a0 + (x * dadx + y * dady)
+                */
+               dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
+               dady = LLVMBuildFMul(builder, dady, pixoffy, "");
+               a = LLVMBuildFAdd(builder, a, dadx, "");
+               a = LLVMBuildFAdd(builder, a, dady, "");
+
+               if (interp == LP_INTERP_PERSPECTIVE) {
+                  if (oow == NULL) {
+                     LLVMValueRef w = bld->attribs[0][3];
+                     assert(attrib != 0);
+                     assert(bld->mask[0] & TGSI_WRITEMASK_W);
+                     oow = lp_build_rcp(coeff_bld, w);
+                  }
+                  a = lp_build_mul(coeff_bld, a, oow);
+               }
+               break;
+
+            case LP_INTERP_CONSTANT:
+            case LP_INTERP_FACING:
+               a = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                              coeff_bld->type, bld->a0aos[attrib],
+                                              index);
+               break;
+
+            case LP_INTERP_POSITION:
+               assert(attrib > 0);
+               a = bld->attribs[0][chan];
+               break;
+
+            default:
+               assert(0);
+               break;
+            }
+
+            if ((attrib == 0) && (chan == 2)){
+               /* FIXME: Depth values can exceed 1.0, due to the fact that
+                * setup interpolation coefficients refer to (0,0) which causes
+                * precision loss. So we must clamp to 1.0 here to avoid artifacts
+                */
+               a = lp_build_min(coeff_bld, a, coeff_bld->one);
+            }
+            bld->attribs[attrib][chan] = a;
+         }
+      }
+   }
+}
+
+/**
+ * Initialize the bld->a, dadq fields.  This involves fetching
+ * those values from the arrays which are passed into the JIT function.
+ */
+static void
+coeffs_init(struct lp_build_interp_soa_context *bld,
+            LLVMValueRef a0_ptr,
+            LLVMValueRef dadx_ptr,
+            LLVMValueRef dady_ptr)
+{
+   struct lp_build_context *coeff_bld = &bld->coeff_bld;
+   struct lp_build_context *setup_bld = &bld->setup_bld;
+   struct gallivm_state *gallivm = coeff_bld->gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef pixoffx, pixoffy;
+   unsigned attrib;
+   unsigned chan;
+   unsigned i;
+
+   pixoffx = coeff_bld->undef;
+   pixoffy = coeff_bld->undef;
+   for (i = 0; i < coeff_bld->type.length; i++) {
+      LLVMValueRef nr = lp_build_const_int32(gallivm, i);
+      LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i]);
+      LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i]);
+      pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, "");
+      pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, "");
+   }
+
+
+   for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      const unsigned mask = bld->mask[attrib];
+      const unsigned interp = bld->interp[attrib];
+      LLVMValueRef index = lp_build_const_int32(gallivm,
+                                attrib * TGSI_NUM_CHANNELS);
+      LLVMValueRef ptr;
+      LLVMValueRef dadxaos = setup_bld->zero;
+      LLVMValueRef dadyaos = setup_bld->zero;
+      LLVMValueRef a0aos = setup_bld->zero;
+
+      /* always fetch all 4 values for performance/simplicity */
+      switch (interp) {
+      case LP_INTERP_PERSPECTIVE:
+         /* fall-through */
+
+      case LP_INTERP_LINEAR:
+         ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         dadxaos = LLVMBuildLoad(builder, ptr, "");
+
+         ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         dadyaos = LLVMBuildLoad(builder, ptr, "");
+
+         attrib_name(dadxaos, attrib, 0, ".dadxaos");
+         attrib_name(dadyaos, attrib, 0, ".dadyaos");
+         /* fall-through */
+
+      case LP_INTERP_CONSTANT:
+      case LP_INTERP_FACING:
+         ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
+         ptr = LLVMBuildBitCast(builder, ptr,
+               LLVMPointerType(setup_bld->vec_type, 0), "");
+         a0aos = LLVMBuildLoad(builder, ptr, "");
+         attrib_name(a0aos, attrib, 0, ".a0aos");
+         break;
+
+      case LP_INTERP_POSITION:
+         /* Nothing to do as the position coeffs are already setup in slot 0 */
+         continue;
+
+      default:
+         assert(0);
+         break;
+      }
+
+      /*
+       * a = a0 + (x * dadx + y * dady)
+       * a0aos is the attrib value at top left corner of stamp
+       */
+      if (interp != LP_INTERP_CONSTANT &&
+          interp != LP_INTERP_FACING) {
+         LLVMValueRef axaos, ayaos;
+         axaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->x),
+                               dadxaos, "");
+         ayaos = LLVMBuildFMul(builder, lp_build_broadcast_scalar(setup_bld, bld->y),
+                               dadyaos, "");
+         a0aos = LLVMBuildFAdd(builder, a0aos, ayaos, "");
+         a0aos = LLVMBuildFAdd(builder, a0aos, axaos, "");
+      }
+
+      /*
+       * dadq = {0, dadx, dady, dadx + dady}
+       * for two quads (side by side) this is:
+       * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady}
+       */
+      for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+         /* this generates a CRAPLOAD of shuffles... */
+         if (mask & (1 << chan)) {
+            LLVMValueRef dadx, dady;
+            LLVMValueRef dadq, dadq2;
+            LLVMValueRef a;
+            LLVMValueRef chan_index = lp_build_const_int32(gallivm, chan);
+
+            if (attrib == 0 && chan == 0) {
+               a = bld->x;
+               if (bld->pos_offset) {
+                  a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), "");
+               }
+               a = lp_build_broadcast_scalar(coeff_bld, a);
+               dadx = coeff_bld->one;
+               dady = coeff_bld->zero;
+            }
+            else if (attrib == 0 && chan == 1) {
+               a = bld->y;
+               if (bld->pos_offset) {
+                  a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), "");
+               }
+               a = lp_build_broadcast_scalar(coeff_bld, a);
+               dady = coeff_bld->one;
+               dadx = coeff_bld->zero;
+            }
+            else {
+               dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                              coeff_bld->type, dadxaos, chan_index);
+               dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                              coeff_bld->type, dadyaos, chan_index);
+
+               /*
+                * a = {a, a, a, a}
+                */
+               a = lp_build_extract_broadcast(gallivm, setup_bld->type,
+                                              coeff_bld->type, a0aos, chan_index);
+            }
+
+            dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
+            dady = LLVMBuildFMul(builder, dady, pixoffy, "");
+            dadq = LLVMBuildFAdd(builder, dadx, dady, "");
+
+            /*
+             * Compute the attrib values on the upper-left corner of each
+             * group of quads.
+             * Note that if we process 2 quads at once this doesn't
+             * really exactly to what we want.
+             * We need to access elem 0 and 2 respectively later if we process
+             * 2 quads at once.
+             */
+
+            if (interp != LP_INTERP_CONSTANT &&
+                interp != LP_INTERP_FACING) {
+               dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
+               a = LLVMBuildFAdd(builder, a, dadq2, "");
+	    }
+
+#if PERSPECTIVE_DIVIDE_PER_QUAD
+            /*
+             * a *= 1 / w
+             */
+
+            /*
+             * XXX since we're only going to access elements 0,2 out of 8
+             * if we have 8-wide vectors we should do the division only 4-wide.
+             * a is really a 2-elements in a 4-wide vector disguised as 8-wide
+             * in this case.
+             */
+            if (interp == LP_INTERP_PERSPECTIVE) {
+               LLVMValueRef w = bld->a[0][3];
+               assert(attrib != 0);
+               assert(bld->mask[0] & TGSI_WRITEMASK_W);
+               if (!bld->oow) {
+                  bld->oow = lp_build_rcp(coeff_bld, w);
+                  lp_build_name(bld->oow, "oow");
+               }
+               a = lp_build_mul(coeff_bld, a, bld->oow);
+            }
+#endif
+
+            attrib_name(a, attrib, chan, ".a");
+            attrib_name(dadq, attrib, chan, ".dadq");
+
+            bld->a[attrib][chan] = lp_build_alloca(gallivm,
+                                                   LLVMTypeOf(a), "");
+            LLVMBuildStore(builder, a, bld->a[attrib][chan]);
+            bld->dadq[attrib][chan] = dadq;
+         }
+      }
+   }
+}
+
+
+/**
+ * Increment the shader input attribute values.
+ * This is called when we move from one quad to the next.
+ */
+static void
+attribs_update(struct lp_build_interp_soa_context *bld,
+               struct gallivm_state *gallivm,
+               LLVMValueRef loop_iter,
+               int start,
+               int end)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_build_context *coeff_bld = &bld->coeff_bld;
+   LLVMValueRef oow = NULL;
+   unsigned attrib;
+   unsigned chan;
+
+   for(attrib = start; attrib < end; ++attrib) {
+      const unsigned mask = bld->mask[attrib];
+      const unsigned interp = bld->interp[attrib];
+      for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+         if(mask & (1 << chan)) {
+            LLVMValueRef a;
+            if (interp == LP_INTERP_CONSTANT ||
+                interp == LP_INTERP_FACING) {
+               a = LLVMBuildLoad(builder, bld->a[attrib][chan], "");
+            }
+            else if (interp == LP_INTERP_POSITION) {
+               assert(attrib > 0);
+               a = bld->attribs[0][chan];
+            }
+            else {
+               LLVMValueRef dadq;
+
+               a = bld->a[attrib][chan];
+
+               /*
+                * Broadcast the attribute value for this quad into all elements
+                */
+
+               {
+                  /* stored as vector load as float */
+                  LLVMTypeRef ptr_type = LLVMPointerType(LLVMFloatTypeInContext(
+                                                            gallivm->context), 0);
+                  LLVMValueRef ptr;
+                  a = LLVMBuildBitCast(builder, a, ptr_type, "");
+                  ptr = LLVMBuildGEP(builder, a, &loop_iter, 1, "");
+                  a = LLVMBuildLoad(builder, ptr, "");
+                  a = lp_build_broadcast_scalar(&bld->coeff_bld, a);
+               }
+
+               /*
+                * Get the derivatives.
+                */
+
+               dadq = bld->dadq[attrib][chan];
+
+#if PERSPECTIVE_DIVIDE_PER_QUAD
+               if (interp == LP_INTERP_PERSPECTIVE) {
+                  LLVMValueRef dwdq = bld->dadq[0][3];
+
+                  if (oow == NULL) {
+                     assert(bld->oow);
+                     oow = LLVMBuildShuffleVector(coeff_bld->builder,
+                                                  bld->oow, coeff_bld->undef,
+                                                  shuffle, "");
+                  }
+
+                  dadq = lp_build_sub(coeff_bld,
+                                      dadq,
+                                      lp_build_mul(coeff_bld, a, dwdq));
+                  dadq = lp_build_mul(coeff_bld, dadq, oow);
+               }
+#endif
+
+               /*
+                * Add the derivatives
+                */
+
+               a = lp_build_add(coeff_bld, a, dadq);
+
+#if !PERSPECTIVE_DIVIDE_PER_QUAD
+               if (interp == LP_INTERP_PERSPECTIVE) {
+                  if (oow == NULL) {
+                     LLVMValueRef w = bld->attribs[0][3];
+                     assert(attrib != 0);
+                     assert(bld->mask[0] & TGSI_WRITEMASK_W);
+                     oow = lp_build_rcp(coeff_bld, w);
+                  }
+                  a = lp_build_mul(coeff_bld, a, oow);
+               }
+#endif
+
+               if (attrib == 0 && chan == 2) {
+                  /* FIXME: Depth values can exceed 1.0, due to the fact that
+                   * setup interpolation coefficients refer to (0,0) which causes
+                   * precision loss. So we must clamp to 1.0 here to avoid artifacts
+                   */
+                  a = lp_build_min(coeff_bld, a, coeff_bld->one);
+               }
+
+               attrib_name(a, attrib, chan, "");
+            }
+            bld->attribs[attrib][chan] = a;
+         }
+      }
+   }
+}
+
+
+/**
+ * Generate the position vectors.
+ *
+ * Parameter x0, y0 are the integer values with upper left coordinates.
+ */
+static void
+pos_init(struct lp_build_interp_soa_context *bld,
+         LLVMValueRef x0,
+         LLVMValueRef y0)
+{
+   LLVMBuilderRef builder = bld->coeff_bld.gallivm->builder;
+   struct lp_build_context *coeff_bld = &bld->coeff_bld;
+
+   bld->x = LLVMBuildSIToFP(builder, x0, coeff_bld->elem_type, "");
+   bld->y = LLVMBuildSIToFP(builder, y0, coeff_bld->elem_type, "");
+}
+
+
+/**
+ * Initialize fragment shader input attribute info.
+ */
+void
+lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
+                         struct gallivm_state *gallivm,
+                         unsigned num_inputs,
+                         const struct lp_shader_input *inputs,
+                         boolean pixel_center_integer,
+                         LLVMBuilderRef builder,
+                         struct lp_type type,
+                         LLVMValueRef a0_ptr,
+                         LLVMValueRef dadx_ptr,
+                         LLVMValueRef dady_ptr,
+                         LLVMValueRef x0,
+                         LLVMValueRef y0)
+{
+   struct lp_type coeff_type;
+   struct lp_type setup_type;
+   unsigned attrib;
+   unsigned chan;
+
+   memset(bld, 0, sizeof *bld);
+
+   memset(&coeff_type, 0, sizeof coeff_type);
+   coeff_type.floating = TRUE;
+   coeff_type.sign = TRUE;
+   coeff_type.width = 32;
+   coeff_type.length = type.length;
+
+   memset(&setup_type, 0, sizeof setup_type);
+   setup_type.floating = TRUE;
+   setup_type.sign = TRUE;
+   setup_type.width = 32;
+   setup_type.length = TGSI_NUM_CHANNELS;
+
+
+   /* XXX: we don't support interpolating into any other types */
+   assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0);
+
+   lp_build_context_init(&bld->coeff_bld, gallivm, coeff_type);
+   lp_build_context_init(&bld->setup_bld, gallivm, setup_type);
+
+   /* For convenience */
+   bld->pos = bld->attribs[0];
+   bld->inputs = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) bld->attribs[1];
+
+   /* Position */
+   bld->mask[0] = TGSI_WRITEMASK_XYZW;
+   bld->interp[0] = LP_INTERP_LINEAR;
+
+   /* Inputs */
+   for (attrib = 0; attrib < num_inputs; ++attrib) {
+      bld->mask[1 + attrib] = inputs[attrib].usage_mask;
+      bld->interp[1 + attrib] = inputs[attrib].interp;
+   }
+   bld->num_attribs = 1 + num_inputs;
+
+   /* Ensure all masked out input channels have a valid value */
+   for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
+      for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+         bld->attribs[attrib][chan] = bld->coeff_bld.undef;
+      }
+   }
+
+   if (pixel_center_integer) {
+      bld->pos_offset = 0.0;
+   } else {
+      bld->pos_offset = 0.5;
+   }
+
+   pos_init(bld, x0, y0);
+
+   /*
+    * Simple method (single step interpolation) may be slower if vector length
+    * is just 4, but the results are different (generally less accurate) with
+    * the other method, so always use more accurate version.
+    */
+   if (1) {
+      bld->simple_interp = TRUE;
+      {
+         /* XXX this should use a global static table */
+         unsigned i;
+         unsigned num_loops = 16 / type.length;
+         LLVMValueRef pixoffx, pixoffy, index;
+         LLVMValueRef ptr;
+
+         bld->xoffset_store = lp_build_array_alloca(gallivm,
+                                                    lp_build_vec_type(gallivm, type),
+                                                    lp_build_const_int32(gallivm, num_loops),
+                                                    "");
+         bld->yoffset_store = lp_build_array_alloca(gallivm,
+                                                    lp_build_vec_type(gallivm, type),
+                                                    lp_build_const_int32(gallivm, num_loops),
+                                                    "");
+         for (i = 0; i < num_loops; i++) {
+            index = lp_build_const_int32(gallivm, i);
+            calc_offsets(&bld->coeff_bld, i*type.length/4, &pixoffx, &pixoffy);
+            ptr = LLVMBuildGEP(builder, bld->xoffset_store, &index, 1, "");
+            LLVMBuildStore(builder, pixoffx, ptr);
+            ptr = LLVMBuildGEP(builder, bld->yoffset_store, &index, 1, "");
+            LLVMBuildStore(builder, pixoffy, ptr);
+         }
+      }
+      coeffs_init_simple(bld, a0_ptr, dadx_ptr, dady_ptr);
+   }
+   else {
+      bld->simple_interp = FALSE;
+      coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
+   }
+
+}
+
+
+/*
+ * Advance the position and inputs to the given quad within the block.
+ */
+
+void
+lp_build_interp_soa_update_inputs_dyn(struct lp_build_interp_soa_context *bld,
+                                      struct gallivm_state *gallivm,
+                                      LLVMValueRef quad_start_index)
+{
+   if (bld->simple_interp) {
+      attribs_update_simple(bld, gallivm, quad_start_index, 1, bld->num_attribs);
+   }
+   else {
+      attribs_update(bld, gallivm, quad_start_index, 1, bld->num_attribs);
+   }
+}
+
+void
+lp_build_interp_soa_update_pos_dyn(struct lp_build_interp_soa_context *bld,
+                                   struct gallivm_state *gallivm,
+                                   LLVMValueRef quad_start_index)
+{
+   if (bld->simple_interp) {
+      attribs_update_simple(bld, gallivm, quad_start_index, 0, 1);
+   }
+   else {
+      attribs_update(bld, gallivm, quad_start_index, 0, 1);
+   }
+}
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_interp.h
new file mode 100644
index 000000000..9029d2a41
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -0,0 +1,137 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Position and shader input interpolation.
+ *
+ * Special attention is given to the interpolation of side by side quads.
+ * Multiplications are made only for the first quad. Interpolation of
+ * inputs for posterior quads are done exclusively with additions, and
+ * perspective divide if necessary.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_BLD_INTERP_H
+#define LP_BLD_INTERP_H
+
+
+#include "gallivm/lp_bld.h"
+#include "gallivm/lp_bld_type.h"
+
+#include "tgsi/tgsi_exec.h"
+
+/**
+ * Describes how to compute the interpolation coefficients (a0, dadx, dady)
+ * from the vertices passed into our triangle/line/point functions by the
+ * draw module.
+ *
+ * Vertices are treated as an array of float[4] values, indexed by
+ * src_index.
+ *
+ * LP_INTERP_COLOR is translated to either LP_INTERP_CONSTANT or
+ * PERSPECTIVE depending on flatshade state.
+ */
+enum lp_interp {
+   LP_INTERP_CONSTANT,
+   LP_INTERP_COLOR,
+   LP_INTERP_LINEAR,
+   LP_INTERP_PERSPECTIVE,
+   LP_INTERP_POSITION,
+   LP_INTERP_FACING
+};
+
+struct lp_shader_input {
+   uint interp:4;       /* enum lp_interp */
+   uint usage_mask:4;   /* bitmask of TGSI_WRITEMASK_x flags */
+   uint src_index:8;    /* where to find values in incoming vertices */
+   uint cyl_wrap:4;     /* TGSI_CYLINDRICAL_WRAP_x flags */
+   uint padding:12;
+};
+
+
+struct lp_build_interp_soa_context
+{
+   /* TGSI_QUAD_SIZE x float */
+   struct lp_build_context coeff_bld;
+   struct lp_build_context setup_bld;
+
+   unsigned num_attribs;
+   unsigned mask[1 + PIPE_MAX_SHADER_INPUTS]; /**< TGSI_WRITE_MASK_x */
+   enum lp_interp interp[1 + PIPE_MAX_SHADER_INPUTS];
+   boolean simple_interp;
+
+   double pos_offset;
+
+   LLVMValueRef x;
+   LLVMValueRef y;
+
+   LLVMValueRef a[1 + PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
+   LLVMValueRef dadq[1 + PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
+   LLVMValueRef a0aos[1 + PIPE_MAX_SHADER_INPUTS];
+   LLVMValueRef dadxaos[1 + PIPE_MAX_SHADER_INPUTS];
+   LLVMValueRef dadyaos[1 + PIPE_MAX_SHADER_INPUTS];
+
+   LLVMValueRef attribs[1 + PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
+
+   LLVMValueRef xoffset_store;
+   LLVMValueRef yoffset_store;
+
+   /*
+    * Convenience pointers. Callers may access this one.
+    */
+   const LLVMValueRef *pos;
+   const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS];
+};
+
+
+void
+lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
+                         struct gallivm_state *gallivm,
+                         unsigned num_inputs,
+                         const struct lp_shader_input *inputs,
+                         boolean pixel_center_integer,
+                         LLVMBuilderRef builder,
+                         struct lp_type type,
+                         LLVMValueRef a0_ptr,
+                         LLVMValueRef dadx_ptr,
+                         LLVMValueRef dady_ptr,
+                         LLVMValueRef x,
+                         LLVMValueRef y);
+
+void
+lp_build_interp_soa_update_inputs_dyn(struct lp_build_interp_soa_context *bld,
+                                      struct gallivm_state *gallivm,
+                                      LLVMValueRef quad_start_index);
+
+void
+lp_build_interp_soa_update_pos_dyn(struct lp_build_interp_soa_context *bld,
+                                   struct gallivm_state *gallivm,
+                                   LLVMValueRef quad_start_index);
+
+#endif /* LP_BLD_INTERP_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_clear.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_clear.c
new file mode 100644
index 000000000..064206fc2
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_clear.c
@@ -0,0 +1,63 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ * Copyright 2009 VMware, Inc.  All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ *    Michel Dänzer
+ */
+
+
+#include "pipe/p_defines.h"
+#include "lp_clear.h"
+#include "lp_context.h"
+#include "lp_setup.h"
+#include "lp_query.h"
+#include "lp_debug.h"
+
+
+/**
+ * Clear the given buffers to the specified values.
+ * No masking, no scissor (clear entire buffer).
+ */
+void
+llvmpipe_clear(struct pipe_context *pipe, 
+               unsigned buffers,
+               const union pipe_color_union *color,
+               double depth,
+               unsigned stencil)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if (!llvmpipe_check_render_cond(llvmpipe))
+      return;
+
+   if (LP_PERF & PERF_NO_DEPTH)
+      buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
+
+   lp_setup_clear( llvmpipe->setup, color, depth, stencil, buffers );
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_clear.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_clear.h
new file mode 100644
index 000000000..7249929cb
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_clear.h
@@ -0,0 +1,44 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ */
+
+#ifndef LP_CLEAR_H
+#define LP_CLEAR_H
+
+#include "pipe/p_state.h"
+struct pipe_context;
+
+extern void
+llvmpipe_clear(struct pipe_context *pipe, unsigned buffers,
+               const union pipe_color_union *color,
+               double depth, unsigned stencil);
+
+
+#endif /* LP_CLEAR_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_context.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_context.c
new file mode 100644
index 000000000..80cb6578b
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_context.c
@@ -0,0 +1,226 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ * Copyright 2008 VMware, Inc.  All rights reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keithw@vmware.com>
+ */
+
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/simple_list.h"
+#include "lp_clear.h"
+#include "lp_context.h"
+#include "lp_flush.h"
+#include "lp_perf.h"
+#include "lp_state.h"
+#include "lp_surface.h"
+#include "lp_query.h"
+#include "lp_setup.h"
+
+/* This is only safe if there's just one concurrent context */
+#ifdef PIPE_SUBSYSTEM_EMBEDDED
+#define USE_GLOBAL_LLVM_CONTEXT
+#endif
+
+static void llvmpipe_destroy( struct pipe_context *pipe )
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
+   uint i, j;
+
+   lp_print_counters();
+
+   if (llvmpipe->blitter) {
+      util_blitter_destroy(llvmpipe->blitter);
+   }
+
+   /* This will also destroy llvmpipe->setup:
+    */
+   if (llvmpipe->draw)
+      draw_destroy( llvmpipe->draw );
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&llvmpipe->framebuffer.cbufs[i], NULL);
+   }
+
+   pipe_surface_reference(&llvmpipe->framebuffer.zsbuf, NULL);
+
+   for (i = 0; i < Elements(llvmpipe->sampler_views[0]); i++) {
+      pipe_sampler_view_reference(&llvmpipe->sampler_views[PIPE_SHADER_FRAGMENT][i], NULL);
+   }
+
+   for (i = 0; i < Elements(llvmpipe->sampler_views[0]); i++) {
+      pipe_sampler_view_reference(&llvmpipe->sampler_views[PIPE_SHADER_VERTEX][i], NULL);
+   }
+
+   for (i = 0; i < Elements(llvmpipe->sampler_views[0]); i++) {
+      pipe_sampler_view_reference(&llvmpipe->sampler_views[PIPE_SHADER_GEOMETRY][i], NULL);
+   }
+
+   for (i = 0; i < Elements(llvmpipe->constants); i++) {
+      for (j = 0; j < Elements(llvmpipe->constants[i]); j++) {
+         pipe_resource_reference(&llvmpipe->constants[i][j].buffer, NULL);
+      }
+   }
+
+   for (i = 0; i < llvmpipe->num_vertex_buffers; i++) {
+      pipe_resource_reference(&llvmpipe->vertex_buffer[i].buffer, NULL);
+   }
+
+   lp_delete_setup_variants(llvmpipe);
+
+#ifndef USE_GLOBAL_LLVM_CONTEXT
+   LLVMContextDispose(llvmpipe->context);
+#endif
+   llvmpipe->context = NULL;
+
+   align_free( llvmpipe );
+}
+
+static void
+do_flush( struct pipe_context *pipe,
+          struct pipe_fence_handle **fence,
+          unsigned flags)
+{
+   llvmpipe_flush(pipe, fence, __FUNCTION__);
+}
+
+
+static void
+llvmpipe_render_condition ( struct pipe_context *pipe,
+                            struct pipe_query *query,
+                            boolean condition,
+                            uint mode )
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
+
+   llvmpipe->render_cond_query = query;
+   llvmpipe->render_cond_mode = mode;
+   llvmpipe->render_cond_cond = condition;
+}
+
+struct pipe_context *
+llvmpipe_create_context( struct pipe_screen *screen, void *priv )
+{
+   struct llvmpipe_context *llvmpipe;
+
+   llvmpipe = align_malloc(sizeof(struct llvmpipe_context), 16);
+   if (!llvmpipe)
+      return NULL;
+
+   util_init_math();
+
+   memset(llvmpipe, 0, sizeof *llvmpipe);
+
+   make_empty_list(&llvmpipe->fs_variants_list);
+
+   make_empty_list(&llvmpipe->setup_variants_list);
+
+
+   llvmpipe->pipe.screen = screen;
+   llvmpipe->pipe.priv = priv;
+
+   /* Init the pipe context methods */
+   llvmpipe->pipe.destroy = llvmpipe_destroy;
+   llvmpipe->pipe.set_framebuffer_state = llvmpipe_set_framebuffer_state;
+   llvmpipe->pipe.clear = llvmpipe_clear;
+   llvmpipe->pipe.flush = do_flush;
+
+   llvmpipe->pipe.render_condition = llvmpipe_render_condition;
+
+   llvmpipe_init_blend_funcs(llvmpipe);
+   llvmpipe_init_clip_funcs(llvmpipe);
+   llvmpipe_init_draw_funcs(llvmpipe);
+   llvmpipe_init_sampler_funcs(llvmpipe);
+   llvmpipe_init_query_funcs( llvmpipe );
+   llvmpipe_init_vertex_funcs(llvmpipe);
+   llvmpipe_init_so_funcs(llvmpipe);
+   llvmpipe_init_fs_funcs(llvmpipe);
+   llvmpipe_init_vs_funcs(llvmpipe);
+   llvmpipe_init_gs_funcs(llvmpipe);
+   llvmpipe_init_rasterizer_funcs(llvmpipe);
+   llvmpipe_init_context_resource_funcs( &llvmpipe->pipe );
+   llvmpipe_init_surface_functions(llvmpipe);
+
+#ifdef USE_GLOBAL_LLVM_CONTEXT
+   llvmpipe->context = LLVMGetGlobalContext();
+#else
+   llvmpipe->context = LLVMContextCreate();
+#endif
+
+   if (!llvmpipe->context)
+      goto fail;
+
+   /*
+    * Create drawing context and plug our rendering stage into it.
+    */
+   llvmpipe->draw = draw_create_with_llvm_context(&llvmpipe->pipe,
+                                                  llvmpipe->context);
+   if (!llvmpipe->draw)
+      goto fail;
+
+   /* FIXME: devise alternative to draw_texture_samplers */
+
+   llvmpipe->setup = lp_setup_create( &llvmpipe->pipe,
+                                      llvmpipe->draw );
+   if (!llvmpipe->setup)
+      goto fail;
+
+   llvmpipe->blitter = util_blitter_create(&llvmpipe->pipe);
+   if (!llvmpipe->blitter) {
+      goto fail;
+   }
+
+   /* must be done before installing Draw stages */
+   util_blitter_cache_all_shaders(llvmpipe->blitter);
+
+   /* plug in AA line/point stages */
+   draw_install_aaline_stage(llvmpipe->draw, &llvmpipe->pipe);
+   draw_install_aapoint_stage(llvmpipe->draw, &llvmpipe->pipe);
+   draw_install_pstipple_stage(llvmpipe->draw, &llvmpipe->pipe);
+
+   /* convert points and lines into triangles: 
+    * (otherwise, draw points and lines natively)
+    */
+   draw_wide_point_sprites(llvmpipe->draw, FALSE);
+   draw_enable_point_sprites(llvmpipe->draw, FALSE);
+   draw_wide_point_threshold(llvmpipe->draw, 10000.0);
+   draw_wide_line_threshold(llvmpipe->draw, 10000.0);
+
+   lp_reset_counters();
+
+   return &llvmpipe->pipe;
+
+ fail:
+   llvmpipe_destroy(&llvmpipe->pipe);
+   return NULL;
+}
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_context.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_context.h
new file mode 100644
index 000000000..c273b25f0
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_context.h
@@ -0,0 +1,179 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keithw@vmware.com>
+ */
+
+#ifndef LP_CONTEXT_H
+#define LP_CONTEXT_H
+
+#include "pipe/p_context.h"
+
+#include "draw/draw_vertex.h"
+#include "util/u_blitter.h"
+
+#include "lp_tex_sample.h"
+#include "lp_jit.h"
+#include "lp_setup.h"
+#include "lp_state_fs.h"
+#include "lp_state_setup.h"
+
+
+struct llvmpipe_vbuf_render;
+struct draw_context;
+struct draw_stage;
+struct draw_vertex_shader;
+struct lp_fragment_shader;
+struct lp_blend_state;
+struct lp_setup_context;
+struct lp_setup_variant;
+struct lp_velems_state;
+
+struct llvmpipe_context {
+   struct pipe_context pipe;  /**< base class */
+
+   /** Constant state objects */
+   const struct pipe_blend_state *blend;
+   struct pipe_sampler_state *samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS];
+
+   const struct pipe_depth_stencil_alpha_state *depth_stencil;
+   const struct pipe_rasterizer_state *rasterizer;
+   struct lp_fragment_shader *fs;
+   struct draw_vertex_shader *vs;
+   const struct lp_geometry_shader *gs;
+   const struct lp_velems_state *velems;
+   const struct lp_so_state *so;
+
+   /** Other rendering state */
+   unsigned sample_mask;
+   struct pipe_blend_color blend_color;
+   struct pipe_stencil_ref stencil_ref;
+   struct pipe_clip_state clip;
+   struct pipe_constant_buffer constants[PIPE_SHADER_TYPES][LP_MAX_TGSI_CONST_BUFFERS];
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissors[PIPE_MAX_VIEWPORTS];
+   struct pipe_sampler_view *sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS];
+
+   struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS];
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   struct pipe_index_buffer index_buffer;
+   struct pipe_resource *mapped_vs_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+   struct pipe_resource *mapped_gs_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+
+   unsigned num_samplers[PIPE_SHADER_TYPES];
+   unsigned num_sampler_views[PIPE_SHADER_TYPES];
+
+   unsigned num_vertex_buffers;
+
+   struct draw_so_target *so_targets[PIPE_MAX_SO_BUFFERS];
+   int num_so_targets;
+   struct pipe_query_data_so_statistics so_stats;
+
+   struct pipe_query_data_pipeline_statistics pipeline_statistics;
+   unsigned active_statistics_queries;
+
+   unsigned active_occlusion_queries;
+
+   unsigned dirty; /**< Mask of LP_NEW_x flags */
+
+   /** Mapped vertex buffers */
+   ubyte *mapped_vbuffer[PIPE_MAX_ATTRIBS];
+   
+   /** Vertex format */
+   struct vertex_info vertex_info;
+   
+   /** Which vertex shader output slot contains color */
+   int color_slot[2];
+
+   /** Which vertex shader output slot contains bcolor */
+   int bcolor_slot[2];
+
+   /** Which vertex shader output slot contains point size */
+   int psize_slot;
+
+   /** Which vertex shader output slot contains viewport index */
+   int viewport_index_slot;
+
+   /** Which geometry shader output slot contains layer */
+   int layer_slot;
+
+   /** A fake frontface output for unfilled primitives */
+   int face_slot;
+
+   /** Depth format and bias settings. */
+   boolean floating_point_depth;
+   double mrd;   /**< minimum resolvable depth value, for polygon offset */
+
+   /** The tiling engine */
+   struct lp_setup_context *setup;
+   struct lp_setup_variant setup_variant;
+
+   /** The primitive drawing context */
+   struct draw_context *draw;
+
+   struct blitter_context *blitter;
+
+   unsigned tex_timestamp;
+   boolean no_rast;
+
+   /** List of all fragment shader variants */
+   struct lp_fs_variant_list_item fs_variants_list;
+   unsigned nr_fs_variants;
+   unsigned nr_fs_instrs;
+
+   struct lp_setup_variant_list_item setup_variants_list;
+   unsigned nr_setup_variants;
+
+   /** Conditional query object and mode */
+   struct pipe_query *render_cond_query;
+   uint render_cond_mode;
+   boolean render_cond_cond;
+
+   /** The LLVMContext to use for LLVM related work */
+   LLVMContextRef context;
+};
+
+
+struct pipe_context *
+llvmpipe_create_context( struct pipe_screen *screen, void *priv );
+
+struct pipe_resource *
+llvmpipe_user_buffer_create(struct pipe_screen *screen,
+                            void *ptr,
+                            unsigned bytes,
+                            unsigned bind_flags);
+
+
+static inline struct llvmpipe_context *
+llvmpipe_context( struct pipe_context *pipe )
+{
+   return (struct llvmpipe_context *)pipe;
+}
+
+#endif /* LP_CONTEXT_H */
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_debug.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_debug.h
new file mode 100644
index 000000000..1038c5fe1
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_debug.h
@@ -0,0 +1,88 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef LP_DEBUG_H
+#define LP_DEBUG_H
+
+#include "pipe/p_compiler.h"
+#include "util/u_debug.h"
+
+extern void
+st_print_current(void);
+
+
+#define DEBUG_PIPE      0x1
+#define DEBUG_TGSI      0x2
+#define DEBUG_TEX       0x4
+#define DEBUG_SETUP     0x10
+#define DEBUG_RAST      0x20
+#define DEBUG_QUERY     0x40
+#define DEBUG_SCREEN    0x80
+#define DEBUG_COUNTERS      0x800
+#define DEBUG_SCENE         0x1000
+#define DEBUG_FENCE         0x2000
+#define DEBUG_MEM           0x4000
+#define DEBUG_FS            0x8000
+
+/* Performance flags.  These are active even on release builds.
+ */
+#define PERF_TEX_MEM        0x1  	/* minimize texture cache footprint */
+#define PERF_NO_MIP_LINEAR  0x2  	/* MIP_FILTER_LINEAR ==> _NEAREST */
+#define PERF_NO_MIPMAPS     0x4  	/* MIP_FILTER_NONE always */
+#define PERF_NO_LINEAR      0x8  	/* FILTER_NEAREST always */
+#define PERF_NO_TEX         0x10  	/* sample white always */
+#define PERF_NO_BLEND       0x20  	/* disable blending */
+#define PERF_NO_DEPTH       0x40  	/* disable depth buffering entirely */
+#define PERF_NO_ALPHATEST   0x80  	/* disable alpha testing */
+
+
+extern int LP_PERF;
+
+#ifdef DEBUG
+extern int LP_DEBUG;
+#else
+#define LP_DEBUG 0
+#endif
+
+void st_debug_init( void );
+
+static inline void
+LP_DBG( unsigned flag, const char *fmt, ... )
+{
+    if (LP_DEBUG & flag)
+    {
+        va_list args;
+
+        va_start( args, fmt );
+        debug_vprintf( fmt, args );
+        va_end( args );
+    }
+}
+
+
+#endif /* LP_DEBUG_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
new file mode 100644
index 000000000..edfb20409
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@@ -0,0 +1,169 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ *    Keith Whitwell
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "util/u_draw.h"
+#include "util/u_prim.h"
+
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_query.h"
+
+#include "draw/draw_context.h"
+
+
+
+/**
+ * Draw vertex arrays, with optional indexing, optional instancing.
+ * All the other drawing functions are implemented in terms of this function.
+ * Basically, map the vertex buffers (and drawing surfaces), then hand off
+ * the drawing to the 'draw' module.
+ */
+static void
+llvmpipe_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
+{
+   struct llvmpipe_context *lp = llvmpipe_context(pipe);
+   struct draw_context *draw = lp->draw;
+   const void *mapped_indices = NULL;
+   unsigned i;
+
+   if (!llvmpipe_check_render_cond(lp))
+      return;
+
+   if (info->indirect) {
+      util_draw_indirect(pipe, info);
+      return;
+   }
+
+   if (lp->dirty)
+      llvmpipe_update_derived( lp );
+
+   /*
+    * Map vertex buffers
+    */
+   for (i = 0; i < lp->num_vertex_buffers; i++) {
+      const void *buf = lp->vertex_buffer[i].user_buffer;
+      size_t size = ~0;
+      if (!buf) {
+         if (!lp->vertex_buffer[i].buffer) {
+            continue;
+         }
+         buf = llvmpipe_resource_data(lp->vertex_buffer[i].buffer);
+         size = lp->vertex_buffer[i].buffer->width0;
+      }
+      draw_set_mapped_vertex_buffer(draw, i, buf, size);
+   }
+
+   /* Map index buffer, if present */
+   if (info->indexed) {
+      unsigned available_space = ~0;
+      mapped_indices = lp->index_buffer.user_buffer;
+      if (!mapped_indices) {
+         mapped_indices = llvmpipe_resource_data(lp->index_buffer.buffer);
+         if (lp->index_buffer.buffer->width0 > lp->index_buffer.offset)
+            available_space =
+               (lp->index_buffer.buffer->width0 - lp->index_buffer.offset);
+         else
+            available_space = 0;
+      }
+      draw_set_indexes(draw,
+                       (ubyte *) mapped_indices + lp->index_buffer.offset,
+                       lp->index_buffer.index_size, available_space);
+   }
+
+   for (i = 0; i < lp->num_so_targets; i++) {
+      void *buf = 0;
+      if (lp->so_targets[i]) {
+         buf = llvmpipe_resource(lp->so_targets[i]->target.buffer)->data;
+         lp->so_targets[i]->mapping = buf;
+      }
+   }
+   draw_set_mapped_so_targets(draw, lp->num_so_targets,
+                              lp->so_targets);
+
+   llvmpipe_prepare_vertex_sampling(lp,
+                                    lp->num_sampler_views[PIPE_SHADER_VERTEX],
+                                    lp->sampler_views[PIPE_SHADER_VERTEX]);
+   llvmpipe_prepare_geometry_sampling(lp,
+                                      lp->num_sampler_views[PIPE_SHADER_GEOMETRY],
+                                      lp->sampler_views[PIPE_SHADER_GEOMETRY]);
+   if (lp->gs && lp->gs->no_tokens) {
+      /* we have an empty geometry shader with stream output, so
+         attach the stream output info to the current vertex shader */
+      if (lp->vs) {
+         draw_vs_attach_so(lp->vs, &lp->gs->stream_output);
+      }
+   }
+   draw_collect_pipeline_statistics(draw,
+                                    lp->active_statistics_queries > 0);
+
+   /* draw! */
+   draw_vbo(draw, info);
+
+   /*
+    * unmap vertex/index buffers
+    */
+   for (i = 0; i < lp->num_vertex_buffers; i++) {
+      draw_set_mapped_vertex_buffer(draw, i, NULL, 0);
+   }
+   if (mapped_indices) {
+      draw_set_indexes(draw, NULL, 0, 0);
+   }
+   draw_set_mapped_so_targets(draw, 0, NULL);
+
+   if (lp->gs && lp->gs->no_tokens) {
+      /* we have attached stream output to the vs for rendering,
+         now lets reset it */
+      if (lp->vs) {
+         draw_vs_reset_so(lp->vs);
+      }
+   }
+   
+   llvmpipe_cleanup_vertex_sampling(lp);
+   llvmpipe_cleanup_geometry_sampling(lp);
+
+   /*
+    * TODO: Flush only when a user vertex/index buffer is present
+    * (or even better, modify draw module to do this
+    * internally when this condition is seen?)
+    */
+   draw_flush(draw);
+}
+
+
+void
+llvmpipe_init_draw_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.draw_vbo = llvmpipe_draw_vbo;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_fence.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_fence.c
new file mode 100644
index 000000000..a21a3c744
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_fence.c
@@ -0,0 +1,127 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_screen.h"
+#include "util/u_memory.h"
+#include "lp_debug.h"
+#include "lp_fence.h"
+
+
+/**
+ * Create a new fence object.
+ *
+ * The rank will be the number of bins in the scene.  Whenever a rendering
+ * thread hits a fence command, it'll increment the fence counter.  When
+ * the counter == the rank, the fence is finished.
+ *
+ * \param rank  the expected finished value of the fence counter.
+ */
+struct lp_fence *
+lp_fence_create(unsigned rank)
+{
+   static int fence_id;
+   struct lp_fence *fence = CALLOC_STRUCT(lp_fence);
+
+   if (!fence)
+      return NULL;
+
+   pipe_reference_init(&fence->reference, 1);
+
+   pipe_mutex_init(fence->mutex);
+   pipe_condvar_init(fence->signalled);
+
+   fence->id = fence_id++;
+   fence->rank = rank;
+
+   if (LP_DEBUG & DEBUG_FENCE)
+      debug_printf("%s %d\n", __FUNCTION__, fence->id);
+
+   return fence;
+}
+
+
+/** Destroy a fence.  Called when refcount hits zero. */
+void
+lp_fence_destroy(struct lp_fence *fence)
+{
+   if (LP_DEBUG & DEBUG_FENCE)
+      debug_printf("%s %d\n", __FUNCTION__, fence->id);
+
+   pipe_mutex_destroy(fence->mutex);
+   pipe_condvar_destroy(fence->signalled);
+   FREE(fence);
+}
+
+
+/**
+ * Called by the rendering threads to increment the fence counter.
+ * When the counter == the rank, the fence is finished.
+ */
+void
+lp_fence_signal(struct lp_fence *fence)
+{
+   if (LP_DEBUG & DEBUG_FENCE)
+      debug_printf("%s %d\n", __FUNCTION__, fence->id);
+
+   pipe_mutex_lock(fence->mutex);
+
+   fence->count++;
+   assert(fence->count <= fence->rank);
+
+   if (LP_DEBUG & DEBUG_FENCE)
+      debug_printf("%s count=%u rank=%u\n", __FUNCTION__,
+                   fence->count, fence->rank);
+
+   /* Wakeup all threads waiting on the mutex:
+    */
+   pipe_condvar_broadcast(fence->signalled);
+
+   pipe_mutex_unlock(fence->mutex);
+}
+
+boolean
+lp_fence_signalled(struct lp_fence *f)
+{
+   return f->count == f->rank;
+}
+
+void
+lp_fence_wait(struct lp_fence *f)
+{
+   if (LP_DEBUG & DEBUG_FENCE)
+      debug_printf("%s %d\n", __FUNCTION__, f->id);
+
+   pipe_mutex_lock(f->mutex);
+   assert(f->issued);
+   while (f->count < f->rank) {
+      pipe_condvar_wait(f->signalled, f->mutex);
+   }
+   pipe_mutex_unlock(f->mutex);
+}
+
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_fence.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_fence.h
new file mode 100644
index 000000000..d7f0c153e
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_fence.h
@@ -0,0 +1,95 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_FENCE_H
+#define LP_FENCE_H
+
+
+#include "os/os_thread.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+
+
+struct pipe_screen;
+
+
+struct lp_fence
+{
+   struct pipe_reference reference;
+   unsigned id;
+
+   pipe_mutex mutex;
+   pipe_condvar signalled;
+
+   boolean issued;
+   unsigned rank;
+   unsigned count;
+};
+
+
+struct lp_fence *
+lp_fence_create(unsigned rank);
+
+
+void
+lp_fence_signal(struct lp_fence *fence);
+
+boolean
+lp_fence_signalled(struct lp_fence *fence);
+
+void
+lp_fence_wait(struct lp_fence *fence);
+
+void
+llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen);
+
+
+void
+lp_fence_destroy(struct lp_fence *fence);
+
+static inline void
+lp_fence_reference(struct lp_fence **ptr,
+                   struct lp_fence *f)
+{
+   struct lp_fence *old = *ptr;
+
+   if (pipe_reference(&old->reference, &f->reference)) {
+      lp_fence_destroy(old);
+   }
+   
+   *ptr = f;
+}
+
+static inline boolean
+lp_fence_issued(const struct lp_fence *fence)
+{
+   return fence->issued;
+}
+
+
+#endif /* LP_FENCE_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_flush.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_flush.c
new file mode 100644
index 000000000..268aab26c
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_flush.c
@@ -0,0 +1,131 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keithw@vmware.com>
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+#include "util/u_string.h"
+#include "draw/draw_context.h"
+#include "lp_flush.h"
+#include "lp_context.h"
+#include "lp_setup.h"
+
+
+/**
+ * \param fence  if non-null, returns pointer to a fence which can be waited on
+ */
+void
+llvmpipe_flush( struct pipe_context *pipe,
+                struct pipe_fence_handle **fence,
+                const char *reason)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   draw_flush(llvmpipe->draw);
+
+   /* ask the setup module to flush */
+   lp_setup_flush(llvmpipe->setup, fence, reason);
+
+   /* Enable to dump BMPs of the color/depth buffers each frame */
+   if (0) {
+      static unsigned frame_no = 1;
+      char filename[256];
+      unsigned i;
+
+      for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++) {
+         util_snprintf(filename, sizeof(filename), "cbuf%u_%u", i, frame_no);
+         debug_dump_surface_bmp(&llvmpipe->pipe, filename, llvmpipe->framebuffer.cbufs[i]);
+      }
+
+      if (0) {
+         util_snprintf(filename, sizeof(filename), "zsbuf_%u", frame_no);
+         debug_dump_surface_bmp(&llvmpipe->pipe, filename, llvmpipe->framebuffer.zsbuf);
+      }
+
+      ++frame_no;
+   }
+}
+
+void
+llvmpipe_finish( struct pipe_context *pipe,
+                 const char *reason )
+{
+   struct pipe_fence_handle *fence = NULL;
+   llvmpipe_flush(pipe, &fence, reason);
+   if (fence) {
+      pipe->screen->fence_finish(pipe->screen, fence, PIPE_TIMEOUT_INFINITE);
+      pipe->screen->fence_reference(pipe->screen, &fence, NULL);
+   }
+}
+
+/**
+ * Flush context if necessary.
+ *
+ * Returns FALSE if it would have block, but do_not_block was set, TRUE
+ * otherwise.
+ *
+ * TODO: move this logic to an auxiliary library?
+ */
+boolean
+llvmpipe_flush_resource(struct pipe_context *pipe,
+                        struct pipe_resource *resource,
+                        unsigned level,
+                        boolean read_only,
+                        boolean cpu_access,
+                        boolean do_not_block,
+                        const char *reason)
+{
+   unsigned referenced;
+
+   referenced = llvmpipe_is_resource_referenced(pipe, resource, level);
+
+   if ((referenced & LP_REFERENCED_FOR_WRITE) ||
+       ((referenced & LP_REFERENCED_FOR_READ) && !read_only)) {
+
+      if (cpu_access) {
+         /*
+          * Flush and wait.
+          */
+         if (do_not_block)
+            return FALSE;
+
+         llvmpipe_finish(pipe, reason);
+      } else {
+         /*
+          * Just flush.
+          */
+
+         llvmpipe_flush(pipe, NULL, reason);
+      }
+   }
+
+   return TRUE;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_flush.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_flush.h
new file mode 100644
index 000000000..68f513028
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_flush.h
@@ -0,0 +1,55 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef LP_FLUSH_H
+#define LP_FLUSH_H
+
+#include "pipe/p_compiler.h"
+
+struct pipe_context;
+struct pipe_fence_handle;
+struct pipe_resource;
+
+void
+llvmpipe_flush(struct pipe_context *pipe,
+               struct pipe_fence_handle **fence,
+               const char *reason);
+
+void
+llvmpipe_finish( struct pipe_context *pipe,
+                 const char *reason );
+
+boolean
+llvmpipe_flush_resource(struct pipe_context *pipe,
+                        struct pipe_resource *resource,
+                        unsigned level,
+                        boolean read_only,
+                        boolean cpu_access,
+                        boolean do_not_block,
+                        const char *reason);
+
+#endif
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_jit.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_jit.c
new file mode 100644
index 000000000..9acde4f1b
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -0,0 +1,246 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * C - JIT interfaces
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_memory.h"
+#include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_debug.h"
+#include "lp_context.h"
+#include "lp_jit.h"
+
+
+static void
+lp_jit_create_types(struct lp_fragment_shader_variant *lp)
+{
+   struct gallivm_state *gallivm = lp->gallivm;
+   LLVMContextRef lc = gallivm->context;
+   LLVMTypeRef viewport_type, texture_type, sampler_type;
+
+   /* struct lp_jit_viewport */
+   {
+      LLVMTypeRef elem_types[LP_JIT_VIEWPORT_NUM_FIELDS];
+
+      elem_types[LP_JIT_VIEWPORT_MIN_DEPTH] =
+      elem_types[LP_JIT_VIEWPORT_MAX_DEPTH] = LLVMFloatTypeInContext(lc);
+
+      viewport_type = LLVMStructTypeInContext(lc, elem_types,
+                                              Elements(elem_types), 0);
+
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_viewport, min_depth,
+                             gallivm->target, viewport_type,
+                             LP_JIT_VIEWPORT_MIN_DEPTH);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_viewport, max_depth,
+                             gallivm->target, viewport_type,
+                             LP_JIT_VIEWPORT_MAX_DEPTH);
+      LP_CHECK_STRUCT_SIZE(struct lp_jit_viewport,
+                           gallivm->target, viewport_type);
+   }
+
+   /* struct lp_jit_texture */
+   {
+      LLVMTypeRef elem_types[LP_JIT_TEXTURE_NUM_FIELDS];
+
+      elem_types[LP_JIT_TEXTURE_WIDTH]  =
+      elem_types[LP_JIT_TEXTURE_HEIGHT] =
+      elem_types[LP_JIT_TEXTURE_DEPTH] =
+      elem_types[LP_JIT_TEXTURE_FIRST_LEVEL] =
+      elem_types[LP_JIT_TEXTURE_LAST_LEVEL] = LLVMInt32TypeInContext(lc);
+      elem_types[LP_JIT_TEXTURE_BASE] = LLVMPointerType(LLVMInt8TypeInContext(lc), 0);
+      elem_types[LP_JIT_TEXTURE_ROW_STRIDE] =
+      elem_types[LP_JIT_TEXTURE_IMG_STRIDE] =
+      elem_types[LP_JIT_TEXTURE_MIP_OFFSETS] =
+         LLVMArrayType(LLVMInt32TypeInContext(lc), LP_MAX_TEXTURE_LEVELS);
+
+      texture_type = LLVMStructTypeInContext(lc, elem_types,
+                                             Elements(elem_types), 0);
+
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, width,
+                             gallivm->target, texture_type,
+                             LP_JIT_TEXTURE_WIDTH);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, height,
+                             gallivm->target, texture_type,
+                             LP_JIT_TEXTURE_HEIGHT);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, depth,
+                             gallivm->target, texture_type,
+                             LP_JIT_TEXTURE_DEPTH);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, first_level,
+                             gallivm->target, texture_type,
+                             LP_JIT_TEXTURE_FIRST_LEVEL);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, last_level,
+                             gallivm->target, texture_type,
+                             LP_JIT_TEXTURE_LAST_LEVEL);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, base,
+                             gallivm->target, texture_type,
+                             LP_JIT_TEXTURE_BASE);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, row_stride,
+                             gallivm->target, texture_type,
+                             LP_JIT_TEXTURE_ROW_STRIDE);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, img_stride,
+                             gallivm->target, texture_type,
+                             LP_JIT_TEXTURE_IMG_STRIDE);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_texture, mip_offsets,
+                             gallivm->target, texture_type,
+                             LP_JIT_TEXTURE_MIP_OFFSETS);
+      LP_CHECK_STRUCT_SIZE(struct lp_jit_texture,
+                           gallivm->target, texture_type);
+   }
+
+   /* struct lp_jit_sampler */
+   {
+      LLVMTypeRef elem_types[LP_JIT_SAMPLER_NUM_FIELDS];
+      elem_types[LP_JIT_SAMPLER_MIN_LOD] =
+      elem_types[LP_JIT_SAMPLER_MAX_LOD] =
+      elem_types[LP_JIT_SAMPLER_LOD_BIAS] = LLVMFloatTypeInContext(lc);
+      elem_types[LP_JIT_SAMPLER_BORDER_COLOR] =
+         LLVMArrayType(LLVMFloatTypeInContext(lc), 4);
+
+      sampler_type = LLVMStructTypeInContext(lc, elem_types,
+                                             Elements(elem_types), 0);
+
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_sampler, min_lod,
+                             gallivm->target, sampler_type,
+                             LP_JIT_SAMPLER_MIN_LOD);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_sampler, max_lod,
+                             gallivm->target, sampler_type,
+                             LP_JIT_SAMPLER_MAX_LOD);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_sampler, lod_bias,
+                             gallivm->target, sampler_type,
+                             LP_JIT_SAMPLER_LOD_BIAS);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_sampler, border_color,
+                             gallivm->target, sampler_type,
+                             LP_JIT_SAMPLER_BORDER_COLOR);
+      LP_CHECK_STRUCT_SIZE(struct lp_jit_sampler,
+                           gallivm->target, sampler_type);
+   }
+
+   /* struct lp_jit_context */
+   {
+      LLVMTypeRef elem_types[LP_JIT_CTX_COUNT];
+      LLVMTypeRef context_type;
+
+      elem_types[LP_JIT_CTX_CONSTANTS] =
+         LLVMArrayType(LLVMPointerType(LLVMFloatTypeInContext(lc), 0), LP_MAX_TGSI_CONST_BUFFERS);
+      elem_types[LP_JIT_CTX_NUM_CONSTANTS] =
+            LLVMArrayType(LLVMInt32TypeInContext(lc), LP_MAX_TGSI_CONST_BUFFERS);
+      elem_types[LP_JIT_CTX_ALPHA_REF] = LLVMFloatTypeInContext(lc);
+      elem_types[LP_JIT_CTX_STENCIL_REF_FRONT] =
+      elem_types[LP_JIT_CTX_STENCIL_REF_BACK] = LLVMInt32TypeInContext(lc);
+      elem_types[LP_JIT_CTX_U8_BLEND_COLOR] = LLVMPointerType(LLVMInt8TypeInContext(lc), 0);
+      elem_types[LP_JIT_CTX_F_BLEND_COLOR] = LLVMPointerType(LLVMFloatTypeInContext(lc), 0);
+      elem_types[LP_JIT_CTX_VIEWPORTS] = LLVMPointerType(viewport_type, 0);
+      elem_types[LP_JIT_CTX_TEXTURES] = LLVMArrayType(texture_type,
+                                                      PIPE_MAX_SHADER_SAMPLER_VIEWS);
+      elem_types[LP_JIT_CTX_SAMPLERS] = LLVMArrayType(sampler_type,
+                                                      PIPE_MAX_SAMPLERS);
+
+      context_type = LLVMStructTypeInContext(lc, elem_types,
+                                             Elements(elem_types), 0);
+
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, constants,
+                             gallivm->target, context_type,
+                             LP_JIT_CTX_CONSTANTS);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, num_constants,
+                             gallivm->target, context_type,
+                             LP_JIT_CTX_NUM_CONSTANTS);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, alpha_ref_value,
+                             gallivm->target, context_type,
+                             LP_JIT_CTX_ALPHA_REF);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, stencil_ref_front,
+                             gallivm->target, context_type,
+                             LP_JIT_CTX_STENCIL_REF_FRONT);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, stencil_ref_back,
+                             gallivm->target, context_type,
+                             LP_JIT_CTX_STENCIL_REF_BACK);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, u8_blend_color,
+                             gallivm->target, context_type,
+                             LP_JIT_CTX_U8_BLEND_COLOR);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, f_blend_color,
+                             gallivm->target, context_type,
+                             LP_JIT_CTX_F_BLEND_COLOR);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, viewports,
+                             gallivm->target, context_type,
+                             LP_JIT_CTX_VIEWPORTS);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, textures,
+                             gallivm->target, context_type,
+                             LP_JIT_CTX_TEXTURES);
+      LP_CHECK_MEMBER_OFFSET(struct lp_jit_context, samplers,
+                             gallivm->target, context_type,
+                             LP_JIT_CTX_SAMPLERS);
+      LP_CHECK_STRUCT_SIZE(struct lp_jit_context,
+                           gallivm->target, context_type);
+
+      lp->jit_context_ptr_type = LLVMPointerType(context_type, 0);
+   }
+
+   /* struct lp_jit_thread_data */
+   {
+      LLVMTypeRef elem_types[LP_JIT_THREAD_DATA_COUNT];
+      LLVMTypeRef thread_data_type;
+
+      elem_types[LP_JIT_THREAD_DATA_COUNTER] = LLVMInt64TypeInContext(lc);
+      elem_types[LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX] =
+            LLVMInt32TypeInContext(lc);
+
+      thread_data_type = LLVMStructTypeInContext(lc, elem_types,
+                                                 Elements(elem_types), 0);
+
+      lp->jit_thread_data_ptr_type = LLVMPointerType(thread_data_type, 0);
+   }
+
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      LLVMDumpModule(gallivm->module);
+   }
+}
+
+
+void
+lp_jit_screen_cleanup(struct llvmpipe_screen *screen)
+{
+   /* nothing */
+}
+
+
+boolean
+lp_jit_screen_init(struct llvmpipe_screen *screen)
+{
+   return lp_build_init();
+}
+
+
+void
+lp_jit_init_types(struct lp_fragment_shader_variant *lp)
+{
+   if (!lp->jit_context_ptr_type)
+      lp_jit_create_types(lp);
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_jit.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_jit.h
new file mode 100644
index 000000000..097fa7dce
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -0,0 +1,263 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * C - JIT interfaces
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef LP_JIT_H
+#define LP_JIT_H
+
+
+#include "gallivm/lp_bld_struct.h"
+#include "gallivm/lp_bld_limits.h"
+
+#include "pipe/p_state.h"
+#include "lp_texture.h"
+
+
+struct lp_fragment_shader_variant;
+struct llvmpipe_screen;
+
+
+struct lp_jit_texture
+{
+   uint32_t width;        /* same as number of elements */
+   uint32_t height;
+   uint32_t depth;        /* doubles as array size */
+   uint32_t first_level;
+   uint32_t last_level;
+   const void *base;
+   uint32_t row_stride[LP_MAX_TEXTURE_LEVELS];
+   uint32_t img_stride[LP_MAX_TEXTURE_LEVELS];
+   uint32_t mip_offsets[LP_MAX_TEXTURE_LEVELS];
+};
+
+
+struct lp_jit_sampler
+{
+   float min_lod;
+   float max_lod;
+   float lod_bias;
+   float border_color[4];
+};
+
+
+struct lp_jit_viewport
+{
+   float min_depth;
+   float max_depth;
+};
+
+
+enum {
+   LP_JIT_TEXTURE_WIDTH = 0,
+   LP_JIT_TEXTURE_HEIGHT,
+   LP_JIT_TEXTURE_DEPTH,
+   LP_JIT_TEXTURE_FIRST_LEVEL,
+   LP_JIT_TEXTURE_LAST_LEVEL,
+   LP_JIT_TEXTURE_BASE,
+   LP_JIT_TEXTURE_ROW_STRIDE,
+   LP_JIT_TEXTURE_IMG_STRIDE,
+   LP_JIT_TEXTURE_MIP_OFFSETS,
+   LP_JIT_TEXTURE_NUM_FIELDS  /* number of fields above */
+};
+
+
+enum {
+   LP_JIT_SAMPLER_MIN_LOD,
+   LP_JIT_SAMPLER_MAX_LOD,
+   LP_JIT_SAMPLER_LOD_BIAS,
+   LP_JIT_SAMPLER_BORDER_COLOR,
+   LP_JIT_SAMPLER_NUM_FIELDS  /* number of fields above */
+};
+
+
+enum {
+   LP_JIT_VIEWPORT_MIN_DEPTH,
+   LP_JIT_VIEWPORT_MAX_DEPTH,
+   LP_JIT_VIEWPORT_NUM_FIELDS /* number of fields above */
+};
+
+
+/**
+ * This structure is passed directly to the generated fragment shader.
+ *
+ * It contains the derived state.
+ *
+ * Changes here must be reflected in the lp_jit_context_* macros and
+ * lp_jit_init_types function. Changes to the ordering should be avoided.
+ *
+ * Only use types with a clear size and padding here, in particular prefer the
+ * stdint.h types to the basic integer types.
+ */
+struct lp_jit_context
+{
+   const float *constants[LP_MAX_TGSI_CONST_BUFFERS];
+   int num_constants[LP_MAX_TGSI_CONST_BUFFERS];
+
+   float alpha_ref_value;
+
+   uint32_t stencil_ref_front, stencil_ref_back;
+
+   uint8_t *u8_blend_color;
+   float *f_blend_color;
+
+   struct lp_jit_viewport *viewports;
+
+   struct lp_jit_texture textures[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+   struct lp_jit_sampler samplers[PIPE_MAX_SAMPLERS];
+};
+
+
+/**
+ * These enum values must match the position of the fields in the
+ * lp_jit_context struct above.
+ */
+enum {
+   LP_JIT_CTX_CONSTANTS = 0,
+   LP_JIT_CTX_NUM_CONSTANTS,
+   LP_JIT_CTX_ALPHA_REF,
+   LP_JIT_CTX_STENCIL_REF_FRONT,
+   LP_JIT_CTX_STENCIL_REF_BACK,
+   LP_JIT_CTX_U8_BLEND_COLOR,
+   LP_JIT_CTX_F_BLEND_COLOR,
+   LP_JIT_CTX_VIEWPORTS,
+   LP_JIT_CTX_TEXTURES,
+   LP_JIT_CTX_SAMPLERS,
+   LP_JIT_CTX_COUNT
+};
+
+
+#define lp_jit_context_constants(_gallivm, _ptr) \
+   lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CTX_CONSTANTS, "constants")
+
+#define lp_jit_context_num_constants(_gallivm, _ptr) \
+   lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CTX_NUM_CONSTANTS, "num_constants")
+
+#define lp_jit_context_alpha_ref_value(_gallivm, _ptr) \
+   lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_ALPHA_REF, "alpha_ref_value")
+
+#define lp_jit_context_stencil_ref_front_value(_gallivm, _ptr) \
+   lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_STENCIL_REF_FRONT, "stencil_ref_front")
+
+#define lp_jit_context_stencil_ref_back_value(_gallivm, _ptr) \
+   lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_STENCIL_REF_BACK, "stencil_ref_back")
+
+#define lp_jit_context_u8_blend_color(_gallivm, _ptr) \
+   lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_U8_BLEND_COLOR, "u8_blend_color")
+
+#define lp_jit_context_f_blend_color(_gallivm, _ptr) \
+   lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_F_BLEND_COLOR, "f_blend_color")
+
+#define lp_jit_context_viewports(_gallivm, _ptr) \
+   lp_build_struct_get(_gallivm, _ptr, LP_JIT_CTX_VIEWPORTS, "viewports")
+
+#define lp_jit_context_textures(_gallivm, _ptr) \
+   lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CTX_TEXTURES, "textures")
+
+#define lp_jit_context_samplers(_gallivm, _ptr) \
+   lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_CTX_SAMPLERS, "samplers")
+
+
+struct lp_jit_thread_data
+{
+   uint64_t vis_counter;
+
+   /*
+    * Non-interpolated rasterizer state passed through to the fragment shader.
+    */
+   struct {
+      uint32_t viewport_index;
+   } raster_state;
+};
+
+
+enum {
+   LP_JIT_THREAD_DATA_COUNTER = 0,
+   LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX,
+   LP_JIT_THREAD_DATA_COUNT
+};
+
+
+#define lp_jit_thread_data_counter(_gallivm, _ptr) \
+   lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_THREAD_DATA_COUNTER, "counter")
+
+#define lp_jit_thread_data_raster_state_viewport_index(_gallivm, _ptr) \
+   lp_build_struct_get(_gallivm, _ptr, \
+                       LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX, \
+                       "raster_state.viewport_index")
+ 
+/**
+ * typedef for fragment shader function
+ *
+ * @param context       jit context
+ * @param x             block start x
+ * @param y             block start y
+ * @param facing        is front facing
+ * @param a0            shader input a0
+ * @param dadx          shader input dadx
+ * @param dady          shader input dady
+ * @param color         color buffer
+ * @param depth         depth buffer
+ * @param mask          mask of visible pixels in block
+ * @param thread_data   task thread data
+ * @param stride        color buffer row stride in bytes
+ * @param depth_stride  depth buffer row stride in bytes
+ */
+typedef void
+(*lp_jit_frag_func)(const struct lp_jit_context *context,
+                    uint32_t x,
+                    uint32_t y,
+                    uint32_t facing,
+                    const void *a0,
+                    const void *dadx,
+                    const void *dady,
+                    uint8_t **color,
+                    uint8_t *depth,
+                    uint32_t mask,
+                    struct lp_jit_thread_data *thread_data,
+                    unsigned *stride,
+                    unsigned depth_stride);
+
+
+void
+lp_jit_screen_cleanup(struct llvmpipe_screen *screen);
+
+
+boolean
+lp_jit_screen_init(struct llvmpipe_screen *screen);
+
+
+void
+lp_jit_init_types(struct lp_fragment_shader_variant *lp);
+
+
+#endif /* LP_JIT_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_limits.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_limits.h
new file mode 100644
index 000000000..5294ced3c
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_limits.h
@@ -0,0 +1,96 @@
+/**************************************************************************
+ * 
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Implementation limits for LLVMpipe driver.
+ */
+
+#ifndef LP_LIMITS_H
+#define LP_LIMITS_H
+
+
+/**
+ * Tile size (width and height). This needs to be a power of two.
+ */
+#define TILE_ORDER 6
+#define TILE_SIZE (1 << TILE_ORDER)
+
+
+/**
+ * Max texture sizes
+ */
+#define LP_MAX_TEXTURE_SIZE (1 * 1024 * 1024 * 1024ULL)  /* 1GB for now */
+#define LP_MAX_TEXTURE_2D_LEVELS 14  /* 8K x 8K for now */
+#define LP_MAX_TEXTURE_3D_LEVELS 12  /* 2K x 2K x 2K for now */
+#define LP_MAX_TEXTURE_CUBE_LEVELS 14  /* 8K x 8K for now */
+#define LP_MAX_TEXTURE_ARRAY_LAYERS 512 /* 8K x 512 / 8K x 8K x 512 */
+
+
+/** This must be the larger of LP_MAX_TEXTURE_2D/3D_LEVELS */
+#define LP_MAX_TEXTURE_LEVELS LP_MAX_TEXTURE_2D_LEVELS
+
+
+/**
+ * Max drawing surface size is the max texture size
+ */
+#define LP_MAX_HEIGHT (1 << (LP_MAX_TEXTURE_LEVELS - 1))
+#define LP_MAX_WIDTH  (1 << (LP_MAX_TEXTURE_LEVELS - 1))
+
+
+#define LP_MAX_THREADS 16
+
+
+/**
+ * Max bytes per scene.  This may be replaced by a runtime parameter.
+ */
+#define LP_MAX_SCENE_SIZE (512 * 1024 * 1024)
+
+/**
+ * Max number of shader variants (for all shaders combined,
+ * per context) that will be kept around.
+ */
+#define LP_MAX_SHADER_VARIANTS 1024
+
+/**
+ * Max number of instructions (for all fragment shaders combined per context)
+ * that will be kept around (counted in terms of llvm ir).
+ * Note: the definition looks odd, but there's branches which use a different
+ * number of max shader variants.
+ */
+#define LP_MAX_SHADER_INSTRUCTIONS MAX2(256*1024, 512*LP_MAX_SHADER_VARIANTS)
+
+/**
+ * Max number of setup variants that will be kept around.
+ *
+ * These are determined by the combination of the fragment shader
+ * input signature and a small amount of rasterization state (eg
+ * flatshading).  It is likely that many active fragment shaders will
+ * share the same setup variant.
+ */
+#define LP_MAX_SETUP_VARIANTS 64
+
+#endif /* LP_LIMITS_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_memory.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_memory.c
new file mode 100644
index 000000000..712e28ea3
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_memory.c
@@ -0,0 +1,36 @@
+/**************************************************************************
+ * 
+ * Copyright 2010 VMware, Inc.  All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_debug.h"
+#include "lp_limits.h"
+#include "lp_memory.h"
+
+/* A single dummy tile used in a couple of out-of-memory situations. 
+ */
+PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN)
+uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4];
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_memory.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_memory.h
new file mode 100644
index 000000000..0acd4e6b8
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_memory.h
@@ -0,0 +1,40 @@
+/**************************************************************************
+ * 
+ * Copyright 2010 VMware, Inc.  All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef LP_MEMORY_H
+#define LP_MEMORY_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "lp_limits.h"
+#include "gallivm/lp_bld_type.h"
+
+extern PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN)
+uint8_t lp_dummy_tile[TILE_SIZE * TILE_SIZE * 4];
+
+#endif /* LP_MEMORY_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_perf.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_perf.c
new file mode 100644
index 000000000..a4548bccf
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_perf.c
@@ -0,0 +1,110 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_debug.h"
+#include "lp_debug.h"
+#include "lp_perf.h"
+
+
+
+struct lp_counters lp_count;
+
+
+void
+lp_reset_counters(void)
+{
+   memset(&lp_count, 0, sizeof(lp_count));
+}
+
+
+void
+lp_print_counters(void)
+{
+   if (LP_DEBUG & DEBUG_COUNTERS) {
+      unsigned total_64, total_16, total_4;
+      float p1, p2, p3, p4, p5, p6;
+
+      debug_printf("llvmpipe: nr_triangles:                 %9u\n", lp_count.nr_tris);
+      debug_printf("llvmpipe: nr_culled_triangles:          %9u\n", lp_count.nr_culled_tris);
+
+      total_64 = (lp_count.nr_empty_64 + 
+                  lp_count.nr_fully_covered_64 +
+                  lp_count.nr_partially_covered_64);
+
+      p1 = 100.0 * (float) lp_count.nr_empty_64 / (float) total_64;
+      p2 = 100.0 * (float) lp_count.nr_fully_covered_64 / (float) total_64;
+      p3 = 100.0 * (float) lp_count.nr_partially_covered_64 / (float) total_64;
+      p5 = 100.0 * (float) lp_count.nr_shade_opaque_64 / (float) total_64;
+      p6 = 100.0 * (float) lp_count.nr_shade_64 / (float) total_64;
+
+      debug_printf("llvmpipe: nr_64x64:                     %9u\n", total_64);
+      debug_printf("llvmpipe:   nr_fully_covered_64x64:     %9u (%3.0f%% of %u)\n", lp_count.nr_fully_covered_64, p2, total_64);
+      debug_printf("llvmpipe:     nr_shade_opaque_64x64:    %9u (%3.0f%% of %u)\n", lp_count.nr_shade_opaque_64, p5, total_64);
+      debug_printf("llvmpipe:        nr_pure_shade_opaque:  %9u (%3.0f%% of %u)\n", lp_count.nr_pure_shade_opaque_64, 0.0, lp_count.nr_shade_opaque_64);
+      debug_printf("llvmpipe:     nr_shade_64x64:           %9u (%3.0f%% of %u)\n", lp_count.nr_shade_64, p6, total_64);
+      debug_printf("llvmpipe:        nr_pure_shade:         %9u (%3.0f%% of %u)\n", lp_count.nr_pure_shade_64, 0.0, lp_count.nr_shade_64);
+      debug_printf("llvmpipe:   nr_partially_covered_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_64, p3, total_64);
+      debug_printf("llvmpipe:   nr_empty_64x64:             %9u (%3.0f%% of %u)\n", lp_count.nr_empty_64, p1, total_64);
+
+      total_16 = (lp_count.nr_empty_16 + 
+                  lp_count.nr_fully_covered_16 +
+                  lp_count.nr_partially_covered_16);
+
+      p1 = 100.0 * (float) lp_count.nr_empty_16 / (float) total_16;
+      p2 = 100.0 * (float) lp_count.nr_fully_covered_16 / (float) total_16;
+      p3 = 100.0 * (float) lp_count.nr_partially_covered_16 / (float) total_16;
+
+      debug_printf("llvmpipe: nr_16x16:                     %9u\n", total_16);
+      debug_printf("llvmpipe:   nr_fully_covered_16x16:     %9u (%3.0f%% of %u)\n", lp_count.nr_fully_covered_16, p2, total_16);
+      debug_printf("llvmpipe:   nr_partially_covered_16x16: %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_16, p3, total_16);
+      debug_printf("llvmpipe:   nr_empty_16x16:             %9u (%3.0f%% of %u)\n", lp_count.nr_empty_16, p1, total_16);
+
+      total_4 = (lp_count.nr_empty_4 +
+                 lp_count.nr_fully_covered_4 +
+                 lp_count.nr_partially_covered_4);
+
+      p1 = 100.0 * (float) lp_count.nr_empty_4 / (float) total_4;
+      p2 = 100.0 * (float) lp_count.nr_fully_covered_4 / (float) total_4;
+      p3 = 100.0 * (float) lp_count.nr_partially_covered_4 / (float) total_4;
+      p4 = 100.0 * (float) lp_count.nr_non_empty_4 / (float) total_4;
+
+      debug_printf("llvmpipe: nr_tri_4x4:                   %9u\n", total_4);
+      debug_printf("llvmpipe:   nr_fully_covered_4x4:       %9u (%3.0f%% of %u)\n", lp_count.nr_fully_covered_4, p2, total_4);
+      debug_printf("llvmpipe:   nr_partially_covered_4x4:   %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_4, p3, total_4);
+      debug_printf("llvmpipe:   nr_empty_4x4:               %9u (%3.0f%% of %u)\n", lp_count.nr_empty_4, p1, total_4);
+      debug_printf("llvmpipe:   nr_non_empty_4x4:           %9u (%3.0f%% of %u)\n", lp_count.nr_non_empty_4, p4, total_4);
+
+      debug_printf("llvmpipe: nr_color_tile_clear:          %9u\n", lp_count.nr_color_tile_clear);
+      debug_printf("llvmpipe: nr_color_tile_load:           %9u\n", lp_count.nr_color_tile_load);
+      debug_printf("llvmpipe: nr_color_tile_store:          %9u\n", lp_count.nr_color_tile_store);
+
+      debug_printf("llvmpipe: nr_llvm_compiles:             %u\n", lp_count.nr_llvm_compiles);
+      debug_printf("llvmpipe: total LLVM compile time:      %.2f sec\n", lp_count.llvm_compile_time / 1000000.0);
+      debug_printf("llvmpipe: average LLVM compile time:    %.2f sec\n", lp_count.llvm_compile_time / 1000000.0 / lp_count.nr_llvm_compiles);
+
+   }
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_perf.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_perf.h
new file mode 100644
index 000000000..455adf7d6
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_perf.h
@@ -0,0 +1,91 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Performance / statistic counters, etc.
+ */
+
+
+#ifndef LP_PERF_H
+#define LP_PERF_H
+
+#include "pipe/p_compiler.h"
+
+/**
+ * Various counters
+ */
+struct lp_counters
+{
+   unsigned nr_tris;
+   unsigned nr_culled_tris;
+   unsigned nr_empty_64;
+   unsigned nr_fully_covered_64;
+   unsigned nr_partially_covered_64;
+   unsigned nr_pure_shade_opaque_64;
+   unsigned nr_pure_shade_64;
+   unsigned nr_shade_64;
+   unsigned nr_shade_opaque_64;
+   unsigned nr_empty_16;
+   unsigned nr_fully_covered_16;
+   unsigned nr_partially_covered_16;
+   unsigned nr_empty_4;
+   unsigned nr_fully_covered_4;
+   unsigned nr_partially_covered_4;
+   unsigned nr_non_empty_4;
+   unsigned nr_llvm_compiles;
+   int64_t llvm_compile_time;  /**< total, in microseconds */
+
+   unsigned nr_color_tile_clear;
+   unsigned nr_color_tile_load;
+   unsigned nr_color_tile_store;
+};
+
+
+extern struct lp_counters lp_count;
+
+
+/** Increment the named counter (only for debug builds) */
+#ifdef DEBUG
+#define LP_COUNT(counter) lp_count.counter++
+#define LP_COUNT_ADD(counter, incr)  lp_count.counter += (incr)
+#define LP_COUNT_GET(counter) (lp_count.counter)
+#else
+#define LP_COUNT(counter)
+#define LP_COUNT_ADD(counter, incr) (void)(incr)
+#define LP_COUNT_GET(counter) 0
+#endif
+
+
+extern void
+lp_reset_counters(void);
+
+
+extern void
+lp_print_counters(void);
+
+
+#endif /* LP_PERF_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_public.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_public.h
new file mode 100644
index 000000000..27ab1baef
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_public.h
@@ -0,0 +1,18 @@
+#ifndef LP_PUBLIC_H
+#define LP_PUBLIC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct pipe_screen;
+struct sw_winsys;
+
+struct pipe_screen *
+llvmpipe_create_screen(struct sw_winsys *winsys);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_query.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_query.c
new file mode 100644
index 000000000..fc5936706
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_query.c
@@ -0,0 +1,332 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:
+ *    Keith Whitwell, Qicheng Christopher Li, Brian Paul
+ */
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "os/os_time.h"
+#include "lp_context.h"
+#include "lp_flush.h"
+#include "lp_fence.h"
+#include "lp_query.h"
+#include "lp_screen.h"
+#include "lp_state.h"
+#include "lp_rast.h"
+
+
+static struct llvmpipe_query *llvmpipe_query( struct pipe_query *p )
+{
+   return (struct llvmpipe_query *)p;
+}
+
+static struct pipe_query *
+llvmpipe_create_query(struct pipe_context *pipe, 
+                      unsigned type,
+                      unsigned index)
+{
+   struct llvmpipe_query *pq;
+
+   assert(type < PIPE_QUERY_TYPES);
+
+   pq = CALLOC_STRUCT( llvmpipe_query );
+
+   if (pq) {
+      pq->type = type;
+   }
+
+   return (struct pipe_query *) pq;
+}
+
+
+static void
+llvmpipe_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct llvmpipe_query *pq = llvmpipe_query(q);
+
+   /* Ideally we would refcount queries & not get destroyed until the
+    * last scene had finished with us.
+    */
+   if (pq->fence) {
+      if (!lp_fence_issued(pq->fence))
+         llvmpipe_flush(pipe, NULL, __FUNCTION__);
+
+      if (!lp_fence_signalled(pq->fence))
+         lp_fence_wait(pq->fence);
+
+      lp_fence_reference(&pq->fence, NULL);
+   }
+
+   FREE(pq);
+}
+
+
+static boolean
+llvmpipe_get_query_result(struct pipe_context *pipe, 
+                          struct pipe_query *q,
+                          boolean wait,
+                          union pipe_query_result *vresult)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(pipe->screen);
+   unsigned num_threads = MAX2(1, screen->num_threads);
+   struct llvmpipe_query *pq = llvmpipe_query(q);
+   uint64_t *result = (uint64_t *)vresult;
+   int i;
+
+   if (pq->fence) {
+      /* only have a fence if there was a scene */
+      if (!lp_fence_signalled(pq->fence)) {
+         if (!lp_fence_issued(pq->fence))
+            llvmpipe_flush(pipe, NULL, __FUNCTION__);
+
+         if (!wait)
+            return FALSE;
+
+         lp_fence_wait(pq->fence);
+      }
+   }
+
+   /* Sum the results from each of the threads:
+    */
+   *result = 0;
+
+   switch (pq->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      for (i = 0; i < num_threads; i++) {
+         *result += pq->end[i];
+      }
+      break;
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      for (i = 0; i < num_threads; i++) {
+         /* safer (still not guaranteed) when there's an overflow */
+         vresult->b = vresult->b || pq->end[i];
+      }
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      for (i = 0; i < num_threads; i++) {
+         if (pq->end[i] > *result) {
+            *result = pq->end[i];
+         }
+      }
+      break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT: {
+      struct pipe_query_data_timestamp_disjoint *td =
+         (struct pipe_query_data_timestamp_disjoint *)vresult;
+      /* os_get_time_nano return nanoseconds */
+      td->frequency = UINT64_C(1000000000);
+      td->disjoint = FALSE;
+   }
+      break;
+   case PIPE_QUERY_GPU_FINISHED:
+      vresult->b = TRUE;
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      *result = pq->num_primitives_generated;
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      *result = pq->num_primitives_written;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      vresult->b = pq->num_primitives_generated > pq->num_primitives_written;
+      break;
+   case PIPE_QUERY_SO_STATISTICS: {
+      struct pipe_query_data_so_statistics *stats =
+         (struct pipe_query_data_so_statistics *)vresult;
+      stats->num_primitives_written = pq->num_primitives_written;
+      stats->primitives_storage_needed = pq->num_primitives_generated;
+   }
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS: {
+      struct pipe_query_data_pipeline_statistics *stats =
+         (struct pipe_query_data_pipeline_statistics *)vresult;
+      /* only ps_invocations come from binned query */
+      for (i = 0; i < num_threads; i++) {
+         pq->stats.ps_invocations += pq->end[i];
+      }
+      pq->stats.ps_invocations *= LP_RASTER_BLOCK_SIZE * LP_RASTER_BLOCK_SIZE;
+      *stats = pq->stats;
+   }
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   return TRUE;
+}
+
+
+static boolean
+llvmpipe_begin_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
+   struct llvmpipe_query *pq = llvmpipe_query(q);
+
+   /* Check if the query is already in the scene.  If so, we need to
+    * flush the scene now.  Real apps shouldn't re-use a query in a
+    * frame of rendering.
+    */
+   if (pq->fence && !lp_fence_issued(pq->fence)) {
+      llvmpipe_finish(pipe, __FUNCTION__);
+   }
+
+
+   memset(pq->start, 0, sizeof(pq->start));
+   memset(pq->end, 0, sizeof(pq->end));
+   lp_setup_begin_query(llvmpipe->setup, pq);
+
+   switch (pq->type) {
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      pq->num_primitives_written = llvmpipe->so_stats.num_primitives_written;
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      pq->num_primitives_generated = llvmpipe->so_stats.primitives_storage_needed;
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      pq->num_primitives_written = llvmpipe->so_stats.num_primitives_written;
+      pq->num_primitives_generated = llvmpipe->so_stats.primitives_storage_needed;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      pq->num_primitives_written = llvmpipe->so_stats.num_primitives_written;
+      pq->num_primitives_generated = llvmpipe->so_stats.primitives_storage_needed;
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      /* reset our cache */
+      if (llvmpipe->active_statistics_queries == 0) {
+         memset(&llvmpipe->pipeline_statistics, 0,
+                sizeof(llvmpipe->pipeline_statistics));
+      }
+      memcpy(&pq->stats, &llvmpipe->pipeline_statistics, sizeof(pq->stats));
+      llvmpipe->active_statistics_queries++;
+      break;
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      llvmpipe->active_occlusion_queries++;
+      llvmpipe->dirty |= LP_NEW_OCCLUSION_QUERY;
+      break;
+   default:
+      break;
+   }
+   return true;
+}
+
+
+static void
+llvmpipe_end_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
+   struct llvmpipe_query *pq = llvmpipe_query(q);
+
+   lp_setup_end_query(llvmpipe->setup, pq);
+
+   switch (pq->type) {
+
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      pq->num_primitives_written =
+         llvmpipe->so_stats.num_primitives_written - pq->num_primitives_written;
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      pq->num_primitives_generated =
+         llvmpipe->so_stats.primitives_storage_needed - pq->num_primitives_generated;
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+      pq->num_primitives_written =
+         llvmpipe->so_stats.num_primitives_written - pq->num_primitives_written;
+      pq->num_primitives_generated =
+         llvmpipe->so_stats.primitives_storage_needed - pq->num_primitives_generated;
+      break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+      pq->num_primitives_written =
+         llvmpipe->so_stats.num_primitives_written - pq->num_primitives_written;
+      pq->num_primitives_generated =
+         llvmpipe->so_stats.primitives_storage_needed - pq->num_primitives_generated;
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      pq->stats.ia_vertices =
+         llvmpipe->pipeline_statistics.ia_vertices - pq->stats.ia_vertices;
+      pq->stats.ia_primitives =
+         llvmpipe->pipeline_statistics.ia_primitives - pq->stats.ia_primitives;
+      pq->stats.vs_invocations =
+         llvmpipe->pipeline_statistics.vs_invocations - pq->stats.vs_invocations;
+      pq->stats.gs_invocations =
+         llvmpipe->pipeline_statistics.gs_invocations - pq->stats.gs_invocations;
+      pq->stats.gs_primitives =
+         llvmpipe->pipeline_statistics.gs_primitives - pq->stats.gs_primitives;
+      pq->stats.c_invocations =
+         llvmpipe->pipeline_statistics.c_invocations - pq->stats.c_invocations;
+      pq->stats.c_primitives =
+         llvmpipe->pipeline_statistics.c_primitives - pq->stats.c_primitives;
+      pq->stats.ps_invocations =
+         llvmpipe->pipeline_statistics.ps_invocations - pq->stats.ps_invocations;
+
+      llvmpipe->active_statistics_queries--;
+      break;
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      assert(llvmpipe->active_occlusion_queries);
+      llvmpipe->active_occlusion_queries--;
+      llvmpipe->dirty |= LP_NEW_OCCLUSION_QUERY;
+      break;
+   default:
+      break;
+   }
+}
+
+boolean
+llvmpipe_check_render_cond(struct llvmpipe_context *lp)
+{
+   struct pipe_context *pipe = &lp->pipe;
+   boolean b, wait;
+   uint64_t result;
+
+   if (!lp->render_cond_query)
+      return TRUE; /* no query predicate, draw normally */
+
+   wait = (lp->render_cond_mode == PIPE_RENDER_COND_WAIT ||
+           lp->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT);
+
+   b = pipe->get_query_result(pipe, lp->render_cond_query, wait, (void*)&result);
+   if (b)
+      return ((!result) == lp->render_cond_cond);
+   else
+      return TRUE;
+}
+
+void llvmpipe_init_query_funcs(struct llvmpipe_context *llvmpipe )
+{
+   llvmpipe->pipe.create_query = llvmpipe_create_query;
+   llvmpipe->pipe.destroy_query = llvmpipe_destroy_query;
+   llvmpipe->pipe.begin_query = llvmpipe_begin_query;
+   llvmpipe->pipe.end_query = llvmpipe_end_query;
+   llvmpipe->pipe.get_query_result = llvmpipe_get_query_result;
+}
+
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_query.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_query.h
new file mode 100644
index 000000000..797375c88
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_query.h
@@ -0,0 +1,60 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:
+ *    Keith Whitwell, Qicheng Christopher Li, Brian Paul
+ */
+
+#ifndef LP_QUERY_H
+#define LP_QUERY_H
+
+#include <limits.h>
+#include "os/os_thread.h"
+#include "lp_limits.h"
+
+
+struct llvmpipe_context;
+
+
+struct llvmpipe_query {
+   uint64_t start[LP_MAX_THREADS];  /* start count value for each thread */
+   uint64_t end[LP_MAX_THREADS];    /* end count value for each thread */
+   struct lp_fence *fence;          /* fence from last scene this was binned in */
+   unsigned type;                   /* PIPE_QUERY_* */
+   unsigned num_primitives_generated;
+   unsigned num_primitives_written;
+
+   struct pipe_query_data_pipeline_statistics stats;
+};
+
+
+extern void llvmpipe_init_query_funcs(struct llvmpipe_context * );
+
+extern boolean llvmpipe_check_render_cond(struct llvmpipe_context *);
+
+#endif /* LP_QUERY_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast.c
new file mode 100644
index 000000000..c726707c0
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -0,0 +1,935 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <limits.h>
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_rect.h"
+#include "util/u_surface.h"
+#include "util/u_pack_color.h"
+#include "util/u_string.h"
+
+#include "os/os_time.h"
+
+#include "lp_scene_queue.h"
+#include "lp_context.h"
+#include "lp_debug.h"
+#include "lp_fence.h"
+#include "lp_perf.h"
+#include "lp_query.h"
+#include "lp_rast.h"
+#include "lp_rast_priv.h"
+#include "gallivm/lp_bld_debug.h"
+#include "lp_scene.h"
+#include "lp_tex_sample.h"
+
+
+#ifdef DEBUG
+int jit_line = 0;
+const struct lp_rast_state *jit_state = NULL;
+const struct lp_rasterizer_task *jit_task = NULL;
+#endif
+
+
+/**
+ * Begin rasterizing a scene.
+ * Called once per scene by one thread.
+ */
+static void
+lp_rast_begin( struct lp_rasterizer *rast,
+               struct lp_scene *scene )
+{
+   rast->curr_scene = scene;
+
+   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
+
+   lp_scene_begin_rasterization( scene );
+   lp_scene_bin_iter_begin( scene );
+}
+
+
+static void
+lp_rast_end( struct lp_rasterizer *rast )
+{
+   lp_scene_end_rasterization( rast->curr_scene );
+
+   rast->curr_scene = NULL;
+}
+
+
+/**
+ * Beginning rasterization of a tile.
+ * \param x  window X position of the tile, in pixels
+ * \param y  window Y position of the tile, in pixels
+ */
+static void
+lp_rast_tile_begin(struct lp_rasterizer_task *task,
+                   const struct cmd_bin *bin,
+                   int x, int y)
+{
+   unsigned i;
+   struct lp_scene *scene = task->scene;
+
+   LP_DBG(DEBUG_RAST, "%s %d,%d\n", __FUNCTION__, x, y);
+
+   task->bin = bin;
+   task->x = x * TILE_SIZE;
+   task->y = y * TILE_SIZE;
+   task->width = TILE_SIZE + x * TILE_SIZE > task->scene->fb.width ?
+                    task->scene->fb.width - x * TILE_SIZE : TILE_SIZE;
+   task->height = TILE_SIZE + y * TILE_SIZE > task->scene->fb.height ?
+                    task->scene->fb.height - y * TILE_SIZE : TILE_SIZE;
+
+   task->thread_data.vis_counter = 0;
+   task->ps_invocations = 0;
+
+   for (i = 0; i < task->scene->fb.nr_cbufs; i++) {
+      if (task->scene->fb.cbufs[i]) {
+         task->color_tiles[i] = scene->cbufs[i].map +
+                                scene->cbufs[i].stride * task->y +
+                                scene->cbufs[i].format_bytes * task->x;
+      }
+   }
+   if (task->scene->fb.zsbuf) {
+      task->depth_tile = scene->zsbuf.map +
+                         scene->zsbuf.stride * task->y +
+                         scene->zsbuf.format_bytes * task->x;
+   }
+}
+
+
+/**
+ * Clear the rasterizer's current color tile.
+ * This is a bin command called during bin processing.
+ * Clear commands always clear all bound layers.
+ */
+static void
+lp_rast_clear_color(struct lp_rasterizer_task *task,
+                    const union lp_rast_cmd_arg arg)
+{
+   const struct lp_scene *scene = task->scene;
+   unsigned cbuf = arg.clear_rb->cbuf;
+   union util_color uc;
+   enum pipe_format format;
+
+   /* we never bin clear commands for non-existing buffers */
+   assert(cbuf < scene->fb.nr_cbufs);
+   assert(scene->fb.cbufs[cbuf]);
+
+   format = scene->fb.cbufs[cbuf]->format;
+   uc = arg.clear_rb->color_val;
+
+   /*
+    * this is pretty rough since we have target format (bunch of bytes...) here.
+    * dump it as raw 4 dwords.
+    */
+   LP_DBG(DEBUG_RAST, "%s clear value (target format %d) raw 0x%x,0x%x,0x%x,0x%x\n",
+          __FUNCTION__, format, uc.ui[0], uc.ui[1], uc.ui[2], uc.ui[3]);
+
+
+   util_fill_box(scene->cbufs[cbuf].map,
+                 format,
+                 scene->cbufs[cbuf].stride,
+                 scene->cbufs[cbuf].layer_stride,
+                 task->x,
+                 task->y,
+                 0,
+                 task->width,
+                 task->height,
+                 scene->fb_max_layer + 1,
+                 &uc);
+
+   /* this will increase for each rb which probably doesn't mean much */
+   LP_COUNT(nr_color_tile_clear);
+}
+
+
+/**
+ * Clear the rasterizer's current z/stencil tile.
+ * This is a bin command called during bin processing.
+ * Clear commands always clear all bound layers.
+ */
+static void
+lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
+                       const union lp_rast_cmd_arg arg)
+{
+   const struct lp_scene *scene = task->scene;
+   uint64_t clear_value64 = arg.clear_zstencil.value;
+   uint64_t clear_mask64 = arg.clear_zstencil.mask;
+   uint32_t clear_value = (uint32_t) clear_value64;
+   uint32_t clear_mask = (uint32_t) clear_mask64;
+   const unsigned height = task->height;
+   const unsigned width = task->width;
+   const unsigned dst_stride = scene->zsbuf.stride;
+   uint8_t *dst;
+   unsigned i, j;
+   unsigned block_size;
+
+   LP_DBG(DEBUG_RAST, "%s: value=0x%08x, mask=0x%08x\n",
+           __FUNCTION__, clear_value, clear_mask);
+
+   /*
+    * Clear the area of the depth/depth buffer matching this tile.
+    */
+
+   if (scene->fb.zsbuf) {
+      unsigned layer;
+      uint8_t *dst_layer = task->depth_tile;
+      block_size = util_format_get_blocksize(scene->fb.zsbuf->format);
+
+      clear_value &= clear_mask;
+
+      for (layer = 0; layer <= scene->fb_max_layer; layer++) {
+         dst = dst_layer;
+
+         switch (block_size) {
+         case 1:
+            assert(clear_mask == 0xff);
+            memset(dst, (uint8_t) clear_value, height * width);
+            break;
+         case 2:
+            if (clear_mask == 0xffff) {
+               for (i = 0; i < height; i++) {
+                  uint16_t *row = (uint16_t *)dst;
+                  for (j = 0; j < width; j++)
+                     *row++ = (uint16_t) clear_value;
+                  dst += dst_stride;
+               }
+            }
+            else {
+               for (i = 0; i < height; i++) {
+                  uint16_t *row = (uint16_t *)dst;
+                  for (j = 0; j < width; j++) {
+                     uint16_t tmp = ~clear_mask & *row;
+                     *row++ = clear_value | tmp;
+                  }
+                  dst += dst_stride;
+               }
+            }
+            break;
+         case 4:
+            if (clear_mask == 0xffffffff) {
+               for (i = 0; i < height; i++) {
+                  uint32_t *row = (uint32_t *)dst;
+                  for (j = 0; j < width; j++)
+                     *row++ = clear_value;
+                  dst += dst_stride;
+               }
+            }
+            else {
+               for (i = 0; i < height; i++) {
+                  uint32_t *row = (uint32_t *)dst;
+                  for (j = 0; j < width; j++) {
+                     uint32_t tmp = ~clear_mask & *row;
+                     *row++ = clear_value | tmp;
+                  }
+                  dst += dst_stride;
+               }
+            }
+            break;
+         case 8:
+            clear_value64 &= clear_mask64;
+            if (clear_mask64 == 0xffffffffffULL) {
+               for (i = 0; i < height; i++) {
+                  uint64_t *row = (uint64_t *)dst;
+                  for (j = 0; j < width; j++)
+                     *row++ = clear_value64;
+                  dst += dst_stride;
+               }
+            }
+            else {
+               for (i = 0; i < height; i++) {
+                  uint64_t *row = (uint64_t *)dst;
+                  for (j = 0; j < width; j++) {
+                     uint64_t tmp = ~clear_mask64 & *row;
+                     *row++ = clear_value64 | tmp;
+                  }
+                  dst += dst_stride;
+               }
+            }
+            break;
+
+         default:
+            assert(0);
+            break;
+         }
+         dst_layer += scene->zsbuf.layer_stride;
+      }
+   }
+}
+
+
+
+/**
+ * Run the shader on all blocks in a tile.  This is used when a tile is
+ * completely contained inside a triangle.
+ * This is a bin command called during bin processing.
+ */
+static void
+lp_rast_shade_tile(struct lp_rasterizer_task *task,
+                   const union lp_rast_cmd_arg arg)
+{
+   const struct lp_scene *scene = task->scene;
+   const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
+   const struct lp_rast_state *state;
+   struct lp_fragment_shader_variant *variant;
+   const unsigned tile_x = task->x, tile_y = task->y;
+   unsigned x, y;
+
+   if (inputs->disable) {
+      /* This command was partially binned and has been disabled */
+      return;
+   }
+
+   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
+
+   state = task->state;
+   assert(state);
+   if (!state) {
+      return;
+   }
+   variant = state->variant;
+
+   /* render the whole 64x64 tile in 4x4 chunks */
+   for (y = 0; y < task->height; y += 4){
+      for (x = 0; x < task->width; x += 4) {
+         uint8_t *color[PIPE_MAX_COLOR_BUFS];
+         unsigned stride[PIPE_MAX_COLOR_BUFS];
+         uint8_t *depth = NULL;
+         unsigned depth_stride = 0;
+         unsigned i;
+
+         /* color buffer */
+         for (i = 0; i < scene->fb.nr_cbufs; i++){
+            if (scene->fb.cbufs[i]) {
+               stride[i] = scene->cbufs[i].stride;
+               color[i] = lp_rast_get_color_block_pointer(task, i, tile_x + x,
+                                                          tile_y + y, inputs->layer);
+            }
+            else {
+               stride[i] = 0;
+               color[i] = NULL;
+            }
+         }
+
+         /* depth buffer */
+         if (scene->zsbuf.map) {
+            depth = lp_rast_get_depth_block_pointer(task, tile_x + x,
+                                                    tile_y + y, inputs->layer);
+            depth_stride = scene->zsbuf.stride;
+         }
+
+         /* Propagate non-interpolated raster state. */
+         task->thread_data.raster_state.viewport_index = inputs->viewport_index;
+
+         /* run shader on 4x4 block */
+         BEGIN_JIT_CALL(state, task);
+         variant->jit_function[RAST_WHOLE]( &state->jit_context,
+                                            tile_x + x, tile_y + y,
+                                            inputs->frontfacing,
+                                            GET_A0(inputs),
+                                            GET_DADX(inputs),
+                                            GET_DADY(inputs),
+                                            color,
+                                            depth,
+                                            0xffff,
+                                            &task->thread_data,
+                                            stride,
+                                            depth_stride);
+         END_JIT_CALL();
+      }
+   }
+}
+
+
+/**
+ * Run the shader on all blocks in a tile.  This is used when a tile is
+ * completely contained inside a triangle, and the shader is opaque.
+ * This is a bin command called during bin processing.
+ */
+static void
+lp_rast_shade_tile_opaque(struct lp_rasterizer_task *task,
+                          const union lp_rast_cmd_arg arg)
+{
+   LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
+
+   assert(task->state);
+   if (!task->state) {
+      return;
+   }
+
+   lp_rast_shade_tile(task, arg);
+}
+
+
+/**
+ * Compute shading for a 4x4 block of pixels inside a triangle.
+ * This is a bin command called during bin processing.
+ * \param x  X position of quad in window coords
+ * \param y  Y position of quad in window coords
+ */
+void
+lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
+                         const struct lp_rast_shader_inputs *inputs,
+                         unsigned x, unsigned y,
+                         unsigned mask)
+{
+   const struct lp_rast_state *state = task->state;
+   struct lp_fragment_shader_variant *variant = state->variant;
+   const struct lp_scene *scene = task->scene;
+   uint8_t *color[PIPE_MAX_COLOR_BUFS];
+   unsigned stride[PIPE_MAX_COLOR_BUFS];
+   uint8_t *depth = NULL;
+   unsigned depth_stride = 0;
+   unsigned i;
+
+   assert(state);
+
+   /* Sanity checks */
+   assert(x < scene->tiles_x * TILE_SIZE);
+   assert(y < scene->tiles_y * TILE_SIZE);
+   assert(x % TILE_VECTOR_WIDTH == 0);
+   assert(y % TILE_VECTOR_HEIGHT == 0);
+
+   assert((x % 4) == 0);
+   assert((y % 4) == 0);
+
+   /* color buffer */
+   for (i = 0; i < scene->fb.nr_cbufs; i++) {
+      if (scene->fb.cbufs[i]) {
+         stride[i] = scene->cbufs[i].stride;
+         color[i] = lp_rast_get_color_block_pointer(task, i, x, y,
+                                                    inputs->layer);
+      }
+      else {
+         stride[i] = 0;
+         color[i] = NULL;
+      }
+   }
+
+   /* depth buffer */
+   if (scene->zsbuf.map) {
+      depth_stride = scene->zsbuf.stride;
+      depth = lp_rast_get_depth_block_pointer(task, x, y, inputs->layer);
+   }
+
+   assert(lp_check_alignment(state->jit_context.u8_blend_color, 16));
+
+   /*
+    * The rasterizer may produce fragments outside our
+    * allocated 4x4 blocks hence need to filter them out here.
+    */
+   if ((x % TILE_SIZE) < task->width && (y % TILE_SIZE) < task->height) {
+      /* not very accurate would need a popcount on the mask */
+      /* always count this not worth bothering? */
+      task->ps_invocations += 1 * variant->ps_inv_multiplier;
+
+      /* Propagate non-interpolated raster state. */
+      task->thread_data.raster_state.viewport_index = inputs->viewport_index;
+
+      /* run shader on 4x4 block */
+      BEGIN_JIT_CALL(state, task);
+      variant->jit_function[RAST_EDGE_TEST](&state->jit_context,
+                                            x, y,
+                                            inputs->frontfacing,
+                                            GET_A0(inputs),
+                                            GET_DADX(inputs),
+                                            GET_DADY(inputs),
+                                            color,
+                                            depth,
+                                            mask,
+                                            &task->thread_data,
+                                            stride,
+                                            depth_stride);
+      END_JIT_CALL();
+   }
+}
+
+
+
+/**
+ * Begin a new occlusion query.
+ * This is a bin command put in all bins.
+ * Called per thread.
+ */
+static void
+lp_rast_begin_query(struct lp_rasterizer_task *task,
+                    const union lp_rast_cmd_arg arg)
+{
+   struct llvmpipe_query *pq = arg.query_obj;
+
+   switch (pq->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      pq->start[task->thread_index] = task->thread_data.vis_counter;
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      pq->start[task->thread_index] = task->ps_invocations;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+
+/**
+ * End the current occlusion query.
+ * This is a bin command put in all bins.
+ * Called per thread.
+ */
+static void
+lp_rast_end_query(struct lp_rasterizer_task *task,
+                  const union lp_rast_cmd_arg arg)
+{
+   struct llvmpipe_query *pq = arg.query_obj;
+
+   switch (pq->type) {
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      pq->end[task->thread_index] +=
+         task->thread_data.vis_counter - pq->start[task->thread_index];
+      pq->start[task->thread_index] = 0;
+      break;
+   case PIPE_QUERY_TIMESTAMP:
+      pq->end[task->thread_index] = os_time_get_nano();
+      break;
+   case PIPE_QUERY_PIPELINE_STATISTICS:
+      pq->end[task->thread_index] +=
+         task->ps_invocations - pq->start[task->thread_index];
+      pq->start[task->thread_index] = 0;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+
+void
+lp_rast_set_state(struct lp_rasterizer_task *task,
+                  const union lp_rast_cmd_arg arg)
+{
+   task->state = arg.state;
+}
+
+
+
+/**
+ * Called when we're done writing to a color tile.
+ */
+static void
+lp_rast_tile_end(struct lp_rasterizer_task *task)
+{
+   unsigned i;
+
+   for (i = 0; i < task->scene->num_active_queries; ++i) {
+      lp_rast_end_query(task, lp_rast_arg_query(task->scene->active_queries[i]));
+   }
+
+   /* debug */
+   memset(task->color_tiles, 0, sizeof(task->color_tiles));
+   task->depth_tile = NULL;
+
+   task->bin = NULL;
+}
+
+static lp_rast_cmd_func dispatch[LP_RAST_OP_MAX] =
+{
+   lp_rast_clear_color,
+   lp_rast_clear_zstencil,
+   lp_rast_triangle_1,
+   lp_rast_triangle_2,
+   lp_rast_triangle_3,
+   lp_rast_triangle_4,
+   lp_rast_triangle_5,
+   lp_rast_triangle_6,
+   lp_rast_triangle_7,
+   lp_rast_triangle_8,
+   lp_rast_triangle_3_4,
+   lp_rast_triangle_3_16,
+   lp_rast_triangle_4_16,
+   lp_rast_shade_tile,
+   lp_rast_shade_tile_opaque,
+   lp_rast_begin_query,
+   lp_rast_end_query,
+   lp_rast_set_state,
+   lp_rast_triangle_32_1,
+   lp_rast_triangle_32_2,
+   lp_rast_triangle_32_3,
+   lp_rast_triangle_32_4,
+   lp_rast_triangle_32_5,
+   lp_rast_triangle_32_6,
+   lp_rast_triangle_32_7,
+   lp_rast_triangle_32_8,
+   lp_rast_triangle_32_3_4,
+   lp_rast_triangle_32_3_16,
+   lp_rast_triangle_32_4_16
+};
+
+
+static void
+do_rasterize_bin(struct lp_rasterizer_task *task,
+                 const struct cmd_bin *bin,
+                 int x, int y)
+{
+   const struct cmd_block *block;
+   unsigned k;
+
+   if (0)
+      lp_debug_bin(bin, x, y);
+
+   for (block = bin->head; block; block = block->next) {
+      for (k = 0; k < block->count; k++) {
+         dispatch[block->cmd[k]]( task, block->arg[k] );
+      }
+   }
+}
+
+
+
+/**
+ * Rasterize commands for a single bin.
+ * \param x, y  position of the bin's tile in the framebuffer
+ * Must be called between lp_rast_begin() and lp_rast_end().
+ * Called per thread.
+ */
+static void
+rasterize_bin(struct lp_rasterizer_task *task,
+              const struct cmd_bin *bin, int x, int y )
+{
+   lp_rast_tile_begin( task, bin, x, y );
+
+   do_rasterize_bin(task, bin, x, y);
+
+   lp_rast_tile_end(task);
+
+
+   /* Debug/Perf flags:
+    */
+   if (bin->head->count == 1) {
+      if (bin->head->cmd[0] == LP_RAST_OP_SHADE_TILE_OPAQUE)
+         LP_COUNT(nr_pure_shade_opaque_64);
+      else if (bin->head->cmd[0] == LP_RAST_OP_SHADE_TILE)
+         LP_COUNT(nr_pure_shade_64);
+   }
+}
+
+
+/* An empty bin is one that just loads the contents of the tile and
+ * stores them again unchanged.  This typically happens when bins have
+ * been flushed for some reason in the middle of a frame, or when
+ * incremental updates are being made to a render target.
+ * 
+ * Try to avoid doing pointless work in this case.
+ */
+static boolean
+is_empty_bin( const struct cmd_bin *bin )
+{
+   return bin->head == NULL;
+}
+
+
+/**
+ * Rasterize/execute all bins within a scene.
+ * Called per thread.
+ */
+static void
+rasterize_scene(struct lp_rasterizer_task *task,
+                struct lp_scene *scene)
+{
+   task->scene = scene;
+
+   if (!task->rast->no_rast && !scene->discard) {
+      /* loop over scene bins, rasterize each */
+      {
+         struct cmd_bin *bin;
+         int i, j;
+
+         assert(scene);
+         while ((bin = lp_scene_bin_iter_next(scene, &i, &j))) {
+            if (!is_empty_bin( bin ))
+               rasterize_bin(task, bin, i, j);
+         }
+      }
+   }
+
+
+   if (scene->fence) {
+      lp_fence_signal(scene->fence);
+   }
+
+   task->scene = NULL;
+}
+
+
+/**
+ * Called by setup module when it has something for us to render.
+ */
+void
+lp_rast_queue_scene( struct lp_rasterizer *rast,
+                     struct lp_scene *scene)
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   if (rast->num_threads == 0) {
+      /* no threading */
+      unsigned fpstate = util_fpstate_get();
+
+      /* Make sure that denorms are treated like zeros. This is 
+       * the behavior required by D3D10. OpenGL doesn't care.
+       */
+      util_fpstate_set_denorms_to_zero(fpstate);
+
+      lp_rast_begin( rast, scene );
+
+      rasterize_scene( &rast->tasks[0], scene );
+
+      lp_rast_end( rast );
+
+      util_fpstate_set(fpstate);
+
+      rast->curr_scene = NULL;
+   }
+   else {
+      /* threaded rendering! */
+      unsigned i;
+
+      lp_scene_enqueue( rast->full_scenes, scene );
+
+      /* signal the threads that there's work to do */
+      for (i = 0; i < rast->num_threads; i++) {
+         pipe_semaphore_signal(&rast->tasks[i].work_ready);
+      }
+   }
+
+   LP_DBG(DEBUG_SETUP, "%s done \n", __FUNCTION__);
+}
+
+
+void
+lp_rast_finish( struct lp_rasterizer *rast )
+{
+   if (rast->num_threads == 0) {
+      /* nothing to do */
+   }
+   else {
+      int i;
+
+      /* wait for work to complete */
+      for (i = 0; i < rast->num_threads; i++) {
+         pipe_semaphore_wait(&rast->tasks[i].work_done);
+      }
+   }
+}
+
+
+/**
+ * This is the thread's main entrypoint.
+ * It's a simple loop:
+ *   1. wait for work
+ *   2. do work
+ *   3. signal that we're done
+ */
+static PIPE_THREAD_ROUTINE( thread_function, init_data )
+{
+   struct lp_rasterizer_task *task = (struct lp_rasterizer_task *) init_data;
+   struct lp_rasterizer *rast = task->rast;
+   boolean debug = false;
+   char thread_name[16];
+   unsigned fpstate;
+
+   util_snprintf(thread_name, sizeof thread_name, "llvmpipe-%u", task->thread_index);
+   pipe_thread_setname(thread_name);
+
+   /* Make sure that denorms are treated like zeros. This is 
+    * the behavior required by D3D10. OpenGL doesn't care.
+    */
+   fpstate = util_fpstate_get();
+   util_fpstate_set_denorms_to_zero(fpstate);
+
+   while (1) {
+      /* wait for work */
+      if (debug)
+         debug_printf("thread %d waiting for work\n", task->thread_index);
+      pipe_semaphore_wait(&task->work_ready);
+
+      if (rast->exit_flag)
+         break;
+
+      if (task->thread_index == 0) {
+         /* thread[0]:
+          *  - get next scene to rasterize
+          *  - map the framebuffer surfaces
+          */
+         lp_rast_begin( rast, 
+                        lp_scene_dequeue( rast->full_scenes, TRUE ) );
+      }
+
+      /* Wait for all threads to get here so that threads[1+] don't
+       * get a null rast->curr_scene pointer.
+       */
+      pipe_barrier_wait( &rast->barrier );
+
+      /* do work */
+      if (debug)
+         debug_printf("thread %d doing work\n", task->thread_index);
+
+      rasterize_scene(task,
+                      rast->curr_scene);
+      
+      /* wait for all threads to finish with this scene */
+      pipe_barrier_wait( &rast->barrier );
+
+      /* XXX: shouldn't be necessary:
+       */
+      if (task->thread_index == 0) {
+         lp_rast_end( rast );
+      }
+
+      /* signal done with work */
+      if (debug)
+         debug_printf("thread %d done working\n", task->thread_index);
+
+      pipe_semaphore_signal(&task->work_done);
+   }
+
+#ifdef _WIN32
+   pipe_semaphore_signal(&task->work_done);
+#endif
+
+   return 0;
+}
+
+
+/**
+ * Initialize semaphores and spawn the threads.
+ */
+static void
+create_rast_threads(struct lp_rasterizer *rast)
+{
+   unsigned i;
+
+   /* NOTE: if num_threads is zero, we won't use any threads */
+   for (i = 0; i < rast->num_threads; i++) {
+      pipe_semaphore_init(&rast->tasks[i].work_ready, 0);
+      pipe_semaphore_init(&rast->tasks[i].work_done, 0);
+      rast->threads[i] = pipe_thread_create(thread_function,
+                                            (void *) &rast->tasks[i]);
+   }
+}
+
+
+
+/**
+ * Create new lp_rasterizer.  If num_threads is zero, don't create any
+ * new threads, do rendering synchronously.
+ * \param num_threads  number of rasterizer threads to create
+ */
+struct lp_rasterizer *
+lp_rast_create( unsigned num_threads )
+{
+   struct lp_rasterizer *rast;
+   unsigned i;
+
+   rast = CALLOC_STRUCT(lp_rasterizer);
+   if (!rast) {
+      goto no_rast;
+   }
+
+   rast->full_scenes = lp_scene_queue_create();
+   if (!rast->full_scenes) {
+      goto no_full_scenes;
+   }
+
+   for (i = 0; i < Elements(rast->tasks); i++) {
+      struct lp_rasterizer_task *task = &rast->tasks[i];
+      task->rast = rast;
+      task->thread_index = i;
+   }
+
+   rast->num_threads = num_threads;
+
+   rast->no_rast = debug_get_bool_option("LP_NO_RAST", FALSE);
+
+   create_rast_threads(rast);
+
+   /* for synchronizing rasterization threads */
+   pipe_barrier_init( &rast->barrier, rast->num_threads );
+
+   memset(lp_dummy_tile, 0, sizeof lp_dummy_tile);
+
+   return rast;
+
+no_full_scenes:
+   FREE(rast);
+no_rast:
+   return NULL;
+}
+
+
+/* Shutdown:
+ */
+void lp_rast_destroy( struct lp_rasterizer *rast )
+{
+   unsigned i;
+
+   /* Set exit_flag and signal each thread's work_ready semaphore.
+    * Each thread will be woken up, notice that the exit_flag is set and
+    * break out of its main loop.  The thread will then exit.
+    */
+   rast->exit_flag = TRUE;
+   for (i = 0; i < rast->num_threads; i++) {
+      pipe_semaphore_signal(&rast->tasks[i].work_ready);
+   }
+
+   /* Wait for threads to terminate before cleaning up per-thread data.
+    * We don't actually call pipe_thread_wait to avoid dead lock on Windows
+    * per https://bugs.freedesktop.org/show_bug.cgi?id=76252 */
+   for (i = 0; i < rast->num_threads; i++) {
+#ifdef _WIN32
+      pipe_semaphore_wait(&rast->tasks[i].work_done);
+#else
+      pipe_thread_wait(rast->threads[i]);
+#endif
+   }
+
+   /* Clean up per-thread data */
+   for (i = 0; i < rast->num_threads; i++) {
+      pipe_semaphore_destroy(&rast->tasks[i].work_ready);
+      pipe_semaphore_destroy(&rast->tasks[i].work_done);
+   }
+
+   /* for synchronizing rasterization threads */
+   pipe_barrier_destroy( &rast->barrier );
+
+   lp_scene_queue_destroy(rast->full_scenes);
+
+   FREE(rast);
+}
+
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast.h
new file mode 100644
index 000000000..c19f93180
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -0,0 +1,324 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * The rast code is concerned with rasterization of command bins.
+ * Each screen tile has a bin associated with it.  To render the
+ * scene we iterate over the tile bins and execute the commands
+ * in each bin.
+ * We'll do that with multiple threads...
+ */
+
+
+#ifndef LP_RAST_H
+#define LP_RAST_H
+
+#include "pipe/p_compiler.h"
+#include "util/u_pack_color.h"
+#include "lp_jit.h"
+
+
+struct lp_rasterizer;
+struct lp_scene;
+struct lp_fence;
+struct cmd_bin;
+
+#define FIXED_TYPE_WIDTH 64
+/** For sub-pixel positioning */
+#define FIXED_ORDER 8
+#define FIXED_ONE (1<<FIXED_ORDER)
+#define FIXED_SHIFT (FIXED_TYPE_WIDTH - 1)
+/** Maximum length of an edge in a primitive in pixels.
+ *  If the framebuffer is large we have to think about fixed-point
+ *  integer overflow. Coordinates need ((FIXED_TYPE_WIDTH/2) - 1) bits
+ *  to be able to fit product of two such coordinates inside
+ *  FIXED_TYPE_WIDTH, any larger and we could overflow a
+ *  FIXED_TYPE_WIDTH_-bit int.
+ */
+#define MAX_FIXED_LENGTH (1 << (((FIXED_TYPE_WIDTH/2) - 1) - FIXED_ORDER))
+
+#define MAX_FIXED_LENGTH32 (1 << (((32/2) - 1) - FIXED_ORDER))
+
+/* Rasterizer output size going to jit fs, width/height */
+#define LP_RASTER_BLOCK_SIZE 4
+
+#define LP_MAX_ACTIVE_BINNED_QUERIES 64
+
+#define IMUL64(a, b) (((int64_t)(a)) * ((int64_t)(b)))
+
+struct lp_rasterizer_task;
+
+
+/**
+ * Rasterization state.
+ * Objects of this type are put into the shared data bin and pointed
+ * to by commands in the per-tile bins.
+ */
+struct lp_rast_state {
+   /* State for the shader.  This also contains state which feeds into
+    * the fragment shader, such as blend color and alpha ref value.
+    */
+   struct lp_jit_context jit_context;
+   
+   /* The shader itself.  Probably we also need to pass a pointer to
+    * the tile color/z/stencil data somehow
+     */
+   struct lp_fragment_shader_variant *variant;
+};
+
+
+/**
+ * Coefficients necessary to run the shader at a given location.
+ * First coefficient is position.
+ * These pointers point into the bin data buffer.
+ */
+struct lp_rast_shader_inputs {
+   unsigned frontfacing:1;      /** True for front-facing */
+   unsigned disable:1;          /** Partially binned, disable this command */
+   unsigned opaque:1;           /** Is opaque */
+   unsigned pad0:29;            /* wasted space */
+   unsigned stride;             /* how much to advance data between a0, dadx, dady */
+   unsigned layer;              /* the layer to render to (from gs, already clamped) */
+   unsigned viewport_index;     /* the active viewport index (from gs, already clamped) */
+   /* followed by a0, dadx, dady and planes[] */
+};
+
+struct lp_rast_plane {
+   /* edge function values at minx,miny ?? */
+   int64_t c;
+
+   int32_t dcdx;
+   int32_t dcdy;
+
+   /* one-pixel sized trivial reject offsets for each plane */
+   int64_t eo;
+};
+
+/**
+ * Rasterization information for a triangle known to be in this bin,
+ * plus inputs to run the shader:
+ * These fields are tile- and bin-independent.
+ * Objects of this type are put into the lp_setup_context::data buffer.
+ */
+struct lp_rast_triangle {
+#ifdef DEBUG
+   float v[3][2];
+   float pad0;
+   float pad1;
+#endif
+
+   /* inputs for the shader */
+   struct lp_rast_shader_inputs inputs;
+   /* planes are also allocated here */
+};
+
+
+struct lp_rast_clear_rb {
+   union util_color color_val;
+   unsigned cbuf;
+};
+
+
+#define GET_A0(inputs) ((float (*)[4])((inputs)+1))
+#define GET_DADX(inputs) ((float (*)[4])((char *)((inputs) + 1) + (inputs)->stride))
+#define GET_DADY(inputs) ((float (*)[4])((char *)((inputs) + 1) + 2 * (inputs)->stride))
+#define GET_PLANES(tri) ((struct lp_rast_plane *)((char *)(&(tri)->inputs + 1) + 3 * (tri)->inputs.stride))
+
+
+
+struct lp_rasterizer *
+lp_rast_create( unsigned num_threads );
+
+void
+lp_rast_destroy( struct lp_rasterizer * );
+
+void 
+lp_rast_queue_scene( struct lp_rasterizer *rast,
+                     struct lp_scene *scene );
+
+void
+lp_rast_finish( struct lp_rasterizer *rast );
+
+
+union lp_rast_cmd_arg {
+   const struct lp_rast_shader_inputs *shade_tile;
+   struct {
+      const struct lp_rast_triangle *tri;
+      unsigned plane_mask;
+   } triangle;
+   const struct lp_rast_state *set_state;
+   const struct lp_rast_clear_rb *clear_rb;
+   struct {
+      uint64_t value;
+      uint64_t mask;
+   } clear_zstencil;
+   const struct lp_rast_state *state;
+   struct lp_fence *fence;
+   struct llvmpipe_query *query_obj;
+};
+
+
+/* Cast wrappers.  Hopefully these compile to noops!
+ */
+static inline union lp_rast_cmd_arg
+lp_rast_arg_inputs( const struct lp_rast_shader_inputs *shade_tile )
+{
+   union lp_rast_cmd_arg arg;
+   arg.shade_tile = shade_tile;
+   return arg;
+}
+
+static inline union lp_rast_cmd_arg
+lp_rast_arg_triangle( const struct lp_rast_triangle *triangle,
+                      unsigned plane_mask)
+{
+   union lp_rast_cmd_arg arg;
+   arg.triangle.tri = triangle;
+   arg.triangle.plane_mask = plane_mask;
+   return arg;
+}
+
+/**
+ * Build argument for a contained triangle.
+ *
+ * All planes are enabled, so instead of the plane mask we pass the upper
+ * left coordinates of the a block that fully encloses the triangle.
+ */
+static inline union lp_rast_cmd_arg
+lp_rast_arg_triangle_contained( const struct lp_rast_triangle *triangle,
+                                unsigned x, unsigned y)
+{
+   union lp_rast_cmd_arg arg;
+   arg.triangle.tri = triangle;
+   arg.triangle.plane_mask = x | (y << 8);
+   return arg;
+}
+
+static inline union lp_rast_cmd_arg
+lp_rast_arg_state( const struct lp_rast_state *state )
+{
+   union lp_rast_cmd_arg arg;
+   arg.set_state = state;
+   return arg;
+}
+
+static inline union lp_rast_cmd_arg
+lp_rast_arg_fence( struct lp_fence *fence )
+{
+   union lp_rast_cmd_arg arg;
+   arg.fence = fence;
+   return arg;
+}
+
+
+static inline union lp_rast_cmd_arg
+lp_rast_arg_clearzs( uint64_t value, uint64_t mask )
+{
+   union lp_rast_cmd_arg arg;
+   arg.clear_zstencil.value = value;
+   arg.clear_zstencil.mask = mask;
+   return arg;
+}
+
+
+static inline union lp_rast_cmd_arg
+lp_rast_arg_query( struct llvmpipe_query *pq )
+{
+   union lp_rast_cmd_arg arg;
+   arg.query_obj = pq;
+   return arg;
+}
+
+static inline union lp_rast_cmd_arg
+lp_rast_arg_null( void )
+{
+   union lp_rast_cmd_arg arg;
+   arg.set_state = NULL;
+   return arg;
+}
+
+
+/**
+ * Binnable Commands.
+ * These get put into bins by the setup code and are called when
+ * the bins are executed.
+ */
+#define LP_RAST_OP_CLEAR_COLOR       0x0
+#define LP_RAST_OP_CLEAR_ZSTENCIL    0x1
+#define LP_RAST_OP_TRIANGLE_1        0x2
+#define LP_RAST_OP_TRIANGLE_2        0x3
+#define LP_RAST_OP_TRIANGLE_3        0x4
+#define LP_RAST_OP_TRIANGLE_4        0x5
+#define LP_RAST_OP_TRIANGLE_5        0x6
+#define LP_RAST_OP_TRIANGLE_6        0x7
+#define LP_RAST_OP_TRIANGLE_7        0x8
+#define LP_RAST_OP_TRIANGLE_8        0x9
+#define LP_RAST_OP_TRIANGLE_3_4      0xa
+#define LP_RAST_OP_TRIANGLE_3_16     0xb
+#define LP_RAST_OP_TRIANGLE_4_16     0xc
+#define LP_RAST_OP_SHADE_TILE        0xd
+#define LP_RAST_OP_SHADE_TILE_OPAQUE 0xe
+#define LP_RAST_OP_BEGIN_QUERY       0xf
+#define LP_RAST_OP_END_QUERY         0x10
+#define LP_RAST_OP_SET_STATE         0x11
+#define LP_RAST_OP_TRIANGLE_32_1     0x12
+#define LP_RAST_OP_TRIANGLE_32_2     0x13
+#define LP_RAST_OP_TRIANGLE_32_3     0x14
+#define LP_RAST_OP_TRIANGLE_32_4     0x15
+#define LP_RAST_OP_TRIANGLE_32_5     0x16
+#define LP_RAST_OP_TRIANGLE_32_6     0x17
+#define LP_RAST_OP_TRIANGLE_32_7     0x18
+#define LP_RAST_OP_TRIANGLE_32_8     0x19
+#define LP_RAST_OP_TRIANGLE_32_3_4   0x1a
+#define LP_RAST_OP_TRIANGLE_32_3_16  0x1b
+#define LP_RAST_OP_TRIANGLE_32_4_16  0x1c
+
+#define LP_RAST_OP_MAX               0x1d
+#define LP_RAST_OP_MASK              0xff
+
+void
+lp_debug_bins( struct lp_scene *scene );
+void
+lp_debug_draw_bins_by_cmd_length( struct lp_scene *scene );
+void
+lp_debug_draw_bins_by_coverage( struct lp_scene *scene );
+
+
+#ifdef PIPE_ARCH_SSE
+#include <emmintrin.h>
+#include "util/u_sse.h"
+
+static inline __m128i
+lp_plane_to_m128i(const struct lp_rast_plane *plane)
+{
+   return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
+                         (int32_t)plane->dcdy, (int32_t)plane->eo);
+}
+
+#endif
+
+#endif
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast_debug.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast_debug.c
new file mode 100644
index 000000000..b5ae9dadf
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast_debug.c
@@ -0,0 +1,438 @@
+#include <inttypes.h>  /* for PRIu64 macro */
+#include "util/u_math.h"
+#include "lp_rast_priv.h"
+#include "lp_state_fs.h"
+
+struct tile {
+   int coverage;
+   int overdraw;
+   const struct lp_rast_state *state;
+   char data[TILE_SIZE][TILE_SIZE];
+};
+
+static char get_label( int i )
+{
+   static const char *cmd_labels = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
+   unsigned max_label = (2*26+10);
+
+   if (i < max_label)
+      return cmd_labels[i];
+   else
+      return '?';
+}
+
+
+
+static const char *cmd_names[LP_RAST_OP_MAX] = 
+{
+   "clear_color",
+   "clear_zstencil",
+   "triangle_1",
+   "triangle_2",
+   "triangle_3",
+   "triangle_4",
+   "triangle_5",
+   "triangle_6",
+   "triangle_7",
+   "triangle_8",
+   "triangle_3_4",
+   "triangle_3_16",
+   "triangle_4_16",
+   "shade_tile",
+   "shade_tile_opaque",
+   "begin_query",
+   "end_query",
+   "set_state",
+   "triangle_32_1",
+   "triangle_32_2",
+   "triangle_32_3",
+   "triangle_32_4",
+   "triangle_32_5",
+   "triangle_32_6",
+   "triangle_32_7",
+   "triangle_32_8",
+   "triangle_32_3_4",
+   "triangle_32_3_16",
+   "triangle_32_4_16",
+};
+
+static const char *cmd_name(unsigned cmd)
+{
+   assert(Elements(cmd_names) > cmd);
+   return cmd_names[cmd];
+}
+
+static const struct lp_fragment_shader_variant *
+get_variant( const struct lp_rast_state *state,
+             const struct cmd_block *block,
+             int k )
+{
+   if (!state)
+      return NULL;
+
+   if (block->cmd[k] == LP_RAST_OP_SHADE_TILE ||
+       block->cmd[k] == LP_RAST_OP_SHADE_TILE_OPAQUE ||
+       block->cmd[k] == LP_RAST_OP_TRIANGLE_1 ||
+       block->cmd[k] == LP_RAST_OP_TRIANGLE_2 ||
+       block->cmd[k] == LP_RAST_OP_TRIANGLE_3 ||
+       block->cmd[k] == LP_RAST_OP_TRIANGLE_4 ||
+       block->cmd[k] == LP_RAST_OP_TRIANGLE_5 ||
+       block->cmd[k] == LP_RAST_OP_TRIANGLE_6 ||
+       block->cmd[k] == LP_RAST_OP_TRIANGLE_7)
+      return state->variant;
+
+   return NULL;
+}
+
+
+static boolean
+is_blend( const struct lp_rast_state *state,
+          const struct cmd_block *block,
+          int k )
+{
+   const struct lp_fragment_shader_variant *variant = get_variant(state, block, k);
+
+   if (variant)
+      return  variant->key.blend.rt[0].blend_enable;
+
+   return FALSE;
+}
+
+
+
+static void
+debug_bin( const struct cmd_bin *bin, int x, int y )
+{
+   const struct lp_rast_state *state = NULL;
+   const struct cmd_block *head = bin->head;
+   int i, j = 0;
+
+   debug_printf("bin %d,%d:\n", x, y);
+                
+   while (head) {
+      for (i = 0; i < head->count; i++, j++) {
+         if (head->cmd[i] == LP_RAST_OP_SET_STATE)
+            state = head->arg[i].state;
+
+         debug_printf("%d: %s %s\n", j,
+                      cmd_name(head->cmd[i]),
+                      is_blend(state, head, i) ? "blended" : "");
+      }
+      head = head->next;
+   }
+}
+
+
+static void plot(struct tile *tile,
+                 int x, int y,
+                 char val,
+                 boolean blend)
+{
+   if (tile->data[x][y] == ' ')
+      tile->coverage++;
+   else
+      tile->overdraw++;
+
+   tile->data[x][y] = val;
+}
+
+
+
+
+
+
+static int
+debug_shade_tile(int x, int y,
+                 const union lp_rast_cmd_arg arg,
+                 struct tile *tile,
+                 char val)
+{
+   const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
+   boolean blend;
+   unsigned i,j;
+
+   if (!tile->state)
+      return 0;
+
+   blend = tile->state->variant->key.blend.rt[0].blend_enable;
+
+   if (inputs->disable)
+      return 0;
+
+   for (i = 0; i < TILE_SIZE; i++)
+      for (j = 0; j < TILE_SIZE; j++)
+         plot(tile, i, j, val, blend);
+
+   return TILE_SIZE * TILE_SIZE;
+}
+
+static int
+debug_clear_tile(int x, int y,
+                 const union lp_rast_cmd_arg arg,
+                 struct tile *tile,
+                 char val)
+{
+   unsigned i,j;
+
+   for (i = 0; i < TILE_SIZE; i++)
+      for (j = 0; j < TILE_SIZE; j++)
+         plot(tile, i, j, val, FALSE);
+
+   return TILE_SIZE * TILE_SIZE;
+
+}
+
+
+static int
+debug_triangle(int tilex, int tiley,
+               const union lp_rast_cmd_arg arg,
+               struct tile *tile,
+               char val)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   unsigned plane_mask = arg.triangle.plane_mask;
+   const struct lp_rast_plane *tri_plane = GET_PLANES(tri);
+   struct lp_rast_plane plane[8];
+   int x, y;
+   int count = 0;
+   unsigned i, nr_planes = 0;
+   boolean blend = tile->state->variant->key.blend.rt[0].blend_enable;
+
+   if (tri->inputs.disable) {
+      /* This triangle was partially binned and has been disabled */
+      return 0;
+   }
+
+   while (plane_mask) {
+      plane[nr_planes] = tri_plane[u_bit_scan(&plane_mask)];
+      plane[nr_planes].c = (plane[nr_planes].c +
+                            IMUL64(plane[nr_planes].dcdy, tiley) -
+                            IMUL64(plane[nr_planes].dcdx, tilex));
+      nr_planes++;
+   }
+
+   for(y = 0; y < TILE_SIZE; y++)
+   {
+      for(x = 0; x < TILE_SIZE; x++)
+      {
+         for (i = 0; i < nr_planes; i++)
+            if (plane[i].c <= 0)
+               goto out;
+         
+         plot(tile, x, y, val, blend);
+         count++;
+
+      out:
+         for (i = 0; i < nr_planes; i++)
+            plane[i].c -= plane[i].dcdx;
+      }
+
+      for (i = 0; i < nr_planes; i++) {
+         plane[i].c += IMUL64(plane[i].dcdx, TILE_SIZE);
+         plane[i].c += plane[i].dcdy;
+      }
+   }
+   return count;
+}
+
+
+
+
+
+static void
+do_debug_bin( struct tile *tile,
+              const struct cmd_bin *bin,
+              int x, int y,
+              boolean print_cmds)
+{
+   unsigned k, j = 0;
+   const struct cmd_block *block;
+
+   int tx = x * TILE_SIZE;
+   int ty = y * TILE_SIZE;
+
+   memset(tile->data, ' ', sizeof tile->data);
+   tile->coverage = 0;
+   tile->overdraw = 0;
+   tile->state = NULL;
+
+   for (block = bin->head; block; block = block->next) {
+      for (k = 0; k < block->count; k++, j++) {
+         boolean blend = is_blend(tile->state, block, k);
+         char val = get_label(j);
+         int count = 0;
+            
+         if (print_cmds)
+            debug_printf("%c: %15s", val, cmd_name(block->cmd[k]));
+
+         if (block->cmd[k] == LP_RAST_OP_SET_STATE)
+            tile->state = block->arg[k].state;
+         
+         if (block->cmd[k] == LP_RAST_OP_CLEAR_COLOR ||
+             block->cmd[k] == LP_RAST_OP_CLEAR_ZSTENCIL)
+            count = debug_clear_tile(tx, ty, block->arg[k], tile, val);
+
+         if (block->cmd[k] == LP_RAST_OP_SHADE_TILE ||
+             block->cmd[k] == LP_RAST_OP_SHADE_TILE_OPAQUE)
+            count = debug_shade_tile(tx, ty, block->arg[k], tile, val);
+
+         if (block->cmd[k] == LP_RAST_OP_TRIANGLE_1 ||
+             block->cmd[k] == LP_RAST_OP_TRIANGLE_2 ||
+             block->cmd[k] == LP_RAST_OP_TRIANGLE_3 ||
+             block->cmd[k] == LP_RAST_OP_TRIANGLE_4 ||
+             block->cmd[k] == LP_RAST_OP_TRIANGLE_5 ||
+             block->cmd[k] == LP_RAST_OP_TRIANGLE_6 ||
+             block->cmd[k] == LP_RAST_OP_TRIANGLE_7)
+            count = debug_triangle(tx, ty, block->arg[k], tile, val);
+
+         if (print_cmds) {
+            debug_printf(" % 5d", count);
+
+            if (blend)
+               debug_printf(" blended");
+            
+            debug_printf("\n");
+         }
+      }
+   }
+}
+
+void
+lp_debug_bin( const struct cmd_bin *bin, int i, int j)
+{
+   struct tile tile;
+   int x,y;
+
+   if (bin->head) {
+      do_debug_bin(&tile, bin, i, j, TRUE);
+
+      debug_printf("------------------------------------------------------------------\n");
+      for (y = 0; y < TILE_SIZE; y++) {
+         for (x = 0; x < TILE_SIZE; x++) {
+            debug_printf("%c", tile.data[y][x]);
+         }
+         debug_printf("|\n");
+      }
+      debug_printf("------------------------------------------------------------------\n");
+
+      debug_printf("each pixel drawn avg %f times\n",
+                   ((float)tile.overdraw + tile.coverage)/(float)tile.coverage);
+   }
+}
+
+
+
+
+
+
+/** Return number of bytes used for a single bin */
+static unsigned
+lp_scene_bin_size( const struct lp_scene *scene, unsigned x, unsigned y )
+{
+   struct cmd_bin *bin = lp_scene_get_bin((struct lp_scene *) scene, x, y);
+   const struct cmd_block *cmd;
+   unsigned size = 0;
+   for (cmd = bin->head; cmd; cmd = cmd->next) {
+      size += (cmd->count *
+               (sizeof(uint8_t) + sizeof(union lp_rast_cmd_arg)));
+   }
+   return size;
+}
+
+
+
+void
+lp_debug_draw_bins_by_coverage( struct lp_scene *scene )
+{
+   unsigned x, y;
+   unsigned total = 0;
+   unsigned possible = 0;
+   static uint64_t _total = 0;
+   static uint64_t _possible = 0;
+
+   for (x = 0; x < scene->tiles_x; x++)
+      debug_printf("-");
+   debug_printf("\n");
+
+   for (y = 0; y < scene->tiles_y; y++) {
+      for (x = 0; x < scene->tiles_x; x++) {
+         struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+         const char *bits = "0123456789";
+         struct tile tile;
+
+         if (bin->head) {
+            //lp_debug_bin(bin, x, y);
+
+            do_debug_bin(&tile, bin, x, y, FALSE);
+
+            total += tile.coverage;
+            possible += 64*64;
+
+            if (tile.coverage == 64*64)
+               debug_printf("*");
+            else if (tile.coverage) {
+               int bit = tile.coverage/(64.0*64.0)*10;
+               debug_printf("%c", bits[MIN2(bit,10)]);
+            }
+            else
+               debug_printf("?");
+         }
+         else {
+            debug_printf(" ");
+         }
+      }
+      debug_printf("|\n");
+   }
+
+   for (x = 0; x < scene->tiles_x; x++)
+      debug_printf("-");
+   debug_printf("\n");
+
+   debug_printf("this tile total: %u possible %u: percentage: %f\n",
+                total,
+                possible,
+                total * 100.0 / (float)possible);
+
+   _total += total;
+   _possible += possible;
+
+
+   debug_printf("overall   total: %" PRIu64
+                " possible %" PRIu64 ": percentage: %f\n",
+                _total,
+                _possible,
+                (double) _total * 100.0 / (double)_possible);
+}
+
+
+void
+lp_debug_draw_bins_by_cmd_length( struct lp_scene *scene )
+{
+   unsigned x, y;
+
+   for (y = 0; y < scene->tiles_y; y++) {
+      for (x = 0; x < scene->tiles_x; x++) {
+         const char *bits = " ...,-~:;=o+xaw*#XAWWWWWWWWWWWWWWWW";
+         unsigned sz = lp_scene_bin_size(scene, x, y);
+         unsigned sz2 = util_logbase2(sz);
+         debug_printf("%c", bits[MIN2(sz2,32)]);
+      }
+      debug_printf("\n");
+   }
+}
+
+
+void
+lp_debug_bins( struct lp_scene *scene )
+{
+   unsigned x, y;
+
+   for (y = 0; y < scene->tiles_y; y++) {
+      for (x = 0; x < scene->tiles_x; x++) {
+         struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+         if (bin->head) {
+            debug_bin(bin, x, y);
+         }
+      }
+   }
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast_priv.h
new file mode 100644
index 000000000..9aa7e8746
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -0,0 +1,347 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef LP_RAST_PRIV_H
+#define LP_RAST_PRIV_H
+
+#include "os/os_thread.h"
+#include "util/u_format.h"
+#include "gallivm/lp_bld_debug.h"
+#include "lp_memory.h"
+#include "lp_rast.h"
+#include "lp_scene.h"
+#include "lp_state.h"
+#include "lp_texture.h"
+#include "lp_limits.h"
+
+
+#define TILE_VECTOR_HEIGHT 4
+#define TILE_VECTOR_WIDTH 4
+
+/* If we crash in a jitted function, we can examine jit_line and jit_state
+ * to get some info.  This is not thread-safe, however.
+ */
+#ifdef DEBUG
+
+struct lp_rasterizer_task;
+extern int jit_line;
+extern const struct lp_rast_state *jit_state;
+extern const struct lp_rasterizer_task *jit_task;
+
+#define BEGIN_JIT_CALL(state, task)                  \
+   do { \
+      jit_line = __LINE__; \
+      jit_state = state; \
+      jit_task = task; \
+   } while (0)
+
+#define END_JIT_CALL() \
+   do { \
+      jit_line = 0; \
+      jit_state = NULL; \
+   } while (0)
+
+#else
+
+#define BEGIN_JIT_CALL(X, Y)
+#define END_JIT_CALL()
+
+#endif
+
+
+struct lp_rasterizer;
+struct cmd_bin;
+
+/**
+ * Per-thread rasterization state
+ */
+struct lp_rasterizer_task
+{
+   const struct cmd_bin *bin;
+   const struct lp_rast_state *state;
+
+   struct lp_scene *scene;
+   unsigned x, y;          /**< Pos of this tile in framebuffer, in pixels */
+   unsigned width, height; /**< width, height of current tile, in pixels */
+
+   uint8_t *color_tiles[PIPE_MAX_COLOR_BUFS];
+   uint8_t *depth_tile;
+
+   /** "back" pointer */
+   struct lp_rasterizer *rast;
+
+   /** "my" index */
+   unsigned thread_index;
+
+   /** Non-interpolated passthru state and occlude counter for visible pixels */
+   struct lp_jit_thread_data thread_data;
+   uint64_t ps_invocations;
+   uint8_t ps_inv_multiplier;
+
+   pipe_semaphore work_ready;
+   pipe_semaphore work_done;
+};
+
+
+/**
+ * This is the state required while rasterizing tiles.
+ * Note that this contains per-thread information too.
+ * The tile size is TILE_SIZE x TILE_SIZE pixels.
+ */
+struct lp_rasterizer
+{
+   boolean exit_flag;
+   boolean no_rast;  /**< For debugging/profiling */
+
+   /** The incoming queue of scenes ready to rasterize */
+   struct lp_scene_queue *full_scenes;
+
+   /** The scene currently being rasterized by the threads */
+   struct lp_scene *curr_scene;
+
+   /** A task object for each rasterization thread */
+   struct lp_rasterizer_task tasks[LP_MAX_THREADS];
+
+   unsigned num_threads;
+   pipe_thread threads[LP_MAX_THREADS];
+
+   /** For synchronizing the rasterization threads */
+   pipe_barrier barrier;
+};
+
+
+void
+lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
+                         const struct lp_rast_shader_inputs *inputs,
+                         unsigned x, unsigned y,
+                         unsigned mask);
+
+
+/**
+ * Get the pointer to a 4x4 color block (within a 64x64 tile).
+ * \param x, y location of 4x4 block in window coords
+ */
+static inline uint8_t *
+lp_rast_get_color_block_pointer(struct lp_rasterizer_task *task,
+                                unsigned buf, unsigned x, unsigned y,
+                                unsigned layer)
+{
+   unsigned px, py, pixel_offset;
+   uint8_t *color;
+
+   assert(x < task->scene->tiles_x * TILE_SIZE);
+   assert(y < task->scene->tiles_y * TILE_SIZE);
+   assert((x % TILE_VECTOR_WIDTH) == 0);
+   assert((y % TILE_VECTOR_HEIGHT) == 0);
+   assert(buf < task->scene->fb.nr_cbufs);
+
+   assert(task->color_tiles[buf]);
+
+   /*
+    * We don't actually benefit from having per tile cbuf/zsbuf pointers,
+    * it's just extra work - the mul/add would be exactly the same anyway.
+    * Fortunately the extra work (modulo) here is very cheap at least...
+    */
+   px = x % TILE_SIZE;
+   py = y % TILE_SIZE;
+
+   pixel_offset = px * task->scene->cbufs[buf].format_bytes +
+                  py * task->scene->cbufs[buf].stride;
+   color = task->color_tiles[buf] + pixel_offset;
+
+   if (layer) {
+      color += layer * task->scene->cbufs[buf].layer_stride;
+   }
+
+   assert(lp_check_alignment(color, llvmpipe_get_format_alignment(task->scene->fb.cbufs[buf]->format)));
+   return color;
+}
+
+
+/**
+ * Get the pointer to a 4x4 depth block (within a 64x64 tile).
+ * \param x, y location of 4x4 block in window coords
+ */
+static inline uint8_t *
+lp_rast_get_depth_block_pointer(struct lp_rasterizer_task *task,
+                                unsigned x, unsigned y, unsigned layer)
+{
+   unsigned px, py, pixel_offset;
+   uint8_t *depth;
+
+   assert(x < task->scene->tiles_x * TILE_SIZE);
+   assert(y < task->scene->tiles_y * TILE_SIZE);
+   assert((x % TILE_VECTOR_WIDTH) == 0);
+   assert((y % TILE_VECTOR_HEIGHT) == 0);
+
+   assert(task->depth_tile);
+
+   px = x % TILE_SIZE;
+   py = y % TILE_SIZE;
+
+   pixel_offset = px * task->scene->zsbuf.format_bytes +
+                  py * task->scene->zsbuf.stride;
+   depth = task->depth_tile + pixel_offset;
+
+   if (layer) {
+      depth += layer * task->scene->zsbuf.layer_stride;
+   }
+
+   assert(lp_check_alignment(depth, llvmpipe_get_format_alignment(task->scene->fb.zsbuf->format)));
+   return depth;
+}
+
+
+
+/**
+ * Shade all pixels in a 4x4 block.  The fragment code omits the
+ * triangle in/out tests.
+ * \param x, y location of 4x4 block in window coords
+ */
+static inline void
+lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
+                         const struct lp_rast_shader_inputs *inputs,
+                         unsigned x, unsigned y )
+{
+   const struct lp_scene *scene = task->scene;
+   const struct lp_rast_state *state = task->state;
+   struct lp_fragment_shader_variant *variant = state->variant;
+   uint8_t *color[PIPE_MAX_COLOR_BUFS];
+   unsigned stride[PIPE_MAX_COLOR_BUFS];
+   uint8_t *depth = NULL;
+   unsigned depth_stride = 0;
+   unsigned i;
+
+   /* color buffer */
+   for (i = 0; i < scene->fb.nr_cbufs; i++) {
+      if (scene->fb.cbufs[i]) {
+         stride[i] = scene->cbufs[i].stride;
+         color[i] = lp_rast_get_color_block_pointer(task, i, x, y,
+                                                    inputs->layer);
+      }
+      else {
+         stride[i] = 0;
+         color[i] = NULL;
+      }
+   }
+
+   if (scene->zsbuf.map) {
+      depth = lp_rast_get_depth_block_pointer(task, x, y, inputs->layer);
+      depth_stride = scene->zsbuf.stride;
+   }
+
+   /*
+    * The rasterizer may produce fragments outside our
+    * allocated 4x4 blocks hence need to filter them out here.
+    */
+   if ((x % TILE_SIZE) < task->width && (y % TILE_SIZE) < task->height) {
+      /* not very accurate would need a popcount on the mask */
+      /* always count this not worth bothering? */
+      task->ps_invocations += 1 * variant->ps_inv_multiplier;
+
+      /* Propagate non-interpolated raster state. */
+      task->thread_data.raster_state.viewport_index = inputs->viewport_index;
+
+      /* run shader on 4x4 block */
+      BEGIN_JIT_CALL(state, task);
+      variant->jit_function[RAST_WHOLE]( &state->jit_context,
+                                         x, y,
+                                         inputs->frontfacing,
+                                         GET_A0(inputs),
+                                         GET_DADX(inputs),
+                                         GET_DADY(inputs),
+                                         color,
+                                         depth,
+                                         0xffff,
+                                         &task->thread_data,
+                                         stride,
+                                         depth_stride);
+      END_JIT_CALL();
+   }
+}
+
+void lp_rast_triangle_1( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_2( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_3( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_4( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_5( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_6( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_7( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_8( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_3_4(struct lp_rasterizer_task *,
+			  const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_3_16( struct lp_rasterizer_task *, 
+                            const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_4_16( struct lp_rasterizer_task *, 
+                            const union lp_rast_cmd_arg );
+
+
+void lp_rast_triangle_32_1( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_2( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_3( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_4( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_5( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_6( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_7( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+void lp_rast_triangle_32_8( struct lp_rasterizer_task *, 
+                         const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_32_3_4(struct lp_rasterizer_task *,
+			  const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_32_3_16( struct lp_rasterizer_task *, 
+                            const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_32_4_16( struct lp_rasterizer_task *, 
+                            const union lp_rast_cmd_arg );
+
+void
+lp_rast_set_state(struct lp_rasterizer_task *task,
+                  const union lp_rast_cmd_arg arg);
+ 
+void
+lp_debug_bin( const struct cmd_bin *bin, int x, int y );
+
+#endif
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast_tri.c
new file mode 100644
index 000000000..c9b9221d8
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -0,0 +1,558 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Rasterization for binned triangles within a tile
+ */
+
+#include <limits.h>
+#include "util/u_math.h"
+#include "lp_debug.h"
+#include "lp_perf.h"
+#include "lp_rast_priv.h"
+
+/**
+ * Shade all pixels in a 4x4 block.
+ */
+static void
+block_full_4(struct lp_rasterizer_task *task,
+             const struct lp_rast_triangle *tri,
+             int x, int y)
+{
+   lp_rast_shade_quads_all(task, &tri->inputs, x, y);
+}
+
+
+/**
+ * Shade all pixels in a 16x16 block.
+ */
+static void
+block_full_16(struct lp_rasterizer_task *task,
+              const struct lp_rast_triangle *tri,
+              int x, int y)
+{
+   unsigned ix, iy;
+   assert(x % 16 == 0);
+   assert(y % 16 == 0);
+   for (iy = 0; iy < 16; iy += 4)
+      for (ix = 0; ix < 16; ix += 4)
+	 block_full_4(task, tri, x + ix, y + iy);
+}
+
+static inline unsigned
+build_mask_linear(int64_t c, int64_t dcdx, int64_t dcdy)
+{
+   unsigned mask = 0;
+
+   int64_t c0 = c;
+   int64_t c1 = c0 + dcdy;
+   int64_t c2 = c1 + dcdy;
+   int64_t c3 = c2 + dcdy;
+
+   mask |= ((c0 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 0);
+   mask |= ((c0 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 1);
+   mask |= ((c0 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 2);
+   mask |= ((c0 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 3);
+   mask |= ((c1 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 4);
+   mask |= ((c1 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 5);
+   mask |= ((c1 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 6);
+   mask |= ((c1 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 7);
+   mask |= ((c2 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 8);
+   mask |= ((c2 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 9);
+   mask |= ((c2 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 10);
+   mask |= ((c2 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 11);
+   mask |= ((c3 + 0 * dcdx) >> FIXED_SHIFT) & (1 << 12);
+   mask |= ((c3 + 1 * dcdx) >> FIXED_SHIFT) & (1 << 13);
+   mask |= ((c3 + 2 * dcdx) >> FIXED_SHIFT) & (1 << 14);
+   mask |= ((c3 + 3 * dcdx) >> FIXED_SHIFT) & (1 << 15);
+  
+   return mask;
+}
+
+
+static inline void
+build_masks(int64_t c,
+            int64_t cdiff,
+            int64_t dcdx,
+            int64_t dcdy,
+	    unsigned *outmask,
+	    unsigned *partmask)
+{
+   *outmask |= build_mask_linear(c, dcdx, dcdy);
+   *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
+}
+
+void
+lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<3)-1;
+   lp_rast_triangle_3(task, arg2);
+}
+
+void
+lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   lp_rast_triangle_3_16(task, arg);
+}
+
+void
+lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<4)-1;
+   lp_rast_triangle_4(task, arg2);
+}
+
+#if !defined(PIPE_ARCH_SSE)
+
+void
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
+                         const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<3)-1;
+   lp_rast_triangle_32_3(task, arg2);
+}
+
+void
+lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
+                         const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<4)-1;
+   lp_rast_triangle_32_4(task, arg2);
+}
+
+void
+lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   lp_rast_triangle_32_3_16(task, arg);
+}
+
+#else
+#include <emmintrin.h>
+#include "util/u_sse.h"
+
+
+static inline void
+build_masks_32(int c, 
+               int cdiff,
+               int dcdx,
+               int dcdy,
+               unsigned *outmask,
+               unsigned *partmask)
+{
+   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+
+   {
+      __m128i cstep01, cstep23, result;
+
+      cstep01 = _mm_packs_epi32(cstep0, cstep1);
+      cstep23 = _mm_packs_epi32(cstep2, cstep3);
+      result = _mm_packs_epi16(cstep01, cstep23);
+
+      *outmask |= _mm_movemask_epi8(result);
+   }
+
+
+   {
+      __m128i cio4 = _mm_set1_epi32(cdiff);
+      __m128i cstep01, cstep23, result;
+
+      cstep0 = _mm_add_epi32(cstep0, cio4);
+      cstep1 = _mm_add_epi32(cstep1, cio4);
+      cstep2 = _mm_add_epi32(cstep2, cio4);
+      cstep3 = _mm_add_epi32(cstep3, cio4);
+
+      cstep01 = _mm_packs_epi32(cstep0, cstep1);
+      cstep23 = _mm_packs_epi32(cstep2, cstep3);
+      result = _mm_packs_epi16(cstep01, cstep23);
+
+      *partmask |= _mm_movemask_epi8(result);
+   }
+}
+
+
+static inline unsigned
+build_mask_linear_32(int c, int dcdx, int dcdy)
+{
+   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+   /* Get values across the quad
+    */
+   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+
+   /* pack pairs of results into epi16
+    */
+   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
+   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
+
+   /* pack into epi8, preserving sign bits
+    */
+   __m128i result = _mm_packs_epi16(cstep01, cstep23);
+
+   /* extract sign bits to create mask
+    */
+   return _mm_movemask_epi8(result);
+}
+
+static inline unsigned
+sign_bits4(const __m128i *cstep, int cdiff)
+{
+
+   /* Adjust the step values
+    */
+   __m128i cio4 = _mm_set1_epi32(cdiff);
+   __m128i cstep0 = _mm_add_epi32(cstep[0], cio4);
+   __m128i cstep1 = _mm_add_epi32(cstep[1], cio4);
+   __m128i cstep2 = _mm_add_epi32(cstep[2], cio4);
+   __m128i cstep3 = _mm_add_epi32(cstep[3], cio4);
+
+   /* Pack down to epi8
+    */
+   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
+   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
+   __m128i result = _mm_packs_epi16(cstep01, cstep23);
+
+   /* Extract the sign bits
+    */
+   return _mm_movemask_epi8(result);
+}
+
+
+#define NR_PLANES 3
+
+
+
+
+
+
+
+void
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   int x = (arg.triangle.plane_mask & 0xff) + task->x;
+   int y = (arg.triangle.plane_mask >> 8) + task->y;
+   unsigned i, j;
+
+   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
+   unsigned nr = 0;
+
+   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i zero = _mm_setzero_si128();
+
+   __m128i c;
+   __m128i dcdx;
+   __m128i dcdy;
+   __m128i rej4;
+
+   __m128i dcdx2;
+   __m128i dcdx3;
+   
+   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
+   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
+   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
+   __m128i unused;
+   
+   transpose4_epi32(&p0, &p1, &p2, &zero,
+                    &c, &dcdx, &dcdy, &rej4);
+
+   /* Adjust dcdx;
+    */
+   dcdx = _mm_sub_epi32(zero, dcdx);
+
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
+   rej4 = _mm_slli_epi32(rej4, 2);
+
+   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
+   c = _mm_sub_epi32(c, _mm_set1_epi32(1));
+   rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));
+
+   dcdx2 = _mm_add_epi32(dcdx, dcdx);
+   dcdx3 = _mm_add_epi32(dcdx2, dcdx);
+
+   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
+                    &span_0, &span_1, &span_2, &unused);
+
+   for (i = 0; i < 4; i++) {
+      __m128i cx = c;
+
+      for (j = 0; j < 4; j++) {
+         __m128i c4rej = _mm_add_epi32(cx, rej4);
+         __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
+
+         /* if (is_zero(rej_masks)) */
+         if (_mm_movemask_epi8(rej_masks) == 0) {
+            __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
+            __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
+            __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
+
+            __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
+
+            __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
+            __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
+            __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
+
+            __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
+            __m128i c_01 = _mm_packs_epi32(c_0, c_1);
+
+            __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
+            __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
+            __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
+
+            __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
+
+            __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
+            __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
+            __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
+
+            __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
+            __m128i c_23 = _mm_packs_epi32(c_2, c_3);
+            __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
+
+            unsigned mask = _mm_movemask_epi8(c_0123);
+
+            out[nr].i = i;
+            out[nr].j = j;
+            out[nr].mask = mask;
+            if (mask != 0xffff)
+               nr++;
+         }
+         cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
+      }
+
+      c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
+   }
+
+   for (i = 0; i < nr; i++)
+      lp_rast_shade_quads_mask(task,
+                               &tri->inputs,
+                               x + 4 * out[i].j,
+                               y + 4 * out[i].i,
+                               0xffff & ~out[i].mask);
+}
+
+
+
+
+
+void
+lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
+                     const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
+   unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
+
+   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i zero = _mm_setzero_si128();
+
+   __m128i c;
+   __m128i dcdx;
+   __m128i dcdy;
+
+   __m128i dcdx2;
+   __m128i dcdx3;
+   
+   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
+   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
+   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
+   __m128i unused;
+
+   transpose4_epi32(&p0, &p1, &p2, &zero,
+                    &c, &dcdx, &dcdy, &unused);
+
+   /* Adjust dcdx;
+    */
+   dcdx = _mm_sub_epi32(zero, dcdx);
+
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
+
+   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
+   c = _mm_sub_epi32(c, _mm_set1_epi32(1));
+
+   dcdx2 = _mm_add_epi32(dcdx, dcdx);
+   dcdx3 = _mm_add_epi32(dcdx2, dcdx);
+
+   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
+                    &span_0, &span_1, &span_2, &unused);
+
+
+   {
+      __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
+      __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
+      __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
+      
+      __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
+
+      __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
+      __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
+      __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
+
+      __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
+      __m128i c_01 = _mm_packs_epi32(c_0, c_1);
+
+      __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
+      __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
+      __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
+
+      __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
+
+      __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
+      __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
+      __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
+
+      __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
+      __m128i c_23 = _mm_packs_epi32(c_2, c_3);
+      __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
+
+      unsigned mask = _mm_movemask_epi8(c_0123);
+
+      if (mask != 0xffff)
+         lp_rast_shade_quads_mask(task,
+                                  &tri->inputs,
+                                  x,
+                                  y,
+                                  0xffff & ~mask);
+   }
+}
+
+#undef NR_PLANES
+#endif
+
+
+#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks(c, cdiff, dcdx, dcdy, omask, pmask)
+#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear(c, dcdx, dcdy)
+
+#define TAG(x) x##_1
+#define NR_PLANES 1
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_2
+#define NR_PLANES 2
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_3
+#define NR_PLANES 3
+/*#define TRI_4 lp_rast_triangle_3_4*/
+/*#define TRI_16 lp_rast_triangle_3_16*/
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_4
+#define NR_PLANES 4
+/*#define TRI_16 lp_rast_triangle_4_16*/
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_5
+#define NR_PLANES 5
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_6
+#define NR_PLANES 6
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_7
+#define NR_PLANES 7
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_8
+#define NR_PLANES 8
+#include "lp_rast_tri_tmp.h"
+
+#ifdef PIPE_ARCH_SSE
+#undef BUILD_MASKS
+#undef BUILD_MASK_LINEAR
+#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)
+#define BUILD_MASK_LINEAR(c, dcdx, dcdy) build_mask_linear_32((int)c, dcdx, dcdy)
+#endif
+
+#define TAG(x) x##_32_1
+#define NR_PLANES 1
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_2
+#define NR_PLANES 2
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_3
+#define NR_PLANES 3
+/*#define TRI_4 lp_rast_triangle_3_4*/
+/*#define TRI_16 lp_rast_triangle_3_16*/
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_4
+#define NR_PLANES 4
+#ifdef PIPE_ARCH_SSE
+#define TRI_16 lp_rast_triangle_32_4_16
+#endif
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_5
+#define NR_PLANES 5
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_6
+#define NR_PLANES 6
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_7
+#define NR_PLANES 7
+#include "lp_rast_tri_tmp.h"
+
+#define TAG(x) x##_32_8
+#define NR_PLANES 8
+#include "lp_rast_tri_tmp.h"
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
new file mode 100644
index 000000000..52f6e9996
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -0,0 +1,380 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Rasterization for binned triangles within a tile
+ */
+
+
+
+/**
+ * Prototype for a 8 plane rasterizer function.  Will codegenerate
+ * several of these.
+ *
+ * XXX: Varients for more/fewer planes.
+ * XXX: Need ways of dropping planes as we descend.
+ * XXX: SIMD
+ */
+static void
+TAG(do_block_4)(struct lp_rasterizer_task *task,
+                const struct lp_rast_triangle *tri,
+                const struct lp_rast_plane *plane,
+                int x, int y,
+                const int64_t *c)
+{
+   unsigned mask = 0xffff;
+   int j;
+
+   for (j = 0; j < NR_PLANES; j++) {
+      mask &= ~BUILD_MASK_LINEAR(c[j] - 1, 
+				 -plane[j].dcdx,
+				 plane[j].dcdy);
+   }
+
+   /* Now pass to the shader:
+    */
+   if (mask)
+      lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask);
+}
+
+/**
+ * Evaluate a 16x16 block of pixels to determine which 4x4 subblocks are in/out
+ * of the triangle's bounds.
+ */
+static void
+TAG(do_block_16)(struct lp_rasterizer_task *task,
+                 const struct lp_rast_triangle *tri,
+                 const struct lp_rast_plane *plane,
+                 int x, int y,
+                 const int64_t *c)
+{
+   unsigned outmask, inmask, partmask, partial_mask;
+   unsigned j;
+
+   outmask = 0;                 /* outside one or more trivial reject planes */
+   partmask = 0;                /* outside one or more trivial accept planes */
+
+   for (j = 0; j < NR_PLANES; j++) {
+      const int64_t dcdx = -IMUL64(plane[j].dcdx, 4);
+      const int64_t dcdy = IMUL64(plane[j].dcdy, 4);
+      const int64_t cox = IMUL64(plane[j].eo, 4);
+      const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+      const int64_t cio = IMUL64(ei, 4) - 1;
+
+      BUILD_MASKS(c[j] + cox,
+		  cio - cox,
+		  dcdx, dcdy, 
+		  &outmask,   /* sign bits from c[i][0..15] + cox */
+		  &partmask); /* sign bits from c[i][0..15] + cio */
+   }
+
+   if (outmask == 0xffff)
+      return;
+
+   /* Mask of sub-blocks which are inside all trivial accept planes:
+    */
+   inmask = ~partmask & 0xffff;
+
+   /* Mask of sub-blocks which are inside all trivial reject planes,
+    * but outside at least one trivial accept plane:
+    */
+   partial_mask = partmask & ~outmask;
+
+   assert((partial_mask & inmask) == 0);
+
+   LP_COUNT_ADD(nr_empty_4, util_bitcount(0xffff & ~(partial_mask | inmask)));
+
+   /* Iterate over partials:
+    */
+   while (partial_mask) {
+      int i = ffs(partial_mask) - 1;
+      int ix = (i & 3) * 4;
+      int iy = (i >> 2) * 4;
+      int px = x + ix;
+      int py = y + iy; 
+      int64_t cx[NR_PLANES];
+
+      partial_mask &= ~(1 << i);
+
+      LP_COUNT(nr_partially_covered_4);
+
+      for (j = 0; j < NR_PLANES; j++)
+         cx[j] = (c[j] 
+                  - IMUL64(plane[j].dcdx, ix)
+                  + IMUL64(plane[j].dcdy, iy));
+
+      TAG(do_block_4)(task, tri, plane, px, py, cx);
+   }
+
+   /* Iterate over fulls: 
+    */
+   while (inmask) {
+      int i = ffs(inmask) - 1;
+      int ix = (i & 3) * 4;
+      int iy = (i >> 2) * 4;
+      int px = x + ix;
+      int py = y + iy; 
+
+      inmask &= ~(1 << i);
+
+      LP_COUNT(nr_fully_covered_4);
+      block_full_4(task, tri, px, py);
+   }
+}
+
+
+/**
+ * Scan the tile in chunks and figure out which pixels to rasterize
+ * for this triangle.
+ */
+void
+TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   unsigned plane_mask = arg.triangle.plane_mask;
+   const struct lp_rast_plane *tri_plane = GET_PLANES(tri);
+   const int x = task->x, y = task->y;
+   struct lp_rast_plane plane[NR_PLANES];
+   int64_t c[NR_PLANES];
+   unsigned outmask, inmask, partmask, partial_mask;
+   unsigned j = 0;
+
+   if (tri->inputs.disable) {
+      /* This triangle was partially binned and has been disabled */
+      return;
+   }
+
+   outmask = 0;                 /* outside one or more trivial reject planes */
+   partmask = 0;                /* outside one or more trivial accept planes */
+
+   while (plane_mask) {
+      int i = ffs(plane_mask) - 1;
+      plane[j] = tri_plane[i];
+      plane_mask &= ~(1 << i);
+      c[j] = plane[j].c + IMUL64(plane[j].dcdy, y) - IMUL64(plane[j].dcdx, x);
+
+      {
+         const int64_t dcdx = -IMUL64(plane[j].dcdx, 16);
+         const int64_t dcdy = IMUL64(plane[j].dcdy, 16);
+         const int64_t cox = IMUL64(plane[j].eo, 16);
+         const int64_t ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+         const int64_t cio = IMUL64(ei, 16) - 1;
+
+         BUILD_MASKS(c[j] + cox,
+                     cio - cox,
+                     dcdx, dcdy,
+                     &outmask,   /* sign bits from c[i][0..15] + cox */
+                     &partmask); /* sign bits from c[i][0..15] + cio */
+      }
+
+      j++;
+   }
+
+   if (outmask == 0xffff)
+      return;
+
+   /* Mask of sub-blocks which are inside all trivial accept planes:
+    */
+   inmask = ~partmask & 0xffff;
+
+   /* Mask of sub-blocks which are inside all trivial reject planes,
+    * but outside at least one trivial accept plane:
+    */
+   partial_mask = partmask & ~outmask;
+
+   assert((partial_mask & inmask) == 0);
+
+   LP_COUNT_ADD(nr_empty_16, util_bitcount(0xffff & ~(partial_mask | inmask)));
+
+   /* Iterate over partials:
+    */
+   while (partial_mask) {
+      int i = ffs(partial_mask) - 1;
+      int ix = (i & 3) * 16;
+      int iy = (i >> 2) * 16;
+      int px = x + ix;
+      int py = y + iy;
+      int64_t cx[NR_PLANES];
+
+      for (j = 0; j < NR_PLANES; j++)
+         cx[j] = (c[j]
+                  - IMUL64(plane[j].dcdx, ix)
+                  + IMUL64(plane[j].dcdy, iy));
+
+      partial_mask &= ~(1 << i);
+
+      LP_COUNT(nr_partially_covered_16);
+      TAG(do_block_16)(task, tri, plane, px, py, cx);
+   }
+
+   /* Iterate over fulls: 
+    */
+   while (inmask) {
+      int i = ffs(inmask) - 1;
+      int ix = (i & 3) * 16;
+      int iy = (i >> 2) * 16;
+      int px = x + ix;
+      int py = y + iy;
+
+      inmask &= ~(1 << i);
+
+      LP_COUNT(nr_fully_covered_16);
+      block_full_16(task, tri, px, py);
+   }
+}
+
+#if defined(PIPE_ARCH_SSE) && defined(TRI_16)
+/* XXX: special case this when intersection is not required.
+ *      - tile completely within bbox,
+ *      - bbox completely within tile.
+ */
+void
+TRI_16(struct lp_rasterizer_task *task,
+       const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   unsigned mask = arg.triangle.plane_mask;
+   unsigned outmask, partial_mask;
+   unsigned j;
+   __m128i cstep4[NR_PLANES][4];
+
+   int x = (mask & 0xff);
+   int y = (mask >> 8);
+
+   outmask = 0;                 /* outside one or more trivial reject planes */
+   
+   x += task->x;
+   y += task->y;
+
+   for (j = 0; j < NR_PLANES; j++) {
+      const int dcdx = -plane[j].dcdx * 4;
+      const int dcdy = plane[j].dcdy * 4;
+      __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+      cstep4[j][0] = _mm_setr_epi32(0, dcdx, dcdx*2, dcdx*3);
+      cstep4[j][1] = _mm_add_epi32(cstep4[j][0], xdcdy);
+      cstep4[j][2] = _mm_add_epi32(cstep4[j][1], xdcdy);
+      cstep4[j][3] = _mm_add_epi32(cstep4[j][2], xdcdy);
+
+      {
+	 const int c = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
+	 const int cox = plane[j].eo * 4;
+
+	 outmask |= sign_bits4(cstep4[j], c + cox);
+      }
+   }
+
+   if (outmask == 0xffff)
+      return;
+
+
+   /* Mask of sub-blocks which are inside all trivial reject planes,
+    * but outside at least one trivial accept plane:
+    */
+   partial_mask = 0xffff & ~outmask;
+
+   /* Iterate over partials:
+    */
+   while (partial_mask) {
+      int i = ffs(partial_mask) - 1;
+      int ix = (i & 3) * 4;
+      int iy = (i >> 2) * 4;
+      int px = x + ix;
+      int py = y + iy; 
+      unsigned mask = 0xffff;
+
+      partial_mask &= ~(1 << i);
+
+      for (j = 0; j < NR_PLANES; j++) {
+         const int cx = (plane[j].c - 1
+			 - plane[j].dcdx * px
+			 + plane[j].dcdy * py) * 4;
+
+	 mask &= ~sign_bits4(cstep4[j], cx);
+      }
+
+      if (mask)
+	 lp_rast_shade_quads_mask(task, &tri->inputs, px, py, mask);
+   }
+}
+#endif
+
+#if defined(PIPE_ARCH_SSE) && defined(TRI_4)
+void
+TRI_4(struct lp_rasterizer_task *task,
+      const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   unsigned mask = arg.triangle.plane_mask;
+   const int x = task->x + (mask & 0xff);
+   const int y = task->y + (mask >> 8);
+   unsigned j;
+
+   /* Iterate over partials:
+    */
+   {
+      unsigned mask = 0xffff;
+
+      for (j = 0; j < NR_PLANES; j++) {
+	 const int cx = (plane[j].c 
+			 - plane[j].dcdx * x
+			 + plane[j].dcdy * y);
+
+	 const int dcdx = -plane[j].dcdx;
+	 const int dcdy = plane[j].dcdy;
+	 __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+	 __m128i cstep0 = _mm_setr_epi32(cx, cx + dcdx, cx + dcdx*2, cx + dcdx*3);
+	 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
+	 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
+	 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+
+	 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
+	 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
+	 __m128i result = _mm_packs_epi16(cstep01, cstep23);
+
+	 /* Extract the sign bits
+	  */
+	 mask &= ~_mm_movemask_epi8(result);
+      }
+
+      if (mask)
+	 lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask);
+   }
+}
+#endif
+
+
+
+#undef TAG
+#undef TRI_4
+#undef TRI_16
+#undef NR_PLANES
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_scene.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_scene.c
new file mode 100644
index 000000000..2441b3c0d
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -0,0 +1,564 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_framebuffer.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/simple_list.h"
+#include "util/u_format.h"
+#include "lp_scene.h"
+#include "lp_fence.h"
+#include "lp_debug.h"
+
+
+#define RESOURCE_REF_SZ 32
+
+/** List of resource references */
+struct resource_ref {
+   struct pipe_resource *resource[RESOURCE_REF_SZ];
+   int count;
+   struct resource_ref *next;
+};
+
+
+/**
+ * Create a new scene object.
+ * \param queue  the queue to put newly rendered/emptied scenes into
+ */
+struct lp_scene *
+lp_scene_create( struct pipe_context *pipe )
+{
+   struct lp_scene *scene = CALLOC_STRUCT(lp_scene);
+   if (!scene)
+      return NULL;
+
+   scene->pipe = pipe;
+
+   scene->data.head =
+      CALLOC_STRUCT(data_block);
+
+   pipe_mutex_init(scene->mutex);
+
+#ifdef DEBUG
+   /* Do some scene limit sanity checks here */
+   {
+      size_t maxBins = TILES_X * TILES_Y;
+      size_t maxCommandBytes = sizeof(struct cmd_block) * maxBins;
+      size_t maxCommandPlusData = maxCommandBytes + DATA_BLOCK_SIZE;
+      /* We'll need at least one command block per bin.  Make sure that's
+       * less than the max allowed scene size.
+       */
+      assert(maxCommandBytes < LP_SCENE_MAX_SIZE);
+      /* We'll also need space for at least one other data block */
+      assert(maxCommandPlusData <= LP_SCENE_MAX_SIZE);
+   }
+#endif
+
+   return scene;
+}
+
+
+/**
+ * Free all data associated with the given scene, and the scene itself.
+ */
+void
+lp_scene_destroy(struct lp_scene *scene)
+{
+   lp_fence_reference(&scene->fence, NULL);
+   pipe_mutex_destroy(scene->mutex);
+   assert(scene->data.head->next == NULL);
+   FREE(scene->data.head);
+   FREE(scene);
+}
+
+
+/**
+ * Check if the scene's bins are all empty.
+ * For debugging purposes.
+ */
+boolean
+lp_scene_is_empty(struct lp_scene *scene )
+{
+   unsigned x, y;
+
+   for (y = 0; y < TILES_Y; y++) {
+      for (x = 0; x < TILES_X; x++) {
+         const struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+         if (bin->head) {
+            return FALSE;
+         }
+      }
+   }
+   return TRUE;
+}
+
+
+/* Returns true if there has ever been a failed allocation attempt in
+ * this scene.  Used in triangle emit to avoid having to check success
+ * at each bin.
+ */
+boolean
+lp_scene_is_oom(struct lp_scene *scene)
+{
+   return scene->alloc_failed;
+}
+
+
+/* Remove all commands from a bin.  Tries to reuse some of the memory
+ * allocated to the bin, however.
+ */
+void
+lp_scene_bin_reset(struct lp_scene *scene, unsigned x, unsigned y)
+{
+   struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+
+   bin->last_state = NULL;
+   bin->head = bin->tail;
+   if (bin->tail) {
+      bin->tail->next = NULL;
+      bin->tail->count = 0;
+   }
+}
+
+
+void
+lp_scene_begin_rasterization(struct lp_scene *scene)
+{
+   const struct pipe_framebuffer_state *fb = &scene->fb;
+   int i;
+
+   //LP_DBG(DEBUG_RAST, "%s\n", __FUNCTION__);
+
+   for (i = 0; i < scene->fb.nr_cbufs; i++) {
+      struct pipe_surface *cbuf = scene->fb.cbufs[i];
+
+      if (!cbuf) {
+         scene->cbufs[i].stride = 0;
+         scene->cbufs[i].layer_stride = 0;
+         scene->cbufs[i].map = NULL;
+         continue;
+      }
+
+      if (llvmpipe_resource_is_texture(cbuf->texture)) {
+         scene->cbufs[i].stride = llvmpipe_resource_stride(cbuf->texture,
+                                                           cbuf->u.tex.level);
+         scene->cbufs[i].layer_stride = llvmpipe_layer_stride(cbuf->texture,
+                                                              cbuf->u.tex.level);
+
+         scene->cbufs[i].map = llvmpipe_resource_map(cbuf->texture,
+                                                     cbuf->u.tex.level,
+                                                     cbuf->u.tex.first_layer,
+                                                     LP_TEX_USAGE_READ_WRITE);
+         scene->cbufs[i].format_bytes = util_format_get_blocksize(cbuf->format);
+      }
+      else {
+         struct llvmpipe_resource *lpr = llvmpipe_resource(cbuf->texture);
+         unsigned pixstride = util_format_get_blocksize(cbuf->format);
+         scene->cbufs[i].stride = cbuf->texture->width0;
+         scene->cbufs[i].layer_stride = 0;
+         scene->cbufs[i].map = lpr->data;
+         scene->cbufs[i].map += cbuf->u.buf.first_element * pixstride;
+         scene->cbufs[i].format_bytes = util_format_get_blocksize(cbuf->format);
+      }
+   }
+
+   if (fb->zsbuf) {
+      struct pipe_surface *zsbuf = scene->fb.zsbuf;
+      scene->zsbuf.stride = llvmpipe_resource_stride(zsbuf->texture, zsbuf->u.tex.level);
+      scene->zsbuf.layer_stride = llvmpipe_layer_stride(zsbuf->texture, zsbuf->u.tex.level);
+
+      scene->zsbuf.map = llvmpipe_resource_map(zsbuf->texture,
+                                               zsbuf->u.tex.level,
+                                               zsbuf->u.tex.first_layer,
+                                               LP_TEX_USAGE_READ_WRITE);
+      scene->zsbuf.format_bytes = util_format_get_blocksize(zsbuf->format);
+   }
+}
+
+
+
+
+/**
+ * Free all the temporary data in a scene.
+ */
+void
+lp_scene_end_rasterization(struct lp_scene *scene )
+{
+   int i, j;
+
+   /* Unmap color buffers */
+   for (i = 0; i < scene->fb.nr_cbufs; i++) {
+      if (scene->cbufs[i].map) {
+         struct pipe_surface *cbuf = scene->fb.cbufs[i];
+         if (llvmpipe_resource_is_texture(cbuf->texture)) {
+            llvmpipe_resource_unmap(cbuf->texture,
+                                    cbuf->u.tex.level,
+                                    cbuf->u.tex.first_layer);
+         }
+         scene->cbufs[i].map = NULL;
+      }
+   }
+
+   /* Unmap z/stencil buffer */
+   if (scene->zsbuf.map) {
+      struct pipe_surface *zsbuf = scene->fb.zsbuf;
+      llvmpipe_resource_unmap(zsbuf->texture,
+                              zsbuf->u.tex.level,
+                              zsbuf->u.tex.first_layer);
+      scene->zsbuf.map = NULL;
+   }
+
+   /* Reset all command lists:
+    */
+   for (i = 0; i < scene->tiles_x; i++) {
+      for (j = 0; j < scene->tiles_y; j++) {
+         struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
+         bin->head = NULL;
+         bin->tail = NULL;
+         bin->last_state = NULL;
+      }
+   }
+
+   /* If there are any bins which weren't cleared by the loop above,
+    * they will be caught (on debug builds at least) by this assert:
+    */
+   assert(lp_scene_is_empty(scene));
+
+   /* Decrement texture ref counts
+    */
+   {
+      struct resource_ref *ref;
+      int i, j = 0;
+
+      for (ref = scene->resources; ref; ref = ref->next) {
+         for (i = 0; i < ref->count; i++) {
+            if (LP_DEBUG & DEBUG_SETUP)
+               debug_printf("resource %d: %p %dx%d sz %d\n",
+                            j,
+                            (void *) ref->resource[i],
+                            ref->resource[i]->width0,
+                            ref->resource[i]->height0,
+                            llvmpipe_resource_size(ref->resource[i]));
+            j++;
+            pipe_resource_reference(&ref->resource[i], NULL);
+         }
+      }
+
+      if (LP_DEBUG & DEBUG_SETUP)
+         debug_printf("scene %d resources, sz %d\n",
+                      j, scene->resource_reference_size);
+   }
+
+   /* Free all scene data blocks:
+    */
+   {
+      struct data_block_list *list = &scene->data;
+      struct data_block *block, *tmp;
+
+      for (block = list->head->next; block; block = tmp) {
+         tmp = block->next;
+	 FREE(block);
+      }
+
+      list->head->next = NULL;
+      list->head->used = 0;
+   }
+
+   lp_fence_reference(&scene->fence, NULL);
+
+   scene->resources = NULL;
+   scene->scene_size = 0;
+   scene->resource_reference_size = 0;
+
+   scene->alloc_failed = FALSE;
+
+   util_unreference_framebuffer_state( &scene->fb );
+}
+
+
+
+
+
+
+struct cmd_block *
+lp_scene_new_cmd_block( struct lp_scene *scene,
+                        struct cmd_bin *bin )
+{
+   struct cmd_block *block = lp_scene_alloc(scene, sizeof(struct cmd_block));
+   if (block) {
+      if (bin->tail) {
+         bin->tail->next = block;
+         bin->tail = block;
+      }
+      else {
+         bin->head = block;
+         bin->tail = block;
+      }
+      //memset(block, 0, sizeof *block);
+      block->next = NULL;
+      block->count = 0;
+   }
+   return block;
+}
+
+
+struct data_block *
+lp_scene_new_data_block( struct lp_scene *scene )
+{
+   if (scene->scene_size + DATA_BLOCK_SIZE > LP_SCENE_MAX_SIZE) {
+      if (0) debug_printf("%s: failed\n", __FUNCTION__);
+      scene->alloc_failed = TRUE;
+      return NULL;
+   }
+   else {
+      struct data_block *block = MALLOC_STRUCT(data_block);
+      if (block == NULL)
+         return NULL;
+      
+      scene->scene_size += sizeof *block;
+
+      block->used = 0;
+      block->next = scene->data.head;
+      scene->data.head = block;
+
+      return block;
+   }
+}
+
+
+/**
+ * Return number of bytes used for all bin data within a scene.
+ * This does not include resources (textures) referenced by the scene.
+ */
+static unsigned
+lp_scene_data_size( const struct lp_scene *scene )
+{
+   unsigned size = 0;
+   const struct data_block *block;
+   for (block = scene->data.head; block; block = block->next) {
+      size += block->used;
+   }
+   return size;
+}
+
+
+
+/**
+ * Add a reference to a resource by the scene.
+ */
+boolean
+lp_scene_add_resource_reference(struct lp_scene *scene,
+                                struct pipe_resource *resource,
+                                boolean initializing_scene)
+{
+   struct resource_ref *ref, **last = &scene->resources;
+   int i;
+
+   /* Look at existing resource blocks:
+    */
+   for (ref = scene->resources; ref; ref = ref->next) {
+      last = &ref->next;
+
+      /* Search for this resource:
+       */
+      for (i = 0; i < ref->count; i++)
+         if (ref->resource[i] == resource)
+            return TRUE;
+
+      if (ref->count < RESOURCE_REF_SZ) {
+         /* If the block is half-empty, then append the reference here.
+          */
+         break;
+      }
+   }
+
+   /* Create a new block if no half-empty block was found.
+    */
+   if (!ref) {
+      assert(*last == NULL);
+      *last = lp_scene_alloc(scene, sizeof *ref);
+      if (*last == NULL)
+          return FALSE;
+
+      ref = *last;
+      memset(ref, 0, sizeof *ref);
+   }
+
+   /* Append the reference to the reference block.
+    */
+   pipe_resource_reference(&ref->resource[ref->count++], resource);
+   scene->resource_reference_size += llvmpipe_resource_size(resource);
+
+   /* Heuristic to advise scene flushes.  This isn't helpful in the
+    * initial setup of the scene, but after that point flush on the
+    * next resource added which exceeds 64MB in referenced texture
+    * data.
+    */
+   if (!initializing_scene &&
+       scene->resource_reference_size >= LP_SCENE_MAX_RESOURCE_SIZE)
+      return FALSE;
+
+   return TRUE;
+}
+
+
+/**
+ * Does this scene have a reference to the given resource?
+ */
+boolean
+lp_scene_is_resource_referenced(const struct lp_scene *scene,
+                                const struct pipe_resource *resource)
+{
+   const struct resource_ref *ref;
+   int i;
+
+   for (ref = scene->resources; ref; ref = ref->next) {
+      for (i = 0; i < ref->count; i++)
+         if (ref->resource[i] == resource)
+            return TRUE;
+   }
+
+   return FALSE;
+}
+
+
+
+
+/** advance curr_x,y to the next bin */
+static boolean
+next_bin(struct lp_scene *scene)
+{
+   scene->curr_x++;
+   if (scene->curr_x >= scene->tiles_x) {
+      scene->curr_x = 0;
+      scene->curr_y++;
+   }
+   if (scene->curr_y >= scene->tiles_y) {
+      /* no more bins */
+      return FALSE;
+   }
+   return TRUE;
+}
+
+
+void
+lp_scene_bin_iter_begin( struct lp_scene *scene )
+{
+   scene->curr_x = scene->curr_y = -1;
+}
+
+
+/**
+ * Return pointer to next bin to be rendered.
+ * The lp_scene::curr_x and ::curr_y fields will be advanced.
+ * Multiple rendering threads will call this function to get a chunk
+ * of work (a bin) to work on.
+ */
+struct cmd_bin *
+lp_scene_bin_iter_next( struct lp_scene *scene , int *x, int *y)
+{
+   struct cmd_bin *bin = NULL;
+
+   pipe_mutex_lock(scene->mutex);
+
+   if (scene->curr_x < 0) {
+      /* first bin */
+      scene->curr_x = 0;
+      scene->curr_y = 0;
+   }
+   else if (!next_bin(scene)) {
+      /* no more bins left */
+      goto end;
+   }
+
+   bin = lp_scene_get_bin(scene, scene->curr_x, scene->curr_y);
+   *x = scene->curr_x;
+   *y = scene->curr_y;
+
+end:
+   /*printf("return bin %p at %d, %d\n", (void *) bin, *bin_x, *bin_y);*/
+   pipe_mutex_unlock(scene->mutex);
+   return bin;
+}
+
+
+void lp_scene_begin_binning( struct lp_scene *scene,
+                             struct pipe_framebuffer_state *fb, boolean discard )
+{
+   int i;
+   unsigned max_layer = ~0;
+
+   assert(lp_scene_is_empty(scene));
+
+   scene->discard = discard;
+   util_copy_framebuffer_state(&scene->fb, fb);
+
+   scene->tiles_x = align(fb->width, TILE_SIZE) / TILE_SIZE;
+   scene->tiles_y = align(fb->height, TILE_SIZE) / TILE_SIZE;
+   assert(scene->tiles_x <= TILES_X);
+   assert(scene->tiles_y <= TILES_Y);
+
+   /*
+    * Determine how many layers the fb has (used for clamping layer value).
+    * OpenGL (but not d3d10) permits different amount of layers per rt, however
+    * results are undefined if layer exceeds the amount of layers of ANY
+    * attachment hence don't need separate per cbuf and zsbuf max.
+    */
+   for (i = 0; i < scene->fb.nr_cbufs; i++) {
+      struct pipe_surface *cbuf = scene->fb.cbufs[i];
+      if (cbuf) {
+         if (llvmpipe_resource_is_texture(cbuf->texture)) {
+            max_layer = MIN2(max_layer,
+                             cbuf->u.tex.last_layer - cbuf->u.tex.first_layer);
+         }
+         else {
+            max_layer = 0;
+         }
+      }
+   }
+   if (fb->zsbuf) {
+      struct pipe_surface *zsbuf = scene->fb.zsbuf;
+      max_layer = MIN2(max_layer, zsbuf->u.tex.last_layer - zsbuf->u.tex.first_layer);
+   }
+   scene->fb_max_layer = max_layer;
+}
+
+
+void lp_scene_end_binning( struct lp_scene *scene )
+{
+   if (LP_DEBUG & DEBUG_SCENE) {
+      debug_printf("rasterize scene:\n");
+      debug_printf("  scene_size: %u\n",
+                   scene->scene_size);
+      debug_printf("  data size: %u\n",
+                   lp_scene_data_size(scene));
+
+      if (0)
+         lp_debug_bins( scene );
+   }
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_scene.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_scene.h
new file mode 100644
index 000000000..b1464bb54
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_scene.h
@@ -0,0 +1,412 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * Binner data structures and bin-related functions.
+ * Note: the "setup" code is concerned with building scenes while
+ * The "rast" code is concerned with consuming/executing scenes.
+ */
+
+#ifndef LP_SCENE_H
+#define LP_SCENE_H
+
+#include "os/os_thread.h"
+#include "lp_rast.h"
+#include "lp_debug.h"
+
+struct lp_scene_queue;
+struct lp_rast_state;
+
+/* We're limited to 2K by 2K for 32bit fixed point rasterization.
+ * Will need a 64-bit version for larger framebuffers.
+ */
+#define TILES_X (LP_MAX_WIDTH / TILE_SIZE)
+#define TILES_Y (LP_MAX_HEIGHT / TILE_SIZE)
+
+
+/* Commands per command block (ideally so sizeof(cmd_block) is a power of
+ * two in size.)
+ */
+#define CMD_BLOCK_MAX 29
+
+/* Bytes per data block.
+ */
+#define DATA_BLOCK_SIZE (64 * 1024)
+
+/* Scene temporary storage is clamped to this size:
+ */
+#define LP_SCENE_MAX_SIZE (9*1024*1024)
+
+/* The maximum amount of texture storage referenced by a scene is
+ * clamped to this size:
+ */
+#define LP_SCENE_MAX_RESOURCE_SIZE (64*1024*1024)
+
+
+/* switch to a non-pointer value for this:
+ */
+typedef void (*lp_rast_cmd_func)( struct lp_rasterizer_task *,
+                                  const union lp_rast_cmd_arg );
+
+   
+struct cmd_block {
+   uint8_t cmd[CMD_BLOCK_MAX];
+   union lp_rast_cmd_arg arg[CMD_BLOCK_MAX];
+   unsigned count;
+   struct cmd_block *next;
+};
+
+
+struct data_block {
+   ubyte data[DATA_BLOCK_SIZE];
+   unsigned used;
+   struct data_block *next;
+};
+
+
+
+/**
+ * For each screen tile we have one of these bins.
+ */
+struct cmd_bin {
+   const struct lp_rast_state *last_state;       /* most recent state set in bin */
+   struct cmd_block *head;
+   struct cmd_block *tail;
+};
+   
+
+/**
+ * This stores bulk data which is used for all memory allocations
+ * within a scene.
+ *
+ * Examples include triangle data and state data.  The commands in
+ * the per-tile bins will point to chunks of data in this structure.
+ *
+ * Include the first block of data statically to ensure we can always
+ * initiate a scene without relying on malloc succeeding.
+ */
+struct data_block_list {
+   struct data_block first;
+   struct data_block *head;
+};
+
+struct resource_ref;
+
+/**
+ * All bins and bin data are contained here.
+ * Per-bin data goes into the 'tile' bins.
+ * Shared data goes into the 'data' buffer.
+ *
+ * When there are multiple threads, will want to double-buffer between
+ * scenes:
+ */
+struct lp_scene {
+   struct pipe_context *pipe;
+   struct lp_fence *fence;
+
+   /* The queries still active at end of scene */
+   struct llvmpipe_query *active_queries[LP_MAX_ACTIVE_BINNED_QUERIES];
+   unsigned num_active_queries;
+   /* If queries were either active or there were begin/end query commands */
+   boolean had_queries;
+
+   /* Framebuffer mappings - valid only between begin_rasterization()
+    * and end_rasterization().
+    */
+   struct {
+      uint8_t *map;
+      unsigned stride;
+      unsigned layer_stride;
+      unsigned format_bytes;
+   } zsbuf, cbufs[PIPE_MAX_COLOR_BUFS];
+
+   /* The amount of layers in the fb (minimum of all attachments) */
+   unsigned fb_max_layer;
+
+   /** the framebuffer to render the scene into */
+   struct pipe_framebuffer_state fb;
+
+   /** list of resources referenced by the scene commands */
+   struct resource_ref *resources;
+
+   /** Total memory used by the scene (in bytes).  This sums all the
+    * data blocks and counts all bins, state, resource references and
+    * other random allocations within the scene.
+    */
+   unsigned scene_size;
+
+   /** Sum of sizes of all resources referenced by the scene.  Sums
+    * all the textures read by the scene:
+    */
+   unsigned resource_reference_size;
+
+   boolean alloc_failed;
+   boolean discard;
+   /**
+    * Number of active tiles in each dimension.
+    * This basically the framebuffer size divided by tile size
+    */
+   unsigned tiles_x, tiles_y;
+
+   int curr_x, curr_y;  /**< for iterating over bins */
+   pipe_mutex mutex;
+
+   struct cmd_bin tile[TILES_X][TILES_Y];
+   struct data_block_list data;
+};
+
+
+
+struct lp_scene *lp_scene_create(struct pipe_context *pipe);
+
+void lp_scene_destroy(struct lp_scene *scene);
+
+boolean lp_scene_is_empty(struct lp_scene *scene );
+boolean lp_scene_is_oom(struct lp_scene *scene );
+
+
+struct data_block *lp_scene_new_data_block( struct lp_scene *scene );
+
+struct cmd_block *lp_scene_new_cmd_block( struct lp_scene *scene,
+                                          struct cmd_bin *bin );
+
+boolean lp_scene_add_resource_reference(struct lp_scene *scene,
+                                        struct pipe_resource *resource,
+                                        boolean initializing_scene);
+
+boolean lp_scene_is_resource_referenced(const struct lp_scene *scene,
+                                        const struct pipe_resource *resource );
+
+
+/**
+ * Allocate space for a command/data in the bin's data buffer.
+ * Grow the block list if needed.
+ */
+static inline void *
+lp_scene_alloc( struct lp_scene *scene, unsigned size)
+{
+   struct data_block_list *list = &scene->data;
+   struct data_block *block = list->head;
+
+   assert(size <= DATA_BLOCK_SIZE);
+   assert(block != NULL);
+
+   if (LP_DEBUG & DEBUG_MEM)
+      debug_printf("alloc %u block %u/%u tot %u/%u\n",
+		   size, block->used, DATA_BLOCK_SIZE,
+		   scene->scene_size, LP_SCENE_MAX_SIZE);
+
+   if (block->used + size > DATA_BLOCK_SIZE) {
+      block = lp_scene_new_data_block( scene );
+      if (!block) {
+         /* out of memory */
+         return NULL;
+      }
+   }
+
+   {
+      ubyte *data = block->data + block->used;
+      block->used += size;
+      return data;
+   }
+}
+
+
+/**
+ * As above, but with specific alignment.
+ */
+static inline void *
+lp_scene_alloc_aligned( struct lp_scene *scene, unsigned size,
+			unsigned alignment )
+{
+   struct data_block_list *list = &scene->data;
+   struct data_block *block = list->head;
+
+   assert(block != NULL);
+
+   if (LP_DEBUG & DEBUG_MEM)
+      debug_printf("alloc %u block %u/%u tot %u/%u\n",
+		   size + alignment - 1,
+		   block->used, DATA_BLOCK_SIZE,
+		   scene->scene_size, LP_SCENE_MAX_SIZE);
+       
+   if (block->used + size + alignment - 1 > DATA_BLOCK_SIZE) {
+      block = lp_scene_new_data_block( scene );
+      if (!block)
+         return NULL;
+   }
+
+   {
+      ubyte *data = block->data + block->used;
+      unsigned offset = (((uintptr_t)data + alignment - 1) & ~(alignment - 1)) - (uintptr_t)data;
+      block->used += offset + size;
+      return data + offset;
+   }
+}
+
+
+/* Put back data if we decide not to use it, eg. culled triangles.
+ */
+static inline void
+lp_scene_putback_data( struct lp_scene *scene, unsigned size)
+{
+   struct data_block_list *list = &scene->data;
+   assert(list->head && list->head->used >= size);
+   list->head->used -= size;
+}
+
+
+/** Return pointer to a particular tile's bin. */
+static inline struct cmd_bin *
+lp_scene_get_bin(struct lp_scene *scene, unsigned x, unsigned y)
+{
+   return &scene->tile[x][y];
+}
+
+
+/** Remove all commands from a bin */
+void
+lp_scene_bin_reset(struct lp_scene *scene, unsigned x, unsigned y);
+
+
+/* Add a command to bin[x][y].
+ */
+static inline boolean
+lp_scene_bin_command( struct lp_scene *scene,
+                      unsigned x, unsigned y,
+                      unsigned cmd,
+                      union lp_rast_cmd_arg arg )
+{
+   struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+   struct cmd_block *tail = bin->tail;
+
+   assert(x < scene->tiles_x);
+   assert(y < scene->tiles_y);
+   assert(cmd < LP_RAST_OP_MAX);
+
+   if (tail == NULL || tail->count == CMD_BLOCK_MAX) {
+      tail = lp_scene_new_cmd_block( scene, bin );
+      if (!tail) {
+         return FALSE;
+      }
+      assert(tail->count == 0);
+   }
+
+   {
+      unsigned i = tail->count;
+      tail->cmd[i] = cmd & LP_RAST_OP_MASK;
+      tail->arg[i] = arg;
+      tail->count++;
+   }
+   
+   return TRUE;
+}
+
+
+static inline boolean
+lp_scene_bin_cmd_with_state( struct lp_scene *scene,
+                             unsigned x, unsigned y,
+                             const struct lp_rast_state *state,
+                             unsigned cmd,
+                             union lp_rast_cmd_arg arg )
+{
+   struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+
+   if (state != bin->last_state) {
+      bin->last_state = state;
+      if (!lp_scene_bin_command(scene, x, y,
+                                LP_RAST_OP_SET_STATE,
+                                lp_rast_arg_state(state)))
+         return FALSE;
+   }
+
+   if (!lp_scene_bin_command( scene, x, y, cmd, arg ))
+      return FALSE;
+
+   return TRUE;
+}
+
+
+/* Add a command to all active bins.
+ */
+static inline boolean
+lp_scene_bin_everywhere( struct lp_scene *scene,
+			 unsigned cmd,
+			 const union lp_rast_cmd_arg arg )
+{
+   unsigned i, j;
+   for (i = 0; i < scene->tiles_x; i++) {
+      for (j = 0; j < scene->tiles_y; j++) {
+         if (!lp_scene_bin_command( scene, i, j, cmd, arg ))
+            return FALSE;
+      }
+   }
+
+   return TRUE;
+}
+
+
+static inline unsigned
+lp_scene_get_num_bins( const struct lp_scene *scene )
+{
+   return scene->tiles_x * scene->tiles_y;
+}
+
+
+void
+lp_scene_bin_iter_begin( struct lp_scene *scene );
+
+struct cmd_bin *
+lp_scene_bin_iter_next( struct lp_scene *scene, int *x, int *y );
+
+
+
+/* Begin/end binning of a scene
+ */
+void
+lp_scene_begin_binning( struct lp_scene *scene,
+                        struct pipe_framebuffer_state *fb,
+                        boolean discard );
+
+void
+lp_scene_end_binning( struct lp_scene *scene );
+
+
+/* Begin/end rasterization of a scene
+ */
+void
+lp_scene_begin_rasterization(struct lp_scene *scene);
+
+void
+lp_scene_end_rasterization(struct lp_scene *scene );
+
+
+
+
+
+#endif /* LP_BIN_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_scene_queue.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_scene_queue.c
new file mode 100644
index 000000000..975db43c4
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_scene_queue.c
@@ -0,0 +1,124 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * Scene queue.  We'll use two queues.  One contains "full" scenes which
+ * are produced by the "setup" code.  The other contains "empty" scenes
+ * which are produced by the "rast" code when it finishes rendering a scene.
+ */
+
+#include "util/u_ringbuffer.h"
+#include "util/u_memory.h"
+#include "lp_scene_queue.h"
+
+
+
+#define MAX_SCENE_QUEUE 4
+
+struct scene_packet {
+   struct util_packet header;
+   struct lp_scene *scene;
+};
+
+/**
+ * A queue of scenes
+ */
+struct lp_scene_queue
+{
+   struct util_ringbuffer *ring;
+};
+
+
+
+/** Allocate a new scene queue */
+struct lp_scene_queue *
+lp_scene_queue_create(void)
+{
+   struct lp_scene_queue *queue = CALLOC_STRUCT(lp_scene_queue);
+   if (queue == NULL)
+      return NULL;
+
+   queue->ring = util_ringbuffer_create( MAX_SCENE_QUEUE * 
+                                         sizeof( struct scene_packet ) / 4);
+   if (queue->ring == NULL)
+      goto fail;
+
+   return queue;
+
+fail:
+   FREE(queue);
+   return NULL;
+}
+
+
+/** Delete a scene queue */
+void
+lp_scene_queue_destroy(struct lp_scene_queue *queue)
+{
+   util_ringbuffer_destroy(queue->ring);
+   FREE(queue);
+}
+
+
+/** Remove first lp_scene from head of queue */
+struct lp_scene *
+lp_scene_dequeue(struct lp_scene_queue *queue, boolean wait)
+{
+   struct scene_packet packet;
+   enum pipe_error ret;
+
+   packet.scene = NULL;
+
+   ret = util_ringbuffer_dequeue(queue->ring,
+                                 &packet.header,
+                                 sizeof packet / 4,
+                                 wait );
+   if (ret != PIPE_OK)
+      return NULL;
+
+   return packet.scene;
+}
+
+
+/** Add an lp_scene to tail of queue */
+void
+lp_scene_enqueue(struct lp_scene_queue *queue, struct lp_scene *scene)
+{
+   struct scene_packet packet;
+
+   packet.header.dwords = sizeof packet / 4;
+   packet.header.data24 = 0;
+   packet.scene = scene;
+
+   util_ringbuffer_enqueue(queue->ring, &packet.header);
+}
+
+
+
+
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_scene_queue.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_scene_queue.h
new file mode 100644
index 000000000..dd9ab593b
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_scene_queue.h
@@ -0,0 +1,53 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_SCENE_QUEUE
+#define LP_SCENE_QUEUE
+
+#include "pipe/p_compiler.h"
+
+struct lp_scene_queue;
+struct lp_scene;
+
+
+struct lp_scene_queue *
+lp_scene_queue_create(void);
+
+void
+lp_scene_queue_destroy(struct lp_scene_queue *queue);
+
+struct lp_scene *
+lp_scene_dequeue(struct lp_scene_queue *queue, boolean wait);
+
+void
+lp_scene_enqueue(struct lp_scene_queue *queue, struct lp_scene *scene);
+
+
+
+
+#endif /* LP_BIN_QUEUE */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_screen.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_screen.c
new file mode 100644
index 000000000..14eeab033
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -0,0 +1,623 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_cpu_detect.h"
+#include "util/u_format.h"
+#include "util/u_string.h"
+#include "util/u_format_s3tc.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+#include "draw/draw_context.h"
+#include "gallivm/lp_bld_type.h"
+
+#include "os/os_misc.h"
+#include "os/os_time.h"
+#include "lp_texture.h"
+#include "lp_fence.h"
+#include "lp_jit.h"
+#include "lp_screen.h"
+#include "lp_context.h"
+#include "lp_debug.h"
+#include "lp_public.h"
+#include "lp_limits.h"
+#include "lp_rast.h"
+
+#include "state_tracker/sw_winsys.h"
+
+#ifdef DEBUG
+int LP_DEBUG = 0;
+
+static const struct debug_named_value lp_debug_flags[] = {
+   { "pipe",   DEBUG_PIPE, NULL },
+   { "tgsi",   DEBUG_TGSI, NULL },
+   { "tex",    DEBUG_TEX, NULL },
+   { "setup",  DEBUG_SETUP, NULL },
+   { "rast",   DEBUG_RAST, NULL },
+   { "query",  DEBUG_QUERY, NULL },
+   { "screen", DEBUG_SCREEN, NULL },
+   { "counters", DEBUG_COUNTERS, NULL },
+   { "scene", DEBUG_SCENE, NULL },
+   { "fence", DEBUG_FENCE, NULL },
+   { "mem", DEBUG_MEM, NULL },
+   { "fs", DEBUG_FS, NULL },
+   DEBUG_NAMED_VALUE_END
+};
+#endif
+
+int LP_PERF = 0;
+static const struct debug_named_value lp_perf_flags[] = {
+   { "texmem",         PERF_TEX_MEM, NULL },
+   { "no_mipmap",      PERF_NO_MIPMAPS, NULL },
+   { "no_linear",      PERF_NO_LINEAR, NULL },
+   { "no_mip_linear",  PERF_NO_MIP_LINEAR, NULL },
+   { "no_tex",         PERF_NO_TEX, NULL },
+   { "no_blend",       PERF_NO_BLEND, NULL },
+   { "no_depth",       PERF_NO_DEPTH, NULL },
+   { "no_alphatest",   PERF_NO_ALPHATEST, NULL },
+   DEBUG_NAMED_VALUE_END
+};
+
+
+static const char *
+llvmpipe_get_vendor(struct pipe_screen *screen)
+{
+   return "VMware, Inc.";
+}
+
+
+static const char *
+llvmpipe_get_name(struct pipe_screen *screen)
+{
+   static char buf[100];
+   util_snprintf(buf, sizeof(buf), "llvmpipe (LLVM %u.%u, %u bits)",
+		 HAVE_LLVM >> 8, HAVE_LLVM & 0xff,
+		 lp_native_vector_width );
+   return buf;
+}
+
+
+static int
+llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
+{
+   switch (param) {
+   case PIPE_CAP_NPOT_TEXTURES:
+   case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_SM3:
+      return 1;
+   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+      return PIPE_MAX_SO_BUFFERS;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 1;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return PIPE_MAX_COLOR_BUFS;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 1;
+   case PIPE_CAP_QUERY_TIME_ELAPSED:
+      return 0;
+   case PIPE_CAP_QUERY_TIMESTAMP:
+      return 1;
+   case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
+      return 0;
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+      return 1;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_TEXTURE_SWIZZLE:
+      return 1;
+   case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
+      return 0;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return LP_MAX_TEXTURE_2D_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return LP_MAX_TEXTURE_3D_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return LP_MAX_TEXTURE_CUBE_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+      return LP_MAX_TEXTURE_ARRAY_LAYERS;
+   case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+      return 1;
+   case PIPE_CAP_INDEP_BLEND_ENABLE:
+      return 1;
+   case PIPE_CAP_INDEP_BLEND_FUNC:
+      return 1;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+      return 1;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+      return 0;
+   case PIPE_CAP_PRIMITIVE_RESTART:
+      return 1;
+   case PIPE_CAP_DEPTH_CLIP_DISABLE:
+      return 1;
+   case PIPE_CAP_SHADER_STENCIL_EXPORT:
+      return 1;
+   case PIPE_CAP_TGSI_INSTANCEID:
+   case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
+   case PIPE_CAP_START_INSTANCE:
+      return 1;
+   case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
+      return 0;
+   case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
+      return 1;
+   case PIPE_CAP_SEAMLESS_CUBE_MAP:
+   case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+      return 1;
+   /* this is a lie could support arbitrary large offsets */
+   case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
+   case PIPE_CAP_MIN_TEXEL_OFFSET:
+      return -32;
+   case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
+   case PIPE_CAP_MAX_TEXEL_OFFSET:
+      return 31;
+   case PIPE_CAP_CONDITIONAL_RENDER:
+      return 1;
+   case PIPE_CAP_TEXTURE_BARRIER:
+      return 0;
+   case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+   case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+      return 16*4;
+   case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
+   case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
+      return 1024;
+   case PIPE_CAP_MAX_VERTEX_STREAMS:
+      return 1;
+   case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
+      return 2048;
+   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+      return 1;
+   case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
+      return 0;
+   case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
+   case PIPE_CAP_VERTEX_COLOR_CLAMPED:
+      return 1;
+   case PIPE_CAP_GLSL_FEATURE_LEVEL:
+      return 330;
+   case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+      return 0;
+   case PIPE_CAP_COMPUTE:
+      return 0;
+   case PIPE_CAP_USER_VERTEX_BUFFERS:
+   case PIPE_CAP_USER_INDEX_BUFFERS:
+      return 1;
+   case PIPE_CAP_USER_CONSTANT_BUFFERS:
+      return 0;
+   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_TGSI_TEXCOORD:
+      return 0;
+   case PIPE_CAP_DRAW_INDIRECT:
+      return 1;
+
+   case PIPE_CAP_CUBE_MAP_ARRAY:
+      return 1;
+   case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
+      return 16;
+   case PIPE_CAP_TEXTURE_MULTISAMPLE:
+      return 0;
+   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+      return 64;
+   case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+      return 65536;
+   case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+      return 1;
+   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+      return 0;
+   case PIPE_CAP_MAX_VIEWPORTS:
+      return PIPE_MAX_VIEWPORTS;
+   case PIPE_CAP_ENDIANNESS:
+      return PIPE_ENDIAN_NATIVE;
+   case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
+      return 1;
+   case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
+      return 4;
+   case PIPE_CAP_TEXTURE_GATHER_SM5:
+   case PIPE_CAP_TEXTURE_QUERY_LOD:
+   case PIPE_CAP_SAMPLE_SHADING:
+   case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
+      return 0;
+   case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
+      return 1;
+   case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+      return 0;
+   case PIPE_CAP_SAMPLER_VIEW_TARGET:
+      return 1;
+   case PIPE_CAP_FAKE_SW_MSAA:
+      return 1;
+   case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+      return 1;
+
+   case PIPE_CAP_VENDOR_ID:
+      return 0xFFFFFFFF;
+   case PIPE_CAP_DEVICE_ID:
+      return 0xFFFFFFFF;
+   case PIPE_CAP_ACCELERATED:
+      return 0;
+   case PIPE_CAP_VIDEO_MEMORY: {
+      /* XXX: Do we want to return the full amount fo system memory ? */
+      uint64_t system_memory;
+
+      if (!os_get_total_physical_memory(&system_memory))
+         return 0;
+
+      return (int)(system_memory >> 20);
+   }
+   case PIPE_CAP_UMA:
+      return 0;
+   case PIPE_CAP_CLIP_HALFZ:
+      return 1;
+   case PIPE_CAP_VERTEXID_NOBASE:
+      return 0;
+   case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+      return 1;
+   case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+   case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
+      return 0;
+   }
+   /* should only get here on unhandled cases */
+   debug_printf("Unexpected PIPE_CAP %d query\n", param);
+   return 0;
+}
+
+static int
+llvmpipe_get_shader_param(struct pipe_screen *screen, unsigned shader, enum pipe_shader_cap param)
+{
+   switch(shader)
+   {
+   case PIPE_SHADER_FRAGMENT:
+      switch (param) {
+      default:
+         return gallivm_get_shader_param(param);
+      }
+   case PIPE_SHADER_VERTEX:
+   case PIPE_SHADER_GEOMETRY:
+      switch (param) {
+      case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
+         /* At this time, the draw module and llvmpipe driver only
+          * support vertex shader texture lookups when LLVM is enabled in
+          * the draw module.
+          */
+         if (debug_get_bool_option("DRAW_USE_LLVM", TRUE))
+            return PIPE_MAX_SAMPLERS;
+         else
+            return 0;
+      case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
+         if (debug_get_bool_option("DRAW_USE_LLVM", TRUE))
+            return PIPE_MAX_SHADER_SAMPLER_VIEWS;
+         else
+            return 0;
+      default:
+         return draw_get_shader_param(shader, param);
+      }
+   default:
+      return 0;
+   }
+}
+
+static float
+llvmpipe_get_paramf(struct pipe_screen *screen, enum pipe_capf param)
+{
+   switch (param) {
+   case PIPE_CAPF_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+      return 255.0; /* arbitrary */
+   case PIPE_CAPF_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAPF_MAX_POINT_WIDTH_AA:
+      return 255.0; /* arbitrary */
+   case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
+      return 16.0; /* not actually signficant at this time */
+   case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
+      return 16.0; /* arbitrary */
+   case PIPE_CAPF_GUARD_BAND_LEFT:
+   case PIPE_CAPF_GUARD_BAND_TOP:
+   case PIPE_CAPF_GUARD_BAND_RIGHT:
+   case PIPE_CAPF_GUARD_BAND_BOTTOM:
+      return 0.0;
+   }
+   /* should only get here on unhandled cases */
+   debug_printf("Unexpected PIPE_CAP %d query\n", param);
+   return 0.0;
+}
+
+
+/**
+ * Query format support for creating a texture, drawing surface, etc.
+ * \param format  the format to test
+ * \param type  one of PIPE_TEXTURE, PIPE_SURFACE
+ */
+static boolean
+llvmpipe_is_format_supported( struct pipe_screen *_screen,
+                              enum pipe_format format,
+                              enum pipe_texture_target target,
+                              unsigned sample_count,
+                              unsigned bind)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+   struct sw_winsys *winsys = screen->winsys;
+   const struct util_format_description *format_desc;
+
+   format_desc = util_format_description(format);
+   if (!format_desc)
+      return FALSE;
+
+   assert(target == PIPE_BUFFER ||
+          target == PIPE_TEXTURE_1D ||
+          target == PIPE_TEXTURE_1D_ARRAY ||
+          target == PIPE_TEXTURE_2D ||
+          target == PIPE_TEXTURE_2D_ARRAY ||
+          target == PIPE_TEXTURE_RECT ||
+          target == PIPE_TEXTURE_3D ||
+          target == PIPE_TEXTURE_CUBE ||
+          target == PIPE_TEXTURE_CUBE_ARRAY);
+
+   if (sample_count > 1)
+      return FALSE;
+
+   if (bind & PIPE_BIND_RENDER_TARGET) {
+      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+         /* this is a lie actually other formats COULD exist where we would fail */
+         if (format_desc->nr_channels < 3)
+            return FALSE;
+      }
+      else if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB)
+         return FALSE;
+
+      if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN &&
+          format != PIPE_FORMAT_R11G11B10_FLOAT)
+         return FALSE;
+
+      assert(format_desc->block.width == 1);
+      assert(format_desc->block.height == 1);
+
+      if (format_desc->is_mixed)
+         return FALSE;
+
+      if (!format_desc->is_array && !format_desc->is_bitmask &&
+          format != PIPE_FORMAT_R11G11B10_FLOAT)
+         return FALSE;
+
+      /*
+       * XXX refuse formats known to crash in generate_unswizzled_blend().
+       * These include all 3-channel 24bit RGB8 variants, plus 48bit
+       * (except those using floats) 3-channel RGB16 variants (the latter
+       * seems to be more of a llvm bug though).
+       * The mesa state tracker only seems to use these for SINT/UINT formats.
+       */
+      if (format_desc->is_array && format_desc->nr_channels == 3) {
+         if (format_desc->block.bits == 24 || (format_desc->block.bits == 48 &&
+               !util_format_is_float(format))) {
+            return FALSE;
+         }
+      }
+   }
+
+   if (bind & PIPE_BIND_DISPLAY_TARGET) {
+      if(!winsys->is_displaytarget_format_supported(winsys, bind, format))
+         return FALSE;
+   }
+
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
+      if (format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+         return FALSE;
+
+      if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
+         return FALSE;
+
+      /* TODO: Support stencil-only formats */
+      if (format_desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_NONE) {
+         return FALSE;
+      }
+   }
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
+      /* Software decoding is not hooked up. */
+      return FALSE;
+   }
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_ETC &&
+       format != PIPE_FORMAT_ETC1_RGB8)
+      return FALSE;
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+      return util_format_s3tc_enabled;
+   }
+
+   /*
+    * Everything can be supported by u_format
+    * (those without fetch_rgba_float might be not but shouldn't hit that)
+    */
+
+   return TRUE;
+}
+
+
+
+
+static void
+llvmpipe_flush_frontbuffer(struct pipe_screen *_screen,
+                           struct pipe_resource *resource,
+                           unsigned level, unsigned layer,
+                           void *context_private,
+                           struct pipe_box *sub_box)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+   struct sw_winsys *winsys = screen->winsys;
+   struct llvmpipe_resource *texture = llvmpipe_resource(resource);
+
+   assert(texture->dt);
+   if (texture->dt)
+      winsys->displaytarget_display(winsys, texture->dt, context_private, sub_box);
+}
+
+static void
+llvmpipe_destroy_screen( struct pipe_screen *_screen )
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+   struct sw_winsys *winsys = screen->winsys;
+
+   if (screen->rast)
+      lp_rast_destroy(screen->rast);
+
+   lp_jit_screen_cleanup(screen);
+
+   if(winsys->destroy)
+      winsys->destroy(winsys);
+
+   pipe_mutex_destroy(screen->rast_mutex);
+
+   FREE(screen);
+}
+
+
+
+
+/**
+ * Fence reference counting.
+ */
+static void
+llvmpipe_fence_reference(struct pipe_screen *screen,
+                         struct pipe_fence_handle **ptr,
+                         struct pipe_fence_handle *fence)
+{
+   struct lp_fence **old = (struct lp_fence **) ptr;
+   struct lp_fence *f = (struct lp_fence *) fence;
+
+   lp_fence_reference(old, f);
+}
+
+
+/**
+ * Wait for the fence to finish.
+ */
+static boolean
+llvmpipe_fence_finish(struct pipe_screen *screen,
+                      struct pipe_fence_handle *fence_handle,
+                      uint64_t timeout)
+{
+   struct lp_fence *f = (struct lp_fence *) fence_handle;
+
+   if (!timeout)
+      return lp_fence_signalled(f);
+
+   lp_fence_wait(f);
+   return TRUE;
+}
+
+static uint64_t
+llvmpipe_get_timestamp(struct pipe_screen *_screen)
+{
+   return os_time_get_nano();
+}
+
+/**
+ * Create a new pipe_screen object
+ * Note: we're not presently subclassing pipe_screen (no llvmpipe_screen).
+ */
+struct pipe_screen *
+llvmpipe_create_screen(struct sw_winsys *winsys)
+{
+   struct llvmpipe_screen *screen;
+
+   util_cpu_detect();
+
+#ifdef DEBUG
+   LP_DEBUG = debug_get_flags_option("LP_DEBUG", lp_debug_flags, 0 );
+#endif
+
+   LP_PERF = debug_get_flags_option("LP_PERF", lp_perf_flags, 0 );
+
+   screen = CALLOC_STRUCT(llvmpipe_screen);
+   if (!screen)
+      return NULL;
+
+   if (!lp_jit_screen_init(screen)) {
+      FREE(screen);
+      return NULL;
+   }
+
+   screen->winsys = winsys;
+
+   screen->base.destroy = llvmpipe_destroy_screen;
+
+   screen->base.get_name = llvmpipe_get_name;
+   screen->base.get_vendor = llvmpipe_get_vendor;
+   screen->base.get_device_vendor = llvmpipe_get_vendor; // TODO should be the CPU vendor
+   screen->base.get_param = llvmpipe_get_param;
+   screen->base.get_shader_param = llvmpipe_get_shader_param;
+   screen->base.get_paramf = llvmpipe_get_paramf;
+   screen->base.is_format_supported = llvmpipe_is_format_supported;
+
+   screen->base.context_create = llvmpipe_create_context;
+   screen->base.flush_frontbuffer = llvmpipe_flush_frontbuffer;
+   screen->base.fence_reference = llvmpipe_fence_reference;
+   screen->base.fence_finish = llvmpipe_fence_finish;
+
+   screen->base.get_timestamp = llvmpipe_get_timestamp;
+
+   llvmpipe_init_screen_resource_funcs(&screen->base);
+
+   screen->num_threads = util_cpu_caps.nr_cpus > 1 ? util_cpu_caps.nr_cpus : 0;
+#ifdef PIPE_SUBSYSTEM_EMBEDDED
+   screen->num_threads = 0;
+#endif
+   screen->num_threads = debug_get_num_option("LP_NUM_THREADS", screen->num_threads);
+   screen->num_threads = MIN2(screen->num_threads, LP_MAX_THREADS);
+
+   screen->rast = lp_rast_create(screen->num_threads);
+   if (!screen->rast) {
+      lp_jit_screen_cleanup(screen);
+      FREE(screen);
+      return NULL;
+   }
+   pipe_mutex_init(screen->rast_mutex);
+
+   util_format_s3tc_init();
+
+   return &screen->base;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_screen.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_screen.h
new file mode 100644
index 000000000..00bf20c8c
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_screen.h
@@ -0,0 +1,73 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ * @author Keith Whitwell <keithw@vmware.com>
+ */
+
+#ifndef LP_SCREEN_H
+#define LP_SCREEN_H
+
+#include "pipe/p_screen.h"
+#include "pipe/p_defines.h"
+#include "os/os_thread.h"
+#include "gallivm/lp_bld.h"
+
+
+struct sw_winsys;
+
+
+struct llvmpipe_screen
+{
+   struct pipe_screen base;
+
+   struct sw_winsys *winsys;
+
+   unsigned num_threads;
+
+   /* Increments whenever textures are modified.  Contexts can track this.
+    */
+   unsigned timestamp;
+
+   struct lp_rasterizer *rast;
+   pipe_mutex rast_mutex;
+};
+
+
+
+
+static inline struct llvmpipe_screen *
+llvmpipe_screen( struct pipe_screen *pipe )
+{
+   return (struct llvmpipe_screen *)pipe;
+}
+
+
+
+#endif /* LP_SCREEN_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup.c
new file mode 100644
index 000000000..4c8167a9e
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -0,0 +1,1493 @@
+/**************************************************************************
+ *
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Tiling engine.
+ *
+ * Builds per-tile display lists and executes them on calls to
+ * lp_setup_flush().
+ */
+
+#include <limits.h>
+
+#include "pipe/p_defines.h"
+#include "util/u_framebuffer.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+#include "draw/draw_pipe.h"
+#include "os/os_time.h"
+#include "lp_context.h"
+#include "lp_memory.h"
+#include "lp_scene.h"
+#include "lp_texture.h"
+#include "lp_debug.h"
+#include "lp_fence.h"
+#include "lp_query.h"
+#include "lp_rast.h"
+#include "lp_setup_context.h"
+#include "lp_screen.h"
+#include "lp_state.h"
+#include "state_tracker/sw_winsys.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+
+
+static boolean set_scene_state( struct lp_setup_context *, enum setup_state,
+                             const char *reason);
+static boolean try_update_scene_state( struct lp_setup_context *setup );
+
+
+static void
+lp_setup_get_empty_scene(struct lp_setup_context *setup)
+{
+   assert(setup->scene == NULL);
+
+   setup->scene_idx++;
+   setup->scene_idx %= Elements(setup->scenes);
+
+   setup->scene = setup->scenes[setup->scene_idx];
+
+   if (setup->scene->fence) {
+      if (LP_DEBUG & DEBUG_SETUP)
+         debug_printf("%s: wait for scene %d\n",
+                      __FUNCTION__, setup->scene->fence->id);
+
+      lp_fence_wait(setup->scene->fence);
+   }
+
+   lp_scene_begin_binning(setup->scene, &setup->fb, setup->rasterizer_discard);
+
+}
+
+
+static void
+first_triangle( struct lp_setup_context *setup,
+                const float (*v0)[4],
+                const float (*v1)[4],
+                const float (*v2)[4])
+{
+   assert(setup->state == SETUP_ACTIVE);
+   lp_setup_choose_triangle( setup );
+   setup->triangle( setup, v0, v1, v2 );
+}
+
+static void
+first_line( struct lp_setup_context *setup,
+	    const float (*v0)[4],
+	    const float (*v1)[4])
+{
+   assert(setup->state == SETUP_ACTIVE);
+   lp_setup_choose_line( setup );
+   setup->line( setup, v0, v1 );
+}
+
+static void
+first_point( struct lp_setup_context *setup,
+	     const float (*v0)[4])
+{
+   assert(setup->state == SETUP_ACTIVE);
+   lp_setup_choose_point( setup );
+   setup->point( setup, v0 );
+}
+
+void lp_setup_reset( struct lp_setup_context *setup )
+{
+   unsigned i;
+
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   /* Reset derived state */
+   for (i = 0; i < Elements(setup->constants); ++i) {
+      setup->constants[i].stored_size = 0;
+      setup->constants[i].stored_data = NULL;
+   }
+   setup->fs.stored = NULL;
+   setup->dirty = ~0;
+
+   /* no current bin */
+   setup->scene = NULL;
+
+   /* Reset some state:
+    */
+   memset(&setup->clear, 0, sizeof setup->clear);
+
+   /* Have an explicit "start-binning" call and get rid of this
+    * pointer twiddling?
+    */
+   setup->line = first_line;
+   setup->point = first_point;
+   setup->triangle = first_triangle;
+}
+
+
+/** Rasterize all scene's bins */
+static void
+lp_setup_rasterize_scene( struct lp_setup_context *setup )
+{
+   struct lp_scene *scene = setup->scene;
+   struct llvmpipe_screen *screen = llvmpipe_screen(scene->pipe->screen);
+
+   scene->num_active_queries = setup->active_binned_queries;
+   memcpy(scene->active_queries, setup->active_queries,
+          scene->num_active_queries * sizeof(scene->active_queries[0]));
+
+   lp_scene_end_binning(scene);
+
+   lp_fence_reference(&setup->last_fence, scene->fence);
+
+   if (setup->last_fence)
+      setup->last_fence->issued = TRUE;
+
+   pipe_mutex_lock(screen->rast_mutex);
+
+   /* FIXME: We enqueue the scene then wait on the rasterizer to finish.
+    * This means we never actually run any vertex stuff in parallel to
+    * rasterization (not in the same context at least) which is what the
+    * multiple scenes per setup is about - when we get a new empty scene
+    * any old one is already empty again because we waited here for
+    * raster tasks to be finished. Ideally, we shouldn't need to wait here
+    * and rely on fences elsewhere when waiting is necessary.
+    * Certainly, lp_scene_end_rasterization() would need to be deferred too
+    * and there's probably other bits why this doesn't actually work.
+    */
+   lp_rast_queue_scene(screen->rast, scene);
+   lp_rast_finish(screen->rast);
+   pipe_mutex_unlock(screen->rast_mutex);
+
+   lp_scene_end_rasterization(setup->scene);
+   lp_setup_reset( setup );
+
+   LP_DBG(DEBUG_SETUP, "%s done \n", __FUNCTION__);
+}
+
+
+
+static boolean
+begin_binning( struct lp_setup_context *setup )
+{
+   struct lp_scene *scene = setup->scene;
+   boolean need_zsload = FALSE;
+   boolean ok;
+
+   assert(scene);
+   assert(scene->fence == NULL);
+
+   /* Always create a fence:
+    */
+   scene->fence = lp_fence_create(MAX2(1, setup->num_threads));
+   if (!scene->fence)
+      return FALSE;
+
+   ok = try_update_scene_state(setup);
+   if (!ok)
+      return FALSE;
+
+   if (setup->fb.zsbuf &&
+       ((setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL) &&
+        util_format_is_depth_and_stencil(setup->fb.zsbuf->format))
+      need_zsload = TRUE;
+
+   LP_DBG(DEBUG_SETUP, "%s color clear bufs: %x depth: %s\n", __FUNCTION__,
+          setup->clear.flags >> 2,
+          need_zsload ? "clear": "load");
+
+   if (setup->clear.flags & PIPE_CLEAR_COLOR) {
+      unsigned cbuf;
+      for (cbuf = 0; cbuf < setup->fb.nr_cbufs; cbuf++) {
+         assert(PIPE_CLEAR_COLOR0 == 1 << 2);
+         if (setup->clear.flags & (1 << (2 + cbuf))) {
+            union lp_rast_cmd_arg clearrb_arg;
+            struct lp_rast_clear_rb *cc_scene =
+               (struct lp_rast_clear_rb *)
+                  lp_scene_alloc(scene, sizeof(struct lp_rast_clear_rb));
+
+            if (!cc_scene) {
+               return FALSE;
+            }
+
+            cc_scene->cbuf = cbuf;
+            cc_scene->color_val = setup->clear.color_val[cbuf];
+            clearrb_arg.clear_rb = cc_scene;
+
+            if (!lp_scene_bin_everywhere(scene,
+                                         LP_RAST_OP_CLEAR_COLOR,
+                                         clearrb_arg))
+               return FALSE;
+         }
+      }
+   }
+
+   if (setup->fb.zsbuf) {
+      if (setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) {
+         ok = lp_scene_bin_everywhere( scene,
+                                       LP_RAST_OP_CLEAR_ZSTENCIL,
+                                       lp_rast_arg_clearzs(
+                                          setup->clear.zsvalue,
+                                          setup->clear.zsmask));
+         if (!ok)
+            return FALSE;
+      }
+   }
+
+   setup->clear.flags = 0;
+   setup->clear.zsmask = 0;
+   setup->clear.zsvalue = 0;
+
+   scene->had_queries = !!setup->active_binned_queries;
+
+   LP_DBG(DEBUG_SETUP, "%s done\n", __FUNCTION__);
+   return TRUE;
+}
+
+
+/* This basically bins and then flushes any outstanding full-screen
+ * clears.  
+ *
+ * TODO: fast path for fullscreen clears and no triangles.
+ */
+static boolean
+execute_clears( struct lp_setup_context *setup )
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   return begin_binning( setup );
+}
+
+const char *states[] = {
+   "FLUSHED",
+   "CLEARED",
+   "ACTIVE "
+};
+
+
+static boolean
+set_scene_state( struct lp_setup_context *setup,
+                 enum setup_state new_state,
+                 const char *reason)
+{
+   unsigned old_state = setup->state;
+
+   if (old_state == new_state)
+      return TRUE;
+   
+   if (LP_DEBUG & DEBUG_SCENE) {
+      debug_printf("%s old %s new %s%s%s\n",
+                   __FUNCTION__,
+                   states[old_state],
+                   states[new_state],
+                   (new_state == SETUP_FLUSHED) ? ": " : "",
+                   (new_state == SETUP_FLUSHED) ? reason : "");
+
+      if (new_state == SETUP_FLUSHED && setup->scene)
+         lp_debug_draw_bins_by_cmd_length(setup->scene);
+   }
+
+   /* wait for a free/empty scene
+    */
+   if (old_state == SETUP_FLUSHED) 
+      lp_setup_get_empty_scene(setup);
+
+   switch (new_state) {
+   case SETUP_CLEARED:
+      break;
+
+   case SETUP_ACTIVE:
+      if (!begin_binning( setup ))
+         goto fail;
+      break;
+
+   case SETUP_FLUSHED:
+      if (old_state == SETUP_CLEARED)
+         if (!execute_clears( setup ))
+            goto fail;
+
+      lp_setup_rasterize_scene( setup );
+      assert(setup->scene == NULL);
+      break;
+
+   default:
+      assert(0 && "invalid setup state mode");
+      goto fail;
+   }
+
+   setup->state = new_state;
+   return TRUE;
+
+fail:
+   if (setup->scene) {
+      lp_scene_end_rasterization(setup->scene);
+      setup->scene = NULL;
+   }
+
+   setup->state = SETUP_FLUSHED;
+   lp_setup_reset( setup );
+   return FALSE;
+}
+
+
+void
+lp_setup_flush( struct lp_setup_context *setup,
+                struct pipe_fence_handle **fence,
+                const char *reason)
+{
+   set_scene_state( setup, SETUP_FLUSHED, reason );
+
+   if (fence) {
+      lp_fence_reference((struct lp_fence **)fence, setup->last_fence);
+   }
+}
+
+
+void
+lp_setup_bind_framebuffer( struct lp_setup_context *setup,
+                           const struct pipe_framebuffer_state *fb )
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   /* Flush any old scene.
+    */
+   set_scene_state( setup, SETUP_FLUSHED, __FUNCTION__ );
+
+   /*
+    * Ensure the old scene is not reused.
+    */
+   assert(!setup->scene);
+
+   /* Set new state.  This will be picked up later when we next need a
+    * scene.
+    */
+   util_copy_framebuffer_state(&setup->fb, fb);
+   setup->framebuffer.x0 = 0;
+   setup->framebuffer.y0 = 0;
+   setup->framebuffer.x1 = fb->width-1;
+   setup->framebuffer.y1 = fb->height-1;
+   setup->dirty |= LP_SETUP_NEW_SCISSOR;
+}
+
+
+/*
+ * Try to clear one color buffer of the attached fb, either by binning a clear
+ * command or queuing up the clear for later (when binning is started).
+ */
+static boolean
+lp_setup_try_clear_color_buffer(struct lp_setup_context *setup,
+                                const union pipe_color_union *color,
+                                unsigned cbuf)
+{
+   union lp_rast_cmd_arg clearrb_arg;
+   union util_color uc;
+   enum pipe_format format = setup->fb.cbufs[cbuf]->format;
+
+   LP_DBG(DEBUG_SETUP, "%s state %d\n", __FUNCTION__, setup->state);
+
+   if (util_format_is_pure_integer(format)) {
+      /*
+       * We expect int/uint clear values here, though some APIs
+       * might disagree (but in any case util_pack_color()
+       * couldn't handle it)...
+       */
+      if (util_format_is_pure_sint(format)) {
+         util_format_write_4i(format, color->i, 0, &uc, 0, 0, 0, 1, 1);
+      }
+      else {
+         assert(util_format_is_pure_uint(format));
+         util_format_write_4ui(format, color->ui, 0, &uc, 0, 0, 0, 1, 1);
+      }
+   }
+   else {
+      util_pack_color(color->f, format, &uc);
+   }
+
+   if (setup->state == SETUP_ACTIVE) {
+      struct lp_scene *scene = setup->scene;
+
+      /* Add the clear to existing scene.  In the unusual case where
+       * both color and depth-stencil are being cleared when there's
+       * already been some rendering, we could discard the currently
+       * binned scene and start again, but I don't see that as being
+       * a common usage.
+       */
+      struct lp_rast_clear_rb *cc_scene =
+         (struct lp_rast_clear_rb *)
+            lp_scene_alloc_aligned(scene, sizeof(struct lp_rast_clear_rb), 8);
+
+      if (!cc_scene) {
+         return FALSE;
+      }
+
+      cc_scene->cbuf = cbuf;
+      cc_scene->color_val = uc;
+      clearrb_arg.clear_rb = cc_scene;
+
+      if (!lp_scene_bin_everywhere(scene,
+                                   LP_RAST_OP_CLEAR_COLOR,
+                                   clearrb_arg))
+         return FALSE;
+   }
+   else {
+      /* Put ourselves into the 'pre-clear' state, specifically to try
+       * and accumulate multiple clears to color and depth_stencil
+       * buffers which the app or state-tracker might issue
+       * separately.
+       */
+      set_scene_state( setup, SETUP_CLEARED, __FUNCTION__ );
+
+      assert(PIPE_CLEAR_COLOR0 == (1 << 2));
+      setup->clear.flags |= 1 << (cbuf + 2);
+      setup->clear.color_val[cbuf] = uc;
+   }
+
+   return TRUE;
+}
+
+static boolean
+lp_setup_try_clear_zs(struct lp_setup_context *setup,
+                      double depth,
+                      unsigned stencil,
+                      unsigned flags)
+{
+   uint64_t zsmask = 0;
+   uint64_t zsvalue = 0;
+   uint32_t zmask32;
+   uint8_t smask8;
+
+   LP_DBG(DEBUG_SETUP, "%s state %d\n", __FUNCTION__, setup->state);
+
+   zmask32 = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0;
+   smask8 = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0;
+
+   zsvalue = util_pack64_z_stencil(setup->fb.zsbuf->format,
+                                   depth,
+                                   stencil);
+
+   zsmask = util_pack64_mask_z_stencil(setup->fb.zsbuf->format,
+                                       zmask32,
+                                       smask8);
+
+   zsvalue &= zsmask;
+
+   if (setup->state == SETUP_ACTIVE) {
+      struct lp_scene *scene = setup->scene;
+
+      /* Add the clear to existing scene.  In the unusual case where
+       * both color and depth-stencil are being cleared when there's
+       * already been some rendering, we could discard the currently
+       * binned scene and start again, but I don't see that as being
+       * a common usage.
+       */
+      if (!lp_scene_bin_everywhere(scene,
+                                   LP_RAST_OP_CLEAR_ZSTENCIL,
+                                   lp_rast_arg_clearzs(zsvalue, zsmask)))
+         return FALSE;
+   }
+   else {
+      /* Put ourselves into the 'pre-clear' state, specifically to try
+       * and accumulate multiple clears to color and depth_stencil
+       * buffers which the app or state-tracker might issue
+       * separately.
+       */
+      set_scene_state( setup, SETUP_CLEARED, __FUNCTION__ );
+
+      setup->clear.flags |= flags;
+
+      setup->clear.zsmask |= zsmask;
+      setup->clear.zsvalue =
+         (setup->clear.zsvalue & ~zsmask) | (zsvalue & zsmask);
+   }
+
+   return TRUE;
+}
+
+void
+lp_setup_clear( struct lp_setup_context *setup,
+                const union pipe_color_union *color,
+                double depth,
+                unsigned stencil,
+                unsigned flags )
+{
+   unsigned i;
+
+   /*
+    * Note any of these (max 9) clears could fail (but at most there should
+    * be just one failure!). This avoids doing the previous succeeded
+    * clears again (we still clear tiles twice if a clear command succeeded
+    * partially for one buffer).
+    */
+   if (flags & PIPE_CLEAR_DEPTHSTENCIL) {
+      unsigned flagszs = flags & PIPE_CLEAR_DEPTHSTENCIL;
+      if (!lp_setup_try_clear_zs(setup, depth, stencil, flagszs)) {
+         lp_setup_flush(setup, NULL, __FUNCTION__);
+
+         if (!lp_setup_try_clear_zs(setup, depth, stencil, flagszs))
+            assert(0);
+      }
+   }
+
+   if (flags & PIPE_CLEAR_COLOR) {
+      assert(PIPE_CLEAR_COLOR0 == (1 << 2));
+      for (i = 0; i < setup->fb.nr_cbufs; i++) {
+         if ((flags & (1 << (2 + i))) && setup->fb.cbufs[i]) {
+            if (!lp_setup_try_clear_color_buffer(setup, color, i)) {
+               lp_setup_flush(setup, NULL, __FUNCTION__);
+
+               if (!lp_setup_try_clear_color_buffer(setup, color, i))
+                  assert(0);
+            }
+         }
+      }
+   }
+}
+
+
+
+void 
+lp_setup_set_triangle_state( struct lp_setup_context *setup,
+                             unsigned cull_mode,
+                             boolean ccw_is_frontface,
+                             boolean scissor,
+                             boolean half_pixel_center,
+                             boolean bottom_edge_rule)
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   setup->ccw_is_frontface = ccw_is_frontface;
+   setup->cullmode = cull_mode;
+   setup->triangle = first_triangle;
+   setup->pixel_offset = half_pixel_center ? 0.5f : 0.0f;
+   setup->bottom_edge_rule = bottom_edge_rule;
+
+   if (setup->scissor_test != scissor) {
+      setup->dirty |= LP_SETUP_NEW_SCISSOR;
+      setup->scissor_test = scissor;
+   }
+}
+
+void 
+lp_setup_set_line_state( struct lp_setup_context *setup,
+			 float line_width)
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   setup->line_width = line_width;
+}
+
+void 
+lp_setup_set_point_state( struct lp_setup_context *setup,
+                          float point_size,
+                          boolean point_size_per_vertex,
+                          uint sprite_coord_enable,
+                          uint sprite_coord_origin)
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   setup->point_size = point_size;
+   setup->sprite_coord_enable = sprite_coord_enable;
+   setup->sprite_coord_origin = sprite_coord_origin;
+   setup->point_size_per_vertex = point_size_per_vertex;
+}
+
+void
+lp_setup_set_setup_variant( struct lp_setup_context *setup,
+			    const struct lp_setup_variant *variant)
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+   
+   setup->setup.variant = variant;
+}
+
+void
+lp_setup_set_fs_variant( struct lp_setup_context *setup,
+                         struct lp_fragment_shader_variant *variant)
+{
+   LP_DBG(DEBUG_SETUP, "%s %p\n", __FUNCTION__,
+          variant);
+   /* FIXME: reference count */
+
+   setup->fs.current.variant = variant;
+   setup->dirty |= LP_SETUP_NEW_FS;
+}
+
+void
+lp_setup_set_fs_constants(struct lp_setup_context *setup,
+                          unsigned num,
+                          struct pipe_constant_buffer *buffers)
+{
+   unsigned i;
+
+   LP_DBG(DEBUG_SETUP, "%s %p\n", __FUNCTION__, (void *) buffers);
+
+   assert(num <= Elements(setup->constants));
+
+   for (i = 0; i < num; ++i) {
+      util_copy_constant_buffer(&setup->constants[i].current, &buffers[i]);
+   }
+   for (; i < Elements(setup->constants); i++) {
+      util_copy_constant_buffer(&setup->constants[i].current, NULL);
+   }
+   setup->dirty |= LP_SETUP_NEW_CONSTANTS;
+}
+
+
+void
+lp_setup_set_alpha_ref_value( struct lp_setup_context *setup,
+                              float alpha_ref_value )
+{
+   LP_DBG(DEBUG_SETUP, "%s %f\n", __FUNCTION__, alpha_ref_value);
+
+   if(setup->fs.current.jit_context.alpha_ref_value != alpha_ref_value) {
+      setup->fs.current.jit_context.alpha_ref_value = alpha_ref_value;
+      setup->dirty |= LP_SETUP_NEW_FS;
+   }
+}
+
+void
+lp_setup_set_stencil_ref_values( struct lp_setup_context *setup,
+                                 const ubyte refs[2] )
+{
+   LP_DBG(DEBUG_SETUP, "%s %d %d\n", __FUNCTION__, refs[0], refs[1]);
+
+   if (setup->fs.current.jit_context.stencil_ref_front != refs[0] ||
+       setup->fs.current.jit_context.stencil_ref_back != refs[1]) {
+      setup->fs.current.jit_context.stencil_ref_front = refs[0];
+      setup->fs.current.jit_context.stencil_ref_back = refs[1];
+      setup->dirty |= LP_SETUP_NEW_FS;
+   }
+}
+
+void
+lp_setup_set_blend_color( struct lp_setup_context *setup,
+                          const struct pipe_blend_color *blend_color )
+{
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   assert(blend_color);
+
+   if(memcmp(&setup->blend_color.current, blend_color, sizeof *blend_color) != 0) {
+      memcpy(&setup->blend_color.current, blend_color, sizeof *blend_color);
+      setup->dirty |= LP_SETUP_NEW_BLEND_COLOR;
+   }
+}
+
+
+void
+lp_setup_set_scissors( struct lp_setup_context *setup,
+                       const struct pipe_scissor_state *scissors )
+{
+   unsigned i;
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   assert(scissors);
+
+   for (i = 0; i < PIPE_MAX_VIEWPORTS; ++i) {
+      setup->scissors[i].x0 = scissors[i].minx;
+      setup->scissors[i].x1 = scissors[i].maxx-1;
+      setup->scissors[i].y0 = scissors[i].miny;
+      setup->scissors[i].y1 = scissors[i].maxy-1;
+   }
+   setup->dirty |= LP_SETUP_NEW_SCISSOR;
+}
+
+
+void 
+lp_setup_set_flatshade_first( struct lp_setup_context *setup,
+                              boolean flatshade_first )
+{
+   setup->flatshade_first = flatshade_first;
+}
+
+void
+lp_setup_set_rasterizer_discard( struct lp_setup_context *setup,
+                                 boolean rasterizer_discard )
+{
+   if (setup->rasterizer_discard != rasterizer_discard) {
+      setup->rasterizer_discard = rasterizer_discard;
+      set_scene_state( setup, SETUP_FLUSHED, __FUNCTION__ );
+   }
+}
+
+void 
+lp_setup_set_vertex_info( struct lp_setup_context *setup,
+                          struct vertex_info *vertex_info )
+{
+   /* XXX: just silently holding onto the pointer:
+    */
+   setup->vertex_info = vertex_info;
+}
+
+
+/**
+ * Called during state validation when LP_NEW_VIEWPORT is set.
+ */
+void
+lp_setup_set_viewports(struct lp_setup_context *setup,
+                       unsigned num_viewports,
+                       const struct pipe_viewport_state *viewports)
+{
+   struct llvmpipe_context *lp = llvmpipe_context(setup->pipe);
+   unsigned i;
+
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   assert(num_viewports <= PIPE_MAX_VIEWPORTS);
+   assert(viewports);
+
+   /*
+    * For use in lp_state_fs.c, propagate the viewport values for all viewports.
+    */
+   for (i = 0; i < num_viewports; i++) {
+      float min_depth;
+      float max_depth;
+
+      if (lp->rasterizer->clip_halfz == 0) {
+         float half_depth = viewports[i].scale[2];
+         min_depth = viewports[i].translate[2] - half_depth;
+         max_depth = min_depth + half_depth * 2.0f;
+      } else {
+         min_depth = viewports[i].translate[2];
+         max_depth = min_depth + viewports[i].scale[2];
+      }
+
+      if (setup->viewports[i].min_depth != min_depth ||
+          setup->viewports[i].max_depth != max_depth) {
+          setup->viewports[i].min_depth = min_depth;
+          setup->viewports[i].max_depth = max_depth;
+          setup->dirty |= LP_SETUP_NEW_VIEWPORTS;
+      }
+   }
+}
+
+
+/**
+ * Called during state validation when LP_NEW_SAMPLER_VIEW is set.
+ */
+void
+lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
+                                    unsigned num,
+                                    struct pipe_sampler_view **views)
+{
+   unsigned i;
+
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS);
+
+   for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
+      struct pipe_sampler_view *view = i < num ? views[i] : NULL;
+
+      if (view) {
+         struct pipe_resource *res = view->texture;
+         struct llvmpipe_resource *lp_tex = llvmpipe_resource(res);
+         struct lp_jit_texture *jit_tex;
+         jit_tex = &setup->fs.current.jit_context.textures[i];
+
+         /* We're referencing the texture's internal data, so save a
+          * reference to it.
+          */
+         pipe_resource_reference(&setup->fs.current_tex[i], res);
+
+         if (!lp_tex->dt) {
+            /* regular texture - setup array of mipmap level offsets */
+            int j;
+            unsigned first_level = 0;
+            unsigned last_level = 0;
+
+            if (llvmpipe_resource_is_texture(res)) {
+               first_level = view->u.tex.first_level;
+               last_level = view->u.tex.last_level;
+               assert(first_level <= last_level);
+               assert(last_level <= res->last_level);
+               jit_tex->base = lp_tex->tex_data;
+            }
+            else {
+              jit_tex->base = lp_tex->data;
+            }
+
+            if (LP_PERF & PERF_TEX_MEM) {
+               /* use dummy tile memory */
+               jit_tex->base = lp_dummy_tile;
+               jit_tex->width = TILE_SIZE/8;
+               jit_tex->height = TILE_SIZE/8;
+               jit_tex->depth = 1;
+               jit_tex->first_level = 0;
+               jit_tex->last_level = 0;
+               jit_tex->mip_offsets[0] = 0;
+               jit_tex->row_stride[0] = 0;
+               jit_tex->img_stride[0] = 0;
+            }
+            else {
+               jit_tex->width = res->width0;
+               jit_tex->height = res->height0;
+               jit_tex->depth = res->depth0;
+               jit_tex->first_level = first_level;
+               jit_tex->last_level = last_level;
+
+               if (llvmpipe_resource_is_texture(res)) {
+                  for (j = first_level; j <= last_level; j++) {
+                     jit_tex->mip_offsets[j] = lp_tex->mip_offsets[j];
+                     jit_tex->row_stride[j] = lp_tex->row_stride[j];
+                     jit_tex->img_stride[j] = lp_tex->img_stride[j];
+                  }
+
+                  if (view->target == PIPE_TEXTURE_1D_ARRAY ||
+                      view->target == PIPE_TEXTURE_2D_ARRAY ||
+                      view->target == PIPE_TEXTURE_CUBE ||
+                      view->target == PIPE_TEXTURE_CUBE_ARRAY) {
+                     /*
+                      * For array textures, we don't have first_layer, instead
+                      * adjust last_layer (stored as depth) plus the mip level offsets
+                      * (as we have mip-first layout can't just adjust base ptr).
+                      * XXX For mip levels, could do something similar.
+                      */
+                     jit_tex->depth = view->u.tex.last_layer - view->u.tex.first_layer + 1;
+                     for (j = first_level; j <= last_level; j++) {
+                        jit_tex->mip_offsets[j] += view->u.tex.first_layer *
+                                                   lp_tex->img_stride[j];
+                     }
+                     if (view->target == PIPE_TEXTURE_CUBE ||
+                         view->target == PIPE_TEXTURE_CUBE_ARRAY) {
+                        assert(jit_tex->depth % 6 == 0);
+                     }
+                     assert(view->u.tex.first_layer <= view->u.tex.last_layer);
+                     assert(view->u.tex.last_layer < res->array_size);
+                  }
+               }
+               else {
+                  /*
+                   * For buffers, we don't have first_element, instead adjust
+                   * last_element (stored as width) plus the base pointer.
+                   */
+                  unsigned view_blocksize = util_format_get_blocksize(view->format);
+                  /* probably don't really need to fill that out */
+                  jit_tex->mip_offsets[0] = 0;
+                  jit_tex->row_stride[0] = 0;
+                  jit_tex->img_stride[0] = 0;
+
+                  /* everything specified in number of elements here. */
+                  jit_tex->width = view->u.buf.last_element - view->u.buf.first_element + 1;
+                  jit_tex->base = (uint8_t *)jit_tex->base + view->u.buf.first_element *
+                                  view_blocksize;
+                  /* XXX Unsure if we need to sanitize parameters? */
+                  assert(view->u.buf.first_element <= view->u.buf.last_element);
+                  assert(view->u.buf.last_element * view_blocksize < res->width0);
+               }
+            }
+         }
+         else {
+            /* display target texture/surface */
+            /*
+             * XXX: Where should this be unmapped?
+             */
+            struct llvmpipe_screen *screen = llvmpipe_screen(res->screen);
+            struct sw_winsys *winsys = screen->winsys;
+            jit_tex->base = winsys->displaytarget_map(winsys, lp_tex->dt,
+                                                         PIPE_TRANSFER_READ);
+            jit_tex->row_stride[0] = lp_tex->row_stride[0];
+            jit_tex->img_stride[0] = lp_tex->img_stride[0];
+            jit_tex->mip_offsets[0] = 0;
+            jit_tex->width = res->width0;
+            jit_tex->height = res->height0;
+            jit_tex->depth = res->depth0;
+            jit_tex->first_level = jit_tex->last_level = 0;
+            assert(jit_tex->base);
+         }
+      }
+   }
+
+   setup->dirty |= LP_SETUP_NEW_FS;
+}
+
+
+/**
+ * Called during state validation when LP_NEW_SAMPLER is set.
+ */
+void
+lp_setup_set_fragment_sampler_state(struct lp_setup_context *setup,
+                                    unsigned num,
+                                    struct pipe_sampler_state **samplers)
+{
+   unsigned i;
+
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      const struct pipe_sampler_state *sampler = i < num ? samplers[i] : NULL;
+
+      if (sampler) {
+         struct lp_jit_sampler *jit_sam;
+         jit_sam = &setup->fs.current.jit_context.samplers[i];
+
+         jit_sam->min_lod = sampler->min_lod;
+         jit_sam->max_lod = sampler->max_lod;
+         jit_sam->lod_bias = sampler->lod_bias;
+         COPY_4V(jit_sam->border_color, sampler->border_color.f);
+      }
+   }
+
+   setup->dirty |= LP_SETUP_NEW_FS;
+}
+
+
+/**
+ * Is the given texture referenced by any scene?
+ * Note: we have to check all scenes including any scenes currently
+ * being rendered and the current scene being built.
+ */
+unsigned
+lp_setup_is_resource_referenced( const struct lp_setup_context *setup,
+                                const struct pipe_resource *texture )
+{
+   unsigned i;
+
+   /* check the render targets */
+   for (i = 0; i < setup->fb.nr_cbufs; i++) {
+      if (setup->fb.cbufs[i] && setup->fb.cbufs[i]->texture == texture)
+         return LP_REFERENCED_FOR_READ | LP_REFERENCED_FOR_WRITE;
+   }
+   if (setup->fb.zsbuf && setup->fb.zsbuf->texture == texture) {
+      return LP_REFERENCED_FOR_READ | LP_REFERENCED_FOR_WRITE;
+   }
+
+   /* check textures referenced by the scene */
+   for (i = 0; i < Elements(setup->scenes); i++) {
+      if (lp_scene_is_resource_referenced(setup->scenes[i], texture)) {
+         return LP_REFERENCED_FOR_READ;
+      }
+   }
+
+   return LP_UNREFERENCED;
+}
+
+
+/**
+ * Called by vbuf code when we're about to draw something.
+ *
+ * This function stores all dirty state in the current scene's display list
+ * memory, via lp_scene_alloc().  We can not pass pointers of mutable state to
+ * the JIT functions, as the JIT functions will be called later on, most likely
+ * on a different thread.
+ *
+ * When processing dirty state it is imperative that we don't refer to any
+ * pointers previously allocated with lp_scene_alloc() in this function (or any
+ * function) as they may belong to a scene freed since then.
+ */
+static boolean
+try_update_scene_state( struct lp_setup_context *setup )
+{
+   static const float fake_const_buf[4];
+   boolean new_scene = (setup->fs.stored == NULL);
+   struct lp_scene *scene = setup->scene;
+   unsigned i;
+
+   assert(scene);
+
+   if (setup->dirty & LP_SETUP_NEW_VIEWPORTS) {
+      /*
+       * Record new depth range state for changes due to viewport updates.
+       *
+       * TODO: Collapse the existing viewport and depth range information
+       *       into one structure, for access by JIT.
+       */
+      struct lp_jit_viewport *stored;
+
+      stored = (struct lp_jit_viewport *)
+         lp_scene_alloc(scene, sizeof setup->viewports);
+
+      if (!stored) {
+         assert(!new_scene);
+         return FALSE;
+      }
+
+      memcpy(stored, setup->viewports, sizeof setup->viewports);
+
+      setup->fs.current.jit_context.viewports = stored;
+      setup->dirty |= LP_SETUP_NEW_FS;
+   }
+
+   if(setup->dirty & LP_SETUP_NEW_BLEND_COLOR) {
+      uint8_t *stored;
+      float* fstored;
+      unsigned i, j;
+      unsigned size;
+
+      /* Alloc u8_blend_color (16 x i8) and f_blend_color (4 or 8 x f32) */
+      size  = 4 * 16 * sizeof(uint8_t);
+      size += (LP_MAX_VECTOR_LENGTH / 4) * sizeof(float);
+      stored = lp_scene_alloc_aligned(scene, size, LP_MIN_VECTOR_ALIGN);
+
+      if (!stored) {
+         assert(!new_scene);
+         return FALSE;
+      }
+
+      /* Store floating point colour */
+      fstored = (float*)(stored + 4*16);
+      for (i = 0; i < (LP_MAX_VECTOR_LENGTH / 4); ++i) {
+         fstored[i] = setup->blend_color.current.color[i % 4];
+      }
+
+      /* smear each blend color component across 16 ubyte elements */
+      for (i = 0; i < 4; ++i) {
+         uint8_t c = float_to_ubyte(setup->blend_color.current.color[i]);
+         for (j = 0; j < 16; ++j)
+            stored[i*16 + j] = c;
+      }
+
+      setup->blend_color.stored = stored;
+      setup->fs.current.jit_context.u8_blend_color = stored;
+      setup->fs.current.jit_context.f_blend_color = fstored;
+      setup->dirty |= LP_SETUP_NEW_FS;
+   }
+
+   if (setup->dirty & LP_SETUP_NEW_CONSTANTS) {
+      for (i = 0; i < Elements(setup->constants); ++i) {
+         struct pipe_resource *buffer = setup->constants[i].current.buffer;
+         const unsigned current_size = MIN2(setup->constants[i].current.buffer_size,
+                                            LP_MAX_TGSI_CONST_BUFFER_SIZE);
+         const ubyte *current_data = NULL;
+         int num_constants;
+
+         STATIC_ASSERT(DATA_BLOCK_SIZE >= LP_MAX_TGSI_CONST_BUFFER_SIZE);
+
+         if (buffer) {
+            /* resource buffer */
+            current_data = (ubyte *) llvmpipe_resource_data(buffer);
+         }
+         else if (setup->constants[i].current.user_buffer) {
+            /* user-space buffer */
+            current_data = (ubyte *) setup->constants[i].current.user_buffer;
+         }
+
+         if (current_data) {
+            current_data += setup->constants[i].current.buffer_offset;
+
+            /* TODO: copy only the actually used constants? */
+
+            if (setup->constants[i].stored_size != current_size ||
+               !setup->constants[i].stored_data ||
+               memcmp(setup->constants[i].stored_data,
+                      current_data,
+                      current_size) != 0) {
+               void *stored;
+
+               stored = lp_scene_alloc(scene, current_size);
+               if (!stored) {
+                  assert(!new_scene);
+                  return FALSE;
+               }
+
+               memcpy(stored,
+                      current_data,
+                      current_size);
+               setup->constants[i].stored_size = current_size;
+               setup->constants[i].stored_data = stored;
+            }
+            setup->fs.current.jit_context.constants[i] =
+               setup->constants[i].stored_data;
+         }
+         else {
+            setup->constants[i].stored_size = 0;
+            setup->constants[i].stored_data = NULL;
+            setup->fs.current.jit_context.constants[i] = fake_const_buf;
+         }
+
+         num_constants =
+            setup->constants[i].stored_size / (sizeof(float) * 4);
+         setup->fs.current.jit_context.num_constants[i] = num_constants;
+         setup->dirty |= LP_SETUP_NEW_FS;
+      }
+   }
+
+
+   if (setup->dirty & LP_SETUP_NEW_FS) {
+      if (!setup->fs.stored ||
+          memcmp(setup->fs.stored,
+                 &setup->fs.current,
+                 sizeof setup->fs.current) != 0)
+      {
+         struct lp_rast_state *stored;
+         
+         /* The fs state that's been stored in the scene is different from
+          * the new, current state.  So allocate a new lp_rast_state object
+          * and append it to the bin's setup data buffer.
+          */
+         stored = (struct lp_rast_state *) lp_scene_alloc(scene, sizeof *stored);
+         if (!stored) {
+            assert(!new_scene);
+            return FALSE;
+         }
+
+         memcpy(stored,
+                &setup->fs.current,
+                sizeof setup->fs.current);
+         setup->fs.stored = stored;
+         
+         /* The scene now references the textures in the rasterization
+          * state record.  Note that now.
+          */
+         for (i = 0; i < Elements(setup->fs.current_tex); i++) {
+            if (setup->fs.current_tex[i]) {
+               if (!lp_scene_add_resource_reference(scene,
+                                                    setup->fs.current_tex[i],
+                                                    new_scene)) {
+                  assert(!new_scene);
+                  return FALSE;
+               }
+            }
+         }
+      }
+   }
+
+   if (setup->dirty & LP_SETUP_NEW_SCISSOR) {
+      unsigned i;
+      for (i = 0; i < PIPE_MAX_VIEWPORTS; ++i) {
+         setup->draw_regions[i] = setup->framebuffer;
+         if (setup->scissor_test) {
+            u_rect_possible_intersection(&setup->scissors[i],
+                                         &setup->draw_regions[i]);
+         }
+      }
+   }
+
+   setup->dirty = 0;
+
+   assert(setup->fs.stored);
+   return TRUE;
+}
+
+boolean
+lp_setup_update_state( struct lp_setup_context *setup,
+                       boolean update_scene )
+{
+   /* Some of the 'draw' pipeline stages may have changed some driver state.
+    * Make sure we've processed those state changes before anything else.
+    *
+    * XXX this is the only place where llvmpipe_context is used in the
+    * setup code.  This may get refactored/changed...
+    */
+   {
+      struct llvmpipe_context *lp = llvmpipe_context(setup->pipe);
+      if (lp->dirty) {
+         llvmpipe_update_derived(lp);
+      }
+
+      if (lp->setup->dirty) {
+         llvmpipe_update_setup(lp);
+      }
+
+      assert(setup->setup.variant);
+
+      /* Will probably need to move this somewhere else, just need  
+       * to know about vertex shader point size attribute.
+       */
+      setup->psize = lp->psize_slot;
+      setup->viewport_index_slot = lp->viewport_index_slot;
+      setup->layer_slot = lp->layer_slot;
+      setup->face_slot = lp->face_slot;
+
+      assert(lp->dirty == 0);
+
+      assert(lp->setup_variant.key.size == 
+	     setup->setup.variant->key.size);
+
+      assert(memcmp(&lp->setup_variant.key,
+		    &setup->setup.variant->key,
+		    setup->setup.variant->key.size) == 0);
+   }
+
+   if (update_scene && setup->state != SETUP_ACTIVE) {
+      if (!set_scene_state( setup, SETUP_ACTIVE, __FUNCTION__ ))
+         return FALSE;
+   }
+
+   /* Only call into update_scene_state() if we already have a
+    * scene:
+    */
+   if (update_scene && setup->scene) {
+      assert(setup->state == SETUP_ACTIVE);
+
+      if (try_update_scene_state(setup))
+         return TRUE;
+
+      /* Update failed, try to restart the scene.
+       *
+       * Cannot call lp_setup_flush_and_restart() directly here
+       * because of potential recursion.
+       */
+      if (!set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__))
+         return FALSE;
+
+      if (!set_scene_state(setup, SETUP_ACTIVE, __FUNCTION__))
+         return FALSE;
+
+      if (!setup->scene)
+         return FALSE;
+
+      return try_update_scene_state(setup);
+   }
+
+   return TRUE;
+}
+
+
+
+/* Only caller is lp_setup_vbuf_destroy()
+ */
+void 
+lp_setup_destroy( struct lp_setup_context *setup )
+{
+   uint i;
+
+   lp_setup_reset( setup );
+
+   util_unreference_framebuffer_state(&setup->fb);
+
+   for (i = 0; i < Elements(setup->fs.current_tex); i++) {
+      pipe_resource_reference(&setup->fs.current_tex[i], NULL);
+   }
+
+   for (i = 0; i < Elements(setup->constants); i++) {
+      pipe_resource_reference(&setup->constants[i].current.buffer, NULL);
+   }
+
+   /* free the scenes in the 'empty' queue */
+   for (i = 0; i < Elements(setup->scenes); i++) {
+      struct lp_scene *scene = setup->scenes[i];
+
+      if (scene->fence)
+         lp_fence_wait(scene->fence);
+
+      lp_scene_destroy(scene);
+   }
+
+   lp_fence_reference(&setup->last_fence, NULL);
+
+   FREE( setup );
+}
+
+
+/**
+ * Create a new primitive tiling engine.  Plug it into the backend of
+ * the draw module.  Currently also creates a rasterizer to use with
+ * it.
+ */
+struct lp_setup_context *
+lp_setup_create( struct pipe_context *pipe,
+                 struct draw_context *draw )
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(pipe->screen);
+   struct lp_setup_context *setup;
+   unsigned i;
+
+   setup = CALLOC_STRUCT(lp_setup_context);
+   if (!setup) {
+      goto no_setup;
+   }
+
+   lp_setup_init_vbuf(setup);
+   
+   /* Used only in update_state():
+    */
+   setup->pipe = pipe;
+
+
+   setup->num_threads = screen->num_threads;
+   setup->vbuf = draw_vbuf_stage(draw, &setup->base);
+   if (!setup->vbuf) {
+      goto no_vbuf;
+   }
+
+   draw_set_rasterize_stage(draw, setup->vbuf);
+   draw_set_render(draw, &setup->base);
+
+   /* create some empty scenes */
+   for (i = 0; i < MAX_SCENES; i++) {
+      setup->scenes[i] = lp_scene_create( pipe );
+      if (!setup->scenes[i]) {
+         goto no_scenes;
+      }
+   }
+
+   setup->triangle = first_triangle;
+   setup->line     = first_line;
+   setup->point    = first_point;
+   
+   setup->dirty = ~0;
+
+   return setup;
+
+no_scenes:
+   for (i = 0; i < MAX_SCENES; i++) {
+      if (setup->scenes[i]) {
+         lp_scene_destroy(setup->scenes[i]);
+      }
+   }
+
+   setup->vbuf->destroy(setup->vbuf);
+no_vbuf:
+   FREE(setup);
+no_setup:
+   return NULL;
+}
+
+
+/**
+ * Put a BeginQuery command into all bins.
+ */
+void
+lp_setup_begin_query(struct lp_setup_context *setup,
+                     struct llvmpipe_query *pq)
+{
+
+   set_scene_state(setup, SETUP_ACTIVE, "begin_query");
+
+   if (!(pq->type == PIPE_QUERY_OCCLUSION_COUNTER ||
+         pq->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+         pq->type == PIPE_QUERY_PIPELINE_STATISTICS))
+      return;
+
+   /* init the query to its beginning state */
+   assert(setup->active_binned_queries < LP_MAX_ACTIVE_BINNED_QUERIES);
+   /* exceeding list size so just ignore the query */
+   if (setup->active_binned_queries >= LP_MAX_ACTIVE_BINNED_QUERIES) {
+      return;
+   }
+   assert(setup->active_queries[setup->active_binned_queries] == NULL);
+   setup->active_queries[setup->active_binned_queries] = pq;
+   setup->active_binned_queries++;
+
+   assert(setup->scene);
+   if (setup->scene) {
+      if (!lp_scene_bin_everywhere(setup->scene,
+                                   LP_RAST_OP_BEGIN_QUERY,
+                                   lp_rast_arg_query(pq))) {
+
+         if (!lp_setup_flush_and_restart(setup))
+            return;
+
+         if (!lp_scene_bin_everywhere(setup->scene,
+                                      LP_RAST_OP_BEGIN_QUERY,
+                                      lp_rast_arg_query(pq))) {
+            return;
+         }
+      }
+      setup->scene->had_queries |= TRUE;
+   }
+}
+
+
+/**
+ * Put an EndQuery command into all bins.
+ */
+void
+lp_setup_end_query(struct lp_setup_context *setup, struct llvmpipe_query *pq)
+{
+   set_scene_state(setup, SETUP_ACTIVE, "end_query");
+
+   assert(setup->scene);
+   if (setup->scene) {
+      /* pq->fence should be the fence of the *last* scene which
+       * contributed to the query result.
+       */
+      lp_fence_reference(&pq->fence, setup->scene->fence);
+
+      if (pq->type == PIPE_QUERY_OCCLUSION_COUNTER ||
+          pq->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+          pq->type == PIPE_QUERY_PIPELINE_STATISTICS ||
+          pq->type == PIPE_QUERY_TIMESTAMP) {
+         if (pq->type == PIPE_QUERY_TIMESTAMP &&
+               !(setup->scene->tiles_x | setup->scene->tiles_y)) {
+            /*
+             * If there's a zero width/height framebuffer, there's no bins and
+             * hence no rast task is ever run. So fill in something here instead.
+             */
+            pq->end[0] = os_time_get_nano();
+         }
+
+         if (!lp_scene_bin_everywhere(setup->scene,
+                                      LP_RAST_OP_END_QUERY,
+                                      lp_rast_arg_query(pq))) {
+            if (!lp_setup_flush_and_restart(setup))
+               goto fail;
+
+            if (!lp_scene_bin_everywhere(setup->scene,
+                                         LP_RAST_OP_END_QUERY,
+                                         lp_rast_arg_query(pq))) {
+               goto fail;
+            }
+         }
+         setup->scene->had_queries |= TRUE;
+      }
+   }
+   else {
+      lp_fence_reference(&pq->fence, setup->last_fence);
+   }
+
+fail:
+   /* Need to do this now not earlier since it still needs to be marked as
+    * active when binning it would cause a flush.
+    */
+   if (pq->type == PIPE_QUERY_OCCLUSION_COUNTER ||
+      pq->type == PIPE_QUERY_OCCLUSION_PREDICATE ||
+      pq->type == PIPE_QUERY_PIPELINE_STATISTICS) {
+      unsigned i;
+
+      /* remove from active binned query list */
+      for (i = 0; i < setup->active_binned_queries; i++) {
+         if (setup->active_queries[i] == pq)
+            break;
+      }
+      assert(i < setup->active_binned_queries);
+      if (i == setup->active_binned_queries)
+         return;
+      setup->active_binned_queries--;
+      setup->active_queries[i] = setup->active_queries[setup->active_binned_queries];
+      setup->active_queries[setup->active_binned_queries] = NULL;
+   }
+}
+
+
+boolean
+lp_setup_flush_and_restart(struct lp_setup_context *setup)
+{
+   if (0) debug_printf("%s\n", __FUNCTION__);
+
+   assert(setup->state == SETUP_ACTIVE);
+
+   if (!set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__))
+      return FALSE;
+   
+   if (!lp_setup_update_state(setup, TRUE))
+      return FALSE;
+
+   return TRUE;
+}
+
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup.h
new file mode 100644
index 000000000..a42df2dc9
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -0,0 +1,168 @@
+/**************************************************************************
+ *
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+#ifndef LP_SETUP_H
+#define LP_SETUP_H
+
+#include "pipe/p_compiler.h"
+#include "lp_jit.h"
+
+struct draw_context;
+struct vertex_info;
+
+
+struct pipe_resource;
+struct pipe_query;
+struct pipe_surface;
+struct pipe_blend_color;
+struct pipe_screen;
+struct pipe_framebuffer_state;
+struct lp_fragment_shader_variant;
+struct lp_jit_context;
+struct llvmpipe_query;
+struct pipe_fence_handle;
+struct lp_setup_variant;
+struct lp_setup_context;
+
+void lp_setup_reset( struct lp_setup_context *setup );
+
+struct lp_setup_context *
+lp_setup_create( struct pipe_context *pipe,
+                 struct draw_context *draw );
+
+void
+lp_setup_clear(struct lp_setup_context *setup,
+               const union pipe_color_union *clear_color,
+               double clear_depth,
+               unsigned clear_stencil,
+               unsigned flags);
+
+
+
+void
+lp_setup_flush( struct lp_setup_context *setup,
+                struct pipe_fence_handle **fence,
+                const char *reason);
+
+
+void
+lp_setup_bind_framebuffer( struct lp_setup_context *setup,
+                           const struct pipe_framebuffer_state *fb );
+
+void 
+lp_setup_set_triangle_state( struct lp_setup_context *setup,
+                             unsigned cullmode,
+                             boolean front_is_ccw,
+                             boolean scissor,
+                             boolean half_pixel_center,
+                             boolean bottom_edge_rule);
+
+void 
+lp_setup_set_line_state( struct lp_setup_context *setup,
+                         float line_width);
+
+void 
+lp_setup_set_point_state( struct lp_setup_context *setup,
+                          float point_size,                          
+                          boolean point_size_per_vertex,
+                          uint sprite_coord_enable,
+                          uint sprite_coord_origin);
+
+void
+lp_setup_set_setup_variant( struct lp_setup_context *setup,
+			    const struct lp_setup_variant *variant );
+
+void
+lp_setup_set_fs_variant( struct lp_setup_context *setup,
+                         struct lp_fragment_shader_variant *variant );
+
+void
+lp_setup_set_fs_constants(struct lp_setup_context *setup,
+                          unsigned num,
+                          struct pipe_constant_buffer *buffers);
+
+void
+lp_setup_set_alpha_ref_value( struct lp_setup_context *setup,
+                              float alpha_ref_value );
+
+void
+lp_setup_set_stencil_ref_values( struct lp_setup_context *setup,
+                                 const ubyte refs[2] );
+
+void
+lp_setup_set_blend_color( struct lp_setup_context *setup,
+                          const struct pipe_blend_color *blend_color );
+
+void
+lp_setup_set_scissors( struct lp_setup_context *setup,
+                       const struct pipe_scissor_state *scissors );
+
+void
+lp_setup_set_viewports(struct lp_setup_context *setup,
+                       unsigned num_viewports,
+                       const struct pipe_viewport_state *viewports);
+
+void
+lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
+                                    unsigned num,
+                                    struct pipe_sampler_view **views);
+
+void
+lp_setup_set_fragment_sampler_state(struct lp_setup_context *setup,
+                                    unsigned num,
+                                    struct pipe_sampler_state **samplers);
+
+unsigned
+lp_setup_is_resource_referenced( const struct lp_setup_context *setup,
+                                const struct pipe_resource *texture );
+
+void
+lp_setup_set_flatshade_first( struct lp_setup_context *setup, 
+                              boolean flatshade_first );
+
+void
+lp_setup_set_rasterizer_discard( struct lp_setup_context *setup, 
+                                 boolean rasterizer_discard );
+
+void
+lp_setup_set_vertex_info( struct lp_setup_context *setup, 
+                          struct vertex_info *info );
+
+void
+lp_setup_begin_query(struct lp_setup_context *setup,
+                     struct llvmpipe_query *pq);
+
+void
+lp_setup_end_query(struct lp_setup_context *setup,
+                   struct llvmpipe_query *pq);
+
+static inline unsigned
+lp_clamp_viewport_idx(int idx)
+{
+   return (PIPE_MAX_VIEWPORTS > idx && idx >= 0) ? idx : 0;
+}
+
+#endif
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup_context.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup_context.h
new file mode 100644
index 000000000..2410e2384
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -0,0 +1,208 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * The setup code is concerned with point/line/triangle setup and
+ * putting commands/data into the bins.
+ */
+
+
+#ifndef LP_SETUP_CONTEXT_H
+#define LP_SETUP_CONTEXT_H
+
+#include "lp_setup.h"
+#include "lp_rast.h"
+#include "lp_scene.h"
+#include "lp_bld_interp.h"	/* for struct lp_shader_input */
+
+#include "draw/draw_vbuf.h"
+#include "util/u_rect.h"
+#include "util/u_pack_color.h"
+
+#define LP_SETUP_NEW_FS          0x01
+#define LP_SETUP_NEW_CONSTANTS   0x02
+#define LP_SETUP_NEW_BLEND_COLOR 0x04
+#define LP_SETUP_NEW_SCISSOR     0x08
+#define LP_SETUP_NEW_VIEWPORTS   0x10
+
+
+struct lp_setup_variant;
+
+
+/** Max number of scenes */
+/* XXX: make multiple scenes per context work, see lp_setup_rasterize_scene */
+#define MAX_SCENES 1
+
+
+
+/**
+ * Point/line/triangle setup context.
+ * Note: "stored" below indicates data which is stored in the bins,
+ * not arbitrary malloc'd memory.
+ *
+ *
+ * Subclass of vbuf_render, plugged directly into the draw module as
+ * the rendering backend.
+ */
+struct lp_setup_context
+{
+   struct vbuf_render base;
+
+   struct pipe_context *pipe;
+   struct vertex_info *vertex_info;
+   uint prim;
+   uint vertex_size;
+   uint nr_vertices;
+   uint sprite_coord_enable, sprite_coord_origin;
+   uint vertex_buffer_size;
+   void *vertex_buffer;
+
+   /* Final pipeline stage for draw module.  Draw module should
+    * create/install this itself now.
+    */
+   struct draw_stage *vbuf;
+   unsigned num_threads;
+   unsigned scene_idx;
+   struct lp_scene *scenes[MAX_SCENES];  /**< all the scenes */
+   struct lp_scene *scene;               /**< current scene being built */
+
+   struct lp_fence *last_fence;
+   struct llvmpipe_query *active_queries[LP_MAX_ACTIVE_BINNED_QUERIES];
+   unsigned active_binned_queries;
+
+   boolean flatshade_first;
+   boolean ccw_is_frontface;
+   boolean scissor_test;
+   boolean point_size_per_vertex;
+   boolean rasterizer_discard;
+   unsigned cullmode;
+   unsigned bottom_edge_rule;
+   float pixel_offset;
+   float line_width;
+   float point_size;
+   float psize;
+   unsigned viewport_index_slot;
+   unsigned layer_slot;
+   int face_slot;
+
+   struct pipe_framebuffer_state fb;
+   struct u_rect framebuffer;
+   struct u_rect scissors[PIPE_MAX_VIEWPORTS];
+   struct u_rect draw_regions[PIPE_MAX_VIEWPORTS];   /* intersection of fb & scissor */
+   struct lp_jit_viewport viewports[PIPE_MAX_VIEWPORTS];
+
+   struct {
+      unsigned flags;
+      union util_color color_val[PIPE_MAX_COLOR_BUFS];
+      uint64_t zsmask;
+      uint64_t zsvalue;               /**< lp_rast_clear_zstencil() cmd */
+   } clear;
+
+   enum setup_state {
+      SETUP_FLUSHED,    /**< scene is null */
+      SETUP_CLEARED,    /**< scene exists but has only clears */
+      SETUP_ACTIVE      /**< scene exists and has at least one draw/query */
+   } state;
+   
+   struct {
+      const struct lp_rast_state *stored; /**< what's in the scene */
+      struct lp_rast_state current;  /**< currently set state */
+      struct pipe_resource *current_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+   } fs;
+
+   /** fragment shader constants */
+   struct {
+      struct pipe_constant_buffer current;
+      unsigned stored_size;
+      const void *stored_data;
+   } constants[LP_MAX_TGSI_CONST_BUFFERS];
+
+   struct {
+      struct pipe_blend_color current;
+      uint8_t *stored;
+   } blend_color;
+
+
+   struct {
+      const struct lp_setup_variant *variant;
+   } setup;
+
+   unsigned dirty;   /**< bitmask of LP_SETUP_NEW_x bits */
+
+   void (*point)( struct lp_setup_context *,
+                  const float (*v0)[4]);
+
+   void (*line)( struct lp_setup_context *,
+                 const float (*v0)[4],
+                 const float (*v1)[4]);
+
+   void (*triangle)( struct lp_setup_context *,
+                     const float (*v0)[4],
+                     const float (*v1)[4],
+                     const float (*v2)[4]);
+};
+
+void lp_setup_choose_triangle( struct lp_setup_context *setup );
+void lp_setup_choose_line( struct lp_setup_context *setup );
+void lp_setup_choose_point( struct lp_setup_context *setup );
+
+void lp_setup_init_vbuf(struct lp_setup_context *setup);
+
+boolean lp_setup_update_state( struct lp_setup_context *setup,
+                            boolean update_scene);
+
+void lp_setup_destroy( struct lp_setup_context *setup );
+
+boolean lp_setup_flush_and_restart(struct lp_setup_context *setup);
+
+void
+lp_setup_print_triangle(struct lp_setup_context *setup,
+                        const float (*v0)[4],
+                        const float (*v1)[4],
+                        const float (*v2)[4]);
+
+void
+lp_setup_print_vertex(struct lp_setup_context *setup,
+                      const char *name,
+                      const float (*v)[4]);
+
+
+struct lp_rast_triangle *
+lp_setup_alloc_triangle(struct lp_scene *scene,
+                        unsigned num_inputs,
+                        unsigned nr_planes,
+                        unsigned *tri_size);
+
+boolean
+lp_setup_bin_triangle( struct lp_setup_context *setup,
+                       struct lp_rast_triangle *tri,
+                       const struct u_rect *bbox,
+                       int nr_planes,
+                       unsigned scissor_index );
+
+#endif
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup_line.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup_line.c
new file mode 100644
index 000000000..a190254d9
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -0,0 +1,748 @@
+/**************************************************************************
+ *
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Binning code for lines
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "lp_perf.h"
+#include "lp_setup_context.h"
+#include "lp_rast.h"
+#include "lp_state_fs.h"
+#include "lp_state_setup.h"
+#include "lp_context.h"
+#include "draw/draw_context.h"
+
+#define NUM_CHANNELS 4
+
+struct lp_line_info {
+
+   float dx;
+   float dy;
+   float oneoverarea;
+   boolean frontfacing;
+
+   const float (*v1)[4];
+   const float (*v2)[4];
+
+   float (*a0)[4];
+   float (*dadx)[4];
+   float (*dady)[4];
+};
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ */
+static void constant_coef( struct lp_setup_context *setup,
+                           struct lp_line_info *info,
+                           unsigned slot,
+                           const float value,
+                           unsigned i )
+{
+   info->a0[slot][i] = value;
+   info->dadx[slot][i] = 0.0f;
+   info->dady[slot][i] = 0.0f;
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a triangle.
+ */
+static void linear_coef( struct lp_setup_context *setup,
+                         struct lp_line_info *info,
+                         unsigned slot,
+                         unsigned vert_attr,
+                         unsigned i)
+{
+   float a1 = info->v1[vert_attr][i]; 
+   float a2 = info->v2[vert_attr][i];
+      
+   float da21 = a1 - a2;   
+   float dadx = da21 * info->dx * info->oneoverarea;
+   float dady = da21 * info->dy * info->oneoverarea;
+
+   info->dadx[slot][i] = dadx;
+   info->dady[slot][i] = dady;  
+   
+   info->a0[slot][i] = (a1 -
+                              (dadx * (info->v1[0][0] - setup->pixel_offset) +
+                               dady * (info->v1[0][1] - setup->pixel_offset)));
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void perspective_coef( struct lp_setup_context *setup,
+                              struct lp_line_info *info,
+                              unsigned slot,
+                              unsigned vert_attr,
+                              unsigned i)
+{
+   /* premultiply by 1/w  (v[0][3] is always 1/w):
+    */
+   float a1 = info->v1[vert_attr][i] * info->v1[0][3];
+   float a2 = info->v2[vert_attr][i] * info->v2[0][3];
+
+   float da21 = a1 - a2;   
+   float dadx = da21 * info->dx * info->oneoverarea;
+   float dady = da21 * info->dy * info->oneoverarea;
+
+   info->dadx[slot][i] = dadx;
+   info->dady[slot][i] = dady;
+   
+   info->a0[slot][i] = (a1 -
+                        (dadx * (info->v1[0][0] - setup->pixel_offset) +
+                         dady * (info->v1[0][1] - setup->pixel_offset)));
+}
+
+static void
+setup_fragcoord_coef( struct lp_setup_context *setup,
+                      struct lp_line_info *info,
+                      unsigned slot,
+                      unsigned usage_mask)
+{
+   /*X*/
+   if (usage_mask & TGSI_WRITEMASK_X) {
+      info->a0[slot][0] = 0.0;
+      info->dadx[slot][0] = 1.0;
+      info->dady[slot][0] = 0.0;
+   }
+
+   /*Y*/
+   if (usage_mask & TGSI_WRITEMASK_Y) {
+      info->a0[slot][1] = 0.0;
+      info->dadx[slot][1] = 0.0;
+      info->dady[slot][1] = 1.0;
+   }
+
+   /*Z*/
+   if (usage_mask & TGSI_WRITEMASK_Z) {
+      linear_coef(setup, info, slot, 0, 2);
+   }
+
+   /*W*/
+   if (usage_mask & TGSI_WRITEMASK_W) {
+      linear_coef(setup, info, slot, 0, 3);
+   }
+}
+
+/**
+ * Compute the tri->coef[] array dadx, dady, a0 values.
+ */
+static void setup_line_coefficients( struct lp_setup_context *setup,
+                                     struct lp_line_info *info)
+{
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
+   unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
+   unsigned slot;
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (slot = 0; slot < key->num_inputs; slot++) {
+      unsigned vert_attr = key->inputs[slot].src_index;
+      unsigned usage_mask = key->inputs[slot].usage_mask;
+      unsigned i;
+           
+      switch (key->inputs[slot].interp) {
+      case LP_INTERP_CONSTANT:
+         if (key->flatshade_first) {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               if (usage_mask & (1 << i))
+                  constant_coef(setup, info, slot+1, info->v1[vert_attr][i], i);
+         }
+         else {
+            for (i = 0; i < NUM_CHANNELS; i++)
+               if (usage_mask & (1 << i))
+                  constant_coef(setup, info, slot+1, info->v2[vert_attr][i], i);
+         }
+         break;
+
+      case LP_INTERP_LINEAR:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               linear_coef(setup, info, slot+1, vert_attr, i);
+         break;
+
+      case LP_INTERP_PERSPECTIVE:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               perspective_coef(setup, info, slot+1, vert_attr, i);
+         fragcoord_usage_mask |= TGSI_WRITEMASK_W;
+         break;
+
+      case LP_INTERP_POSITION:
+         /*
+          * The generated pixel interpolators will pick up the coeffs from
+          * slot 0, so all need to ensure that the usage mask is covers all
+          * usages.
+          */
+         fragcoord_usage_mask |= usage_mask;
+         break;
+
+      case LP_INTERP_FACING:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               constant_coef(setup, info, slot+1,
+                             info->frontfacing ? 1.0f : -1.0f, i);
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   /* The internal position input is in slot zero:
+    */
+   setup_fragcoord_coef(setup, info, 0,
+                        fragcoord_usage_mask);
+}
+
+
+
+static inline int subpixel_snap( float a )
+{
+   return util_iround(FIXED_ONE * a);
+}
+
+
+/**
+ * Print line vertex attribs (for debug).
+ */
+static void
+print_line(struct lp_setup_context *setup,
+           const float (*v1)[4],
+           const float (*v2)[4])
+{
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
+   uint i;
+
+   debug_printf("llvmpipe line\n");
+   for (i = 0; i < 1 + key->num_inputs; i++) {
+      debug_printf("  v1[%d]:  %f %f %f %f\n", i,
+                   v1[i][0], v1[i][1], v1[i][2], v1[i][3]);
+   }
+   for (i = 0; i < 1 + key->num_inputs; i++) {
+      debug_printf("  v2[%d]:  %f %f %f %f\n", i,
+                   v2[i][0], v2[i][1], v2[i][2], v2[i][3]);
+   }
+}
+
+
+static inline boolean sign(float x){
+   return x >= 0;  
+}  
+
+
+/* Used on positive floats only:
+ */
+static inline float fracf(float f)
+{
+   return f - floorf(f);
+}
+
+
+
+static boolean
+try_setup_line( struct lp_setup_context *setup,
+               const float (*v1)[4],
+               const float (*v2)[4])
+{
+   struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe;
+   struct lp_scene *scene = setup->scene;
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
+   struct lp_rast_triangle *line;
+   struct lp_rast_plane *plane;
+   struct lp_line_info info;
+   float width = MAX2(1.0, setup->line_width);
+   struct u_rect bbox;
+   unsigned tri_bytes;
+   int x[4]; 
+   int y[4];
+   int i;
+   int nr_planes = 4;
+   unsigned viewport_index = 0;
+   unsigned layer = 0;
+   
+   /* linewidth should be interpreted as integer */
+   int fixed_width = util_iround(width) * FIXED_ONE;
+
+   float x_offset=0;
+   float y_offset=0;
+   float x_offset_end=0;
+   float y_offset_end=0;
+      
+   float x1diff;
+   float y1diff;
+   float x2diff;
+   float y2diff;
+   float dx, dy;
+   float area;
+
+   boolean draw_start;
+   boolean draw_end;
+   boolean will_draw_start;
+   boolean will_draw_end;
+
+   if (0)
+      print_line(setup, v1, v2);
+
+   if (setup->scissor_test) {
+      nr_planes = 8;
+      if (setup->viewport_index_slot > 0) {
+         unsigned *udata = (unsigned*)v1[setup->viewport_index_slot];
+         viewport_index = lp_clamp_viewport_idx(*udata);
+      }
+   }
+   else {
+      nr_planes = 4;
+   }
+
+   if (setup->layer_slot > 0) {
+      layer = *(unsigned*)v1[setup->layer_slot];
+      layer = MIN2(layer, scene->fb_max_layer);
+   }
+
+   dx = v1[0][0] - v2[0][0];
+   dy = v1[0][1] - v2[0][1];
+   area = (dx * dx  + dy * dy);
+   if (area == 0) {
+      LP_COUNT(nr_culled_tris);
+      return TRUE;
+   }
+
+   info.oneoverarea = 1.0f / area;
+   info.dx = dx;
+   info.dy = dy;
+   info.v1 = v1;
+   info.v2 = v2;
+
+  
+   /* X-MAJOR LINE */
+   if (fabsf(dx) >= fabsf(dy)) {
+      float dydx = dy / dx;
+
+      x1diff = v1[0][0] - (float) floor(v1[0][0]) - 0.5;
+      y1diff = v1[0][1] - (float) floor(v1[0][1]) - 0.5;
+      x2diff = v2[0][0] - (float) floor(v2[0][0]) - 0.5;
+      y2diff = v2[0][1] - (float) floor(v2[0][1]) - 0.5;
+
+      if (y2diff==-0.5 && dy<0){
+         y2diff = 0.5;
+      }
+      
+      /* 
+       * Diamond exit rule test for starting point 
+       */    
+      if (fabsf(x1diff) + fabsf(y1diff) < 0.5) {
+         draw_start = TRUE;
+      }
+      else if (sign(x1diff) == sign(-dx)) {
+         draw_start = FALSE;
+      }
+      else if (sign(-y1diff) != sign(dy)) {
+         draw_start = TRUE;
+      }
+      else {
+         /* do intersection test */
+         float yintersect = fracf(v1[0][1]) + x1diff * dydx;
+         draw_start = (yintersect < 1.0 && yintersect > 0.0);
+      }
+
+
+      /* 
+       * Diamond exit rule test for ending point 
+       */    
+      if (fabsf(x2diff) + fabsf(y2diff) < 0.5) {
+         draw_end = FALSE;
+      }
+      else if (sign(x2diff) != sign(-dx)) {
+         draw_end = FALSE;
+      }
+      else if (sign(-y2diff) == sign(dy)) {
+         draw_end = TRUE;
+      }
+      else {
+         /* do intersection test */
+         float yintersect = fracf(v2[0][1]) + x2diff * dydx;
+         draw_end = (yintersect < 1.0 && yintersect > 0.0);
+      }
+
+      /* Are we already drawing start/end?
+       */
+      will_draw_start = sign(-x1diff) != sign(dx);
+      will_draw_end = (sign(x2diff) == sign(-dx)) || x2diff==0;
+
+      if (dx < 0) {
+         /* if v2 is to the right of v1, swap pointers */
+         const float (*temp)[4] = v1;
+         v1 = v2;
+         v2 = temp;
+         dx = -dx;
+         dy = -dy;
+         /* Otherwise shift planes appropriately */
+         if (will_draw_start != draw_start) {
+            x_offset_end = - x1diff - 0.5;
+            y_offset_end = x_offset_end * dydx;
+
+         }
+         if (will_draw_end != draw_end) {
+            x_offset = - x2diff - 0.5;
+            y_offset = x_offset * dydx;
+         }
+
+      }
+      else{
+         /* Otherwise shift planes appropriately */
+         if (will_draw_start != draw_start) {
+            x_offset = - x1diff + 0.5;
+            y_offset = x_offset * dydx;
+         }
+         if (will_draw_end != draw_end) {
+            x_offset_end = - x2diff + 0.5;
+            y_offset_end = x_offset_end * dydx;
+         }
+      }
+  
+      /* x/y positions in fixed point */
+      x[0] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset);
+      x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset);
+      x[2] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset);
+      x[3] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset);
+      
+      y[0] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset) - fixed_width/2;
+      y[1] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset) - fixed_width/2;
+      y[2] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset) + fixed_width/2;
+      y[3] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset) + fixed_width/2;
+      
+   }
+   else {
+      const float dxdy = dx / dy;
+
+      /* Y-MAJOR LINE */      
+      x1diff = v1[0][0] - (float) floor(v1[0][0]) - 0.5;
+      y1diff = v1[0][1] - (float) floor(v1[0][1]) - 0.5;
+      x2diff = v2[0][0] - (float) floor(v2[0][0]) - 0.5;
+      y2diff = v2[0][1] - (float) floor(v2[0][1]) - 0.5;
+
+      if (x2diff==-0.5 && dx<0) {
+         x2diff = 0.5;
+      }
+
+      /* 
+       * Diamond exit rule test for starting point 
+       */    
+      if (fabsf(x1diff) + fabsf(y1diff) < 0.5) {
+         draw_start = TRUE;
+      }
+      else if (sign(-y1diff) == sign(dy)) {
+         draw_start = FALSE;
+      }
+      else if (sign(x1diff) != sign(-dx)) {
+         draw_start = TRUE;
+      }
+      else {
+         /* do intersection test */
+         float xintersect = fracf(v1[0][0]) + y1diff * dxdy;
+         draw_start = (xintersect < 1.0 && xintersect > 0.0);
+      }
+
+      /* 
+       * Diamond exit rule test for ending point 
+       */    
+      if (fabsf(x2diff) + fabsf(y2diff) < 0.5) {
+         draw_end = FALSE;
+      }
+      else if (sign(-y2diff) != sign(dy) ) {
+         draw_end = FALSE;
+      }
+      else if (sign(x2diff) == sign(-dx) ) {
+         draw_end = TRUE;
+      }
+      else {
+         /* do intersection test */
+         float xintersect = fracf(v2[0][0]) + y2diff * dxdy;
+         draw_end = (xintersect < 1.0 && xintersect >= 0.0);
+      }
+
+      /* Are we already drawing start/end?
+       */
+      will_draw_start = sign(y1diff) == sign(dy);
+      will_draw_end = (sign(-y2diff) == sign(dy)) || y2diff==0;
+
+      if (dy > 0) {
+         /* if v2 is on top of v1, swap pointers */
+         const float (*temp)[4] = v1;
+         v1 = v2;
+         v2 = temp; 
+         dx = -dx;
+         dy = -dy;
+
+         /* Otherwise shift planes appropriately */
+         if (will_draw_start != draw_start) {
+            y_offset_end = - y1diff + 0.5;
+            x_offset_end = y_offset_end * dxdy;
+         }
+         if (will_draw_end != draw_end) {
+            y_offset = - y2diff + 0.5;
+            x_offset = y_offset * dxdy;
+         }
+      }
+      else {
+         /* Otherwise shift planes appropriately */
+         if (will_draw_start != draw_start) {
+            y_offset = - y1diff - 0.5;
+            x_offset = y_offset * dxdy;
+                     
+         }
+         if (will_draw_end != draw_end) {
+            y_offset_end = - y2diff - 0.5;
+            x_offset_end = y_offset_end * dxdy;
+         }
+      }
+
+      /* x/y positions in fixed point */
+      x[0] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset) - fixed_width/2;
+      x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) - fixed_width/2;
+      x[2] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) + fixed_width/2;
+      x[3] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset) + fixed_width/2;
+     
+      y[0] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset); 
+      y[1] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset);
+      y[2] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset);
+      y[3] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset);
+   }
+
+   /* Bounding rectangle (in pixels) */
+   {
+      /* Yes this is necessary to accurately calculate bounding boxes
+       * with the two fill-conventions we support.  GL (normally) ends
+       * up needing a bottom-left fill convention, which requires
+       * slightly different rounding.
+       */
+      int adj = (setup->bottom_edge_rule != 0) ? 1 : 0;
+
+      bbox.x0 = (MIN4(x[0], x[1], x[2], x[3]) + (FIXED_ONE-1)) >> FIXED_ORDER;
+      bbox.x1 = (MAX4(x[0], x[1], x[2], x[3]) + (FIXED_ONE-1)) >> FIXED_ORDER;
+      bbox.y0 = (MIN4(y[0], y[1], y[2], y[3]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+      bbox.y1 = (MAX4(y[0], y[1], y[2], y[3]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+
+      /* Inclusive coordinates:
+       */
+      bbox.x1--;
+      bbox.y1--;
+   }
+
+   if (bbox.x1 < bbox.x0 ||
+       bbox.y1 < bbox.y0) {
+      if (0) debug_printf("empty bounding box\n");
+      LP_COUNT(nr_culled_tris);
+      return TRUE;
+   }
+
+   if (!u_rect_test_intersection(&setup->draw_regions[viewport_index], &bbox)) {
+      if (0) debug_printf("offscreen\n");
+      LP_COUNT(nr_culled_tris);
+      return TRUE;
+   }
+
+   /* Can safely discard negative regions:
+    */
+   bbox.x0 = MAX2(bbox.x0, 0);
+   bbox.y0 = MAX2(bbox.y0, 0);
+
+   line = lp_setup_alloc_triangle(scene,
+                                  key->num_inputs,
+                                  nr_planes,
+                                  &tri_bytes);
+   if (!line)
+      return FALSE;
+
+#ifdef DEBUG
+   line->v[0][0] = v1[0][0];
+   line->v[1][0] = v2[0][0];   
+   line->v[0][1] = v1[0][1];
+   line->v[1][1] = v2[0][1];
+#endif
+
+   LP_COUNT(nr_tris);
+
+   if (lp_context->active_statistics_queries &&
+       !llvmpipe_rasterization_disabled(lp_context)) {
+      lp_context->pipeline_statistics.c_primitives++;
+   }
+
+   /* calculate the deltas */
+   plane = GET_PLANES(line);
+   plane[0].dcdy = x[0] - x[1];
+   plane[1].dcdy = x[1] - x[2];
+   plane[2].dcdy = x[2] - x[3];
+   plane[3].dcdy = x[3] - x[0];
+
+   plane[0].dcdx = y[0] - y[1];
+   plane[1].dcdx = y[1] - y[2];
+   plane[2].dcdx = y[2] - y[3];
+   plane[3].dcdx = y[3] - y[0];
+
+   if (draw_will_inject_frontface(lp_context->draw) &&
+       setup->face_slot > 0) {
+      line->inputs.frontfacing = v1[setup->face_slot][0];
+   } else {
+      line->inputs.frontfacing = TRUE;
+   }
+
+   /* Setup parameter interpolants:
+    */
+   info.a0 = GET_A0(&line->inputs);
+   info.dadx = GET_DADX(&line->inputs);
+   info.dady = GET_DADY(&line->inputs);
+   info.frontfacing = line->inputs.frontfacing;
+   setup_line_coefficients(setup, &info); 
+
+   line->inputs.disable = FALSE;
+   line->inputs.opaque = FALSE;
+   line->inputs.layer = layer;
+   line->inputs.viewport_index = viewport_index;
+
+   for (i = 0; i < 4; i++) {
+
+      /* half-edge constants, will be interated over the whole render
+       * target.
+       */
+      plane[i].c = IMUL64(plane[i].dcdx, x[i]) - IMUL64(plane[i].dcdy, y[i]);
+
+      
+      /* correct for top-left vs. bottom-left fill convention.  
+       */         
+      if (plane[i].dcdx < 0) {
+         /* both fill conventions want this - adjust for left edges */
+         plane[i].c++;            
+      }
+      else if (plane[i].dcdx == 0) {
+         if (setup->pixel_offset == 0) {
+            /* correct for top-left fill convention:
+             */
+            if (plane[i].dcdy > 0) plane[i].c++;
+         }
+         else {
+            /* correct for bottom-left fill convention:
+             */
+            if (plane[i].dcdy < 0) plane[i].c++;
+         }
+      }
+
+      plane[i].dcdx *= FIXED_ONE;
+      plane[i].dcdy *= FIXED_ONE;
+
+      /* find trivial reject offsets for each edge for a single-pixel
+       * sized block.  These will be scaled up at each recursive level to
+       * match the active blocksize.  Scaling in this way works best if
+       * the blocks are square.
+       */
+      plane[i].eo = 0;
+      if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
+      if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
+   }
+
+
+   /* 
+    * When rasterizing scissored tris, use the intersection of the
+    * triangle bounding box and the scissor rect to generate the
+    * scissor planes.
+    *
+    * This permits us to cut off the triangle "tails" that are present
+    * in the intermediate recursive levels caused when two of the
+    * triangles edges don't diverge quickly enough to trivially reject
+    * exterior blocks from the triangle.
+    *
+    * It's not really clear if it's worth worrying about these tails,
+    * but since we generate the planes for each scissored tri, it's
+    * free to trim them in this case.
+    * 
+    * Note that otherwise, the scissor planes only vary in 'C' value,
+    * and even then only on state-changes.  Could alternatively store
+    * these planes elsewhere.
+    */
+   if (nr_planes == 8) {
+      const struct u_rect *scissor =
+         &setup->scissors[viewport_index];
+
+      plane[4].dcdx = -1;
+      plane[4].dcdy = 0;
+      plane[4].c = 1-scissor->x0;
+      plane[4].eo = 1;
+
+      plane[5].dcdx = 1;
+      plane[5].dcdy = 0;
+      plane[5].c = scissor->x1+1;
+      plane[5].eo = 0;
+
+      plane[6].dcdx = 0;
+      plane[6].dcdy = 1;
+      plane[6].c = 1-scissor->y0;
+      plane[6].eo = 1;
+
+      plane[7].dcdx = 0;
+      plane[7].dcdy = -1;
+      plane[7].c = scissor->y1+1;
+      plane[7].eo = 0;
+   }
+
+   return lp_setup_bin_triangle(setup, line, &bbox, nr_planes, viewport_index);
+}
+
+
+static void lp_setup_line( struct lp_setup_context *setup,
+                           const float (*v0)[4],
+                           const float (*v1)[4] )
+{
+   if (!try_setup_line( setup, v0, v1 ))
+   {
+      if (!lp_setup_flush_and_restart(setup))
+         return;
+
+      if (!try_setup_line( setup, v0, v1 ))
+         return;
+   }
+}
+
+
+void lp_setup_choose_line( struct lp_setup_context *setup ) 
+{ 
+   setup->line = lp_setup_line;
+}
+
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup_point.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup_point.c
new file mode 100644
index 000000000..75544b524
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -0,0 +1,541 @@
+/**************************************************************************
+ *
+ * Copyright 2010, VMware Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Binning code for points
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "lp_setup_context.h"
+#include "lp_perf.h"
+#include "lp_rast.h"
+#include "lp_state_fs.h"
+#include "lp_state_setup.h"
+#include "lp_context.h"
+#include "tgsi/tgsi_scan.h"
+#include "draw/draw_context.h"
+
+#define NUM_CHANNELS 4
+
+struct point_info {
+   /* x,y deltas */
+   int dy01, dy12;
+   int dx01, dx12;
+
+   const float (*v0)[4];
+
+   float (*a0)[4];
+   float (*dadx)[4];
+   float (*dady)[4];
+
+   boolean frontfacing;
+};   
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ */
+static void
+constant_coef(struct lp_setup_context *setup,
+              struct point_info *info,
+              unsigned slot,
+              const float value,
+              unsigned i)
+{
+   info->a0[slot][i] = value;
+   info->dadx[slot][i] = 0.0f;
+   info->dady[slot][i] = 0.0f;
+}
+
+
+static void
+point_persp_coeff(struct lp_setup_context *setup,
+                  const struct point_info *info,
+                  unsigned slot,
+                  unsigned i)
+{
+   /*
+    * Fragment shader expects pre-multiplied w for LP_INTERP_PERSPECTIVE. A
+    * better stratergy would be to take the primitive in consideration when
+    * generating the fragment shader key, and therefore avoid the per-fragment
+    * perspective divide.
+    */
+
+   float w0 = info->v0[0][3];
+
+   assert(i < 4);
+
+   info->a0[slot][i] = info->v0[slot][i]*w0;
+   info->dadx[slot][i] = 0.0f;
+   info->dady[slot][i] = 0.0f;
+}
+
+
+/**
+ * Setup automatic texcoord coefficients (for sprite rendering).
+ * \param slot  the vertex attribute slot to setup
+ * \param i  the attribute channel in [0,3]
+ * \param sprite_coord_origin  one of PIPE_SPRITE_COORD_x
+ * \param perspective  does the shader expects pre-multiplied w, i.e.,
+ *    LP_INTERP_PERSPECTIVE is specified in the shader key
+ */
+static void
+texcoord_coef(struct lp_setup_context *setup,
+              const struct point_info *info,
+              unsigned slot,
+              unsigned i,
+              unsigned sprite_coord_origin,
+              boolean perspective)
+{
+   float w0 = info->v0[0][3];
+
+   assert(i < 4);
+
+   if (i == 0) {
+      float dadx = FIXED_ONE / (float)info->dx12;
+      float dady =  0.0f;
+      float x0 = info->v0[0][0] - setup->pixel_offset;
+      float y0 = info->v0[0][1] - setup->pixel_offset;
+
+      info->dadx[slot][0] = dadx;
+      info->dady[slot][0] = dady;
+      info->a0[slot][0] = 0.5 - (dadx * x0 + dady * y0);
+
+      if (perspective) {
+         info->dadx[slot][0] *= w0;
+         info->dady[slot][0] *= w0;
+         info->a0[slot][0] *= w0;
+      }
+   }
+   else if (i == 1) {
+      float dadx = 0.0f;
+      float dady = FIXED_ONE / (float)info->dx12;
+      float x0 = info->v0[0][0] - setup->pixel_offset;
+      float y0 = info->v0[0][1] - setup->pixel_offset;
+
+      if (sprite_coord_origin == PIPE_SPRITE_COORD_LOWER_LEFT) {
+         dady = -dady;
+      }
+
+      info->dadx[slot][1] = dadx;
+      info->dady[slot][1] = dady;
+      info->a0[slot][1] = 0.5 - (dadx * x0 + dady * y0);
+
+      if (perspective) {
+         info->dadx[slot][1] *= w0;
+         info->dady[slot][1] *= w0;
+         info->a0[slot][1] *= w0;
+      }
+   }
+   else if (i == 2) {
+      info->a0[slot][2] = 0.0f;
+      info->dadx[slot][2] = 0.0f;
+      info->dady[slot][2] = 0.0f;
+   }
+   else {
+      info->a0[slot][3] = perspective ? w0 : 1.0f;
+      info->dadx[slot][3] = 0.0f;
+      info->dady[slot][3] = 0.0f;
+   }
+}
+
+
+/**
+ * Special coefficient setup for gl_FragCoord.
+ * X and Y are trivial
+ * Z and W are copied from position_coef which should have already been computed.
+ * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
+ */
+static void
+setup_point_fragcoord_coef(struct lp_setup_context *setup,
+                           struct point_info *info,
+                           unsigned slot,
+                           unsigned usage_mask)
+{
+   /*X*/
+   if (usage_mask & TGSI_WRITEMASK_X) {
+      info->a0[slot][0] = 0.0;
+      info->dadx[slot][0] = 1.0;
+      info->dady[slot][0] = 0.0;
+   }
+
+   /*Y*/
+   if (usage_mask & TGSI_WRITEMASK_Y) {
+      info->a0[slot][1] = 0.0;
+      info->dadx[slot][1] = 0.0;
+      info->dady[slot][1] = 1.0;
+   }
+
+   /*Z*/
+   if (usage_mask & TGSI_WRITEMASK_Z) {
+      constant_coef(setup, info, slot, info->v0[0][2], 2);
+   }
+
+   /*W*/
+   if (usage_mask & TGSI_WRITEMASK_W) {
+      constant_coef(setup, info, slot, info->v0[0][3], 3);
+   }
+}
+
+
+/**
+ * Compute the point->coef[] array dadx, dady, a0 values.
+ */
+static void   
+setup_point_coefficients( struct lp_setup_context *setup,
+                          struct point_info *info)
+{
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
+   const struct lp_fragment_shader *shader = setup->fs.current.variant->shader;
+   unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
+   unsigned slot;
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (slot = 0; slot < key->num_inputs; slot++) {
+      unsigned vert_attr = key->inputs[slot].src_index;
+      unsigned usage_mask = key->inputs[slot].usage_mask;
+      enum lp_interp interp = key->inputs[slot].interp;
+      boolean perspective = !!(interp == LP_INTERP_PERSPECTIVE);
+      unsigned i;
+
+      if (perspective & usage_mask) {
+         fragcoord_usage_mask |= TGSI_WRITEMASK_W;
+      }
+      
+      switch (interp) {
+      case LP_INTERP_POSITION:
+         /*
+          * The generated pixel interpolators will pick up the coeffs from
+          * slot 0, so all need to ensure that the usage mask is covers all
+          * usages.
+          */
+         fragcoord_usage_mask |= usage_mask;
+         break;
+
+      case LP_INTERP_LINEAR:
+         /* Sprite tex coords may use linear interpolation someday */
+         /* fall-through */
+      case LP_INTERP_PERSPECTIVE:
+         /* check if the sprite coord flag is set for this attribute.
+          * If so, set it up so it up so x and y vary from 0 to 1.
+          */
+         if (shader->info.base.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) {
+            unsigned semantic_index = shader->info.base.input_semantic_index[slot];
+            /* Note that sprite_coord enable is a bitfield of
+             * PIPE_MAX_SHADER_OUTPUTS bits.
+             */
+            if (semantic_index < PIPE_MAX_SHADER_OUTPUTS &&
+                (setup->sprite_coord_enable & (1 << semantic_index))) {
+               for (i = 0; i < NUM_CHANNELS; i++) {
+                  if (usage_mask & (1 << i)) {
+                     texcoord_coef(setup, info, slot + 1, i,
+                                   setup->sprite_coord_origin,
+                                   perspective);
+                  }
+               }
+               break;
+            }
+         }
+         /* fall-through */
+      case LP_INTERP_CONSTANT:
+         for (i = 0; i < NUM_CHANNELS; i++) {
+            if (usage_mask & (1 << i)) {
+               if (perspective) {
+                  point_persp_coeff(setup, info, slot+1, i);
+               }
+               else {
+                  constant_coef(setup, info, slot+1, info->v0[vert_attr][i], i);
+               }
+            }
+         }
+         break;
+
+      case LP_INTERP_FACING:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            if (usage_mask & (1 << i))
+               constant_coef(setup, info, slot+1,
+                             info->frontfacing ? 1.0f : -1.0f, i);
+         break;
+
+      default:
+         assert(0);
+         break;
+      }
+   }
+
+   /* The internal position input is in slot zero:
+    */
+   setup_point_fragcoord_coef(setup, info, 0,
+                              fragcoord_usage_mask);
+}
+
+
+static inline int
+subpixel_snap(float a)
+{
+   return util_iround(FIXED_ONE * a);
+}
+
+/**
+ * Print point vertex attribs (for debug).
+ */
+static void
+print_point(struct lp_setup_context *setup,
+            const float (*v0)[4],
+            const float size)
+{
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
+   uint i;
+
+   debug_printf("llvmpipe point, width %f\n", size);
+   for (i = 0; i < 1 + key->num_inputs; i++) {
+      debug_printf("  v0[%d]:  %f %f %f %f\n", i,
+                   v0[i][0], v0[i][1], v0[i][2], v0[i][3]);
+   }
+}
+
+
+static boolean
+try_setup_point( struct lp_setup_context *setup,
+                 const float (*v0)[4] )
+{
+   struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe;
+   /* x/y positions in fixed point */
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
+   const int sizeAttr = setup->psize;
+   const float size
+      = (setup->point_size_per_vertex && sizeAttr > 0) ? v0[sizeAttr][0]
+      : setup->point_size;
+
+   /* Yes this is necessary to accurately calculate bounding boxes
+    * with the two fill-conventions we support.  GL (normally) ends
+    * up needing a bottom-left fill convention, which requires
+    * slightly different rounding.
+    */
+   int adj = (setup->bottom_edge_rule != 0) ? 1 : 0;
+
+   struct lp_scene *scene = setup->scene;
+   struct lp_rast_triangle *point;
+   unsigned bytes;
+   struct u_rect bbox;
+   unsigned nr_planes = 4;
+   struct point_info info;
+   unsigned viewport_index = 0;
+   unsigned layer = 0;
+   int fixed_width;
+
+   if (setup->viewport_index_slot > 0) {
+      unsigned *udata = (unsigned*)v0[setup->viewport_index_slot];
+      viewport_index = lp_clamp_viewport_idx(*udata);
+   }
+   if (setup->layer_slot > 0) {
+      layer = *(unsigned*)v0[setup->layer_slot];
+      layer = MIN2(layer, scene->fb_max_layer);
+   }
+
+   if (0)
+      print_point(setup, v0, size);
+
+   /* Bounding rectangle (in pixels) */
+   if (!lp_context->rasterizer ||
+       lp_context->rasterizer->point_quad_rasterization) {
+      /*
+       * Rasterize points as quads.
+       */
+      int x0, y0;
+      /* Point size as fixed point integer, remove rounding errors
+       * and gives minimum width for very small points.
+       */
+      fixed_width = MAX2(FIXED_ONE, subpixel_snap(size));
+
+      x0 = subpixel_snap(v0[0][0] - setup->pixel_offset) - fixed_width/2;
+      y0 = subpixel_snap(v0[0][1] - setup->pixel_offset) - fixed_width/2;
+
+      bbox.x0 = (x0 + (FIXED_ONE-1)) >> FIXED_ORDER;
+      bbox.x1 = (x0 + fixed_width + (FIXED_ONE-1)) >> FIXED_ORDER;
+      bbox.y0 = (y0 + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+      bbox.y1 = (y0 + fixed_width + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+
+      /* Inclusive coordinates:
+       */
+      bbox.x1--;
+      bbox.y1--;
+   } else {
+      /*
+       * OpenGL legacy rasterization rules for non-sprite points.
+       *
+       * Per OpenGL 2.1 spec, section 3.3.1, "Basic Point Rasterization".
+       *
+       * This type of point rasterization is only available in pre 3.0 contexts
+       * (or compatibilility contexts which we don't support) anyway.
+       */
+
+      const int x0 = subpixel_snap(v0[0][0]);
+      const int y0 = subpixel_snap(v0[0][1]) - adj;
+
+      int int_width;
+      /* Point size as fixed point integer. For GL legacy points
+       * the point size is always a whole integer.
+       */
+      fixed_width = MAX2(FIXED_ONE,
+                         (subpixel_snap(size) + FIXED_ONE/2 - 1) & ~(FIXED_ONE-1));
+      int_width = fixed_width >> FIXED_ORDER;
+
+      assert(setup->pixel_offset != 0);
+
+      if (int_width == 1) {
+         bbox.x0 = x0 >> FIXED_ORDER;
+         bbox.y0 = y0 >> FIXED_ORDER;
+         bbox.x1 = bbox.x0;
+         bbox.y1 = bbox.y0;
+      } else {
+         if (int_width & 1) {
+            /* Odd width */
+            bbox.x0 = (x0 >> FIXED_ORDER) - (int_width - 1)/2;
+            bbox.y0 = (y0 >> FIXED_ORDER) - (int_width - 1)/2;
+         } else {
+            /* Even width */
+            bbox.x0 = ((x0 + FIXED_ONE/2) >> FIXED_ORDER) - int_width/2;
+            bbox.y0 = ((y0 + FIXED_ONE/2) >> FIXED_ORDER) - int_width/2;
+         }
+
+         bbox.x1 = bbox.x0 + int_width - 1;
+         bbox.y1 = bbox.y0 + int_width - 1;
+      }
+   }
+
+   if (0) {
+      debug_printf("  bbox: (%i, %i) - (%i, %i)\n",
+                   bbox.x0, bbox.y0,
+                   bbox.x1, bbox.y1);
+   }
+
+   if (!u_rect_test_intersection(&setup->draw_regions[viewport_index], &bbox)) {
+      if (0) debug_printf("offscreen\n");
+      LP_COUNT(nr_culled_tris);
+      return TRUE;
+   }
+
+   u_rect_find_intersection(&setup->draw_regions[viewport_index], &bbox);
+
+   point = lp_setup_alloc_triangle(scene,
+                                   key->num_inputs,
+                                   nr_planes,
+                                   &bytes);
+   if (!point)
+      return FALSE;
+
+#ifdef DEBUG
+   point->v[0][0] = v0[0][0];
+   point->v[0][1] = v0[0][1];
+#endif
+
+   LP_COUNT(nr_tris);
+
+   if (lp_context->active_statistics_queries &&
+       !llvmpipe_rasterization_disabled(lp_context)) {
+      lp_context->pipeline_statistics.c_primitives++;
+   }
+
+   if (draw_will_inject_frontface(lp_context->draw) &&
+       setup->face_slot > 0) {
+      point->inputs.frontfacing = v0[setup->face_slot][0];
+   } else {
+      point->inputs.frontfacing = TRUE;
+   }
+
+   info.v0 = v0;
+   info.dx01 = 0;
+   info.dx12 = fixed_width;
+   info.dy01 = fixed_width;
+   info.dy12 = 0;
+   info.a0 = GET_A0(&point->inputs);
+   info.dadx = GET_DADX(&point->inputs);
+   info.dady = GET_DADY(&point->inputs);
+   info.frontfacing = point->inputs.frontfacing;
+   
+   /* Setup parameter interpolants:
+    */
+   setup_point_coefficients(setup, &info);
+
+   point->inputs.disable = FALSE;
+   point->inputs.opaque = FALSE;
+   point->inputs.layer = layer;
+   point->inputs.viewport_index = viewport_index;
+
+   {
+      struct lp_rast_plane *plane = GET_PLANES(point);
+
+      plane[0].dcdx = -1;
+      plane[0].dcdy = 0;
+      plane[0].c = 1-bbox.x0;
+      plane[0].eo = 1;
+
+      plane[1].dcdx = 1;
+      plane[1].dcdy = 0;
+      plane[1].c = bbox.x1+1;
+      plane[1].eo = 0;
+
+      plane[2].dcdx = 0;
+      plane[2].dcdy = 1;
+      plane[2].c = 1-bbox.y0;
+      plane[2].eo = 1;
+
+      plane[3].dcdx = 0;
+      plane[3].dcdy = -1;
+      plane[3].c = bbox.y1+1;
+      plane[3].eo = 0;
+   }
+
+   return lp_setup_bin_triangle(setup, point, &bbox, nr_planes, viewport_index);
+}
+
+
+static void 
+lp_setup_point(struct lp_setup_context *setup,
+               const float (*v0)[4])
+{
+   if (!try_setup_point( setup, v0 ))
+   {
+      if (!lp_setup_flush_and_restart(setup))
+         return;
+
+      if (!try_setup_point( setup, v0 ))
+         return;
+   }
+}
+
+
+void 
+lp_setup_choose_point( struct lp_setup_context *setup )
+{
+   setup->point = lp_setup_point;
+}
+
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup_tri.c
new file mode 100644
index 000000000..98a9d4bc2
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -0,0 +1,1027 @@
+/**************************************************************************
+ *
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Binning code for triangles
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_rect.h"
+#include "util/u_sse.h"
+#include "lp_perf.h"
+#include "lp_setup_context.h"
+#include "lp_rast.h"
+#include "lp_state_fs.h"
+#include "lp_state_setup.h"
+#include "lp_context.h"
+
+#include <inttypes.h>
+
+#define NUM_CHANNELS 4
+
+#if defined(PIPE_ARCH_SSE)
+#include <emmintrin.h>
+#endif
+
+static inline int
+subpixel_snap(float a)
+{
+   return util_iround(FIXED_ONE * a);
+}
+
+static inline float
+fixed_to_float(int a)
+{
+   return a * (1.0f / FIXED_ONE);
+}
+
+
+/* Position and area in fixed point coordinates */
+struct fixed_position {
+   int32_t x[4];
+   int32_t y[4];
+   int64_t area;
+   int32_t dx01;
+   int32_t dy01;
+   int32_t dx20;
+   int32_t dy20;
+};
+
+
+/**
+ * Alloc space for a new triangle plus the input.a0/dadx/dady arrays
+ * immediately after it.
+ * The memory is allocated from the per-scene pool, not per-tile.
+ * \param tri_size  returns number of bytes allocated
+ * \param num_inputs  number of fragment shader inputs
+ * \return pointer to triangle space
+ */
+struct lp_rast_triangle *
+lp_setup_alloc_triangle(struct lp_scene *scene,
+                        unsigned nr_inputs,
+                        unsigned nr_planes,
+                        unsigned *tri_size)
+{
+   unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float);
+   unsigned plane_sz = nr_planes * sizeof(struct lp_rast_plane);
+   struct lp_rast_triangle *tri;
+
+   *tri_size = (sizeof(struct lp_rast_triangle) +
+                3 * input_array_sz +
+                plane_sz);
+
+   tri = lp_scene_alloc_aligned( scene, *tri_size, 16 );
+   if (tri == NULL)
+      return NULL;
+
+   tri->inputs.stride = input_array_sz;
+
+   {
+      char *a = (char *)tri;
+      char *b = (char *)&GET_PLANES(tri)[nr_planes];
+      assert(b - a == *tri_size);
+   }
+
+   return tri;
+}
+
+void
+lp_setup_print_vertex(struct lp_setup_context *setup,
+                      const char *name,
+                      const float (*v)[4])
+{
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
+   int i, j;
+
+   debug_printf("   wpos (%s[0]) xyzw %f %f %f %f\n",
+                name,
+                v[0][0], v[0][1], v[0][2], v[0][3]);
+
+   for (i = 0; i < key->num_inputs; i++) {
+      const float *in = v[key->inputs[i].src_index];
+
+      debug_printf("  in[%d] (%s[%d]) %s%s%s%s ",
+                   i, 
+                   name, key->inputs[i].src_index,
+                   (key->inputs[i].usage_mask & 0x1) ? "x" : " ",
+                   (key->inputs[i].usage_mask & 0x2) ? "y" : " ",
+                   (key->inputs[i].usage_mask & 0x4) ? "z" : " ",
+                   (key->inputs[i].usage_mask & 0x8) ? "w" : " ");
+
+      for (j = 0; j < 4; j++)
+         if (key->inputs[i].usage_mask & (1<<j))
+            debug_printf("%.5f ", in[j]);
+
+      debug_printf("\n");
+   }
+}
+
+
+/**
+ * Print triangle vertex attribs (for debug).
+ */
+void
+lp_setup_print_triangle(struct lp_setup_context *setup,
+                        const float (*v0)[4],
+                        const float (*v1)[4],
+                        const float (*v2)[4])
+{
+   debug_printf("triangle\n");
+
+   {
+      const float ex = v0[0][0] - v2[0][0];
+      const float ey = v0[0][1] - v2[0][1];
+      const float fx = v1[0][0] - v2[0][0];
+      const float fy = v1[0][1] - v2[0][1];
+
+      /* det = cross(e,f).z */
+      const float det = ex * fy - ey * fx;
+      if (det < 0.0f) 
+         debug_printf("   - ccw\n");
+      else if (det > 0.0f)
+         debug_printf("   - cw\n");
+      else
+         debug_printf("   - zero area\n");
+   }
+
+   lp_setup_print_vertex(setup, "v0", v0);
+   lp_setup_print_vertex(setup, "v1", v1);
+   lp_setup_print_vertex(setup, "v2", v2);
+}
+
+
+#define MAX_PLANES 8
+static unsigned
+lp_rast_tri_tab[MAX_PLANES+1] = {
+   0,               /* should be impossible */
+   LP_RAST_OP_TRIANGLE_1,
+   LP_RAST_OP_TRIANGLE_2,
+   LP_RAST_OP_TRIANGLE_3,
+   LP_RAST_OP_TRIANGLE_4,
+   LP_RAST_OP_TRIANGLE_5,
+   LP_RAST_OP_TRIANGLE_6,
+   LP_RAST_OP_TRIANGLE_7,
+   LP_RAST_OP_TRIANGLE_8
+};
+
+static unsigned
+lp_rast_32_tri_tab[MAX_PLANES+1] = {
+   0,               /* should be impossible */
+   LP_RAST_OP_TRIANGLE_32_1,
+   LP_RAST_OP_TRIANGLE_32_2,
+   LP_RAST_OP_TRIANGLE_32_3,
+   LP_RAST_OP_TRIANGLE_32_4,
+   LP_RAST_OP_TRIANGLE_32_5,
+   LP_RAST_OP_TRIANGLE_32_6,
+   LP_RAST_OP_TRIANGLE_32_7,
+   LP_RAST_OP_TRIANGLE_32_8
+};
+
+
+
+/**
+ * The primitive covers the whole tile- shade whole tile.
+ *
+ * \param tx, ty  the tile position in tiles, not pixels
+ */
+static boolean
+lp_setup_whole_tile(struct lp_setup_context *setup,
+                    const struct lp_rast_shader_inputs *inputs,
+                    int tx, int ty)
+{
+   struct lp_scene *scene = setup->scene;
+
+   LP_COUNT(nr_fully_covered_64);
+
+   /* if variant is opaque and scissor doesn't effect the tile */
+   if (inputs->opaque) {
+      /* Several things prevent this optimization from working:
+       * - For layered rendering we can't determine if this covers the same layer
+       * as previous rendering (or in case of clears those actually always cover
+       * all layers so optimization is impossible). Need to use fb_max_layer and
+       * not setup->layer_slot to determine this since even if there's currently
+       * no slot assigned previous rendering could have used one.
+       * - If there were any Begin/End query commands in the scene then those
+       * would get removed which would be very wrong. Furthermore, if queries
+       * were just active we also can't do the optimization since to get
+       * accurate query results we unfortunately need to execute the rendering
+       * commands.
+       */
+      if (!scene->fb.zsbuf && scene->fb_max_layer == 0 && !scene->had_queries) {
+         /*
+          * All previous rendering will be overwritten so reset the bin.
+          */
+         lp_scene_bin_reset( scene, tx, ty );
+      }
+
+      LP_COUNT(nr_shade_opaque_64);
+      return lp_scene_bin_cmd_with_state( scene, tx, ty,
+                                          setup->fs.stored,
+                                          LP_RAST_OP_SHADE_TILE_OPAQUE,
+                                          lp_rast_arg_inputs(inputs) );
+   } else {
+      LP_COUNT(nr_shade_64);
+      return lp_scene_bin_cmd_with_state( scene, tx, ty,
+                                          setup->fs.stored, 
+                                          LP_RAST_OP_SHADE_TILE,
+                                          lp_rast_arg_inputs(inputs) );
+   }
+}
+
+
+/**
+ * Do basic setup for triangle rasterization and determine which
+ * framebuffer tiles are touched.  Put the triangle in the scene's
+ * bins for the tiles which we overlap.
+ */
+static boolean
+do_triangle_ccw(struct lp_setup_context *setup,
+                struct fixed_position* position,
+                const float (*v0)[4],
+                const float (*v1)[4],
+                const float (*v2)[4],
+                boolean frontfacing )
+{
+   struct lp_scene *scene = setup->scene;
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
+   struct lp_rast_triangle *tri;
+   struct lp_rast_plane *plane;
+   struct u_rect bbox;
+   unsigned tri_bytes;
+   int nr_planes = 3;
+   unsigned viewport_index = 0;
+   unsigned layer = 0;
+
+   /* Area should always be positive here */
+   assert(position->area > 0);
+
+   if (0)
+      lp_setup_print_triangle(setup, v0, v1, v2);
+
+   if (setup->scissor_test) {
+      nr_planes = 7;
+      if (setup->viewport_index_slot > 0) {
+         unsigned *udata = (unsigned*)v0[setup->viewport_index_slot];
+         viewport_index = lp_clamp_viewport_idx(*udata);
+      }
+   }
+   else {
+      nr_planes = 3;
+   }
+   if (setup->layer_slot > 0) {
+      layer = *(unsigned*)v1[setup->layer_slot];
+      layer = MIN2(layer, scene->fb_max_layer);
+   }
+
+   /* Bounding rectangle (in pixels) */
+   {
+      /* Yes this is necessary to accurately calculate bounding boxes
+       * with the two fill-conventions we support.  GL (normally) ends
+       * up needing a bottom-left fill convention, which requires
+       * slightly different rounding.
+       */
+      int adj = (setup->bottom_edge_rule != 0) ? 1 : 0;
+
+      /* Inclusive x0, exclusive x1 */
+      bbox.x0 =  MIN3(position->x[0], position->x[1], position->x[2]) >> FIXED_ORDER;
+      bbox.x1 = (MAX3(position->x[0], position->x[1], position->x[2]) - 1) >> FIXED_ORDER;
+
+      /* Inclusive / exclusive depending upon adj (bottom-left or top-right) */
+      bbox.y0 = (MIN3(position->y[0], position->y[1], position->y[2]) + adj) >> FIXED_ORDER;
+      bbox.y1 = (MAX3(position->y[0], position->y[1], position->y[2]) - 1 + adj) >> FIXED_ORDER;
+   }
+
+   if (bbox.x1 < bbox.x0 ||
+       bbox.y1 < bbox.y0) {
+      if (0) debug_printf("empty bounding box\n");
+      LP_COUNT(nr_culled_tris);
+      return TRUE;
+   }
+
+   if (!u_rect_test_intersection(&setup->draw_regions[viewport_index], &bbox)) {
+      if (0) debug_printf("offscreen\n");
+      LP_COUNT(nr_culled_tris);
+      return TRUE;
+   }
+
+   /* Can safely discard negative regions, but need to keep hold of
+    * information about when the triangle extends past screen
+    * boundaries.  See trimmed_box in lp_setup_bin_triangle().
+    */
+   bbox.x0 = MAX2(bbox.x0, 0);
+   bbox.y0 = MAX2(bbox.y0, 0);
+
+   tri = lp_setup_alloc_triangle(scene,
+                                 key->num_inputs,
+                                 nr_planes,
+                                 &tri_bytes);
+   if (!tri)
+      return FALSE;
+
+#if 0
+   tri->v[0][0] = v0[0][0];
+   tri->v[1][0] = v1[0][0];
+   tri->v[2][0] = v2[0][0];
+   tri->v[0][1] = v0[0][1];
+   tri->v[1][1] = v1[0][1];
+   tri->v[2][1] = v2[0][1];
+#endif
+
+   LP_COUNT(nr_tris);
+
+   /* Setup parameter interpolants:
+    */
+   setup->setup.variant->jit_function( v0,
+				       v1,
+				       v2,
+				       frontfacing,
+				       GET_A0(&tri->inputs),
+				       GET_DADX(&tri->inputs),
+				       GET_DADY(&tri->inputs) );
+
+   tri->inputs.frontfacing = frontfacing;
+   tri->inputs.disable = FALSE;
+   tri->inputs.opaque = setup->fs.current.variant->opaque;
+   tri->inputs.layer = layer;
+   tri->inputs.viewport_index = viewport_index;
+
+   if (0)
+      lp_dump_setup_coef(&setup->setup.variant->key,
+			 (const float (*)[4])GET_A0(&tri->inputs),
+			 (const float (*)[4])GET_DADX(&tri->inputs),
+			 (const float (*)[4])GET_DADY(&tri->inputs));
+
+   plane = GET_PLANES(tri);
+
+#if defined(PIPE_ARCH_SSE)
+   if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
+       setup->fb.height <= MAX_FIXED_LENGTH32 &&
+       (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
+       (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
+      __m128i vertx, verty;
+      __m128i shufx, shufy;
+      __m128i dcdx, dcdy, c;
+      __m128i unused;
+      __m128i dcdx_neg_mask;
+      __m128i dcdy_neg_mask;
+      __m128i dcdx_zero_mask;
+      __m128i top_left_flag;
+      __m128i c_inc_mask, c_inc;
+      __m128i eo, p0, p1, p2;
+      __m128i zero = _mm_setzero_si128();
+      PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
+
+      vertx = _mm_loadu_si128((__m128i *)position->x); /* vertex x coords */
+      verty = _mm_loadu_si128((__m128i *)position->y); /* vertex y coords */
+
+      shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1));
+      shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1));
+
+      dcdx = _mm_sub_epi32(verty, shufy);
+      dcdy = _mm_sub_epi32(vertx, shufx);
+
+      dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
+      dcdx_zero_mask = _mm_cmpeq_epi32(dcdx, zero);
+      dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
+
+      top_left_flag = _mm_set1_epi32((setup->bottom_edge_rule == 0) ? ~0 : 0);
+
+      c_inc_mask = _mm_or_si128(dcdx_neg_mask,
+                                _mm_and_si128(dcdx_zero_mask,
+                                              _mm_xor_si128(dcdy_neg_mask,
+                                                            top_left_flag)));
+
+      c_inc = _mm_srli_epi32(c_inc_mask, 31);
+
+      c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx),
+                        mm_mullo_epi32(dcdy, verty));
+
+      c = _mm_add_epi32(c, c_inc);
+
+      /* Scale up to match c:
+       */
+      dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER);
+      dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER);
+
+      /* Calculate trivial reject values:
+       */
+      eo = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
+                         _mm_and_si128(dcdx_neg_mask, dcdx));
+
+      /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
+
+      /* Pointless transpose which gets undone immediately in
+       * rasterization:
+       */
+      transpose4_epi32(&c, &dcdx, &dcdy, &eo,
+                       &p0, &p1, &p2, &unused);
+
+#define STORE_PLANE(plane, vec) do {                 \
+         _mm_store_si128((__m128i *)&temp_vec, vec); \
+         plane.c    = (int64_t)temp_vec[0];          \
+         plane.dcdx = temp_vec[1];                   \
+         plane.dcdy = temp_vec[2];                   \
+         plane.eo   = temp_vec[3];                   \
+      } while(0)
+
+      STORE_PLANE(plane[0], p0);
+      STORE_PLANE(plane[1], p1);
+      STORE_PLANE(plane[2], p2);
+#undef STORE_PLANE
+   } else
+#endif
+   {
+      int i;
+      plane[0].dcdy = position->dx01;
+      plane[1].dcdy = position->x[1] - position->x[2];
+      plane[2].dcdy = position->dx20;
+      plane[0].dcdx = position->dy01;
+      plane[1].dcdx = position->y[1] - position->y[2];
+      plane[2].dcdx = position->dy20;
+  
+      for (i = 0; i < 3; i++) {
+         /* half-edge constants, will be interated over the whole render
+          * target.
+          */
+         plane[i].c = IMUL64(plane[i].dcdx, position->x[i]) -
+               IMUL64(plane[i].dcdy, position->y[i]);
+
+         /* correct for top-left vs. bottom-left fill convention.
+          */         
+         if (plane[i].dcdx < 0) {
+            /* both fill conventions want this - adjust for left edges */
+            plane[i].c++;            
+         }
+         else if (plane[i].dcdx == 0) {
+            if (setup->bottom_edge_rule == 0){
+               /* correct for top-left fill convention:
+                */
+               if (plane[i].dcdy > 0) plane[i].c++;
+            }
+            else {
+               /* correct for bottom-left fill convention:
+                */
+               if (plane[i].dcdy < 0) plane[i].c++;
+            }
+         }
+
+         /* Scale up to match c:
+          */
+         assert((plane[i].dcdx << FIXED_ORDER) >> FIXED_ORDER == plane[i].dcdx);
+         assert((plane[i].dcdy << FIXED_ORDER) >> FIXED_ORDER == plane[i].dcdy);
+         plane[i].dcdx <<= FIXED_ORDER;
+         plane[i].dcdy <<= FIXED_ORDER;
+
+         /* find trivial reject offsets for each edge for a single-pixel
+          * sized block.  These will be scaled up at each recursive level to
+          * match the active blocksize.  Scaling in this way works best if
+          * the blocks are square.
+          */
+         plane[i].eo = 0;
+         if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
+         if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
+      }
+   }
+
+   if (0) {
+      debug_printf("p0: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
+                   plane[0].c,
+                   plane[0].dcdx,
+                   plane[0].dcdy,
+                   plane[0].eo);
+      
+      debug_printf("p1: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
+                   plane[1].c,
+                   plane[1].dcdx,
+                   plane[1].dcdy,
+                   plane[1].eo);
+      
+      debug_printf("p2: %"PRIx64"/%08x/%08x/%"PRIx64"\n",
+                   plane[2].c,
+                   plane[2].dcdx,
+                   plane[2].dcdy,
+                   plane[2].eo);
+   }
+
+
+   /* 
+    * When rasterizing scissored tris, use the intersection of the
+    * triangle bounding box and the scissor rect to generate the
+    * scissor planes.
+    *
+    * This permits us to cut off the triangle "tails" that are present
+    * in the intermediate recursive levels caused when two of the
+    * triangles edges don't diverge quickly enough to trivially reject
+    * exterior blocks from the triangle.
+    *
+    * It's not really clear if it's worth worrying about these tails,
+    * but since we generate the planes for each scissored tri, it's
+    * free to trim them in this case.
+    * 
+    * Note that otherwise, the scissor planes only vary in 'C' value,
+    * and even then only on state-changes.  Could alternatively store
+    * these planes elsewhere.
+    */
+   if (nr_planes == 7) {
+      const struct u_rect *scissor = &setup->scissors[viewport_index];
+
+      plane[3].dcdx = -1;
+      plane[3].dcdy = 0;
+      plane[3].c = 1-scissor->x0;
+      plane[3].eo = 1;
+
+      plane[4].dcdx = 1;
+      plane[4].dcdy = 0;
+      plane[4].c = scissor->x1+1;
+      plane[4].eo = 0;
+
+      plane[5].dcdx = 0;
+      plane[5].dcdy = 1;
+      plane[5].c = 1-scissor->y0;
+      plane[5].eo = 1;
+
+      plane[6].dcdx = 0;
+      plane[6].dcdy = -1;
+      plane[6].c = scissor->y1+1;
+      plane[6].eo = 0;
+   }
+
+   return lp_setup_bin_triangle(setup, tri, &bbox, nr_planes, viewport_index);
+}
+
+/*
+ * Round to nearest less or equal power of two of the input.
+ *
+ * Undefined if no bit set exists, so code should check against 0 first.
+ */
+static inline uint32_t 
+floor_pot(uint32_t n)
+{
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)
+   if (n == 0)
+      return 0;
+
+   __asm__("bsr %1,%0"
+          : "=r" (n)
+          : "rm" (n));
+   return 1 << n;
+#else
+   n |= (n >>  1);
+   n |= (n >>  2);
+   n |= (n >>  4);
+   n |= (n >>  8);
+   n |= (n >> 16);
+   return n - (n >> 1);
+#endif
+}
+
+
+boolean
+lp_setup_bin_triangle( struct lp_setup_context *setup,
+                       struct lp_rast_triangle *tri,
+                       const struct u_rect *bbox,
+                       int nr_planes,
+                       unsigned viewport_index )
+{
+   struct lp_scene *scene = setup->scene;
+   struct u_rect trimmed_box = *bbox;   
+   int i;
+   /* What is the largest power-of-two boundary this triangle crosses:
+    */
+   int dx = floor_pot((bbox->x0 ^ bbox->x1) |
+		      (bbox->y0 ^ bbox->y1));
+
+   /* The largest dimension of the rasterized area of the triangle
+    * (aligned to a 4x4 grid), rounded down to the nearest power of two:
+    */
+   int max_sz = ((bbox->x1 - (bbox->x0 & ~3)) |
+                 (bbox->y1 - (bbox->y0 & ~3)));
+   int sz = floor_pot(max_sz);
+   boolean use_32bits = max_sz <= MAX_FIXED_LENGTH32;
+
+   /* Now apply scissor, etc to the bounding box.  Could do this
+    * earlier, but it confuses the logic for tri-16 and would force
+    * the rasterizer to also respect scissor, etc, just for the rare
+    * cases where a small triangle extends beyond the scissor.
+    */
+   u_rect_find_intersection(&setup->draw_regions[viewport_index],
+                            &trimmed_box);
+
+   /* Determine which tile(s) intersect the triangle's bounding box
+    */
+   if (dx < TILE_SIZE)
+   {
+      int ix0 = bbox->x0 / TILE_SIZE;
+      int iy0 = bbox->y0 / TILE_SIZE;
+      unsigned px = bbox->x0 & 63 & ~3;
+      unsigned py = bbox->y0 & 63 & ~3;
+
+      assert(iy0 == bbox->y1 / TILE_SIZE &&
+	     ix0 == bbox->x1 / TILE_SIZE);
+
+      if (nr_planes == 3) {
+         if (sz < 4)
+         {
+            /* Triangle is contained in a single 4x4 stamp:
+             */
+            assert(px + 4 <= TILE_SIZE);
+            assert(py + 4 <= TILE_SIZE);
+            return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
+                                                setup->fs.stored,
+                                                use_32bits ?
+                                                LP_RAST_OP_TRIANGLE_32_3_4 :
+                                                LP_RAST_OP_TRIANGLE_3_4,
+                                                lp_rast_arg_triangle_contained(tri, px, py) );
+         }
+
+         if (sz < 16)
+         {
+            /* Triangle is contained in a single 16x16 block:
+             */
+
+            /*
+             * The 16x16 block is only 4x4 aligned, and can exceed the tile
+             * dimensions if the triangle is 16 pixels in one dimension but 4
+             * in the other. So budge the 16x16 back inside the tile.
+             */
+            px = MIN2(px, TILE_SIZE - 16);
+            py = MIN2(py, TILE_SIZE - 16);
+
+            assert(px + 16 <= TILE_SIZE);
+            assert(py + 16 <= TILE_SIZE);
+
+            return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
+                                                setup->fs.stored,
+                                                use_32bits ?
+                                                LP_RAST_OP_TRIANGLE_32_3_16 :
+                                                LP_RAST_OP_TRIANGLE_3_16,
+                                                lp_rast_arg_triangle_contained(tri, px, py) );
+         }
+      }
+      else if (nr_planes == 4 && sz < 16) 
+      {
+         px = MIN2(px, TILE_SIZE - 16);
+         py = MIN2(py, TILE_SIZE - 16);
+
+         assert(px + 16 <= TILE_SIZE);
+         assert(py + 16 <= TILE_SIZE);
+
+         return lp_scene_bin_cmd_with_state(scene, ix0, iy0,
+                                            setup->fs.stored,
+                                            use_32bits ?
+                                            LP_RAST_OP_TRIANGLE_32_4_16 :
+                                            LP_RAST_OP_TRIANGLE_4_16,
+                                            lp_rast_arg_triangle_contained(tri, px, py));
+      }
+
+
+      /* Triangle is contained in a single tile:
+       */
+      return lp_scene_bin_cmd_with_state(
+         scene, ix0, iy0, setup->fs.stored,
+         use_32bits ? lp_rast_32_tri_tab[nr_planes] : lp_rast_tri_tab[nr_planes],
+         lp_rast_arg_triangle(tri, (1<<nr_planes)-1));
+   }
+   else
+   {
+      struct lp_rast_plane *plane = GET_PLANES(tri);
+      int64_t c[MAX_PLANES];
+      int64_t ei[MAX_PLANES];
+
+      int64_t eo[MAX_PLANES];
+      int64_t xstep[MAX_PLANES];
+      int64_t ystep[MAX_PLANES];
+      int x, y;
+
+      int ix0 = trimmed_box.x0 / TILE_SIZE;
+      int iy0 = trimmed_box.y0 / TILE_SIZE;
+      int ix1 = trimmed_box.x1 / TILE_SIZE;
+      int iy1 = trimmed_box.y1 / TILE_SIZE;
+      
+      for (i = 0; i < nr_planes; i++) {
+         c[i] = (plane[i].c + 
+                 IMUL64(plane[i].dcdy, iy0) * TILE_SIZE -
+                 IMUL64(plane[i].dcdx, ix0) * TILE_SIZE);
+
+         ei[i] = (plane[i].dcdy - 
+                  plane[i].dcdx - 
+                  plane[i].eo) << TILE_ORDER;
+
+         eo[i] = plane[i].eo << TILE_ORDER;
+         xstep[i] = -(((int64_t)plane[i].dcdx) << TILE_ORDER);
+         ystep[i] = ((int64_t)plane[i].dcdy) << TILE_ORDER;
+      }
+
+
+
+      /* Test tile-sized blocks against the triangle.
+       * Discard blocks fully outside the tri.  If the block is fully
+       * contained inside the tri, bin an lp_rast_shade_tile command.
+       * Else, bin a lp_rast_triangle command.
+       */
+      for (y = iy0; y <= iy1; y++)
+      {
+         boolean in = FALSE;  /* are we inside the triangle? */
+         int64_t cx[MAX_PLANES];
+
+         for (i = 0; i < nr_planes; i++)
+            cx[i] = c[i];
+
+         for (x = ix0; x <= ix1; x++)
+         {
+            int out = 0;
+            int partial = 0;
+
+            for (i = 0; i < nr_planes; i++) {
+               int64_t planeout = cx[i] + eo[i];
+               int64_t planepartial = cx[i] + ei[i] - 1;
+               out |= (int) (planeout >> 63);
+               partial |= ((int) (planepartial >> 63)) & (1<<i);
+            }
+
+            if (out) {
+               /* do nothing */
+               if (in)
+                  break;  /* exiting triangle, all done with this row */
+               LP_COUNT(nr_empty_64);
+            }
+            else if (partial) {
+               /* Not trivially accepted by at least one plane -
+                * rasterize/shade partial tile
+                */
+               int count = util_bitcount(partial);
+               in = TRUE;
+               
+               if (!lp_scene_bin_cmd_with_state( scene, x, y,
+                                                 setup->fs.stored,
+                                                 use_32bits ?
+                                                 lp_rast_32_tri_tab[count] :
+                                                 lp_rast_tri_tab[count],
+                                                 lp_rast_arg_triangle(tri, partial) ))
+                  goto fail;
+
+               LP_COUNT(nr_partially_covered_64);
+            }
+            else {
+               /* triangle covers the whole tile- shade whole tile */
+               LP_COUNT(nr_fully_covered_64);
+               in = TRUE;
+               if (!lp_setup_whole_tile(setup, &tri->inputs, x, y))
+                  goto fail;
+            }
+
+            /* Iterate cx values across the region: */
+            for (i = 0; i < nr_planes; i++)
+               cx[i] += xstep[i];
+         }
+
+         /* Iterate c values down the region: */
+         for (i = 0; i < nr_planes; i++)
+            c[i] += ystep[i];
+      }
+   }
+
+   return TRUE;
+
+fail:
+   /* Need to disable any partially binned triangle.  This is easier
+    * than trying to locate all the triangle, shade-tile, etc,
+    * commands which may have been binned.
+    */
+   tri->inputs.disable = TRUE;
+   return FALSE;
+}
+
+
+/**
+ * Try to draw the triangle, restart the scene on failure.
+ */
+static void retry_triangle_ccw( struct lp_setup_context *setup,
+                                struct fixed_position* position,
+                                const float (*v0)[4],
+                                const float (*v1)[4],
+                                const float (*v2)[4],
+                                boolean front)
+{
+   if (!do_triangle_ccw( setup, position, v0, v1, v2, front ))
+   {
+      if (!lp_setup_flush_and_restart(setup))
+         return;
+
+      if (!do_triangle_ccw( setup, position, v0, v1, v2, front ))
+         return;
+   }
+}
+
+/**
+ * Calculate fixed position data for a triangle
+ */
+static inline void
+calc_fixed_position( struct lp_setup_context *setup,
+                     struct fixed_position* position,
+                     const float (*v0)[4],
+                     const float (*v1)[4],
+                     const float (*v2)[4])
+{
+   position->x[0] = subpixel_snap(v0[0][0] - setup->pixel_offset);
+   position->x[1] = subpixel_snap(v1[0][0] - setup->pixel_offset);
+   position->x[2] = subpixel_snap(v2[0][0] - setup->pixel_offset);
+   position->x[3] = 0;
+
+   position->y[0] = subpixel_snap(v0[0][1] - setup->pixel_offset);
+   position->y[1] = subpixel_snap(v1[0][1] - setup->pixel_offset);
+   position->y[2] = subpixel_snap(v2[0][1] - setup->pixel_offset);
+   position->y[3] = 0;
+
+   position->dx01 = position->x[0] - position->x[1];
+   position->dy01 = position->y[0] - position->y[1];
+
+   position->dx20 = position->x[2] - position->x[0];
+   position->dy20 = position->y[2] - position->y[0];
+
+   position->area = IMUL64(position->dx01, position->dy20) -
+         IMUL64(position->dx20, position->dy01);
+}
+
+
+/**
+ * Rotate a triangle, flipping its clockwise direction,
+ * Swaps values for xy[0] and xy[1]
+ */
+static inline void
+rotate_fixed_position_01( struct fixed_position* position )
+{
+   int x, y;
+
+   x = position->x[1];
+   y = position->y[1];
+   position->x[1] = position->x[0];
+   position->y[1] = position->y[0];
+   position->x[0] = x;
+   position->y[0] = y;
+
+   position->dx01 = -position->dx01;
+   position->dy01 = -position->dy01;
+   position->dx20 = position->x[2] - position->x[0];
+   position->dy20 = position->y[2] - position->y[0];
+
+   position->area = -position->area;
+}
+
+
+/**
+ * Rotate a triangle, flipping its clockwise direction,
+ * Swaps values for xy[1] and xy[2]
+ */
+static inline void
+rotate_fixed_position_12( struct fixed_position* position )
+{
+   int x, y;
+
+   x = position->x[2];
+   y = position->y[2];
+   position->x[2] = position->x[1];
+   position->y[2] = position->y[1];
+   position->x[1] = x;
+   position->y[1] = y;
+
+   x = position->dx01;
+   y = position->dy01;
+   position->dx01 = -position->dx20;
+   position->dy01 = -position->dy20;
+   position->dx20 = -x;
+   position->dy20 = -y;
+
+   position->area = -position->area;
+}
+
+
+/**
+ * Draw triangle if it's CW, cull otherwise.
+ */
+static void triangle_cw( struct lp_setup_context *setup,
+			 const float (*v0)[4],
+			 const float (*v1)[4],
+			 const float (*v2)[4] )
+{
+   struct fixed_position position;
+
+   calc_fixed_position(setup, &position, v0, v1, v2);
+
+   if (position.area < 0) {
+      if (setup->flatshade_first) {
+         rotate_fixed_position_12(&position);
+         retry_triangle_ccw(setup, &position, v0, v2, v1, !setup->ccw_is_frontface);
+      } else {
+         rotate_fixed_position_01(&position);
+         retry_triangle_ccw(setup, &position, v1, v0, v2, !setup->ccw_is_frontface);
+      }
+   }
+}
+
+
+static void triangle_ccw( struct lp_setup_context *setup,
+                          const float (*v0)[4],
+                          const float (*v1)[4],
+                          const float (*v2)[4])
+{
+   struct fixed_position position;
+
+   calc_fixed_position(setup, &position, v0, v1, v2);
+
+   if (position.area > 0)
+      retry_triangle_ccw(setup, &position, v0, v1, v2, setup->ccw_is_frontface);
+}
+
+/**
+ * Draw triangle whether it's CW or CCW.
+ */
+static void triangle_both( struct lp_setup_context *setup,
+			   const float (*v0)[4],
+			   const float (*v1)[4],
+			   const float (*v2)[4] )
+{
+   struct fixed_position position;
+   struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe;
+
+   if (lp_context->active_statistics_queries &&
+       !llvmpipe_rasterization_disabled(lp_context)) {
+      lp_context->pipeline_statistics.c_primitives++;
+   }
+
+   calc_fixed_position(setup, &position, v0, v1, v2);
+
+   if (0) {
+      assert(!util_is_inf_or_nan(v0[0][0]));
+      assert(!util_is_inf_or_nan(v0[0][1]));
+      assert(!util_is_inf_or_nan(v1[0][0]));
+      assert(!util_is_inf_or_nan(v1[0][1]));
+      assert(!util_is_inf_or_nan(v2[0][0]));
+      assert(!util_is_inf_or_nan(v2[0][1]));
+   }
+
+   if (position.area > 0)
+      retry_triangle_ccw( setup, &position, v0, v1, v2, setup->ccw_is_frontface );
+   else if (position.area < 0) {
+      if (setup->flatshade_first) {
+         rotate_fixed_position_12( &position );
+         retry_triangle_ccw( setup, &position, v0, v2, v1, !setup->ccw_is_frontface );
+      } else {
+         rotate_fixed_position_01( &position );
+         retry_triangle_ccw( setup, &position, v1, v0, v2, !setup->ccw_is_frontface );
+      }
+   }
+}
+
+
+static void triangle_nop( struct lp_setup_context *setup,
+			  const float (*v0)[4],
+			  const float (*v1)[4],
+			  const float (*v2)[4] )
+{
+}
+
+
+void 
+lp_setup_choose_triangle( struct lp_setup_context *setup )
+{
+   switch (setup->cullmode) {
+   case PIPE_FACE_NONE:
+      setup->triangle = triangle_both;
+      break;
+   case PIPE_FACE_BACK:
+      setup->triangle = setup->ccw_is_frontface ? triangle_ccw : triangle_cw;
+      break;
+   case PIPE_FACE_FRONT:
+      setup->triangle = setup->ccw_is_frontface ? triangle_cw : triangle_ccw;
+      break;
+   default:
+      setup->triangle = triangle_nop;
+      break;
+   }
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
new file mode 100644
index 000000000..534c5f48a
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
@@ -0,0 +1,602 @@
+/**************************************************************************
+ *
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Interface between 'draw' module's output and the llvmpipe rasterizer/setup
+ * code.  When the 'draw' module has finished filling a vertex buffer, the
+ * draw_arrays() functions below will be called.  Loop over the vertices and
+ * call the point/line/tri setup functions.
+ *
+ * Authors
+ *  Brian Paul
+ */
+
+
+#include "lp_setup_context.h"
+#include "lp_context.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "util/u_memory.h"
+
+
+#define LP_MAX_VBUF_INDEXES 1024
+#define LP_MAX_VBUF_SIZE    4096
+
+  
+
+/** cast wrapper */
+static struct lp_setup_context *
+lp_setup_context(struct vbuf_render *vbr)
+{
+   return (struct lp_setup_context *) vbr;
+}
+
+
+
+static const struct vertex_info *
+lp_setup_get_vertex_info(struct vbuf_render *vbr)
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+
+   /* Vertex size/info depends on the latest state.
+    * The draw module may have issued additional state-change commands.
+    */
+   lp_setup_update_state(setup, FALSE);
+
+   return setup->vertex_info;
+}
+
+
+static boolean
+lp_setup_allocate_vertices(struct vbuf_render *vbr,
+                          ushort vertex_size, ushort nr_vertices)
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+   unsigned size = vertex_size * nr_vertices;
+
+   if (setup->vertex_buffer_size < size) {
+      align_free(setup->vertex_buffer);
+      setup->vertex_buffer = align_malloc(size, 16);
+      setup->vertex_buffer_size = size;
+   }
+
+   setup->vertex_size = vertex_size;
+   setup->nr_vertices = nr_vertices;
+   
+   return setup->vertex_buffer != NULL;
+}
+
+static void
+lp_setup_release_vertices(struct vbuf_render *vbr)
+{
+   /* keep the old allocation for next time */
+}
+
+static void *
+lp_setup_map_vertices(struct vbuf_render *vbr)
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+   return setup->vertex_buffer;
+}
+
+static void 
+lp_setup_unmap_vertices(struct vbuf_render *vbr, 
+                       ushort min_index,
+                       ushort max_index )
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+   assert( setup->vertex_buffer_size >= (max_index+1) * setup->vertex_size );
+   /* do nothing */
+}
+
+
+static void
+lp_setup_set_primitive(struct vbuf_render *vbr, unsigned prim)
+{
+   lp_setup_context(vbr)->prim = prim;
+}
+
+typedef const float (*const_float4_ptr)[4];
+
+static inline const_float4_ptr get_vert( const void *vertex_buffer,
+                                         int index,
+                                         int stride )
+{
+   return (const_float4_ptr)((char *)vertex_buffer + index * stride);
+}
+
+/**
+ * draw elements / indexed primitives
+ */
+static void
+lp_setup_draw_elements(struct vbuf_render *vbr, const ushort *indices, uint nr)
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+   const unsigned stride = setup->vertex_info->size * sizeof(float);
+   const void *vertex_buffer = setup->vertex_buffer;
+   const boolean flatshade_first = setup->flatshade_first;
+   unsigned i;
+
+   assert(setup->setup.variant);
+
+   if (!lp_setup_update_state(setup, TRUE))
+      return;
+
+   switch (setup->prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < nr; i++) {
+         setup->point( setup,
+                       get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 1; i < nr; i += 2) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, indices[i-1], stride),
+                      get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < nr; i ++) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, indices[i-1], stride),
+                      get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      for (i = 1; i < nr; i ++) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, indices[i-1], stride),
+                      get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      if (nr) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, indices[nr-1], stride),
+                      get_vert(vertex_buffer, indices[0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      for (i = 2; i < nr; i += 3) {
+         setup->triangle( setup,
+                          get_vert(vertex_buffer, indices[i-2], stride),
+                          get_vert(vertex_buffer, indices[i-1], stride),
+                          get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            /* emit first triangle vertex as first triangle vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-2], stride),
+                             get_vert(vertex_buffer, indices[i+(i&1)-1], stride),
+                             get_vert(vertex_buffer, indices[i-(i&1)], stride) );
+
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            /* emit last triangle vertex as last triangle vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i+(i&1)-2], stride),
+                             get_vert(vertex_buffer, indices[i-(i&1)-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            /* emit first non-spoke vertex as first vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[0], stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            /* emit last non-spoke vertex as last vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[0], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUADS:
+      /* GL quads don't follow provoking vertex convention */
+      if (flatshade_first) { 
+         /* emit last quad vertex as first triangle vertex */
+         for (i = 3; i < nr; i += 4) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[i-3], stride),
+                             get_vert(vertex_buffer, indices[i-2], stride) );
+
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[i-2], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride) );
+         }
+      }
+      else {
+         /* emit last quad vertex as last triangle vertex */
+         for (i = 3; i < nr; i += 4) {
+            setup->triangle( setup,
+                          get_vert(vertex_buffer, indices[i-3], stride),
+                          get_vert(vertex_buffer, indices[i-2], stride),
+                          get_vert(vertex_buffer, indices[i-0], stride) );
+
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-2], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      /* GL quad strips don't follow provoking vertex convention */
+      if (flatshade_first) { 
+         /* emit last quad vertex as first triangle vertex */
+         for (i = 3; i < nr; i += 2) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[i-3], stride),
+                             get_vert(vertex_buffer, indices[i-2], stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-3], stride) );
+         }
+      }
+      else {
+         /* emit last quad vertex as last triangle vertex */
+         for (i = 3; i < nr; i += 2) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-3], stride),
+                             get_vert(vertex_buffer, indices[i-2], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-3], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      /* Almost same as tri fan but the _first_ vertex specifies the flat
+       * shading color.
+       */
+      if (flatshade_first) { 
+         /* emit first polygon  vertex as first triangle vertex */
+         for (i = 2; i < nr; i += 1) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[0], stride),
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride) );
+         }
+      }
+      else {
+         /* emit first polygon  vertex as last triangle vertex */
+         for (i = 2; i < nr; i += 1) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, indices[i-1], stride),
+                             get_vert(vertex_buffer, indices[i-0], stride),
+                             get_vert(vertex_buffer, indices[0], stride) );
+         }
+      }
+      break;
+
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * This function is hit when the draw module is working in pass-through mode.
+ * It's up to us to convert the vertex array into point/line/tri prims.
+ */
+static void
+lp_setup_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+   const unsigned stride = setup->vertex_info->size * sizeof(float);
+   const void *vertex_buffer =
+      (void *) get_vert(setup->vertex_buffer, start, stride);
+   const boolean flatshade_first = setup->flatshade_first;
+   unsigned i;
+
+   if (!lp_setup_update_state(setup, TRUE))
+      return;
+
+   switch (setup->prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < nr; i++) {
+         setup->point( setup,
+                       get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 1; i < nr; i += 2) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, i-1, stride),
+                      get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < nr; i ++) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, i-1, stride),
+                      get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      for (i = 1; i < nr; i ++) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, i-1, stride),
+                      get_vert(vertex_buffer, i-0, stride) );
+      }
+      if (nr) {
+         setup->line( setup,
+                      get_vert(vertex_buffer, nr-1, stride),
+                      get_vert(vertex_buffer, 0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      for (i = 2; i < nr; i += 3) {
+         setup->triangle( setup,
+                          get_vert(vertex_buffer, i-2, stride),
+                          get_vert(vertex_buffer, i-1, stride),
+                          get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (flatshade_first) {
+         for (i = 2; i < nr; i++) {
+            /* emit first triangle vertex as first triangle vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i+(i&1)-1, stride),
+                             get_vert(vertex_buffer, i-(i&1), stride) );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i++) {
+            /* emit last triangle vertex as last triangle vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i+(i&1)-2, stride),
+                             get_vert(vertex_buffer, i-(i&1)-1, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
+            /* emit first non-spoke vertex as first vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, 0, stride)  );
+         }
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
+            /* emit last non-spoke vertex as last vertex */
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, 0, stride),
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUADS:
+      /* GL quads don't follow provoking vertex convention */
+      if (flatshade_first) { 
+         /* emit last quad vertex as first triangle vertex */
+         for (i = 3; i < nr; i += 4) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, i-3, stride),
+                             get_vert(vertex_buffer, i-2, stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i-1, stride) );
+         }
+      }
+      else {
+         /* emit last quad vertex as last triangle vertex */
+         for (i = 3; i < nr; i += 4) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-3, stride),
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      /* GL quad strips don't follow provoking vertex convention */
+      if (flatshade_first) { 
+         /* emit last quad vertex as first triangle vertex */
+         for (i = 3; i < nr; i += 2) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, i-3, stride),
+                             get_vert(vertex_buffer, i-2, stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-3, stride) );
+         }
+      }
+      else {
+         /* emit last quad vertex as last triangle vertex */
+         for (i = 3; i < nr; i += 2) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-3, stride),
+                             get_vert(vertex_buffer, i-2, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-3, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      /* Almost same as tri fan but the _first_ vertex specifies the flat
+       * shading color.
+       */
+      if (flatshade_first) { 
+         /* emit first polygon  vertex as first triangle vertex */
+         for (i = 2; i < nr; i += 1) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, 0, stride),
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-0, stride) );
+         }
+      }
+      else {
+         /* emit first polygon  vertex as last triangle vertex */
+         for (i = 2; i < nr; i += 1) {
+            setup->triangle( setup,
+                             get_vert(vertex_buffer, i-1, stride),
+                             get_vert(vertex_buffer, i-0, stride),
+                             get_vert(vertex_buffer, 0, stride) );
+         }
+      }
+      break;
+
+   default:
+      assert(0);
+   }
+}
+
+
+
+static void
+lp_setup_vbuf_destroy(struct vbuf_render *vbr)
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+   if (setup->vertex_buffer) {
+      align_free(setup->vertex_buffer);
+      setup->vertex_buffer = NULL;
+   }
+   lp_setup_destroy(setup);
+}
+
+/*
+ * FIXME: it is unclear if primitives_storage_needed (which is generally
+ * the same as pipe query num_primitives_generated) should increase
+ * if SO is disabled for d3d10, but for GL we definitely need to
+ * increase num_primitives_generated and this is only called for active
+ * SO. If it must not increase for d3d10 need to disambiguate the counters
+ * in the driver and do some work for getting correct values, if it should
+ * increase too should call this from outside streamout code.
+ */
+static void
+lp_setup_so_info(struct vbuf_render *vbr, uint primitives, uint prim_generated)
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+   struct llvmpipe_context *lp = llvmpipe_context(setup->pipe);
+
+   lp->so_stats.num_primitives_written += primitives;
+   lp->so_stats.primitives_storage_needed += prim_generated;
+}
+
+static void
+lp_setup_pipeline_statistics(
+   struct vbuf_render *vbr,
+   const struct pipe_query_data_pipeline_statistics *stats)
+{
+   struct lp_setup_context *setup = lp_setup_context(vbr);
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(setup->pipe);
+
+   llvmpipe->pipeline_statistics.ia_vertices +=
+      stats->ia_vertices;
+   llvmpipe->pipeline_statistics.ia_primitives +=
+      stats->ia_primitives;
+   llvmpipe->pipeline_statistics.vs_invocations +=
+      stats->vs_invocations;
+   llvmpipe->pipeline_statistics.gs_invocations +=
+      stats->gs_invocations;
+   llvmpipe->pipeline_statistics.gs_primitives +=
+      stats->gs_primitives;
+   if (!llvmpipe_rasterization_disabled(llvmpipe)) {
+      llvmpipe->pipeline_statistics.c_invocations +=
+         stats->c_invocations;
+   } else {
+      llvmpipe->pipeline_statistics.c_invocations = 0;
+   }
+}
+
+/**
+ * Create the post-transform vertex handler for the given context.
+ */
+void
+lp_setup_init_vbuf(struct lp_setup_context *setup)
+{
+   setup->base.max_indices = LP_MAX_VBUF_INDEXES;
+   setup->base.max_vertex_buffer_bytes = LP_MAX_VBUF_SIZE;
+
+   setup->base.get_vertex_info = lp_setup_get_vertex_info;
+   setup->base.allocate_vertices = lp_setup_allocate_vertices;
+   setup->base.map_vertices = lp_setup_map_vertices;
+   setup->base.unmap_vertices = lp_setup_unmap_vertices;
+   setup->base.set_primitive = lp_setup_set_primitive;
+   setup->base.draw_elements = lp_setup_draw_elements;
+   setup->base.draw_arrays = lp_setup_draw_arrays;
+   setup->base.release_vertices = lp_setup_release_vertices;
+   setup->base.destroy = lp_setup_vbuf_destroy;
+   setup->base.set_stream_output_info = lp_setup_so_info;
+   setup->base.pipeline_statistics = lp_setup_pipeline_statistics;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_state.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state.h
new file mode 100644
index 000000000..2da6caaef
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state.h
@@ -0,0 +1,145 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keithw@vmware.com>
+ */
+
+#ifndef LP_STATE_H
+#define LP_STATE_H
+
+#include "pipe/p_state.h"
+#include "lp_jit.h"
+#include "lp_state_fs.h"
+#include "gallivm/lp_bld.h"
+
+
+#define LP_NEW_VIEWPORT      0x1
+#define LP_NEW_RASTERIZER    0x2
+#define LP_NEW_FS            0x4
+#define LP_NEW_BLEND         0x8
+#define LP_NEW_CLIP          0x10
+#define LP_NEW_SCISSOR       0x20
+#define LP_NEW_STIPPLE       0x40
+#define LP_NEW_FRAMEBUFFER   0x80
+#define LP_NEW_DEPTH_STENCIL_ALPHA 0x100
+#define LP_NEW_CONSTANTS     0x200
+#define LP_NEW_SAMPLER       0x400
+#define LP_NEW_SAMPLER_VIEW  0x800
+#define LP_NEW_VERTEX        0x1000
+#define LP_NEW_VS            0x2000
+#define LP_NEW_OCCLUSION_QUERY 0x4000
+#define LP_NEW_BLEND_COLOR   0x8000
+#define LP_NEW_GS            0x10000
+#define LP_NEW_SO            0x20000
+#define LP_NEW_SO_BUFFERS    0x40000
+
+
+
+struct vertex_info;
+struct pipe_context;
+struct llvmpipe_context;
+
+
+
+struct lp_geometry_shader {
+   boolean no_tokens;
+   struct pipe_stream_output_info stream_output;
+   struct draw_geometry_shader *dgs;
+};
+
+/** Vertex element state */
+struct lp_velems_state
+{
+   unsigned count;
+   struct pipe_vertex_element velem[PIPE_MAX_ATTRIBS];
+};
+
+struct lp_so_state {
+   struct pipe_stream_output_info base;
+};
+
+
+void
+llvmpipe_set_framebuffer_state(struct pipe_context *,
+                               const struct pipe_framebuffer_state *);
+
+void
+llvmpipe_update_fs(struct llvmpipe_context *lp);
+
+void 
+llvmpipe_update_setup(struct llvmpipe_context *lp);
+
+void
+llvmpipe_update_derived(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_sampler_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_blend_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_vertex_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_draw_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_clip_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_vs_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_gs_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_rasterizer_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_init_so_funcs(struct llvmpipe_context *llvmpipe);
+
+void
+llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *ctx,
+                                 unsigned num,
+                                 struct pipe_sampler_view **views);
+void
+llvmpipe_cleanup_vertex_sampling(struct llvmpipe_context *ctx);
+
+
+void
+llvmpipe_prepare_geometry_sampling(struct llvmpipe_context *ctx,
+                                   unsigned num,
+                                   struct pipe_sampler_view **views);
+void
+llvmpipe_cleanup_geometry_sampling(struct llvmpipe_context *ctx);
+
+
+#endif
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_blend.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_blend.c
new file mode 100644
index 000000000..e38de9aca
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_blend.c
@@ -0,0 +1,206 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ * @author Keith Whitwell <keithw@vmware.com>
+ */
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_dump.h"
+#include "draw/draw_context.h"
+#include "lp_screen.h"
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_debug.h"
+
+
+static void *
+llvmpipe_create_blend_state(struct pipe_context *pipe,
+                            const struct pipe_blend_state *blend)
+{
+   struct pipe_blend_state *state = mem_dup(blend, sizeof *blend);
+   int i;
+
+   if (LP_PERF & PERF_NO_BLEND) {
+      state->independent_blend_enable = 0;
+      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+	 state->rt[i].blend_enable = 0;
+   }
+
+   return state;
+}
+
+
+static void
+llvmpipe_bind_blend_state(struct pipe_context *pipe, void *blend)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if (llvmpipe->blend == blend)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   llvmpipe->blend = blend;
+
+   llvmpipe->dirty |= LP_NEW_BLEND;
+}
+
+
+static void
+llvmpipe_delete_blend_state(struct pipe_context *pipe, void *blend)
+{
+   FREE( blend );
+}
+
+
+static void
+llvmpipe_set_blend_color(struct pipe_context *pipe,
+                         const struct pipe_blend_color *blend_color)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if(!blend_color)
+      return;
+
+   if(memcmp(&llvmpipe->blend_color, blend_color, sizeof *blend_color) == 0)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   memcpy(&llvmpipe->blend_color, blend_color, sizeof *blend_color);
+
+   llvmpipe->dirty |= LP_NEW_BLEND_COLOR;
+}
+
+
+/** XXX move someday?  Or consolidate all these simple state setters
+ * into one file.
+ */
+
+
+static void *
+llvmpipe_create_depth_stencil_state(struct pipe_context *pipe,
+				    const struct pipe_depth_stencil_alpha_state *depth_stencil)
+{
+   struct pipe_depth_stencil_alpha_state *state;
+
+   state = mem_dup(depth_stencil, sizeof *depth_stencil);
+
+   if (LP_PERF & PERF_NO_DEPTH) {
+      state->depth.enabled = 0;
+      state->depth.writemask = 0;
+      state->stencil[0].enabled = 0;
+      state->stencil[1].enabled = 0;
+   }
+
+   if (LP_PERF & PERF_NO_ALPHATEST) {
+      state->alpha.enabled = 0;
+   }
+
+   return state;
+}
+
+
+static void
+llvmpipe_bind_depth_stencil_state(struct pipe_context *pipe,
+                                  void *depth_stencil)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if (llvmpipe->depth_stencil == depth_stencil)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   llvmpipe->depth_stencil = depth_stencil;
+
+   llvmpipe->dirty |= LP_NEW_DEPTH_STENCIL_ALPHA;
+}
+
+
+static void
+llvmpipe_delete_depth_stencil_state(struct pipe_context *pipe, void *depth)
+{
+   FREE( depth );
+}
+
+
+static void
+llvmpipe_set_stencil_ref(struct pipe_context *pipe,
+                         const struct pipe_stencil_ref *stencil_ref)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if(!stencil_ref)
+      return;
+
+   if(memcmp(&llvmpipe->stencil_ref, stencil_ref, sizeof *stencil_ref) == 0)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   memcpy(&llvmpipe->stencil_ref, stencil_ref, sizeof *stencil_ref);
+
+   /* not sure. want new flag? */
+   llvmpipe->dirty |= LP_NEW_DEPTH_STENCIL_ALPHA;
+}
+
+static void
+llvmpipe_set_sample_mask(struct pipe_context *pipe,
+                         unsigned sample_mask)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if (sample_mask != llvmpipe->sample_mask) {
+      llvmpipe->sample_mask = sample_mask;
+
+      llvmpipe->dirty |= LP_NEW_RASTERIZER;
+   }
+}
+
+void
+llvmpipe_init_blend_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_blend_state = llvmpipe_create_blend_state;
+   llvmpipe->pipe.bind_blend_state   = llvmpipe_bind_blend_state;
+   llvmpipe->pipe.delete_blend_state = llvmpipe_delete_blend_state;
+
+   llvmpipe->pipe.create_depth_stencil_alpha_state = llvmpipe_create_depth_stencil_state;
+   llvmpipe->pipe.bind_depth_stencil_alpha_state   = llvmpipe_bind_depth_stencil_state;
+   llvmpipe->pipe.delete_depth_stencil_alpha_state = llvmpipe_delete_depth_stencil_state;
+
+   llvmpipe->pipe.set_blend_color = llvmpipe_set_blend_color;
+
+   llvmpipe->pipe.set_stencil_ref = llvmpipe_set_stencil_ref;
+   llvmpipe->pipe.set_sample_mask = llvmpipe_set_sample_mask;
+
+   llvmpipe->sample_mask = ~0;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_clip.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_clip.c
new file mode 100644
index 000000000..1b9b84c08
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_clip.c
@@ -0,0 +1,105 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keithw@vmware.com>
+ */
+#include "lp_context.h"
+#include "lp_state.h"
+#include "draw/draw_context.h"
+
+
+static void
+llvmpipe_set_clip_state(struct pipe_context *pipe,
+                        const struct pipe_clip_state *clip)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   /* pass the clip state to the draw module */
+   draw_set_clip_state(llvmpipe->draw, clip);
+}
+
+
+static void
+llvmpipe_set_viewport_states(struct pipe_context *pipe,
+                             unsigned start_slot,
+                             unsigned num_viewports,
+                             const struct pipe_viewport_state *viewports)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   /* pass the viewport info to the draw module */
+   draw_set_viewport_states(llvmpipe->draw, start_slot, num_viewports,
+                            viewports);
+
+   memcpy(llvmpipe->viewports + start_slot, viewports,
+          sizeof(struct pipe_viewport_state) * num_viewports);
+   llvmpipe->dirty |= LP_NEW_VIEWPORT;
+}
+
+
+static void
+llvmpipe_set_scissor_states(struct pipe_context *pipe,
+                            unsigned start_slot,
+                            unsigned num_scissors,
+                            const struct pipe_scissor_state *scissors)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   draw_flush(llvmpipe->draw);
+
+   debug_assert(start_slot < PIPE_MAX_VIEWPORTS);
+   debug_assert((start_slot + num_scissors) <= PIPE_MAX_VIEWPORTS);
+
+   memcpy(llvmpipe->scissors + start_slot, scissors,
+          sizeof(struct pipe_scissor_state) * num_scissors);
+   
+   llvmpipe->dirty |= LP_NEW_SCISSOR;
+}
+
+
+static void
+llvmpipe_set_polygon_stipple(struct pipe_context *pipe,
+                             const struct pipe_poly_stipple *stipple)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   draw_flush(llvmpipe->draw);
+
+   llvmpipe->poly_stipple = *stipple; /* struct copy */
+   llvmpipe->dirty |= LP_NEW_STIPPLE;
+}
+
+
+
+void
+llvmpipe_init_clip_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.set_clip_state = llvmpipe_set_clip_state;
+   llvmpipe->pipe.set_polygon_stipple = llvmpipe_set_polygon_stipple;
+   llvmpipe->pipe.set_scissor_states = llvmpipe_set_scissor_states;
+   llvmpipe->pipe.set_viewport_states = llvmpipe_set_viewport_states;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_derived.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_derived.c
new file mode 100644
index 000000000..a25e83261
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -0,0 +1,244 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_private.h"
+#include "lp_context.h"
+#include "lp_screen.h"
+#include "lp_setup.h"
+#include "lp_state.h"
+
+
+
+/**
+ * The vertex info describes how to convert the post-transformed vertices
+ * (simple float[][4]) used by the 'draw' module into vertices for
+ * rasterization.
+ *
+ * This function validates the vertex layout.
+ */
+static void
+compute_vertex_info(struct llvmpipe_context *llvmpipe)
+{
+   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
+   struct vertex_info *vinfo = &llvmpipe->vertex_info;
+   int vs_index;
+   uint i;
+
+   draw_prepare_shader_outputs(llvmpipe->draw);
+
+   llvmpipe->color_slot[0] = -1;
+   llvmpipe->color_slot[1] = -1;
+   llvmpipe->bcolor_slot[0] = -1;
+   llvmpipe->bcolor_slot[1] = -1;
+
+   /*
+    * Match FS inputs against VS outputs, emitting the necessary
+    * attributes.  Could cache these structs and look them up with a
+    * combination of fragment shader, vertex shader ids.
+    */
+
+   vinfo->num_attribs = 0;
+
+   vs_index = draw_find_shader_output(llvmpipe->draw,
+                                      TGSI_SEMANTIC_POSITION,
+                                      0);
+
+   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+
+   for (i = 0; i < lpfs->info.base.num_inputs; i++) {
+      /*
+       * Search for each input in current vs output:
+       */
+
+      vs_index = draw_find_shader_output(llvmpipe->draw,
+                                         lpfs->info.base.input_semantic_name[i],
+                                         lpfs->info.base.input_semantic_index[i]);
+
+      if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_COLOR &&
+          lpfs->info.base.input_semantic_index[i] < 2) {
+         int idx = lpfs->info.base.input_semantic_index[i];
+         llvmpipe->color_slot[idx] = (int)vinfo->num_attribs;
+      }
+
+      if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_FACE) {
+         llvmpipe->face_slot = vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_PRIMID) {
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      } else {
+         /*
+          * Emit the requested fs attribute for all but position.
+          */
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+      }
+   }
+   /* Figure out if we need bcolor as well.
+    */
+   for (i = 0; i < 2; i++) {
+      vs_index = draw_find_shader_output(llvmpipe->draw,
+                                         TGSI_SEMANTIC_BCOLOR, i);
+
+      if (vs_index >= 0) {
+         llvmpipe->bcolor_slot[i] = (int)vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
+      }
+   }
+
+
+   /* Figure out if we need pointsize as well.
+    */
+   vs_index = draw_find_shader_output(llvmpipe->draw,
+                                      TGSI_SEMANTIC_PSIZE, 0);
+
+   if (vs_index >= 0) {
+      llvmpipe->psize_slot = vinfo->num_attribs;
+      draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+   }
+
+   /* Figure out if we need viewport index */
+   vs_index = draw_find_shader_output(llvmpipe->draw,
+                                      TGSI_SEMANTIC_VIEWPORT_INDEX,
+                                      0);
+   if (vs_index >= 0) {
+      llvmpipe->viewport_index_slot = vinfo->num_attribs;
+      draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+   } else {
+      llvmpipe->viewport_index_slot = 0;
+   }
+
+   /* Figure out if we need layer */
+   vs_index = draw_find_shader_output(llvmpipe->draw,
+                                      TGSI_SEMANTIC_LAYER,
+                                      0);
+   if (vs_index >= 0) {
+      llvmpipe->layer_slot = vinfo->num_attribs;
+      draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+   } else {
+      llvmpipe->layer_slot = 0;
+   }
+
+   draw_compute_vertex_size(vinfo);
+   lp_setup_set_vertex_info(llvmpipe->setup, vinfo);
+}
+
+
+/**
+ * Handle state changes.
+ * Called just prior to drawing anything (pipe::draw_arrays(), etc).
+ *
+ * Hopefully this will remain quite simple, otherwise need to pull in
+ * something like the state tracker mechanism.
+ */
+void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
+{
+   struct llvmpipe_screen *lp_screen = llvmpipe_screen(llvmpipe->pipe.screen);
+
+   /* Check for updated textures.
+    */
+   if (llvmpipe->tex_timestamp != lp_screen->timestamp) {
+      llvmpipe->tex_timestamp = lp_screen->timestamp;
+      llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
+   }
+      
+   if (llvmpipe->dirty & (LP_NEW_RASTERIZER |
+                          LP_NEW_FS |
+                          LP_NEW_VS))
+      compute_vertex_info( llvmpipe );
+
+   if (llvmpipe->dirty & (LP_NEW_FS |
+                          LP_NEW_FRAMEBUFFER |
+                          LP_NEW_BLEND |
+                          LP_NEW_SCISSOR |
+                          LP_NEW_DEPTH_STENCIL_ALPHA |
+                          LP_NEW_RASTERIZER |
+                          LP_NEW_SAMPLER |
+                          LP_NEW_SAMPLER_VIEW |
+                          LP_NEW_OCCLUSION_QUERY))
+      llvmpipe_update_fs( llvmpipe );
+
+   if (llvmpipe->dirty & (LP_NEW_RASTERIZER)) {
+      boolean discard =
+         (llvmpipe->sample_mask & 1) == 0 ||
+         (llvmpipe->rasterizer ? llvmpipe->rasterizer->rasterizer_discard : FALSE);
+
+      lp_setup_set_rasterizer_discard(llvmpipe->setup, discard);
+   }
+
+   if (llvmpipe->dirty & (LP_NEW_FS |
+                          LP_NEW_FRAMEBUFFER |
+                          LP_NEW_RASTERIZER))
+      llvmpipe_update_setup( llvmpipe );
+
+   if (llvmpipe->dirty & LP_NEW_BLEND_COLOR)
+      lp_setup_set_blend_color(llvmpipe->setup,
+                               &llvmpipe->blend_color);
+
+   if (llvmpipe->dirty & LP_NEW_SCISSOR)
+      lp_setup_set_scissors(llvmpipe->setup, llvmpipe->scissors);
+
+   if (llvmpipe->dirty & LP_NEW_DEPTH_STENCIL_ALPHA) {
+      lp_setup_set_alpha_ref_value(llvmpipe->setup, 
+                                   llvmpipe->depth_stencil->alpha.ref_value);
+      lp_setup_set_stencil_ref_values(llvmpipe->setup,
+                                      llvmpipe->stencil_ref.ref_value);
+   }
+
+   if (llvmpipe->dirty & LP_NEW_CONSTANTS)
+      lp_setup_set_fs_constants(llvmpipe->setup,
+                                Elements(llvmpipe->constants[PIPE_SHADER_FRAGMENT]),
+                                llvmpipe->constants[PIPE_SHADER_FRAGMENT]);
+
+   if (llvmpipe->dirty & (LP_NEW_SAMPLER_VIEW))
+      lp_setup_set_fragment_sampler_views(llvmpipe->setup,
+                                          llvmpipe->num_sampler_views[PIPE_SHADER_FRAGMENT],
+                                          llvmpipe->sampler_views[PIPE_SHADER_FRAGMENT]);
+
+   if (llvmpipe->dirty & (LP_NEW_SAMPLER))
+      lp_setup_set_fragment_sampler_state(llvmpipe->setup,
+                                          llvmpipe->num_samplers[PIPE_SHADER_FRAGMENT],
+                                          llvmpipe->samplers[PIPE_SHADER_FRAGMENT]);
+
+   if (llvmpipe->dirty & LP_NEW_VIEWPORT) {
+      /*
+       * Update setup and fragment's view of the active viewport state.
+       *
+       * XXX TODO: It is possible to only loop over the active viewports
+       *           instead of all viewports (PIPE_MAX_VIEWPORTS).
+       */
+      lp_setup_set_viewports(llvmpipe->setup,
+                             PIPE_MAX_VIEWPORTS,
+                             llvmpipe->viewports);
+   }
+
+   llvmpipe->dirty = 0;
+}
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_fs.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_fs.c
new file mode 100644
index 000000000..fd6c49aac
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -0,0 +1,3217 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Code generate the whole fragment pipeline.
+ *
+ * The fragment pipeline consists of the following stages:
+ * - early depth test
+ * - fragment shader
+ * - alpha test
+ * - depth/stencil test
+ * - blending
+ *
+ * This file has only the glue to assemble the fragment pipeline.  The actual
+ * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the
+ * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we
+ * muster the LLVM JIT execution engine to create a function that follows an
+ * established binary interface and that can be called from C directly.
+ *
+ * A big source of complexity here is that we often want to run different
+ * stages with different precisions and data types and precisions. For example,
+ * the fragment shader needs typically to be done in floats, but the
+ * depth/stencil test and blending is better done in the type that most closely
+ * matches the depth/stencil and color buffer respectively.
+ *
+ * Since the width of a SIMD vector register stays the same regardless of the
+ * element type, different types imply different number of elements, so we must
+ * code generate more instances of the stages with larger types to be able to
+ * feed/consume the stages with smaller types.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include <limits.h>
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_pointer.h"
+#include "util/u_format.h"
+#include "util/u_dump.h"
+#include "util/u_string.h"
+#include "util/simple_list.h"
+#include "util/u_dual_blend.h"
+#include "os/os_time.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_parse.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_conv.h"
+#include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_intr.h"
+#include "gallivm/lp_bld_logic.h"
+#include "gallivm/lp_bld_tgsi.h"
+#include "gallivm/lp_bld_swizzle.h"
+#include "gallivm/lp_bld_flow.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_pack.h"
+#include "gallivm/lp_bld_format.h"
+#include "gallivm/lp_bld_quad.h"
+
+#include "lp_bld_alpha.h"
+#include "lp_bld_blend.h"
+#include "lp_bld_depth.h"
+#include "lp_bld_interp.h"
+#include "lp_context.h"
+#include "lp_debug.h"
+#include "lp_perf.h"
+#include "lp_setup.h"
+#include "lp_state.h"
+#include "lp_tex_sample.h"
+#include "lp_flush.h"
+#include "lp_state_fs.h"
+#include "lp_rast.h"
+
+
+/** Fragment shader number (for debugging) */
+static unsigned fs_no = 0;
+
+
+/**
+ * Expand the relevant bits of mask_input to a n*4-dword mask for the
+ * n*four pixels in n 2x2 quads.  This will set the n*four elements of the
+ * quad mask vector to 0 or ~0.
+ * Grouping is 01, 23 for 2 quad mode hence only 0 and 2 are valid
+ * quad arguments with fs length 8.
+ *
+ * \param first_quad  which quad(s) of the quad group to test, in [0,3]
+ * \param mask_input  bitwise mask for the whole 4x4 stamp
+ */
+static LLVMValueRef
+generate_quad_mask(struct gallivm_state *gallivm,
+                   struct lp_type fs_type,
+                   unsigned first_quad,
+                   LLVMValueRef mask_input) /* int32 */
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_type mask_type;
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMValueRef bits[16];
+   LLVMValueRef mask;
+   int shift, i;
+
+   /*
+    * XXX: We'll need a different path for 16 x u8
+    */
+   assert(fs_type.width == 32);
+   assert(fs_type.length <= Elements(bits));
+   mask_type = lp_int_type(fs_type);
+
+   /*
+    * mask_input >>= (quad * 4)
+    */
+   switch (first_quad) {
+   case 0:
+      shift = 0;
+      break;
+   case 1:
+      assert(fs_type.length == 4);
+      shift = 2;
+      break;
+   case 2:
+      shift = 8;
+      break;
+   case 3:
+      assert(fs_type.length == 4);
+      shift = 10;
+      break;
+   default:
+      assert(0);
+      shift = 0;
+   }
+
+   mask_input = LLVMBuildLShr(builder,
+                              mask_input,
+                              LLVMConstInt(i32t, shift, 0),
+                              "");
+
+   /*
+    * mask = { mask_input & (1 << i), for i in [0,3] }
+    */
+   mask = lp_build_broadcast(gallivm,
+                             lp_build_vec_type(gallivm, mask_type),
+                             mask_input);
+
+   for (i = 0; i < fs_type.length / 4; i++) {
+      unsigned j = 2 * (i % 2) + (i / 2) * 8;
+      bits[4*i + 0] = LLVMConstInt(i32t, 1ULL << (j + 0), 0);
+      bits[4*i + 1] = LLVMConstInt(i32t, 1ULL << (j + 1), 0);
+      bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0);
+      bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0);
+   }
+   mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, fs_type.length), "");
+
+   /*
+    * mask = mask != 0 ? ~0 : 0
+    */
+   mask = lp_build_compare(gallivm,
+                           mask_type, PIPE_FUNC_NOTEQUAL,
+                           mask,
+                           lp_build_const_int_vec(gallivm, mask_type, 0));
+
+   return mask;
+}
+
+
+#define EARLY_DEPTH_TEST  0x1
+#define LATE_DEPTH_TEST   0x2
+#define EARLY_DEPTH_WRITE 0x4
+#define LATE_DEPTH_WRITE  0x8
+
+static int
+find_output_by_semantic( const struct tgsi_shader_info *info,
+			 unsigned semantic,
+			 unsigned index )
+{
+   int i;
+
+   for (i = 0; i < info->num_outputs; i++)
+      if (info->output_semantic_name[i] == semantic &&
+	  info->output_semantic_index[i] == index)
+	 return i;
+
+   return -1;
+}
+
+
+/**
+ * Fetch the specified lp_jit_viewport structure for a given viewport_index.
+ */
+static LLVMValueRef
+lp_llvm_viewport(LLVMValueRef context_ptr,
+                 struct gallivm_state *gallivm,
+                 LLVMValueRef viewport_index)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef ptr;
+   LLVMValueRef res;
+   struct lp_type viewport_type =
+      lp_type_float_vec(32, 32 * LP_JIT_VIEWPORT_NUM_FIELDS);
+
+   ptr = lp_jit_context_viewports(gallivm, context_ptr);
+   ptr = LLVMBuildPointerCast(builder, ptr,
+            LLVMPointerType(lp_build_vec_type(gallivm, viewport_type), 0), "");
+
+   res = lp_build_pointer_get(builder, ptr, viewport_index);
+
+   return res;
+}
+
+
+/**
+ * Generate the fragment shader, depth/stencil test, and alpha tests.
+ */
+static void
+generate_fs_loop(struct gallivm_state *gallivm,
+                 struct lp_fragment_shader *shader,
+                 const struct lp_fragment_shader_variant_key *key,
+                 LLVMBuilderRef builder,
+                 struct lp_type type,
+                 LLVMValueRef context_ptr,
+                 LLVMValueRef num_loop,
+                 struct lp_build_interp_soa_context *interp,
+                 struct lp_build_sampler_soa *sampler,
+                 LLVMValueRef mask_store,
+                 LLVMValueRef (*out_color)[4],
+                 LLVMValueRef depth_ptr,
+                 LLVMValueRef depth_stride,
+                 LLVMValueRef facing,
+                 LLVMValueRef thread_data_ptr)
+{
+   const struct util_format_description *zs_format_desc = NULL;
+   const struct tgsi_token *tokens = shader->base.tokens;
+   struct lp_type int_type = lp_int_type(type);
+   LLVMTypeRef vec_type, int_vec_type;
+   LLVMValueRef mask_ptr, mask_val;
+   LLVMValueRef consts_ptr, num_consts_ptr;
+   LLVMValueRef z;
+   LLVMValueRef z_value, s_value;
+   LLVMValueRef z_fb, s_fb;
+   LLVMValueRef stencil_refs[2];
+   LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
+   struct lp_build_for_loop_state loop_state;
+   struct lp_build_mask_context mask;
+   /*
+    * TODO: figure out if simple_shader optimization is really worthwile to
+    * keep. Disabled because it may hide some real bugs in the (depth/stencil)
+    * code since tests tend to take another codepath than real shaders.
+    */
+   boolean simple_shader = (shader->info.base.file_count[TGSI_FILE_SAMPLER] == 0 &&
+                            shader->info.base.num_inputs < 3 &&
+                            shader->info.base.num_instructions < 8) && 0;
+   const boolean dual_source_blend = key->blend.rt[0].blend_enable &&
+                                     util_blend_state_is_dual(&key->blend, 0);
+   unsigned attrib;
+   unsigned chan;
+   unsigned cbuf;
+   unsigned depth_mode;
+
+   struct lp_bld_tgsi_system_values system_values;
+
+   memset(&system_values, 0, sizeof(system_values));
+
+   if (key->depth.enabled ||
+       key->stencil[0].enabled) {
+
+      zs_format_desc = util_format_description(key->zsbuf_format);
+      assert(zs_format_desc);
+
+      if (!shader->info.base.writes_z && !shader->info.base.writes_stencil) {
+         if (key->alpha.enabled ||
+             key->blend.alpha_to_coverage ||
+             shader->info.base.uses_kill) {
+            /* With alpha test and kill, can do the depth test early
+             * and hopefully eliminate some quads.  But need to do a
+             * special deferred depth write once the final mask value
+             * is known. This only works though if there's either no
+             * stencil test or the stencil value isn't written.
+             */
+            if (key->stencil[0].enabled && (key->stencil[0].writemask ||
+                                            (key->stencil[1].enabled &&
+                                             key->stencil[1].writemask)))
+               depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
+            else
+               depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE;
+         }
+         else
+            depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
+      }
+      else {
+         depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
+      }
+
+      if (!(key->depth.enabled && key->depth.writemask) &&
+          !(key->stencil[0].enabled && (key->stencil[0].writemask ||
+                                        (key->stencil[1].enabled &&
+                                         key->stencil[1].writemask))))
+         depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
+   }
+   else {
+      depth_mode = 0;
+   }
+
+   vec_type = lp_build_vec_type(gallivm, type);
+   int_vec_type = lp_build_vec_type(gallivm, int_type);
+
+   stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_ptr);
+   stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_ptr);
+   /* convert scalar stencil refs into vectors */
+   stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]);
+   stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]);
+
+   consts_ptr = lp_jit_context_constants(gallivm, context_ptr);
+   num_consts_ptr = lp_jit_context_num_constants(gallivm, context_ptr);
+
+   lp_build_for_loop_begin(&loop_state, gallivm,
+                           lp_build_const_int32(gallivm, 0),
+                           LLVMIntULT,
+                           num_loop,
+                           lp_build_const_int32(gallivm, 1));
+
+   mask_ptr = LLVMBuildGEP(builder, mask_store,
+                           &loop_state.counter, 1, "mask_ptr");
+   mask_val = LLVMBuildLoad(builder, mask_ptr, "");
+
+   memset(outputs, 0, sizeof outputs);
+
+   for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
+      for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+         out_color[cbuf][chan] = lp_build_array_alloca(gallivm,
+                                                       lp_build_vec_type(gallivm,
+                                                                         type),
+                                                       num_loop, "color");
+      }
+   }
+   if (dual_source_blend) {
+      assert(key->nr_cbufs <= 1);
+      for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+         out_color[1][chan] = lp_build_array_alloca(gallivm,
+                                                    lp_build_vec_type(gallivm,
+                                                                      type),
+                                                    num_loop, "color1");
+      }
+   }
+
+
+   /* 'mask' will control execution based on quad's pixel alive/killed state */
+   lp_build_mask_begin(&mask, gallivm, type, mask_val);
+
+   if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader)
+      lp_build_mask_check(&mask);
+
+   lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter);
+   z = interp->pos[2];
+
+   if (depth_mode & EARLY_DEPTH_TEST) {
+      lp_build_depth_stencil_load_swizzled(gallivm, type,
+                                           zs_format_desc, key->resource_1d,
+                                           depth_ptr, depth_stride,
+                                           &z_fb, &s_fb, loop_state.counter);
+      lp_build_depth_stencil_test(gallivm,
+                                  &key->depth,
+                                  key->stencil,
+                                  type,
+                                  zs_format_desc,
+                                  &mask,
+                                  stencil_refs,
+                                  z, z_fb, s_fb,
+                                  facing,
+                                  &z_value, &s_value,
+                                  !simple_shader);
+
+      if (depth_mode & EARLY_DEPTH_WRITE) {
+         lp_build_depth_stencil_write_swizzled(gallivm, type,
+                                               zs_format_desc, key->resource_1d,
+                                               NULL, NULL, NULL, loop_state.counter,
+                                               depth_ptr, depth_stride,
+                                               z_value, s_value);
+      }
+      /*
+       * Note mask check if stencil is enabled must be after ds write not after
+       * stencil test otherwise new stencil values may not get written if all
+       * fragments got killed by depth/stencil test.
+       */
+      if (!simple_shader && key->stencil[0].enabled)
+         lp_build_mask_check(&mask);
+   }
+
+   lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter);
+
+   /* Build the actual shader */
+   lp_build_tgsi_soa(gallivm, tokens, type, &mask,
+                     consts_ptr, num_consts_ptr, &system_values,
+                     interp->inputs,
+                     outputs, context_ptr,
+                     sampler, &shader->info.base, NULL);
+
+   /* Alpha test */
+   if (key->alpha.enabled) {
+      int color0 = find_output_by_semantic(&shader->info.base,
+                                           TGSI_SEMANTIC_COLOR,
+                                           0);
+
+      if (color0 != -1 && outputs[color0][3]) {
+         const struct util_format_description *cbuf_format_desc;
+         LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha");
+         LLVMValueRef alpha_ref_value;
+
+         alpha_ref_value = lp_jit_context_alpha_ref_value(gallivm, context_ptr);
+         alpha_ref_value = lp_build_broadcast(gallivm, vec_type, alpha_ref_value);
+
+         cbuf_format_desc = util_format_description(key->cbuf_format[0]);
+
+         lp_build_alpha_test(gallivm, key->alpha.func, type, cbuf_format_desc,
+                             &mask, alpha, alpha_ref_value,
+                             (depth_mode & LATE_DEPTH_TEST) != 0);
+      }
+   }
+
+   /* Emulate Alpha to Coverage with Alpha test */
+   if (key->blend.alpha_to_coverage) {
+      int color0 = find_output_by_semantic(&shader->info.base,
+                                           TGSI_SEMANTIC_COLOR,
+                                           0);
+
+      if (color0 != -1 && outputs[color0][3]) {
+         LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha");
+
+         lp_build_alpha_to_coverage(gallivm, type,
+                                    &mask, alpha,
+                                    (depth_mode & LATE_DEPTH_TEST) != 0);
+      }
+   }
+
+   /* Late Z test */
+   if (depth_mode & LATE_DEPTH_TEST) {
+      int pos0 = find_output_by_semantic(&shader->info.base,
+                                         TGSI_SEMANTIC_POSITION,
+                                         0);
+      int s_out = find_output_by_semantic(&shader->info.base,
+                                          TGSI_SEMANTIC_STENCIL,
+                                          0);
+      if (pos0 != -1 && outputs[pos0][2]) {
+         z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z");
+
+         /*
+          * Clamp according to ARB_depth_clamp semantics.
+          */
+         if (key->depth_clamp) {
+            LLVMValueRef viewport, min_depth, max_depth;
+            LLVMValueRef viewport_index;
+            struct lp_build_context f32_bld;
+
+            assert(type.floating);
+            lp_build_context_init(&f32_bld, gallivm, type);
+
+            /*
+             * Assumes clamping of the viewport index will occur in setup/gs. Value
+             * is passed through the rasterization stage via lp_rast_shader_inputs.
+             *
+             * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping
+             *      semantics.
+             */
+            viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm,
+                                thread_data_ptr);
+
+            /*
+             * Load the min and max depth from the lp_jit_context.viewports
+             * array of lp_jit_viewport structures.
+             */
+            viewport = lp_llvm_viewport(context_ptr, gallivm, viewport_index);
+
+            /* viewports[viewport_index].min_depth */
+            min_depth = LLVMBuildExtractElement(builder, viewport,
+                           lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH),
+                           "");
+            min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth);
+
+            /* viewports[viewport_index].max_depth */
+            max_depth = LLVMBuildExtractElement(builder, viewport,
+                           lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH),
+                           "");
+            max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth);
+
+            /*
+             * Clamp to the min and max depth values for the given viewport.
+             */
+            z = lp_build_clamp(&f32_bld, z, min_depth, max_depth);
+         }
+      }
+
+      if (s_out != -1 && outputs[s_out][1]) {
+         /* there's only one value, and spec says to discard additional bits */
+         LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255);
+         stencil_refs[0] = LLVMBuildLoad(builder, outputs[s_out][1], "output.s");
+         stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, "");
+         stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, "");
+         stencil_refs[1] = stencil_refs[0];
+      }
+
+      lp_build_depth_stencil_load_swizzled(gallivm, type,
+                                           zs_format_desc, key->resource_1d,
+                                           depth_ptr, depth_stride,
+                                           &z_fb, &s_fb, loop_state.counter);
+
+      lp_build_depth_stencil_test(gallivm,
+                                  &key->depth,
+                                  key->stencil,
+                                  type,
+                                  zs_format_desc,
+                                  &mask,
+                                  stencil_refs,
+                                  z, z_fb, s_fb,
+                                  facing,
+                                  &z_value, &s_value,
+                                  !simple_shader);
+      /* Late Z write */
+      if (depth_mode & LATE_DEPTH_WRITE) {
+         lp_build_depth_stencil_write_swizzled(gallivm, type,
+                                               zs_format_desc, key->resource_1d,
+                                               NULL, NULL, NULL, loop_state.counter,
+                                               depth_ptr, depth_stride,
+                                               z_value, s_value);
+      }
+   }
+   else if ((depth_mode & EARLY_DEPTH_TEST) &&
+            (depth_mode & LATE_DEPTH_WRITE))
+   {
+      /* Need to apply a reduced mask to the depth write.  Reload the
+       * depth value, update from zs_value with the new mask value and
+       * write that out.
+       */
+      lp_build_depth_stencil_write_swizzled(gallivm, type,
+                                            zs_format_desc, key->resource_1d,
+                                            &mask, z_fb, s_fb, loop_state.counter,
+                                            depth_ptr, depth_stride,
+                                            z_value, s_value);
+   }
+
+
+   /* Color write  */
+   for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib)
+   {
+      unsigned cbuf = shader->info.base.output_semantic_index[attrib];
+      if ((shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR) &&
+           ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend)))
+      {
+         for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+            if(outputs[attrib][chan]) {
+               /* XXX: just initialize outputs to point at colors[] and
+                * skip this.
+                */
+               LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
+               LLVMValueRef color_ptr;
+               color_ptr = LLVMBuildGEP(builder, out_color[cbuf][chan],
+                                        &loop_state.counter, 1, "");
+               lp_build_name(out, "color%u.%c", attrib, "rgba"[chan]);
+               LLVMBuildStore(builder, out, color_ptr);
+            }
+         }
+      }
+   }
+
+   if (key->occlusion_count) {
+      LLVMValueRef counter = lp_jit_thread_data_counter(gallivm, thread_data_ptr);
+      lp_build_name(counter, "counter");
+      lp_build_occlusion_count(gallivm, type,
+                               lp_build_mask_value(&mask), counter);
+   }
+
+   mask_val = lp_build_mask_end(&mask);
+   LLVMBuildStore(builder, mask_val, mask_ptr);
+   lp_build_for_loop_end(&loop_state);
+}
+
+
+/**
+ * This function will reorder pixels from the fragment shader SoA to memory layout AoS
+ *
+ * Fragment Shader outputs pixels in small 2x2 blocks
+ *  e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ...
+ *
+ * However in memory pixels are stored in rows
+ *  e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ...
+ *
+ * @param type            fragment shader type (4x or 8x float)
+ * @param num_fs          number of fs_src
+ * @param is_1d           whether we're outputting to a 1d resource
+ * @param dst_channels    number of output channels
+ * @param fs_src          output from fragment shader
+ * @param dst             pointer to store result
+ * @param pad_inline      is channel padding inline or at end of row
+ * @return                the number of dsts
+ */
+static int
+generate_fs_twiddle(struct gallivm_state *gallivm,
+                    struct lp_type type,
+                    unsigned num_fs,
+                    unsigned dst_channels,
+                    LLVMValueRef fs_src[][4],
+                    LLVMValueRef* dst,
+                    bool pad_inline)
+{
+   LLVMValueRef src[16];
+
+   bool swizzle_pad;
+   bool twiddle;
+   bool split;
+
+   unsigned pixels = type.length / 4;
+   unsigned reorder_group;
+   unsigned src_channels;
+   unsigned src_count;
+   unsigned i;
+
+   src_channels = dst_channels < 3 ? dst_channels : 4;
+   src_count = num_fs * src_channels;
+
+   assert(pixels == 2 || pixels == 1);
+   assert(num_fs * src_channels <= Elements(src));
+
+   /*
+    * Transpose from SoA -> AoS
+    */
+   for (i = 0; i < num_fs; ++i) {
+      lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels, &src[i * src_channels]);
+   }
+
+   /*
+    * Pick transformation options
+    */
+   swizzle_pad = false;
+   twiddle = false;
+   split = false;
+   reorder_group = 0;
+
+   if (dst_channels == 1) {
+      twiddle = true;
+
+      if (pixels == 2) {
+         split = true;
+      }
+   } else if (dst_channels == 2) {
+      if (pixels == 1) {
+         reorder_group = 1;
+      }
+   } else if (dst_channels > 2) {
+      if (pixels == 1) {
+         reorder_group = 2;
+      } else {
+         twiddle = true;
+      }
+
+      if (!pad_inline && dst_channels == 3 && pixels > 1) {
+         swizzle_pad = true;
+      }
+   }
+
+   /*
+    * Split the src in half
+    */
+   if (split) {
+      for (i = num_fs; i > 0; --i) {
+         src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4);
+         src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4);
+      }
+
+      src_count *= 2;
+      type.length = 4;
+   }
+
+   /*
+    * Ensure pixels are in memory order
+    */
+   if (reorder_group) {
+      /* Twiddle pixels by reordering the array, e.g.:
+       *
+       * src_count =  8 -> 0 2 1 3 4 6 5 7
+       * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15
+       */
+      const unsigned reorder_sw[] = { 0, 2, 1, 3 };
+
+      for (i = 0; i < src_count; ++i) {
+         unsigned group = i / reorder_group;
+         unsigned block = (group / 4) * 4 * reorder_group;
+         unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group);
+         dst[i] = src[j];
+      }
+   } else if (twiddle) {
+      /* Twiddle pixels across elements of array */
+      lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
+   } else {
+      /* Do nothing */
+      memcpy(dst, src, sizeof(LLVMValueRef) * src_count);
+   }
+
+   /*
+    * Moves any padding between pixels to the end
+    * e.g. RGBXRGBX -> RGBRGBXX
+    */
+   if (swizzle_pad) {
+      unsigned char swizzles[16];
+      unsigned elems = pixels * dst_channels;
+
+      for (i = 0; i < type.length; ++i) {
+         if (i < elems)
+            swizzles[i] = i % dst_channels + (i / dst_channels) * 4;
+         else
+            swizzles[i] = LP_BLD_SWIZZLE_DONTCARE;
+      }
+
+      for (i = 0; i < src_count; ++i) {
+         dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles, type.length, type.length);
+      }
+   }
+
+   return src_count;
+}
+
+
+/**
+ * Load an unswizzled block of pixels from memory
+ */
+static void
+load_unswizzled_block(struct gallivm_state *gallivm,
+                      LLVMValueRef base_ptr,
+                      LLVMValueRef stride,
+                      unsigned block_width,
+                      unsigned block_height,
+                      LLVMValueRef* dst,
+                      struct lp_type dst_type,
+                      unsigned dst_count,
+                      unsigned dst_alignment)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned row_size = dst_count / block_height;
+   unsigned i;
+
+   /* Ensure block exactly fits into dst */
+   assert((block_width * block_height) % dst_count == 0);
+
+   for (i = 0; i < dst_count; ++i) {
+      unsigned x = i % row_size;
+      unsigned y = i / row_size;
+
+      LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length);
+      LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
+
+      LLVMValueRef gep[2];
+      LLVMValueRef dst_ptr;
+
+      gep[0] = lp_build_const_int32(gallivm, 0);
+      gep[1] = LLVMBuildAdd(builder, bx, by, "");
+
+      dst_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
+      dst_ptr = LLVMBuildBitCast(builder, dst_ptr, LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
+
+      dst[i] = LLVMBuildLoad(builder, dst_ptr, "");
+
+      lp_set_load_alignment(dst[i], dst_alignment);
+   }
+}
+
+
+/**
+ * Store an unswizzled block of pixels to memory
+ */
+static void
+store_unswizzled_block(struct gallivm_state *gallivm,
+                       LLVMValueRef base_ptr,
+                       LLVMValueRef stride,
+                       unsigned block_width,
+                       unsigned block_height,
+                       LLVMValueRef* src,
+                       struct lp_type src_type,
+                       unsigned src_count,
+                       unsigned src_alignment)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned row_size = src_count / block_height;
+   unsigned i;
+
+   /* Ensure src exactly fits into block */
+   assert((block_width * block_height) % src_count == 0);
+
+   for (i = 0; i < src_count; ++i) {
+      unsigned x = i % row_size;
+      unsigned y = i / row_size;
+
+      LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length);
+      LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
+
+      LLVMValueRef gep[2];
+      LLVMValueRef src_ptr;
+
+      gep[0] = lp_build_const_int32(gallivm, 0);
+      gep[1] = LLVMBuildAdd(builder, bx, by, "");
+
+      src_ptr = LLVMBuildGEP(builder, base_ptr, gep, 2, "");
+      src_ptr = LLVMBuildBitCast(builder, src_ptr, LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
+
+      src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
+
+      lp_set_store_alignment(src_ptr, src_alignment);
+   }
+}
+
+
+/**
+ * Checks if a format description is an arithmetic format
+ *
+ * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
+ */
+static inline boolean
+is_arithmetic_format(const struct util_format_description *format_desc)
+{
+   boolean arith = false;
+   unsigned i;
+
+   for (i = 0; i < format_desc->nr_channels; ++i) {
+      arith |= format_desc->channel[i].size != format_desc->channel[0].size;
+      arith |= (format_desc->channel[i].size % 8) != 0;
+   }
+
+   return arith;
+}
+
+
+/**
+ * Checks if this format requires special handling due to required expansion
+ * to floats for blending, and furthermore has "natural" packed AoS -> unpacked
+ * SoA conversion.
+ */
+static inline boolean
+format_expands_to_float_soa(const struct util_format_description *format_desc)
+{
+   if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
+       format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+      return true;
+   }
+   return false;
+}
+
+
+/**
+ * Retrieves the type representing the memory layout for a format
+ *
+ * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
+ */
+static inline void
+lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
+                             struct lp_type* type)
+{
+   unsigned i;
+   unsigned chan;
+
+   if (format_expands_to_float_soa(format_desc)) {
+      /* just make this a uint with width of block */
+      type->floating = false;
+      type->fixed = false;
+      type->sign = false;
+      type->norm = false;
+      type->width = format_desc->block.bits;
+      type->length = 1;
+      return;
+   }
+
+   for (i = 0; i < 4; i++)
+      if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
+         break;
+   chan = i;
+
+   memset(type, 0, sizeof(struct lp_type));
+   type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
+   type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
+   type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
+   type->norm     = format_desc->channel[chan].normalized;
+
+   if (is_arithmetic_format(format_desc)) {
+      type->width = 0;
+      type->length = 1;
+
+      for (i = 0; i < format_desc->nr_channels; ++i) {
+         type->width += format_desc->channel[i].size;
+      }
+   } else {
+      type->width = format_desc->channel[chan].size;
+      type->length = format_desc->nr_channels;
+   }
+}
+
+
+/**
+ * Retrieves the type for a format which is usable in the blending code.
+ *
+ * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
+ */
+static inline void
+lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
+                               struct lp_type* type)
+{
+   unsigned i;
+   unsigned chan;
+
+   if (format_expands_to_float_soa(format_desc)) {
+      /* always use ordinary floats for blending */
+      type->floating = true;
+      type->fixed = false;
+      type->sign = true;
+      type->norm = false;
+      type->width = 32;
+      type->length = 4;
+      return;
+   }
+
+   for (i = 0; i < 4; i++)
+      if (format_desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
+         break;
+   chan = i;
+
+   memset(type, 0, sizeof(struct lp_type));
+   type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
+   type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
+   type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
+   type->norm     = format_desc->channel[chan].normalized;
+   type->width    = format_desc->channel[chan].size;
+   type->length   = format_desc->nr_channels;
+
+   for (i = 1; i < format_desc->nr_channels; ++i) {
+      if (format_desc->channel[i].size > type->width)
+         type->width = format_desc->channel[i].size;
+   }
+
+   if (type->floating) {
+      type->width = 32;
+   } else {
+      if (type->width <= 8) {
+         type->width = 8;
+      } else if (type->width <= 16) {
+         type->width = 16;
+      } else {
+         type->width = 32;
+      }
+   }
+
+   if (is_arithmetic_format(format_desc) && type->length == 3) {
+      type->length = 4;
+   }
+}
+
+
+/**
+ * Scale a normalized value from src_bits to dst_bits.
+ *
+ * The exact calculation is
+ *
+ *    dst = iround(src * dst_mask / src_mask)
+ *
+ *  or with integer rounding
+ *
+ *    dst = src * (2*dst_mask + sign(src)*src_mask) / (2*src_mask)
+ *
+ *  where
+ *
+ *    src_mask = (1 << src_bits) - 1
+ *    dst_mask = (1 << dst_bits) - 1
+ *
+ * but we try to avoid division and multiplication through shifts.
+ */
+static inline LLVMValueRef
+scale_bits(struct gallivm_state *gallivm,
+           int src_bits,
+           int dst_bits,
+           LLVMValueRef src,
+           struct lp_type src_type)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef result = src;
+
+   if (dst_bits < src_bits) {
+      int delta_bits = src_bits - dst_bits;
+
+      if (delta_bits <= dst_bits) {
+         /*
+          * Approximate the rescaling with a single shift.
+          *
+          * This gives the wrong rounding.
+          */
+
+         result = LLVMBuildLShr(builder,
+                                src,
+                                lp_build_const_int_vec(gallivm, src_type, delta_bits),
+                                "");
+
+      } else {
+         /*
+          * Try more accurate rescaling.
+          */
+
+         /*
+          * Drop the least significant bits to make space for the multiplication.
+          *
+          * XXX: A better approach would be to use a wider integer type as intermediate.  But
+          * this is enough to convert alpha from 16bits -> 2 when rendering to
+          * PIPE_FORMAT_R10G10B10A2_UNORM.
+          */
+         result = LLVMBuildLShr(builder,
+                                src,
+                                lp_build_const_int_vec(gallivm, src_type, dst_bits),
+                                "");
+
+
+         result = LLVMBuildMul(builder,
+                               result,
+                               lp_build_const_int_vec(gallivm, src_type, (1LL << dst_bits) - 1),
+                               "");
+
+         /*
+          * Add a rounding term before the division.
+          *
+          * TODO: Handle signed integers too.
+          */
+         if (!src_type.sign) {
+            result = LLVMBuildAdd(builder,
+                                  result,
+                                  lp_build_const_int_vec(gallivm, src_type, (1LL << (delta_bits - 1))),
+                                  "");
+         }
+
+         /*
+          * Approximate the division by src_mask with a src_bits shift.
+          *
+          * Given the src has already been shifted by dst_bits, all we need
+          * to do is to shift by the difference.
+          */
+
+         result = LLVMBuildLShr(builder,
+                                result,
+                                lp_build_const_int_vec(gallivm, src_type, delta_bits),
+                                "");
+      }
+
+   } else if (dst_bits > src_bits) {
+      /* Scale up bits */
+      int db = dst_bits - src_bits;
+
+      /* Shift left by difference in bits */
+      result = LLVMBuildShl(builder,
+                            src,
+                            lp_build_const_int_vec(gallivm, src_type, db),
+                            "");
+
+      if (db < src_bits) {
+         /* Enough bits in src to fill the remainder */
+         LLVMValueRef lower = LLVMBuildLShr(builder,
+                                            src,
+                                            lp_build_const_int_vec(gallivm, src_type, src_bits - db),
+                                            "");
+
+         result = LLVMBuildOr(builder, result, lower, "");
+      } else if (db > src_bits) {
+         /* Need to repeatedly copy src bits to fill remainder in dst */
+         unsigned n;
+
+         for (n = src_bits; n < dst_bits; n *= 2) {
+            LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
+
+            result = LLVMBuildOr(builder,
+                                 result,
+                                 LLVMBuildLShr(builder, result, shuv, ""),
+                                 "");
+         }
+      }
+   }
+
+   return result;
+}
+
+/**
+ * If RT is a smallfloat (needing denorms) format
+ */
+static inline int
+have_smallfloat_format(struct lp_type dst_type,
+                       enum pipe_format format)
+{
+   return ((dst_type.floating && dst_type.width != 32) ||
+    /* due to format handling hacks this format doesn't have floating set
+     * here (and actually has width set to 32 too) so special case this. */
+    (format == PIPE_FORMAT_R11G11B10_FLOAT));
+}
+
+
+/**
+ * Convert from memory format to blending format
+ *
+ * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending
+ */
+static void
+convert_to_blend_type(struct gallivm_state *gallivm,
+                      unsigned block_size,
+                      const struct util_format_description *src_fmt,
+                      struct lp_type src_type,
+                      struct lp_type dst_type,
+                      LLVMValueRef* src, // and dst
+                      unsigned num_srcs)
+{
+   LLVMValueRef *dst = src;
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_type blend_type;
+   struct lp_type mem_type;
+   unsigned i, j, k;
+   unsigned pixels = block_size / num_srcs;
+   bool is_arith;
+
+   /*
+    * full custom path for packed floats and srgb formats - none of the later
+    * functions would do anything useful, and given the lp_type representation they
+    * can't be fixed. Should really have some SoA blend path for these kind of
+    * formats rather than hacking them in here.
+    */
+   if (format_expands_to_float_soa(src_fmt)) {
+      LLVMValueRef tmpsrc[4];
+      /*
+       * This is pretty suboptimal for this case blending in SoA would be much
+       * better, since conversion gets us SoA values so need to convert back.
+       */
+      assert(src_type.width == 32 || src_type.width == 16);
+      assert(dst_type.floating);
+      assert(dst_type.width == 32);
+      assert(dst_type.length % 4 == 0);
+      assert(num_srcs % 4 == 0);
+
+      if (src_type.width == 16) {
+         /* expand 4x16bit values to 4x32bit */
+         struct lp_type type32x4 = src_type;
+         LLVMTypeRef ltype32x4;
+         unsigned num_fetch = dst_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
+         type32x4.width = 32;
+         ltype32x4 = lp_build_vec_type(gallivm, type32x4);
+         for (i = 0; i < num_fetch; i++) {
+            src[i] = LLVMBuildZExt(builder, src[i], ltype32x4, "");
+         }
+         src_type.width = 32;
+      }
+      for (i = 0; i < 4; i++) {
+         tmpsrc[i] = src[i];
+      }
+      for (i = 0; i < num_srcs / 4; i++) {
+         LLVMValueRef tmpsoa[4];
+         LLVMValueRef tmps = tmpsrc[i];
+         if (dst_type.length == 8) {
+            LLVMValueRef shuffles[8];
+            unsigned j;
+            /* fetch was 4 values but need 8-wide output values */
+            tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
+            /*
+             * for 8-wide aos transpose would give us wrong order not matching
+             * incoming converted fs values and mask. ARGH.
+             */
+            for (j = 0; j < 4; j++) {
+               shuffles[j] = lp_build_const_int32(gallivm, j * 2);
+               shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
+            }
+            tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
+                                          LLVMConstVector(shuffles, 8), "");
+         }
+         if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+            lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
+         }
+         else {
+            lp_build_unpack_rgba_soa(gallivm, src_fmt, dst_type, tmps, tmpsoa);
+         }
+         lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
+      }
+      return;
+   }
+
+   lp_mem_type_from_format_desc(src_fmt, &mem_type);
+   lp_blend_type_from_format_desc(src_fmt, &blend_type);
+
+   /* Is the format arithmetic */
+   is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length;
+   is_arith &= !(mem_type.width == 16 && mem_type.floating);
+
+   /* Pad if necessary */
+   if (!is_arith && src_type.length < dst_type.length) {
+      for (i = 0; i < num_srcs; ++i) {
+         dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length);
+      }
+
+      src_type.length = dst_type.length;
+   }
+
+   /* Special case for half-floats */
+   if (mem_type.width == 16 && mem_type.floating) {
+      assert(blend_type.width == 32 && blend_type.floating);
+      lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
+      is_arith = false;
+   }
+
+   if (!is_arith) {
+      return;
+   }
+
+   src_type.width = blend_type.width * blend_type.length;
+   blend_type.length *= pixels;
+   src_type.length *= pixels / (src_type.length / mem_type.length);
+
+   for (i = 0; i < num_srcs; ++i) {
+      LLVMValueRef chans[4];
+      LLVMValueRef res = NULL;
+
+      dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
+
+      for (j = 0; j < src_fmt->nr_channels; ++j) {
+         unsigned mask = 0;
+         unsigned sa = src_fmt->channel[j].shift;
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+         unsigned from_lsb = j;
+#else
+         unsigned from_lsb = src_fmt->nr_channels - j - 1;
+#endif
+
+         for (k = 0; k < src_fmt->channel[j].size; ++k) {
+            mask |= 1 << k;
+         }
+
+         /* Extract bits from source */
+         chans[j] = LLVMBuildLShr(builder,
+                                  dst[i],
+                                  lp_build_const_int_vec(gallivm, src_type, sa),
+                                  "");
+
+         chans[j] = LLVMBuildAnd(builder,
+                                 chans[j],
+                                 lp_build_const_int_vec(gallivm, src_type, mask),
+                                 "");
+
+         /* Scale bits */
+         if (src_type.norm) {
+            chans[j] = scale_bits(gallivm, src_fmt->channel[j].size,
+                                  blend_type.width, chans[j], src_type);
+         }
+
+         /* Insert bits into correct position */
+         chans[j] = LLVMBuildShl(builder,
+                                 chans[j],
+                                 lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
+                                 "");
+
+         if (j == 0) {
+            res = chans[j];
+         } else {
+            res = LLVMBuildOr(builder, res, chans[j], "");
+         }
+      }
+
+      dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), "");
+   }
+}
+
+
+/**
+ * Convert from blending format to memory format
+ *
+ * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory
+ */
+static void
+convert_from_blend_type(struct gallivm_state *gallivm,
+                        unsigned block_size,
+                        const struct util_format_description *src_fmt,
+                        struct lp_type src_type,
+                        struct lp_type dst_type,
+                        LLVMValueRef* src, // and dst
+                        unsigned num_srcs)
+{
+   LLVMValueRef* dst = src;
+   unsigned i, j, k;
+   struct lp_type mem_type;
+   struct lp_type blend_type;
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned pixels = block_size / num_srcs;
+   bool is_arith;
+
+   /*
+    * full custom path for packed floats and srgb formats - none of the later
+    * functions would do anything useful, and given the lp_type representation they
+    * can't be fixed. Should really have some SoA blend path for these kind of
+    * formats rather than hacking them in here.
+    */
+   if (format_expands_to_float_soa(src_fmt)) {
+      /*
+       * This is pretty suboptimal for this case blending in SoA would be much
+       * better - we need to transpose the AoS values back to SoA values for
+       * conversion/packing.
+       */
+      assert(src_type.floating);
+      assert(src_type.width == 32);
+      assert(src_type.length % 4 == 0);
+      assert(dst_type.width == 32 || dst_type.width == 16);
+
+      for (i = 0; i < num_srcs / 4; i++) {
+         LLVMValueRef tmpsoa[4], tmpdst;
+         lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
+         /* really really need SoA here */
+
+         if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
+            tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
+         }
+         else {
+            tmpdst = lp_build_float_to_srgb_packed(gallivm, src_fmt,
+                                                   src_type, tmpsoa);
+         }
+
+         if (src_type.length == 8) {
+            LLVMValueRef tmpaos, shuffles[8];
+            unsigned j;
+            /*
+             * for 8-wide aos transpose has given us wrong order not matching
+             * output order. HMPF. Also need to split the output values manually.
+             */
+            for (j = 0; j < 4; j++) {
+               shuffles[j * 2] = lp_build_const_int32(gallivm, j);
+               shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
+            }
+            tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
+                                            LLVMConstVector(shuffles, 8), "");
+            src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
+            src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
+         }
+         else {
+            src[i] = tmpdst;
+         }
+      }
+      if (dst_type.width == 16) {
+         struct lp_type type16x8 = dst_type;
+         struct lp_type type32x4 = dst_type;
+         LLVMTypeRef ltype16x4, ltypei64, ltypei128;
+         unsigned num_fetch = src_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
+         type16x8.length = 8;
+         type32x4.width = 32;
+         ltypei128 = LLVMIntTypeInContext(gallivm->context, 128);
+         ltypei64 = LLVMIntTypeInContext(gallivm->context, 64);
+         ltype16x4 = lp_build_vec_type(gallivm, dst_type);
+         /* We could do vector truncation but it doesn't generate very good code */
+         for (i = 0; i < num_fetch; i++) {
+            src[i] = lp_build_pack2(gallivm, type32x4, type16x8,
+                                    src[i], lp_build_zero(gallivm, type32x4));
+            src[i] = LLVMBuildBitCast(builder, src[i], ltypei128, "");
+            src[i] = LLVMBuildTrunc(builder, src[i], ltypei64, "");
+            src[i] = LLVMBuildBitCast(builder, src[i], ltype16x4, "");
+         }
+      }
+      return;
+   }
+
+   lp_mem_type_from_format_desc(src_fmt, &mem_type);
+   lp_blend_type_from_format_desc(src_fmt, &blend_type);
+
+   is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length);
+
+   /* Special case for half-floats */
+   if (mem_type.width == 16 && mem_type.floating) {
+      int length = dst_type.length;
+      assert(blend_type.width == 32 && blend_type.floating);
+
+      dst_type.length = src_type.length;
+
+      lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
+
+      dst_type.length = length;
+      is_arith = false;
+   }
+
+   /* Remove any padding */
+   if (!is_arith && (src_type.length % mem_type.length)) {
+      src_type.length -= (src_type.length % mem_type.length);
+
+      for (i = 0; i < num_srcs; ++i) {
+         dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length);
+      }
+   }
+
+   /* No bit arithmetic to do */
+   if (!is_arith) {
+      return;
+   }
+
+   src_type.length = pixels;
+   src_type.width = blend_type.length * blend_type.width;
+   dst_type.length = pixels;
+
+   for (i = 0; i < num_srcs; ++i) {
+      LLVMValueRef chans[4];
+      LLVMValueRef res = NULL;
+
+      dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
+
+      for (j = 0; j < src_fmt->nr_channels; ++j) {
+         unsigned mask = 0;
+         unsigned sa = src_fmt->channel[j].shift;
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+         unsigned from_lsb = j;
+#else
+         unsigned from_lsb = src_fmt->nr_channels - j - 1;
+#endif
+
+         assert(blend_type.width > src_fmt->channel[j].size);
+
+         for (k = 0; k < blend_type.width; ++k) {
+            mask |= 1 << k;
+         }
+
+         /* Extract bits */
+         chans[j] = LLVMBuildLShr(builder,
+                                  dst[i],
+                                  lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
+                                  "");
+
+         chans[j] = LLVMBuildAnd(builder,
+                                 chans[j],
+                                 lp_build_const_int_vec(gallivm, src_type, mask),
+                                 "");
+
+         /* Scale down bits */
+         if (src_type.norm) {
+            chans[j] = scale_bits(gallivm, blend_type.width,
+                                  src_fmt->channel[j].size, chans[j], src_type);
+         }
+
+         /* Insert bits */
+         chans[j] = LLVMBuildShl(builder,
+                                 chans[j],
+                                 lp_build_const_int_vec(gallivm, src_type, sa),
+                                 "");
+
+         sa += src_fmt->channel[j].size;
+
+         if (j == 0) {
+            res = chans[j];
+         } else {
+            res = LLVMBuildOr(builder, res, chans[j], "");
+         }
+      }
+
+      assert (dst_type.width != 24);
+
+      dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), "");
+   }
+}
+
+
+/**
+ * Convert alpha to same blend type as src
+ */
+static void
+convert_alpha(struct gallivm_state *gallivm,
+              struct lp_type row_type,
+              struct lp_type alpha_type,
+              const unsigned block_size,
+              const unsigned block_height,
+              const unsigned src_count,
+              const unsigned dst_channels,
+              const bool pad_inline,
+              LLVMValueRef* src_alpha)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned i, j;
+   unsigned length = row_type.length;
+   row_type.length = alpha_type.length;
+
+   /* Twiddle the alpha to match pixels */
+   lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, block_height, src_alpha);
+
+   /*
+    * TODO this should use single lp_build_conv call for
+    * src_count == 1 && dst_channels == 1 case (dropping the concat below)
+    */
+   for (i = 0; i < block_height; ++i) {
+      lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1, &src_alpha[i], 1);
+   }
+
+   alpha_type = row_type;
+   row_type.length = length;
+
+   /* If only one channel we can only need the single alpha value per pixel */
+   if (src_count == 1 && dst_channels == 1) {
+
+      lp_build_concat_n(gallivm, alpha_type, src_alpha, block_height, src_alpha, src_count);
+   } else {
+      /* If there are more srcs than rows then we need to split alpha up */
+      if (src_count > block_height) {
+         for (i = src_count; i > 0; --i) {
+            unsigned pixels = block_size / src_count;
+            unsigned idx = i - 1;
+
+            src_alpha[idx] = lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4],
+                                                    (idx * pixels) % 4, pixels);
+         }
+      }
+
+      /* If there is a src for each pixel broadcast the alpha across whole row */
+      if (src_count == block_size) {
+         for (i = 0; i < src_count; ++i) {
+            src_alpha[i] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, row_type), src_alpha[i]);
+         }
+      } else {
+         unsigned pixels = block_size / src_count;
+         unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels;
+         unsigned alpha_span = 1;
+         LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
+
+         /* Check if we need 2 src_alphas for our shuffles */
+         if (pixels > alpha_type.length) {
+            alpha_span = 2;
+         }
+
+         /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */
+         for (j = 0; j < row_type.length; ++j) {
+            if (j < pixels * channels) {
+               shuffles[j] = lp_build_const_int32(gallivm, j / channels);
+            } else {
+               shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
+            }
+         }
+
+         for (i = 0; i < src_count; ++i) {
+            unsigned idx1 = i, idx2 = i;
+
+            if (alpha_span > 1){
+               idx1 *= alpha_span;
+               idx2 = idx1 + 1;
+            }
+
+            src_alpha[i] = LLVMBuildShuffleVector(builder,
+                                                  src_alpha[idx1],
+                                                  src_alpha[idx2],
+                                                  LLVMConstVector(shuffles, row_type.length),
+                                                  "");
+         }
+      }
+   }
+}
+
+
+/**
+ * Generates the blend function for unswizzled colour buffers
+ * Also generates the read & write from colour buffer
+ */
+static void
+generate_unswizzled_blend(struct gallivm_state *gallivm,
+                          unsigned rt,
+                          struct lp_fragment_shader_variant *variant,
+                          enum pipe_format out_format,
+                          unsigned int num_fs,
+                          struct lp_type fs_type,
+                          LLVMValueRef* fs_mask,
+                          LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],
+                          LLVMValueRef context_ptr,
+                          LLVMValueRef color_ptr,
+                          LLVMValueRef stride,
+                          unsigned partial_mask,
+                          boolean do_branch)
+{
+   const unsigned alpha_channel = 3;
+   const unsigned block_width = LP_RASTER_BLOCK_SIZE;
+   const unsigned block_height = LP_RASTER_BLOCK_SIZE;
+   const unsigned block_size = block_width * block_height;
+   const unsigned lp_integer_vector_width = 128;
+
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
+   LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS];
+   LLVMValueRef src_alpha[4 * 4];
+   LLVMValueRef src1_alpha[4 * 4];
+   LLVMValueRef src_mask[4 * 4];
+   LLVMValueRef src[4 * 4];
+   LLVMValueRef src1[4 * 4];
+   LLVMValueRef dst[4 * 4];
+   LLVMValueRef blend_color;
+   LLVMValueRef blend_alpha;
+   LLVMValueRef i32_zero;
+   LLVMValueRef check_mask;
+   LLVMValueRef undef_src_val;
+
+   struct lp_build_mask_context mask_ctx;
+   struct lp_type mask_type;
+   struct lp_type blend_type;
+   struct lp_type row_type;
+   struct lp_type dst_type;
+
+   unsigned char swizzle[TGSI_NUM_CHANNELS];
+   unsigned vector_width;
+   unsigned src_channels = TGSI_NUM_CHANNELS;
+   unsigned dst_channels;
+   unsigned dst_count;
+   unsigned src_count;
+   unsigned i, j;
+
+   const struct util_format_description* out_format_desc = util_format_description(out_format);
+
+   unsigned dst_alignment;
+
+   bool pad_inline = is_arithmetic_format(out_format_desc);
+   bool has_alpha = false;
+   const boolean dual_source_blend = variant->key.blend.rt[0].blend_enable &&
+                                     util_blend_state_is_dual(&variant->key.blend, 0);
+
+   const boolean is_1d = variant->key.resource_1d;
+   unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
+   LLVMValueRef fpstate = 0;
+
+   /* Get type from output format */
+   lp_blend_type_from_format_desc(out_format_desc, &row_type);
+   lp_mem_type_from_format_desc(out_format_desc, &dst_type);
+
+   /*
+    * Technically this code should go into lp_build_smallfloat_to_float
+    * and lp_build_float_to_smallfloat but due to the
+    * http://llvm.org/bugs/show_bug.cgi?id=6393
+    * llvm reorders the mxcsr intrinsics in a way that breaks the code.
+    * So the ordering is important here and there shouldn't be any
+    * llvm ir instrunctions in this function before
+    * this, otherwise half-float format conversions won't work
+    * (again due to llvm bug #6393).
+    */
+   if (have_smallfloat_format(dst_type, out_format)) {
+      /* We need to make sure that denorms are ok for half float
+         conversions */
+      fpstate = lp_build_fpstate_get(gallivm);
+      lp_build_fpstate_set_denorms_zero(gallivm, FALSE);
+   }
+
+   mask_type = lp_int32_vec4_type();
+   mask_type.length = fs_type.length;
+
+   for (i = num_fs; i < num_fullblock_fs; i++) {
+      fs_mask[i] = lp_build_zero(gallivm, mask_type);
+   }
+
+   /* Do not bother executing code when mask is empty.. */
+   if (do_branch) {
+      check_mask = LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type));
+
+      for (i = 0; i < num_fullblock_fs; ++i) {
+         check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], "");
+      }
+
+      lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask);
+      lp_build_mask_check(&mask_ctx);
+   }
+
+   partial_mask |= !variant->opaque;
+   i32_zero = lp_build_const_int32(gallivm, 0);
+
+   undef_src_val = lp_build_undef(gallivm, fs_type);
+
+   row_type.length = fs_type.length;
+   vector_width    = dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
+
+   /* Compute correct swizzle and count channels */
+   memset(swizzle, LP_BLD_SWIZZLE_DONTCARE, TGSI_NUM_CHANNELS);
+   dst_channels = 0;
+
+   for (i = 0; i < TGSI_NUM_CHANNELS; ++i) {
+      /* Ensure channel is used */
+      if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) {
+         continue;
+      }
+
+      /* Ensure not already written to (happens in case with GL_ALPHA) */
+      if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) {
+         continue;
+      }
+
+      /* Ensure we havn't already found all channels */
+      if (dst_channels >= out_format_desc->nr_channels) {
+         continue;
+      }
+
+      swizzle[out_format_desc->swizzle[i]] = i;
+      ++dst_channels;
+
+      if (i == alpha_channel) {
+         has_alpha = true;
+      }
+   }
+
+   if (format_expands_to_float_soa(out_format_desc)) {
+      /*
+       * the code above can't work for layout_other
+       * for srgb it would sort of work but we short-circuit swizzles, etc.
+       * as that is done as part of unpack / pack.
+       */
+      dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */
+      has_alpha = true;
+      swizzle[0] = 0;
+      swizzle[1] = 1;
+      swizzle[2] = 2;
+      swizzle[3] = 3;
+      pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */
+   }
+
+   /* If 3 channels then pad to include alpha for 4 element transpose */
+   if (dst_channels == 3 && !has_alpha) {
+      for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
+         if (swizzle[i] > TGSI_NUM_CHANNELS)
+            swizzle[i] = 3;
+      }
+      if (out_format_desc->nr_channels == 4) {
+         dst_channels = 4;
+      }
+   }
+
+   /*
+    * Load shader output
+    */
+   for (i = 0; i < num_fullblock_fs; ++i) {
+      /* Always load alpha for use in blending */
+      LLVMValueRef alpha;
+      if (i < num_fs) {
+         alpha = LLVMBuildLoad(builder, fs_out_color[rt][alpha_channel][i], "");
+      }
+      else {
+         alpha = undef_src_val;
+      }
+
+      /* Load each channel */
+      for (j = 0; j < dst_channels; ++j) {
+         assert(swizzle[j] < 4);
+         if (i < num_fs) {
+            fs_src[i][j] = LLVMBuildLoad(builder, fs_out_color[rt][swizzle[j]][i], "");
+         }
+         else {
+            fs_src[i][j] = undef_src_val;
+         }
+      }
+
+      /* If 3 channels then pad to include alpha for 4 element transpose */
+      /*
+       * XXX If we include that here maybe could actually use it instead of
+       * separate alpha for blending?
+       */
+      if (dst_channels == 3 && !has_alpha) {
+         fs_src[i][3] = alpha;
+      }
+
+      /* We split the row_mask and row_alpha as we want 128bit interleave */
+      if (fs_type.length == 8) {
+         src_mask[i*2 + 0]  = lp_build_extract_range(gallivm, fs_mask[i], 0, src_channels);
+         src_mask[i*2 + 1]  = lp_build_extract_range(gallivm, fs_mask[i], src_channels, src_channels);
+
+         src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
+         src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
+      } else {
+         src_mask[i] = fs_mask[i];
+         src_alpha[i] = alpha;
+      }
+   }
+   if (dual_source_blend) {
+      /* same as above except different src/dst, skip masks and comments... */
+      for (i = 0; i < num_fullblock_fs; ++i) {
+         LLVMValueRef alpha;
+         if (i < num_fs) {
+            alpha = LLVMBuildLoad(builder, fs_out_color[1][alpha_channel][i], "");
+         }
+         else {
+            alpha = undef_src_val;
+         }
+
+         for (j = 0; j < dst_channels; ++j) {
+            assert(swizzle[j] < 4);
+            if (i < num_fs) {
+               fs_src1[i][j] = LLVMBuildLoad(builder, fs_out_color[1][swizzle[j]][i], "");
+            }
+            else {
+               fs_src1[i][j] = undef_src_val;
+            }
+         }
+         if (dst_channels == 3 && !has_alpha) {
+            fs_src1[i][3] = alpha;
+         }
+         if (fs_type.length == 8) {
+            src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
+            src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha, src_channels, src_channels);
+         } else {
+            src1_alpha[i] = alpha;
+         }
+      }
+   }
+
+   if (util_format_is_pure_integer(out_format)) {
+      /*
+       * In this case fs_type was really ints or uints disguised as floats,
+       * fix that up now.
+       */
+      fs_type.floating = 0;
+      fs_type.sign = dst_type.sign;
+      for (i = 0; i < num_fullblock_fs; ++i) {
+         for (j = 0; j < dst_channels; ++j) {
+            fs_src[i][j] = LLVMBuildBitCast(builder, fs_src[i][j],
+                                            lp_build_vec_type(gallivm, fs_type), "");
+         }
+         if (dst_channels == 3 && !has_alpha) {
+            fs_src[i][3] = LLVMBuildBitCast(builder, fs_src[i][3],
+                                            lp_build_vec_type(gallivm, fs_type), "");
+         }
+      }
+   }
+
+   /*
+    * Pixel twiddle from fragment shader order to memory order
+    */
+   src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
+                                   dst_channels, fs_src, src, pad_inline);
+   if (dual_source_blend) {
+      generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
+                          fs_src1, src1, pad_inline);
+   }
+
+   src_channels = dst_channels < 3 ? dst_channels : 4;
+   if (src_count != num_fullblock_fs * src_channels) {
+      unsigned ds = src_count / (num_fullblock_fs * src_channels);
+      row_type.length /= ds;
+      fs_type.length = row_type.length;
+   }
+
+   blend_type = row_type;
+   mask_type.length = 4;
+
+   /* Convert src to row_type */
+   if (dual_source_blend) {
+      struct lp_type old_row_type = row_type;
+      lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
+      src_count = lp_build_conv_auto(gallivm, fs_type, &old_row_type, src1, src_count, src1);
+   }
+   else {
+      src_count = lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
+   }
+
+   /* If the rows are not an SSE vector, combine them to become SSE size! */
+   if ((row_type.width * row_type.length) % 128) {
+      unsigned bits = row_type.width * row_type.length;
+      unsigned combined;
+
+      assert(src_count >= (vector_width / bits));
+
+      dst_count = src_count / (vector_width / bits);
+
+      combined = lp_build_concat_n(gallivm, row_type, src, src_count, src, dst_count);
+      if (dual_source_blend) {
+         lp_build_concat_n(gallivm, row_type, src1, src_count, src1, dst_count);
+      }
+
+      row_type.length *= combined;
+      src_count /= combined;
+
+      bits = row_type.width * row_type.length;
+      assert(bits == 128 || bits == 256);
+   }
+
+
+   /*
+    * Blend Colour conversion
+    */
+   blend_color = lp_jit_context_f_blend_color(gallivm, context_ptr);
+   blend_color = LLVMBuildPointerCast(builder, blend_color, LLVMPointerType(lp_build_vec_type(gallivm, fs_type), 0), "");
+   blend_color = LLVMBuildLoad(builder, LLVMBuildGEP(builder, blend_color, &i32_zero, 1, ""), "");
+
+   /* Convert */
+   lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1, &blend_color, 1);
+
+   if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
+      /*
+       * since blending is done with floats, there was no conversion.
+       * However, the rules according to fixed point renderbuffers still
+       * apply, that is we must clamp inputs to 0.0/1.0.
+       * (This would apply to separate alpha conversion too but we currently
+       * force has_alpha to be true.)
+       * TODO: should skip this with "fake" blend, since post-blend conversion
+       * will clamp anyway.
+       * TODO: could also skip this if fragment color clamping is enabled. We
+       * don't support it natively so it gets baked into the shader however, so
+       * can't really tell here.
+       */
+      struct lp_build_context f32_bld;
+      assert(row_type.floating);
+      lp_build_context_init(&f32_bld, gallivm, row_type);
+      for (i = 0; i < src_count; i++) {
+         src[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src[i]);
+      }
+      if (dual_source_blend) {
+         for (i = 0; i < src_count; i++) {
+            src1[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src1[i]);
+         }
+      }
+      /* probably can't be different than row_type but better safe than sorry... */
+      lp_build_context_init(&f32_bld, gallivm, blend_type);
+      blend_color = lp_build_clamp(&f32_bld, blend_color, f32_bld.zero, f32_bld.one);
+   }
+
+   /* Extract alpha */
+   blend_alpha = lp_build_extract_broadcast(gallivm, blend_type, row_type, blend_color, lp_build_const_int32(gallivm, 3));
+
+   /* Swizzle to appropriate channels, e.g. from RGBA to BGRA BGRA */
+   pad_inline &= (dst_channels * (block_size / src_count) * row_type.width) != vector_width;
+   if (pad_inline) {
+      /* Use all 4 channels e.g. from RGBA RGBA to RGxx RGxx */
+      blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, TGSI_NUM_CHANNELS, row_type.length);
+   } else {
+      /* Only use dst_channels e.g. RGBA RGBA to RG RG xxxx */
+      blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle, dst_channels, row_type.length);
+   }
+
+   /*
+    * Mask conversion
+    */
+   lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0], block_height, &src_mask[0]);
+
+   if (src_count < block_height) {
+      lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count);
+   } else if (src_count > block_height) {
+      for (i = src_count; i > 0; --i) {
+         unsigned pixels = block_size / src_count;
+         unsigned idx = i - 1;
+
+         src_mask[idx] = lp_build_extract_range(gallivm, src_mask[(idx * pixels) / 4],
+                                                (idx * pixels) % 4, pixels);
+      }
+   }
+
+   assert(mask_type.width == 32);
+
+   for (i = 0; i < src_count; ++i) {
+      unsigned pixels = block_size / src_count;
+      unsigned pixel_width = row_type.width * dst_channels;
+
+      if (pixel_width == 24) {
+         mask_type.width = 8;
+         mask_type.length = vector_width / mask_type.width;
+      } else {
+         mask_type.length = pixels;
+         mask_type.width = row_type.width * dst_channels;
+
+         src_mask[i] = LLVMBuildIntCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+
+         mask_type.length *= dst_channels;
+         mask_type.width /= dst_channels;
+      }
+
+      src_mask[i] = LLVMBuildBitCast(builder, src_mask[i], lp_build_int_vec_type(gallivm, mask_type), "");
+      src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
+   }
+
+   /*
+    * Alpha conversion
+    */
+   if (!has_alpha) {
+      struct lp_type alpha_type = fs_type;
+      alpha_type.length = 4;
+      convert_alpha(gallivm, row_type, alpha_type,
+                    block_size, block_height,
+                    src_count, dst_channels,
+                    pad_inline, src_alpha);
+      if (dual_source_blend) {
+         convert_alpha(gallivm, row_type, alpha_type,
+                       block_size, block_height,
+                       src_count, dst_channels,
+                       pad_inline, src1_alpha);
+      }
+   }
+
+
+   /*
+    * Load dst from memory
+    */
+   if (src_count < block_height) {
+      dst_count = block_height;
+   } else {
+      dst_count = src_count;
+   }
+
+   dst_type.length *= block_size / dst_count;
+
+   if (format_expands_to_float_soa(out_format_desc)) {
+      /*
+       * we need multiple values at once for the conversion, so can as well
+       * load them vectorized here too instead of concatenating later.
+       * (Still need concatenation later for 8-wide vectors).
+       */
+      dst_count = block_height;
+      dst_type.length = block_width;
+   }
+
+   /*
+    * Compute the alignment of the destination pointer in bytes
+    * We fetch 1-4 pixels, if the format has pot alignment then those fetches
+    * are always aligned by MIN2(16, fetch_width) except for buffers (not
+    * 1d tex but can't distinguish here) so need to stick with per-pixel
+    * alignment in this case.
+    */
+   if (is_1d) {
+      dst_alignment = (out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8);
+   }
+   else {
+      dst_alignment = dst_type.length * dst_type.width / 8;
+   }
+   /* Force power-of-two alignment by extracting only the least-significant-bit */
+   dst_alignment = 1 << (ffs(dst_alignment) - 1);
+   /*
+    * Resource base and stride pointers are aligned to 16 bytes, so that's
+    * the maximum alignment we can guarantee
+    */
+   dst_alignment = MIN2(16, dst_alignment);
+
+   if (is_1d) {
+      load_unswizzled_block(gallivm, color_ptr, stride, block_width, 1,
+                            dst, dst_type, dst_count / 4, dst_alignment);
+      for (i = dst_count / 4; i < dst_count; i++) {
+         dst[i] = lp_build_undef(gallivm, dst_type);
+      }
+
+   }
+   else {
+      load_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
+                            dst, dst_type, dst_count, dst_alignment);
+   }
+
+
+   /*
+    * Convert from dst/output format to src/blending format.
+    *
+    * This is necessary as we can only read 1 row from memory at a time,
+    * so the minimum dst_count will ever be at this point is 4.
+    *
+    * With, for example, R8 format you can have all 16 pixels in a 128 bit vector,
+    * this will take the 4 dsts and combine them into 1 src so we can perform blending
+    * on all 16 pixels in that single vector at once.
+    */
+   if (dst_count > src_count) {
+      lp_build_concat_n(gallivm, dst_type, dst, 4, dst, src_count);
+   }
+
+   /*
+    * Blending
+    */
+   /* XXX this is broken for RGB8 formats -
+    * they get expanded from 12 to 16 elements (to include alpha)
+    * by convert_to_blend_type then reduced to 15 instead of 12
+    * by convert_from_blend_type (a simple fix though breaks A8...).
+    * R16G16B16 also crashes differently however something going wrong
+    * inside llvm handling npot vector sizes seemingly.
+    * It seems some cleanup could be done here (like skipping conversion/blend
+    * when not needed).
+    */
+   convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type, row_type, dst, src_count);
+
+   /*
+    * FIXME: Really should get logic ops / masks out of generic blend / row
+    * format. Logic ops will definitely not work on the blend float format
+    * used for SRGB here and I think OpenGL expects this to work as expected
+    * (that is incoming values converted to srgb then logic op applied).
+    */
+   for (i = 0; i < src_count; ++i) {
+      dst[i] = lp_build_blend_aos(gallivm,
+                                  &variant->key.blend,
+                                  out_format,
+                                  row_type,
+                                  rt,
+                                  src[i],
+                                  has_alpha ? NULL : src_alpha[i],
+                                  src1[i],
+                                  has_alpha ? NULL : src1_alpha[i],
+                                  dst[i],
+                                  partial_mask ? src_mask[i] : NULL,
+                                  blend_color,
+                                  has_alpha ? NULL : blend_alpha,
+                                  swizzle,
+                                  pad_inline ? 4 : dst_channels);
+   }
+
+   convert_from_blend_type(gallivm, block_size, out_format_desc, row_type, dst_type, dst, src_count);
+
+   /* Split the blend rows back to memory rows */
+   if (dst_count > src_count) {
+      row_type.length = dst_type.length * (dst_count / src_count);
+
+      if (src_count == 1) {
+         dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2);
+         dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
+
+         row_type.length /= 2;
+         src_count *= 2;
+      }
+
+      dst[3] = lp_build_extract_range(gallivm, dst[1], row_type.length / 2, row_type.length / 2);
+      dst[2] = lp_build_extract_range(gallivm, dst[1], 0, row_type.length / 2);
+      dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2, row_type.length / 2);
+      dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
+
+      row_type.length /= 2;
+      src_count *= 2;
+   }
+
+   /*
+    * Store blend result to memory
+    */
+   if (is_1d) {
+      store_unswizzled_block(gallivm, color_ptr, stride, block_width, 1,
+                             dst, dst_type, dst_count / 4, dst_alignment);
+   }
+   else {
+      store_unswizzled_block(gallivm, color_ptr, stride, block_width, block_height,
+                             dst, dst_type, dst_count, dst_alignment);
+   }
+
+   if (have_smallfloat_format(dst_type, out_format)) {
+      lp_build_fpstate_set(gallivm, fpstate);
+   }
+
+   if (do_branch) {
+      lp_build_mask_end(&mask_ctx);
+   }
+}
+
+
+/**
+ * Generate the runtime callable function for the whole fragment pipeline.
+ * Note that the function which we generate operates on a block of 16
+ * pixels at at time.  The block contains 2x2 quads.  Each quad contains
+ * 2x2 pixels.
+ */
+static void
+generate_fragment(struct llvmpipe_context *lp,
+                  struct lp_fragment_shader *shader,
+                  struct lp_fragment_shader_variant *variant,
+                  unsigned partial_mask)
+{
+   struct gallivm_state *gallivm = variant->gallivm;
+   const struct lp_fragment_shader_variant_key *key = &variant->key;
+   struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
+   char func_name[64];
+   struct lp_type fs_type;
+   struct lp_type blend_type;
+   LLVMTypeRef fs_elem_type;
+   LLVMTypeRef blend_vec_type;
+   LLVMTypeRef arg_types[13];
+   LLVMTypeRef func_type;
+   LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
+   LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
+   LLVMValueRef context_ptr;
+   LLVMValueRef x;
+   LLVMValueRef y;
+   LLVMValueRef a0_ptr;
+   LLVMValueRef dadx_ptr;
+   LLVMValueRef dady_ptr;
+   LLVMValueRef color_ptr_ptr;
+   LLVMValueRef stride_ptr;
+   LLVMValueRef depth_ptr;
+   LLVMValueRef depth_stride;
+   LLVMValueRef mask_input;
+   LLVMValueRef thread_data_ptr;
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef builder;
+   struct lp_build_sampler_soa *sampler;
+   struct lp_build_interp_soa_context interp;
+   LLVMValueRef fs_mask[16 / 4];
+   LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4];
+   LLVMValueRef function;
+   LLVMValueRef facing;
+   unsigned num_fs;
+   unsigned i;
+   unsigned chan;
+   unsigned cbuf;
+   boolean cbuf0_write_all;
+   const boolean dual_source_blend = key->blend.rt[0].blend_enable &&
+                                     util_blend_state_is_dual(&key->blend, 0);
+
+   assert(lp_native_vector_width / 32 >= 4);
+
+   /* Adjust color input interpolation according to flatshade state:
+    */
+   memcpy(inputs, shader->inputs, shader->info.base.num_inputs * sizeof inputs[0]);
+   for (i = 0; i < shader->info.base.num_inputs; i++) {
+      if (inputs[i].interp == LP_INTERP_COLOR) {
+	 if (key->flatshade)
+	    inputs[i].interp = LP_INTERP_CONSTANT;
+	 else
+	    inputs[i].interp = LP_INTERP_PERSPECTIVE;
+      }
+   }
+
+   /* check if writes to cbuf[0] are to be copied to all cbufs */
+   cbuf0_write_all =
+     shader->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
+
+   /* TODO: actually pick these based on the fs and color buffer
+    * characteristics. */
+
+   memset(&fs_type, 0, sizeof fs_type);
+   fs_type.floating = TRUE;      /* floating point values */
+   fs_type.sign = TRUE;          /* values are signed */
+   fs_type.norm = FALSE;         /* values are not limited to [0,1] or [-1,1] */
+   fs_type.width = 32;           /* 32-bit float */
+   fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */
+
+   memset(&blend_type, 0, sizeof blend_type);
+   blend_type.floating = FALSE; /* values are integers */
+   blend_type.sign = FALSE;     /* values are unsigned */
+   blend_type.norm = TRUE;      /* values are in [0,1] or [-1,1] */
+   blend_type.width = 8;        /* 8-bit ubyte values */
+   blend_type.length = 16;      /* 16 elements per vector */
+
+   /* 
+    * Generate the function prototype. Any change here must be reflected in
+    * lp_jit.h's lp_jit_frag_func function pointer type, and vice-versa.
+    */
+
+   fs_elem_type = lp_build_elem_type(gallivm, fs_type);
+
+   blend_vec_type = lp_build_vec_type(gallivm, blend_type);
+
+   util_snprintf(func_name, sizeof(func_name), "fs%u_variant%u_%s",
+                 shader->no, variant->no, partial_mask ? "partial" : "whole");
+
+   arg_types[0] = variant->jit_context_ptr_type;       /* context */
+   arg_types[1] = int32_type;                          /* x */
+   arg_types[2] = int32_type;                          /* y */
+   arg_types[3] = int32_type;                          /* facing */
+   arg_types[4] = LLVMPointerType(fs_elem_type, 0);    /* a0 */
+   arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
+   arg_types[6] = LLVMPointerType(fs_elem_type, 0);    /* dady */
+   arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0);  /* color */
+   arg_types[8] = LLVMPointerType(int8_type, 0);       /* depth */
+   arg_types[9] = int32_type;                          /* mask_input */
+   arg_types[10] = variant->jit_thread_data_ptr_type;  /* per thread data */
+   arg_types[11] = LLVMPointerType(int32_type, 0);     /* stride */
+   arg_types[12] = int32_type;                         /* depth_stride */
+
+   func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
+                                arg_types, Elements(arg_types), 0);
+
+   function = LLVMAddFunction(gallivm->module, func_name, func_type);
+   LLVMSetFunctionCallConv(function, LLVMCCallConv);
+
+   variant->function[partial_mask] = function;
+
+   /* XXX: need to propagate noalias down into color param now we are
+    * passing a pointer-to-pointer?
+    */
+   for(i = 0; i < Elements(arg_types); ++i)
+      if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
+         LLVMAddAttribute(LLVMGetParam(function, i), LLVMNoAliasAttribute);
+
+   context_ptr  = LLVMGetParam(function, 0);
+   x            = LLVMGetParam(function, 1);
+   y            = LLVMGetParam(function, 2);
+   facing       = LLVMGetParam(function, 3);
+   a0_ptr       = LLVMGetParam(function, 4);
+   dadx_ptr     = LLVMGetParam(function, 5);
+   dady_ptr     = LLVMGetParam(function, 6);
+   color_ptr_ptr = LLVMGetParam(function, 7);
+   depth_ptr    = LLVMGetParam(function, 8);
+   mask_input   = LLVMGetParam(function, 9);
+   thread_data_ptr  = LLVMGetParam(function, 10);
+   stride_ptr   = LLVMGetParam(function, 11);
+   depth_stride = LLVMGetParam(function, 12);
+
+   lp_build_name(context_ptr, "context");
+   lp_build_name(x, "x");
+   lp_build_name(y, "y");
+   lp_build_name(a0_ptr, "a0");
+   lp_build_name(dadx_ptr, "dadx");
+   lp_build_name(dady_ptr, "dady");
+   lp_build_name(color_ptr_ptr, "color_ptr_ptr");
+   lp_build_name(depth_ptr, "depth");
+   lp_build_name(thread_data_ptr, "thread_data");
+   lp_build_name(mask_input, "mask_input");
+   lp_build_name(stride_ptr, "stride_ptr");
+   lp_build_name(depth_stride, "depth_stride");
+
+   /*
+    * Function body
+    */
+
+   block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
+   builder = gallivm->builder;
+   assert(builder);
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   /* code generated texture sampling */
+   sampler = lp_llvm_sampler_soa_create(key->state);
+
+   num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
+   /* for 1d resources only run "upper half" of stamp */
+   if (key->resource_1d)
+      num_fs /= 2;
+
+   {
+      LLVMValueRef num_loop = lp_build_const_int32(gallivm, num_fs);
+      LLVMTypeRef mask_type = lp_build_int_vec_type(gallivm, fs_type);
+      LLVMValueRef mask_store = lp_build_array_alloca(gallivm, mask_type,
+                                                      num_loop, "mask_store");
+      LLVMValueRef color_store[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS];
+      boolean pixel_center_integer =
+         shader->info.base.properties[TGSI_PROPERTY_FS_COORD_PIXEL_CENTER];
+
+      /*
+       * The shader input interpolation info is not explicitely baked in the
+       * shader key, but everything it derives from (TGSI, and flatshade) is
+       * already included in the shader key.
+       */
+      lp_build_interp_soa_init(&interp,
+                               gallivm,
+                               shader->info.base.num_inputs,
+                               inputs,
+                               pixel_center_integer,
+                               builder, fs_type,
+                               a0_ptr, dadx_ptr, dady_ptr,
+                               x, y);
+
+      for (i = 0; i < num_fs; i++) {
+         LLVMValueRef mask;
+         LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
+         LLVMValueRef mask_ptr = LLVMBuildGEP(builder, mask_store,
+                                              &indexi, 1, "mask_ptr");
+
+         if (partial_mask) {
+            mask = generate_quad_mask(gallivm, fs_type,
+                                      i*fs_type.length/4, mask_input);
+         }
+         else {
+            mask = lp_build_const_int_vec(gallivm, fs_type, ~0);
+         }
+         LLVMBuildStore(builder, mask, mask_ptr);
+      }
+
+      generate_fs_loop(gallivm,
+                       shader, key,
+                       builder,
+                       fs_type,
+                       context_ptr,
+                       num_loop,
+                       &interp,
+                       sampler,
+                       mask_store, /* output */
+                       color_store,
+                       depth_ptr,
+                       depth_stride,
+                       facing,
+                       thread_data_ptr);
+
+      for (i = 0; i < num_fs; i++) {
+         LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
+         LLVMValueRef ptr = LLVMBuildGEP(builder, mask_store,
+                                         &indexi, 1, "");
+         fs_mask[i] = LLVMBuildLoad(builder, ptr, "mask");
+         /* This is fucked up need to reorganize things */
+         for (cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
+            for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+               ptr = LLVMBuildGEP(builder,
+                                  color_store[cbuf * !cbuf0_write_all][chan],
+                                  &indexi, 1, "");
+               fs_out_color[cbuf][chan][i] = ptr;
+            }
+         }
+         if (dual_source_blend) {
+            /* only support one dual source blend target hence always use output 1 */
+            for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
+               ptr = LLVMBuildGEP(builder,
+                                  color_store[1][chan],
+                                  &indexi, 1, "");
+               fs_out_color[1][chan][i] = ptr;
+            }
+         }
+      }
+   }
+
+   sampler->destroy(sampler);
+
+   /* Loop over color outputs / color buffers to do blending.
+    */
+   for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
+      if (key->cbuf_format[cbuf] != PIPE_FORMAT_NONE) {
+         LLVMValueRef color_ptr;
+         LLVMValueRef stride;
+         LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
+
+         boolean do_branch = ((key->depth.enabled
+                               || key->stencil[0].enabled
+                               || key->alpha.enabled)
+                              && !shader->info.base.uses_kill);
+
+         color_ptr = LLVMBuildLoad(builder,
+                                   LLVMBuildGEP(builder, color_ptr_ptr,
+                                                &index, 1, ""),
+                                   "");
+
+         lp_build_name(color_ptr, "color_ptr%d", cbuf);
+
+         stride = LLVMBuildLoad(builder,
+                                LLVMBuildGEP(builder, stride_ptr, &index, 1, ""),
+                                "");
+
+         generate_unswizzled_blend(gallivm, cbuf, variant,
+                                   key->cbuf_format[cbuf],
+                                   num_fs, fs_type, fs_mask, fs_out_color,
+                                   context_ptr, color_ptr, stride,
+                                   partial_mask, do_branch);
+      }
+   }
+
+   LLVMBuildRetVoid(builder);
+
+   gallivm_verify_function(gallivm, function);
+}
+
+
+static void
+dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
+{
+   unsigned i;
+
+   debug_printf("fs variant %p:\n", (void *) key);
+
+   if (key->flatshade) {
+      debug_printf("flatshade = 1\n");
+   }
+   for (i = 0; i < key->nr_cbufs; ++i) {
+      debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
+   }
+   if (key->depth.enabled) {
+      debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
+      debug_printf("depth.func = %s\n", util_dump_func(key->depth.func, TRUE));
+      debug_printf("depth.writemask = %u\n", key->depth.writemask);
+   }
+
+   for (i = 0; i < 2; ++i) {
+      if (key->stencil[i].enabled) {
+         debug_printf("stencil[%u].func = %s\n", i, util_dump_func(key->stencil[i].func, TRUE));
+         debug_printf("stencil[%u].fail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].fail_op, TRUE));
+         debug_printf("stencil[%u].zpass_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zpass_op, TRUE));
+         debug_printf("stencil[%u].zfail_op = %s\n", i, util_dump_stencil_op(key->stencil[i].zfail_op, TRUE));
+         debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
+         debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
+      }
+   }
+
+   if (key->alpha.enabled) {
+      debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE));
+   }
+
+   if (key->occlusion_count) {
+      debug_printf("occlusion_count = 1\n");
+   }
+
+   if (key->blend.logicop_enable) {
+      debug_printf("blend.logicop_func = %s\n", util_dump_logicop(key->blend.logicop_func, TRUE));
+   }
+   else if (key->blend.rt[0].blend_enable) {
+      debug_printf("blend.rgb_func = %s\n",   util_dump_blend_func  (key->blend.rt[0].rgb_func, TRUE));
+      debug_printf("blend.rgb_src_factor = %s\n",   util_dump_blend_factor(key->blend.rt[0].rgb_src_factor, TRUE));
+      debug_printf("blend.rgb_dst_factor = %s\n",   util_dump_blend_factor(key->blend.rt[0].rgb_dst_factor, TRUE));
+      debug_printf("blend.alpha_func = %s\n",       util_dump_blend_func  (key->blend.rt[0].alpha_func, TRUE));
+      debug_printf("blend.alpha_src_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_src_factor, TRUE));
+      debug_printf("blend.alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
+   }
+   debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
+   if (key->blend.alpha_to_coverage) {
+      debug_printf("blend.alpha_to_coverage is enabled\n");
+   }
+   for (i = 0; i < key->nr_samplers; ++i) {
+      const struct lp_static_sampler_state *sampler = &key->state[i].sampler_state;
+      debug_printf("sampler[%u] = \n", i);
+      debug_printf("  .wrap = %s %s %s\n",
+                   util_dump_tex_wrap(sampler->wrap_s, TRUE),
+                   util_dump_tex_wrap(sampler->wrap_t, TRUE),
+                   util_dump_tex_wrap(sampler->wrap_r, TRUE));
+      debug_printf("  .min_img_filter = %s\n",
+                   util_dump_tex_filter(sampler->min_img_filter, TRUE));
+      debug_printf("  .min_mip_filter = %s\n",
+                   util_dump_tex_mipfilter(sampler->min_mip_filter, TRUE));
+      debug_printf("  .mag_img_filter = %s\n",
+                   util_dump_tex_filter(sampler->mag_img_filter, TRUE));
+      if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE)
+         debug_printf("  .compare_func = %s\n", util_dump_func(sampler->compare_func, TRUE));
+      debug_printf("  .normalized_coords = %u\n", sampler->normalized_coords);
+      debug_printf("  .min_max_lod_equal = %u\n", sampler->min_max_lod_equal);
+      debug_printf("  .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero);
+      debug_printf("  .apply_min_lod = %u\n", sampler->apply_min_lod);
+      debug_printf("  .apply_max_lod = %u\n", sampler->apply_max_lod);
+   }
+   for (i = 0; i < key->nr_sampler_views; ++i) {
+      const struct lp_static_texture_state *texture = &key->state[i].texture_state;
+      debug_printf("texture[%u] = \n", i);
+      debug_printf("  .format = %s\n",
+                   util_format_name(texture->format));
+      debug_printf("  .target = %s\n",
+                   util_dump_tex_target(texture->target, TRUE));
+      debug_printf("  .level_zero_only = %u\n",
+                   texture->level_zero_only);
+      debug_printf("  .pot = %u %u %u\n",
+                   texture->pot_width,
+                   texture->pot_height,
+                   texture->pot_depth);
+   }
+}
+
+
+void
+lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant)
+{
+   debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n", 
+                variant->shader->no, variant->no);
+   tgsi_dump(variant->shader->base.tokens, 0);
+   dump_fs_variant_key(&variant->key);
+   debug_printf("variant->opaque = %u\n", variant->opaque);
+   debug_printf("\n");
+}
+
+
+/**
+ * Generate a new fragment shader variant from the shader code and
+ * other state indicated by the key.
+ */
+static struct lp_fragment_shader_variant *
+generate_variant(struct llvmpipe_context *lp,
+                 struct lp_fragment_shader *shader,
+                 const struct lp_fragment_shader_variant_key *key)
+{
+   struct lp_fragment_shader_variant *variant;
+   const struct util_format_description *cbuf0_format_desc;
+   boolean fullcolormask;
+   char module_name[64];
+
+   variant = CALLOC_STRUCT(lp_fragment_shader_variant);
+   if(!variant)
+      return NULL;
+
+   util_snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
+                 shader->no, shader->variants_created);
+
+   variant->gallivm = gallivm_create(module_name, lp->context);
+   if (!variant->gallivm) {
+      FREE(variant);
+      return NULL;
+   }
+
+   variant->shader = shader;
+   variant->list_item_global.base = variant;
+   variant->list_item_local.base = variant;
+   variant->no = shader->variants_created++;
+
+   memcpy(&variant->key, key, shader->variant_key_size);
+
+   /*
+    * Determine whether we are touching all channels in the color buffer.
+    */
+   fullcolormask = FALSE;
+   if (key->nr_cbufs == 1) {
+      cbuf0_format_desc = util_format_description(key->cbuf_format[0]);
+      fullcolormask = util_format_colormask_full(cbuf0_format_desc, key->blend.rt[0].colormask);
+   }
+
+   variant->opaque =
+         !key->blend.logicop_enable &&
+         !key->blend.rt[0].blend_enable &&
+         fullcolormask &&
+         !key->stencil[0].enabled &&
+         !key->alpha.enabled &&
+         !key->blend.alpha_to_coverage &&
+         !key->depth.enabled &&
+         !shader->info.base.uses_kill
+      ? TRUE : FALSE;
+
+   if ((shader->info.base.num_tokens <= 1) &&
+       !key->depth.enabled && !key->stencil[0].enabled) {
+      variant->ps_inv_multiplier = 0;
+   } else {
+      variant->ps_inv_multiplier = 1;
+   }
+
+   if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
+      lp_debug_fs_variant(variant);
+   }
+
+   lp_jit_init_types(variant);
+   
+   if (variant->jit_function[RAST_EDGE_TEST] == NULL)
+      generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
+
+   if (variant->jit_function[RAST_WHOLE] == NULL) {
+      if (variant->opaque) {
+         /* Specialized shader, which doesn't need to read the color buffer. */
+         generate_fragment(lp, shader, variant, RAST_WHOLE);
+      }
+   }
+
+   /*
+    * Compile everything
+    */
+
+   gallivm_compile_module(variant->gallivm);
+
+   variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module);
+
+   if (variant->function[RAST_EDGE_TEST]) {
+      variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func)
+            gallivm_jit_function(variant->gallivm,
+                                 variant->function[RAST_EDGE_TEST]);
+   }
+
+   if (variant->function[RAST_WHOLE]) {
+         variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
+               gallivm_jit_function(variant->gallivm,
+                                    variant->function[RAST_WHOLE]);
+   } else if (!variant->jit_function[RAST_WHOLE]) {
+      variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST];
+   }
+
+   gallivm_free_ir(variant->gallivm);
+
+   return variant;
+}
+
+
+static void *
+llvmpipe_create_fs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct lp_fragment_shader *shader;
+   int nr_samplers;
+   int nr_sampler_views;
+   int i;
+
+   shader = CALLOC_STRUCT(lp_fragment_shader);
+   if (!shader)
+      return NULL;
+
+   shader->no = fs_no++;
+   make_empty_list(&shader->variants);
+
+   /* get/save the summary info for this shader */
+   lp_build_tgsi_info(templ->tokens, &shader->info);
+
+   /* we need to keep a local copy of the tokens */
+   shader->base.tokens = tgsi_dup_tokens(templ->tokens);
+
+   shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ);
+   if (shader->draw_data == NULL) {
+      FREE((void *) shader->base.tokens);
+      FREE(shader);
+      return NULL;
+   }
+
+   nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
+   nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
+
+   shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key,
+                                     state[MAX2(nr_samplers, nr_sampler_views)]);
+
+   for (i = 0; i < shader->info.base.num_inputs; i++) {
+      shader->inputs[i].usage_mask = shader->info.base.input_usage_mask[i];
+      shader->inputs[i].cyl_wrap = shader->info.base.input_cylindrical_wrap[i];
+
+      switch (shader->info.base.input_interpolate[i]) {
+      case TGSI_INTERPOLATE_CONSTANT:
+	 shader->inputs[i].interp = LP_INTERP_CONSTANT;
+	 break;
+      case TGSI_INTERPOLATE_LINEAR:
+	 shader->inputs[i].interp = LP_INTERP_LINEAR;
+	 break;
+      case TGSI_INTERPOLATE_PERSPECTIVE:
+	 shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
+	 break;
+      case TGSI_INTERPOLATE_COLOR:
+	 shader->inputs[i].interp = LP_INTERP_COLOR;
+	 break;
+      default:
+	 assert(0);
+	 break;
+      }
+
+      switch (shader->info.base.input_semantic_name[i]) {
+      case TGSI_SEMANTIC_FACE:
+	 shader->inputs[i].interp = LP_INTERP_FACING;
+	 break;
+      case TGSI_SEMANTIC_POSITION:
+	 /* Position was already emitted above
+	  */
+	 shader->inputs[i].interp = LP_INTERP_POSITION;
+	 shader->inputs[i].src_index = 0;
+	 continue;
+      }
+
+      shader->inputs[i].src_index = i+1;
+   }
+
+   if (LP_DEBUG & DEBUG_TGSI) {
+      unsigned attrib;
+      debug_printf("llvmpipe: Create fragment shader #%u %p:\n",
+                   shader->no, (void *) shader);
+      tgsi_dump(templ->tokens, 0);
+      debug_printf("usage masks:\n");
+      for (attrib = 0; attrib < shader->info.base.num_inputs; ++attrib) {
+         unsigned usage_mask = shader->info.base.input_usage_mask[attrib];
+         debug_printf("  IN[%u].%s%s%s%s\n",
+                      attrib,
+                      usage_mask & TGSI_WRITEMASK_X ? "x" : "",
+                      usage_mask & TGSI_WRITEMASK_Y ? "y" : "",
+                      usage_mask & TGSI_WRITEMASK_Z ? "z" : "",
+                      usage_mask & TGSI_WRITEMASK_W ? "w" : "");
+      }
+      debug_printf("\n");
+   }
+
+   return shader;
+}
+
+
+static void
+llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if (llvmpipe->fs == fs)
+      return;
+
+   llvmpipe->fs = (struct lp_fragment_shader *) fs;
+
+   draw_bind_fragment_shader(llvmpipe->draw,
+                             (llvmpipe->fs ? llvmpipe->fs->draw_data : NULL));
+
+   llvmpipe->dirty |= LP_NEW_FS;
+}
+
+
+/**
+ * Remove shader variant from two lists: the shader's variant list
+ * and the context's variant list.
+ */
+void
+llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
+                               struct lp_fragment_shader_variant *variant)
+{
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      debug_printf("llvmpipe: del fs #%u var #%u v created #%u v cached"
+                   " #%u v total cached #%u\n",
+                   variant->shader->no,
+                   variant->no,
+                   variant->shader->variants_created,
+                   variant->shader->variants_cached,
+                   lp->nr_fs_variants);
+   }
+
+   gallivm_destroy(variant->gallivm);
+
+   /* remove from shader's list */
+   remove_from_list(&variant->list_item_local);
+   variant->shader->variants_cached--;
+
+   /* remove from context's list */
+   remove_from_list(&variant->list_item_global);
+   lp->nr_fs_variants--;
+   lp->nr_fs_instrs -= variant->nr_instrs;
+
+   FREE(variant);
+}
+
+
+static void
+llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct lp_fragment_shader *shader = fs;
+   struct lp_fs_variant_list_item *li;
+
+   assert(fs != llvmpipe->fs);
+
+   /*
+    * XXX: we need to flush the context until we have some sort of reference
+    * counting in fragment shaders as they may still be binned
+    * Flushing alone might not sufficient we need to wait on it too.
+    */
+   llvmpipe_finish(pipe, __FUNCTION__);
+
+   /* Delete all the variants */
+   li = first_elem(&shader->variants);
+   while(!at_end(&shader->variants, li)) {
+      struct lp_fs_variant_list_item *next = next_elem(li);
+      llvmpipe_remove_shader_variant(llvmpipe, li->base);
+      li = next;
+   }
+
+   /* Delete draw module's data */
+   draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
+
+   assert(shader->variants_cached == 0);
+   FREE((void *) shader->base.tokens);
+   FREE(shader);
+}
+
+
+
+static void
+llvmpipe_set_constant_buffer(struct pipe_context *pipe,
+                             uint shader, uint index,
+                             struct pipe_constant_buffer *cb)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct pipe_resource *constants = cb ? cb->buffer : NULL;
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index < Elements(llvmpipe->constants[shader]));
+
+   /* note: reference counting */
+   util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb);
+
+   if (shader == PIPE_SHADER_VERTEX ||
+       shader == PIPE_SHADER_GEOMETRY) {
+      /* Pass the constants to the 'draw' module */
+      const unsigned size = cb ? cb->buffer_size : 0;
+      const ubyte *data;
+
+      if (constants) {
+         data = (ubyte *) llvmpipe_resource_data(constants);
+      }
+      else if (cb && cb->user_buffer) {
+         data = (ubyte *) cb->user_buffer;
+      }
+      else {
+         data = NULL;
+      }
+
+      if (data)
+         data += cb->buffer_offset;
+
+      draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
+                                      index, data, size);
+   }
+
+   llvmpipe->dirty |= LP_NEW_CONSTANTS;
+
+   if (cb && cb->user_buffer) {
+      pipe_resource_reference(&constants, NULL);
+   }
+}
+
+
+/**
+ * Return the blend factor equivalent to a destination alpha of one.
+ */
+static inline unsigned
+force_dst_alpha_one(unsigned factor, boolean clamped_zero)
+{
+   switch(factor) {
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return PIPE_BLENDFACTOR_ONE;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      return PIPE_BLENDFACTOR_ZERO;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      if (clamped_zero)
+         return PIPE_BLENDFACTOR_ZERO;
+      else
+         return PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE;
+   }
+
+   return factor;
+}
+
+
+/**
+ * We need to generate several variants of the fragment pipeline to match
+ * all the combinations of the contributing state atoms.
+ *
+ * TODO: there is actually no reason to tie this to context state -- the
+ * generated code could be cached globally in the screen.
+ */
+static void
+make_variant_key(struct llvmpipe_context *lp,
+                 struct lp_fragment_shader *shader,
+                 struct lp_fragment_shader_variant_key *key)
+{
+   unsigned i;
+
+   memset(key, 0, shader->variant_key_size);
+
+   if (lp->framebuffer.zsbuf) {
+      enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format;
+      const struct util_format_description *zsbuf_desc =
+         util_format_description(zsbuf_format);
+
+      if (lp->depth_stencil->depth.enabled &&
+          util_format_has_depth(zsbuf_desc)) {
+         key->zsbuf_format = zsbuf_format;
+         memcpy(&key->depth, &lp->depth_stencil->depth, sizeof key->depth);
+      }
+      if (lp->depth_stencil->stencil[0].enabled &&
+          util_format_has_stencil(zsbuf_desc)) {
+         key->zsbuf_format = zsbuf_format;
+         memcpy(&key->stencil, &lp->depth_stencil->stencil, sizeof key->stencil);
+      }
+      if (llvmpipe_resource_is_1d(lp->framebuffer.zsbuf->texture)) {
+         key->resource_1d = TRUE;
+      }
+   }
+
+   /*
+    * Propagate the depth clamp setting from the rasterizer state.
+    * depth_clip == 0 implies depth clamping is enabled.
+    *
+    * When clip_halfz is enabled, then always clamp the depth values.
+    */
+   if (lp->rasterizer->clip_halfz) {
+      key->depth_clamp = 1;
+   } else {
+      key->depth_clamp = (lp->rasterizer->depth_clip == 0) ? 1 : 0;
+   }
+
+   /* alpha test only applies if render buffer 0 is non-integer (or does not exist) */
+   if (!lp->framebuffer.nr_cbufs ||
+       !lp->framebuffer.cbufs[0] ||
+       !util_format_is_pure_integer(lp->framebuffer.cbufs[0]->format)) {
+      key->alpha.enabled = lp->depth_stencil->alpha.enabled;
+   }
+   if(key->alpha.enabled)
+      key->alpha.func = lp->depth_stencil->alpha.func;
+   /* alpha.ref_value is passed in jit_context */
+
+   key->flatshade = lp->rasterizer->flatshade;
+   if (lp->active_occlusion_queries) {
+      key->occlusion_count = TRUE;
+   }
+
+   if (lp->framebuffer.nr_cbufs) {
+      memcpy(&key->blend, lp->blend, sizeof key->blend);
+   }
+
+   key->nr_cbufs = lp->framebuffer.nr_cbufs;
+
+   if (!key->blend.independent_blend_enable) {
+      /* we always need independent blend otherwise the fixups below won't work */
+      for (i = 1; i < key->nr_cbufs; i++) {
+         memcpy(&key->blend.rt[i], &key->blend.rt[0], sizeof(key->blend.rt[0]));
+      }
+      key->blend.independent_blend_enable = 1;
+   }
+
+   for (i = 0; i < lp->framebuffer.nr_cbufs; i++) {
+      struct pipe_rt_blend_state *blend_rt = &key->blend.rt[i];
+
+      if (lp->framebuffer.cbufs[i]) {
+         enum pipe_format format = lp->framebuffer.cbufs[i]->format;
+         const struct util_format_description *format_desc;
+
+         key->cbuf_format[i] = format;
+
+         /*
+          * Figure out if this is a 1d resource. Note that OpenGL allows crazy
+          * mixing of 2d textures with height 1 and 1d textures, so make sure
+          * we pick 1d if any cbuf or zsbuf is 1d.
+          */
+         if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[i]->texture)) {
+            key->resource_1d = TRUE;
+         }
+
+         format_desc = util_format_description(format);
+         assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
+                format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
+
+         /*
+          * Mask out color channels not present in the color buffer.
+          */
+         blend_rt->colormask &= util_format_colormask(format_desc);
+
+         /*
+          * Disable blend for integer formats.
+          */
+         if (util_format_is_pure_integer(format)) {
+            blend_rt->blend_enable = 0;
+         }
+
+         /*
+          * Our swizzled render tiles always have an alpha channel, but the
+          * linear render target format often does not, so force here the dst
+          * alpha to be one.
+          *
+          * This is not a mere optimization. Wrong results will be produced if
+          * the dst alpha is used, the dst format does not have alpha, and the
+          * previous rendering was not flushed from the swizzled to linear
+          * buffer. For example, NonPowTwo DCT.
+          *
+          * TODO: This should be generalized to all channels for better
+          * performance, but only alpha causes correctness issues.
+          *
+          * Also, force rgb/alpha func/factors match, to make AoS blending
+          * easier.
+          */
+         if (format_desc->swizzle[3] > UTIL_FORMAT_SWIZZLE_W ||
+             format_desc->swizzle[3] == format_desc->swizzle[0]) {
+            /* Doesn't cover mixed snorm/unorm but can't render to them anyway */
+            boolean clamped_zero = !util_format_is_float(format) &&
+                                   !util_format_is_snorm(format);
+            blend_rt->rgb_src_factor =
+               force_dst_alpha_one(blend_rt->rgb_src_factor, clamped_zero);
+            blend_rt->rgb_dst_factor =
+               force_dst_alpha_one(blend_rt->rgb_dst_factor, clamped_zero);
+            blend_rt->alpha_func       = blend_rt->rgb_func;
+            blend_rt->alpha_src_factor = blend_rt->rgb_src_factor;
+            blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor;
+         }
+      }
+      else {
+         /* no color buffer for this fragment output */
+         key->cbuf_format[i] = PIPE_FORMAT_NONE;
+         blend_rt->colormask = 0x0;
+         blend_rt->blend_enable = 0;
+      }
+   }
+
+   /* This value will be the same for all the variants of a given shader:
+    */
+   key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
+
+   for(i = 0; i < key->nr_samplers; ++i) {
+      if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
+         lp_sampler_static_sampler_state(&key->state[i].sampler_state,
+                                         lp->samplers[PIPE_SHADER_FRAGMENT][i]);
+      }
+   }
+
+   /*
+    * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes
+    * are dx10-style? Can't really have mixed opcodes, at least not
+    * if we want to skip the holes here (without rescanning tgsi).
+    */
+   if (shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) {
+      key->nr_sampler_views = shader->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
+      for(i = 0; i < key->nr_sampler_views; ++i) {
+         if(shader->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) {
+            lp_sampler_static_texture_state(&key->state[i].texture_state,
+                                            lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
+         }
+      }
+   }
+   else {
+      key->nr_sampler_views = key->nr_samplers;
+      for(i = 0; i < key->nr_sampler_views; ++i) {
+         if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
+            lp_sampler_static_texture_state(&key->state[i].texture_state,
+                                            lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
+         }
+      }
+   }
+}
+
+
+
+/**
+ * Update fragment shader state.  This is called just prior to drawing
+ * something when some fragment-related state has changed.
+ */
+void 
+llvmpipe_update_fs(struct llvmpipe_context *lp)
+{
+   struct lp_fragment_shader *shader = lp->fs;
+   struct lp_fragment_shader_variant_key key;
+   struct lp_fragment_shader_variant *variant = NULL;
+   struct lp_fs_variant_list_item *li;
+
+   make_variant_key(lp, shader, &key);
+
+   /* Search the variants for one which matches the key */
+   li = first_elem(&shader->variants);
+   while(!at_end(&shader->variants, li)) {
+      if(memcmp(&li->base->key, &key, shader->variant_key_size) == 0) {
+         variant = li->base;
+         break;
+      }
+      li = next_elem(li);
+   }
+
+   if (variant) {
+      /* Move this variant to the head of the list to implement LRU
+       * deletion of shader's when we have too many.
+       */
+      move_to_head(&lp->fs_variants_list, &variant->list_item_global);
+   }
+   else {
+      /* variant not found, create it now */
+      int64_t t0, t1, dt;
+      unsigned i;
+      unsigned variants_to_cull;
+
+      if (0) {
+         debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n",
+                      lp->nr_fs_variants,
+                      lp->nr_fs_instrs,
+                      lp->nr_fs_variants ? lp->nr_fs_instrs / lp->nr_fs_variants : 0);
+      }
+
+      /* First, check if we've exceeded the max number of shader variants.
+       * If so, free 25% of them (the least recently used ones).
+       */
+      variants_to_cull = lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS ? LP_MAX_SHADER_VARIANTS / 4 : 0;
+
+      if (variants_to_cull ||
+          lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) {
+         struct pipe_context *pipe = &lp->pipe;
+
+         /*
+          * XXX: we need to flush the context until we have some sort of
+          * reference counting in fragment shaders as they may still be binned
+          * Flushing alone might not be sufficient we need to wait on it too.
+          */
+         llvmpipe_finish(pipe, __FUNCTION__);
+
+         /*
+          * We need to re-check lp->nr_fs_variants because an arbitrarliy large
+          * number of shader variants (potentially all of them) could be
+          * pending for destruction on flush.
+          */
+
+         for (i = 0; i < variants_to_cull || lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS; i++) {
+            struct lp_fs_variant_list_item *item;
+            if (is_empty_list(&lp->fs_variants_list)) {
+               break;
+            }
+            item = last_elem(&lp->fs_variants_list);
+            assert(item);
+            assert(item->base);
+            llvmpipe_remove_shader_variant(lp, item->base);
+         }
+      }
+
+      /*
+       * Generate the new variant.
+       */
+      t0 = os_time_get();
+      variant = generate_variant(lp, shader, &key);
+      t1 = os_time_get();
+      dt = t1 - t0;
+      LP_COUNT_ADD(llvm_compile_time, dt);
+      LP_COUNT_ADD(nr_llvm_compiles, 2);  /* emit vs. omit in/out test */
+
+      /* Put the new variant into the list */
+      if (variant) {
+         insert_at_head(&shader->variants, &variant->list_item_local);
+         insert_at_head(&lp->fs_variants_list, &variant->list_item_global);
+         lp->nr_fs_variants++;
+         lp->nr_fs_instrs += variant->nr_instrs;
+         shader->variants_cached++;
+      }
+   }
+
+   /* Bind this variant */
+   lp_setup_set_fs_variant(lp->setup, variant);
+}
+
+
+
+
+
+void
+llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_fs_state = llvmpipe_create_fs_state;
+   llvmpipe->pipe.bind_fs_state   = llvmpipe_bind_fs_state;
+   llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state;
+
+   llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
+}
+
+/*
+ * Rasterization is disabled if there is no pixel shader and
+ * both depth and stencil testing are disabled:
+ * http://msdn.microsoft.com/en-us/library/windows/desktop/bb205125
+ */
+boolean
+llvmpipe_rasterization_disabled(struct llvmpipe_context *lp)
+{
+   boolean null_fs = !lp->fs || lp->fs->info.base.num_tokens <= 1;
+
+   return (null_fs &&
+           !lp->depth_stencil->depth.enabled &&
+           !lp->depth_stencil->stencil[0].enabled);
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_fs.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_fs.h
new file mode 100644
index 000000000..2ddd85188
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_fs.h
@@ -0,0 +1,157 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef LP_STATE_FS_H_
+#define LP_STATE_FS_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h" /* for tgsi_shader_info */
+#include "gallivm/lp_bld_sample.h" /* for struct lp_sampler_static_state */
+#include "gallivm/lp_bld_tgsi.h" /* for lp_tgsi_info */
+#include "lp_bld_interp.h" /* for struct lp_shader_input */
+
+
+struct tgsi_token;
+struct lp_fragment_shader;
+
+
+/** Indexes into jit_function[] array */
+#define RAST_WHOLE 0
+#define RAST_EDGE_TEST 1
+
+
+struct lp_sampler_static_state
+{
+   /*
+    * These attributes are effectively interleaved for more sane key handling.
+    * However, there might be lots of null space if the amount of samplers and
+    * textures isn't the same.
+    */
+   struct lp_static_sampler_state sampler_state;
+   struct lp_static_texture_state texture_state;
+};
+
+
+struct lp_fragment_shader_variant_key
+{
+   struct pipe_depth_state depth;
+   struct pipe_stencil_state stencil[2];
+   struct pipe_blend_state blend;
+
+   struct {
+      unsigned enabled:1;
+      unsigned func:3;
+   } alpha;
+
+   unsigned nr_cbufs:8;
+   unsigned nr_samplers:8;      /* actually derivable from just the shader */
+   unsigned nr_sampler_views:8; /* actually derivable from just the shader */
+   unsigned flatshade:1;
+   unsigned occlusion_count:1;
+   unsigned resource_1d:1;
+   unsigned depth_clamp:1;
+
+   enum pipe_format zsbuf_format;
+   enum pipe_format cbuf_format[PIPE_MAX_COLOR_BUFS];
+
+   struct lp_sampler_static_state state[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+};
+
+
+/** doubly-linked list item */
+struct lp_fs_variant_list_item
+{
+   struct lp_fragment_shader_variant *base;
+   struct lp_fs_variant_list_item *next, *prev;
+};
+
+
+struct lp_fragment_shader_variant
+{
+   struct lp_fragment_shader_variant_key key;
+
+   boolean opaque;
+   uint8_t ps_inv_multiplier;
+
+   struct gallivm_state *gallivm;
+
+   LLVMTypeRef jit_context_ptr_type;
+   LLVMTypeRef jit_thread_data_ptr_type;
+   LLVMTypeRef jit_linear_context_ptr_type;
+
+   LLVMValueRef function[2];
+
+   lp_jit_frag_func jit_function[2];
+
+   /* Total number of LLVM instructions generated */
+   unsigned nr_instrs;
+
+   struct lp_fs_variant_list_item list_item_global, list_item_local;
+   struct lp_fragment_shader *shader;
+
+   /* For debugging/profiling purposes */
+   unsigned no;
+};
+
+
+/** Subclass of pipe_shader_state */
+struct lp_fragment_shader
+{
+   struct pipe_shader_state base;
+
+   struct lp_tgsi_info info;
+
+   struct lp_fs_variant_list_item variants;
+
+   struct draw_fragment_shader *draw_data;
+
+   /* For debugging/profiling purposes */
+   unsigned variant_key_size;
+   unsigned no;
+   unsigned variants_created;
+   unsigned variants_cached;
+
+   /** Fragment shader input interpolation info */
+   struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
+};
+
+
+void
+lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant);
+
+void
+llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
+                               struct lp_fragment_shader_variant *variant);
+
+boolean
+llvmpipe_rasterization_disabled(struct llvmpipe_context *lp);
+
+
+#endif /* LP_STATE_FS_H_ */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_gs.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_gs.c
new file mode 100644
index 000000000..7ea7a3906
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_gs.c
@@ -0,0 +1,116 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_texture.h"
+#include "lp_debug.h"
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "draw/draw_context.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_parse.h"
+
+
+static void *
+llvmpipe_create_gs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct lp_geometry_shader *state;
+
+   state = CALLOC_STRUCT(lp_geometry_shader);
+   if (state == NULL )
+      goto no_state;
+
+   /* debug */
+   if (LP_DEBUG & DEBUG_TGSI) {
+      debug_printf("llvmpipe: Create geometry shader %p:\n", (void *)state);
+      tgsi_dump(templ->tokens, 0);
+   }
+
+   /* copy stream output info */
+   state->no_tokens = !templ->tokens;
+   memcpy(&state->stream_output, &templ->stream_output, sizeof state->stream_output);
+
+   if (templ->tokens) {
+      state->dgs = draw_create_geometry_shader(llvmpipe->draw, templ);
+      if (state->dgs == NULL) {
+         goto no_dgs;
+      }
+   }
+
+   return state;
+
+no_dgs:
+   FREE( state );
+no_state:
+   return NULL;
+}
+
+
+static void
+llvmpipe_bind_gs_state(struct pipe_context *pipe, void *gs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   llvmpipe->gs = (struct lp_geometry_shader *)gs;
+
+   draw_bind_geometry_shader(llvmpipe->draw,
+                             (llvmpipe->gs ? llvmpipe->gs->dgs : NULL));
+
+   llvmpipe->dirty |= LP_NEW_GS;
+}
+
+
+static void
+llvmpipe_delete_gs_state(struct pipe_context *pipe, void *gs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   struct lp_geometry_shader *state =
+      (struct lp_geometry_shader *)gs;
+
+   if (!state) {
+      return;
+   }
+
+   draw_delete_geometry_shader(llvmpipe->draw, state->dgs);
+   FREE(state);
+}
+
+
+void
+llvmpipe_init_gs_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_gs_state = llvmpipe_create_gs_state;
+   llvmpipe->pipe.bind_gs_state   = llvmpipe_bind_gs_state;
+   llvmpipe->pipe.delete_gs_state = llvmpipe_delete_gs_state;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
new file mode 100644
index 000000000..94ebf8fff
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
@@ -0,0 +1,154 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_setup.h"
+#include "draw/draw_context.h"
+
+struct lp_rast_state {
+   struct pipe_rasterizer_state lp_state;
+   struct pipe_rasterizer_state draw_state;
+};
+
+/* State which might be handled in either the draw module or locally.
+ * This function is used to turn that state off in one of the two
+ * places.
+ */
+static void
+clear_flags(struct pipe_rasterizer_state *rast)
+{
+   rast->light_twoside = 0;
+   rast->offset_tri = 0;
+   rast->offset_line = 0;
+   rast->offset_point = 0;
+   rast->offset_units = 0.0f;
+   rast->offset_scale = 0.0f;
+}
+
+
+
+static void *
+llvmpipe_create_rasterizer_state(struct pipe_context *pipe,
+                                 const struct pipe_rasterizer_state *rast)
+{
+   boolean need_pipeline;
+
+   /* Partition rasterizer state into what we want the draw module to
+    * handle, and what we'll look after ourselves.
+    */
+   struct lp_rast_state *state = MALLOC_STRUCT(lp_rast_state);
+   if (state == NULL)
+      return NULL;
+
+   memcpy(&state->draw_state, rast, sizeof *rast);
+   memcpy(&state->lp_state, rast, sizeof *rast);
+
+   /* We rely on draw module to do unfilled polyons, AA lines and
+    * points and stipple.
+    * 
+    * Over time, reduce this list of conditions, and expand the list
+    * of flags which get cleared in clear_flags().
+    */
+   need_pipeline = (rast->fill_front != PIPE_POLYGON_MODE_FILL ||
+		    rast->fill_back != PIPE_POLYGON_MODE_FILL ||
+		    rast->point_smooth ||
+		    rast->line_smooth ||
+		    rast->line_stipple_enable ||
+		    rast->poly_stipple_enable);
+
+   /* If not using the pipeline, clear out the flags which we can
+    * handle ourselves.  If we *are* using the pipeline, do everything
+    * on the pipeline and clear those flags on our internal copy of
+    * the state.
+    */
+   if (need_pipeline)
+      clear_flags(&state->lp_state);
+   else
+      clear_flags(&state->draw_state);
+
+   return state;
+}
+
+
+
+static void
+llvmpipe_bind_rasterizer_state(struct pipe_context *pipe, void *handle)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   const struct lp_rast_state *state =
+      (const struct lp_rast_state *) handle;
+
+   if (state) {
+      llvmpipe->rasterizer = &state->lp_state;
+      draw_set_rasterizer_state(llvmpipe->draw, &state->draw_state, handle);
+
+      /* XXX: just pass lp_state directly to setup.
+       */
+      lp_setup_set_triangle_state( llvmpipe->setup,
+                                  state->lp_state.cull_face,
+                                  state->lp_state.front_ccw,
+                                  state->lp_state.scissor,
+                                  state->lp_state.half_pixel_center,
+                                  state->lp_state.bottom_edge_rule);
+      lp_setup_set_flatshade_first( llvmpipe->setup,
+				    state->lp_state.flatshade_first);
+      lp_setup_set_line_state( llvmpipe->setup,
+                              state->lp_state.line_width);
+      lp_setup_set_point_state( llvmpipe->setup,
+                               state->lp_state.point_size,
+                               state->lp_state.point_size_per_vertex,
+                               state->lp_state.sprite_coord_enable,
+                               state->lp_state.sprite_coord_mode);
+   }
+   else {
+      llvmpipe->rasterizer = NULL;
+      draw_set_rasterizer_state(llvmpipe->draw, NULL, handle);      
+   }
+
+   llvmpipe->dirty |= LP_NEW_RASTERIZER;
+}
+
+
+static void
+llvmpipe_delete_rasterizer_state(struct pipe_context *pipe,
+                                 void *rasterizer)
+{
+   FREE( rasterizer );
+}
+
+
+
+void
+llvmpipe_init_rasterizer_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_rasterizer_state = llvmpipe_create_rasterizer_state;
+   llvmpipe->pipe.bind_rasterizer_state   = llvmpipe_bind_rasterizer_state;
+   llvmpipe->pipe.delete_rasterizer_state = llvmpipe_delete_rasterizer_state;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_sampler.c
new file mode 100644
index 000000000..b205f02fd
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -0,0 +1,390 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:
+ *  Brian Paul
+ */
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+
+#include "draw/draw_context.h"
+
+#include "lp_context.h"
+#include "lp_screen.h"
+#include "lp_state.h"
+#include "lp_debug.h"
+#include "state_tracker/sw_winsys.h"
+
+
+static void *
+llvmpipe_create_sampler_state(struct pipe_context *pipe,
+                              const struct pipe_sampler_state *sampler)
+{
+   struct pipe_sampler_state *state = mem_dup(sampler, sizeof *sampler);
+
+   if (LP_PERF & PERF_NO_MIP_LINEAR) {
+      if (state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR)
+	 state->min_mip_filter = PIPE_TEX_MIPFILTER_NEAREST;
+   }
+
+   if (LP_PERF & PERF_NO_MIPMAPS)
+      state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+
+   if (LP_PERF & PERF_NO_LINEAR) {
+      state->mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+      state->min_img_filter = PIPE_TEX_FILTER_NEAREST;
+   }
+
+   return state;
+}
+
+
+static void
+llvmpipe_bind_sampler_states(struct pipe_context *pipe,
+                             unsigned shader,
+                             unsigned start,
+                             unsigned num,
+                             void **samplers)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   unsigned i;
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(start + num <= Elements(llvmpipe->samplers[shader]));
+
+   draw_flush(llvmpipe->draw);
+
+   /* set the new samplers */
+   for (i = 0; i < num; i++) {
+      llvmpipe->samplers[shader][start + i] = samplers[i];
+   }
+
+   /* find highest non-null samplers[] entry */
+   {
+      unsigned j = MAX2(llvmpipe->num_samplers[shader], start + num);
+      while (j > 0 && llvmpipe->samplers[shader][j - 1] == NULL)
+         j--;
+      llvmpipe->num_samplers[shader] = j;
+   }
+
+   if (shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_GEOMETRY) {
+      draw_set_samplers(llvmpipe->draw,
+                        shader,
+                        llvmpipe->samplers[shader],
+                        llvmpipe->num_samplers[shader]);
+   }
+
+   llvmpipe->dirty |= LP_NEW_SAMPLER;
+}
+
+
+static void
+llvmpipe_set_sampler_views(struct pipe_context *pipe,
+                           unsigned shader,
+                           unsigned start,
+                           unsigned num,
+                           struct pipe_sampler_view **views)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS);
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(start + num <= Elements(llvmpipe->sampler_views[shader]));
+
+   draw_flush(llvmpipe->draw);
+
+   /* set the new sampler views */
+   for (i = 0; i < num; i++) {
+      /* Note: we're using pipe_sampler_view_release() here to work around
+       * a possible crash when the old view belongs to another context that
+       * was already destroyed.
+       */
+      pipe_sampler_view_release(pipe,
+                                &llvmpipe->sampler_views[shader][start + i]);
+      pipe_sampler_view_reference(&llvmpipe->sampler_views[shader][start + i],
+                                  views[i]);
+   }
+
+   /* find highest non-null sampler_views[] entry */
+   {
+      unsigned j = MAX2(llvmpipe->num_sampler_views[shader], start + num);
+      while (j > 0 && llvmpipe->sampler_views[shader][j - 1] == NULL)
+         j--;
+      llvmpipe->num_sampler_views[shader] = j;
+   }
+
+   if (shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_GEOMETRY) {
+      draw_set_sampler_views(llvmpipe->draw,
+                             shader,
+                             llvmpipe->sampler_views[shader],
+                             llvmpipe->num_sampler_views[shader]);
+   }
+
+   llvmpipe->dirty |= LP_NEW_SAMPLER_VIEW;
+}
+
+
+static struct pipe_sampler_view *
+llvmpipe_create_sampler_view(struct pipe_context *pipe,
+                            struct pipe_resource *texture,
+                            const struct pipe_sampler_view *templ)
+{
+   struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
+   /*
+    * XXX we REALLY want to see the correct bind flag here but the OpenGL
+    * state tracker can't guarantee that at least for texture buffer objects.
+    */
+   if (!(texture->bind & PIPE_BIND_SAMPLER_VIEW))
+      debug_printf("Illegal sampler view creation without bind flag\n");
+
+   if (view) {
+      *view = *templ;
+      view->reference.count = 1;
+      view->texture = NULL;
+      pipe_resource_reference(&view->texture, texture);
+      view->context = pipe;
+
+#ifdef DEBUG
+     /*
+      * This is possibly too lenient, but the primary reason is just
+      * to catch state trackers which forget to initialize this, so
+      * it only catches clearly impossible view targets.
+      */
+      if (view->target != texture->target) {
+         if (view->target == PIPE_TEXTURE_1D)
+            assert(texture->target == PIPE_TEXTURE_1D_ARRAY);
+         else if (view->target == PIPE_TEXTURE_1D_ARRAY)
+            assert(texture->target == PIPE_TEXTURE_1D);
+         else if (view->target == PIPE_TEXTURE_2D)
+            assert(texture->target == PIPE_TEXTURE_2D_ARRAY ||
+                   texture->target == PIPE_TEXTURE_CUBE ||
+                   texture->target == PIPE_TEXTURE_CUBE_ARRAY);
+         else if (view->target == PIPE_TEXTURE_2D_ARRAY)
+            assert(texture->target == PIPE_TEXTURE_2D ||
+                   texture->target == PIPE_TEXTURE_CUBE ||
+                   texture->target == PIPE_TEXTURE_CUBE_ARRAY);
+         else if (view->target == PIPE_TEXTURE_CUBE)
+            assert(texture->target == PIPE_TEXTURE_CUBE_ARRAY ||
+                   texture->target == PIPE_TEXTURE_2D_ARRAY);
+         else if (view->target == PIPE_TEXTURE_CUBE_ARRAY)
+            assert(texture->target == PIPE_TEXTURE_CUBE ||
+                   texture->target == PIPE_TEXTURE_2D_ARRAY);
+         else
+            assert(0);
+      }
+#endif
+   }
+
+   return view;
+}
+
+
+static void
+llvmpipe_sampler_view_destroy(struct pipe_context *pipe,
+                              struct pipe_sampler_view *view)
+{
+   pipe_resource_reference(&view->texture, NULL);
+   FREE(view);
+}
+
+
+static void
+llvmpipe_delete_sampler_state(struct pipe_context *pipe,
+                              void *sampler)
+{
+   FREE( sampler );
+}
+
+
+static void
+prepare_shader_sampling(
+   struct llvmpipe_context *lp,
+   unsigned num,
+   struct pipe_sampler_view **views,
+   unsigned shader_type,
+   struct pipe_resource *mapped_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS])
+{
+
+   unsigned i;
+   uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS];
+   uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS];
+   uint32_t mip_offsets[PIPE_MAX_TEXTURE_LEVELS];
+   const void *addr;
+
+   assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS);
+   if (!num)
+      return;
+
+   for (i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
+      struct pipe_sampler_view *view = i < num ? views[i] : NULL;
+
+      if (view) {
+         struct pipe_resource *tex = view->texture;
+         struct llvmpipe_resource *lp_tex = llvmpipe_resource(tex);
+         unsigned width0 = tex->width0;
+         unsigned num_layers = tex->depth0;
+         unsigned first_level = 0;
+         unsigned last_level = 0;
+
+         /* We're referencing the texture's internal data, so save a
+          * reference to it.
+          */
+         pipe_resource_reference(&mapped_tex[i], tex);
+
+         if (!lp_tex->dt) {
+            /* regular texture - setup array of mipmap level offsets */
+            struct pipe_resource *res = view->texture;
+            int j;
+
+            if (llvmpipe_resource_is_texture(res)) {
+               first_level = view->u.tex.first_level;
+               last_level = view->u.tex.last_level;
+               assert(first_level <= last_level);
+               assert(last_level <= res->last_level);
+               addr = lp_tex->tex_data;
+
+               for (j = first_level; j <= last_level; j++) {
+                  mip_offsets[j] = lp_tex->mip_offsets[j];
+                  row_stride[j] = lp_tex->row_stride[j];
+                  img_stride[j] = lp_tex->img_stride[j];
+               }
+               if (view->target == PIPE_TEXTURE_1D_ARRAY ||
+                   view->target == PIPE_TEXTURE_2D_ARRAY ||
+                   view->target == PIPE_TEXTURE_CUBE ||
+                   view->target == PIPE_TEXTURE_CUBE_ARRAY) {
+                  num_layers = view->u.tex.last_layer - view->u.tex.first_layer + 1;
+                  for (j = first_level; j <= last_level; j++) {
+                     mip_offsets[j] += view->u.tex.first_layer *
+                                       lp_tex->img_stride[j];
+                  }
+                  if (view->target == PIPE_TEXTURE_CUBE ||
+                      view->target == PIPE_TEXTURE_CUBE_ARRAY) {
+                     assert(num_layers % 6 == 0);
+                  }
+                  assert(view->u.tex.first_layer <= view->u.tex.last_layer);
+                  assert(view->u.tex.last_layer < res->array_size);
+               }
+            }
+            else {
+               unsigned view_blocksize = util_format_get_blocksize(view->format);
+               addr = lp_tex->data;
+               /* probably don't really need to fill that out */
+               mip_offsets[0] = 0;
+               row_stride[0] = 0;
+               img_stride[0] = 0;
+
+               /* everything specified in number of elements here. */
+               width0 = view->u.buf.last_element - view->u.buf.first_element + 1;
+               addr = (uint8_t *)addr + view->u.buf.first_element *
+                               view_blocksize;
+               assert(view->u.buf.first_element <= view->u.buf.last_element);
+               assert(view->u.buf.last_element * view_blocksize < res->width0);
+            }
+         }
+         else {
+            /* display target texture/surface */
+            /*
+             * XXX: Where should this be unmapped?
+             */
+            struct llvmpipe_screen *screen = llvmpipe_screen(tex->screen);
+            struct sw_winsys *winsys = screen->winsys;
+            addr = winsys->displaytarget_map(winsys, lp_tex->dt,
+                                                PIPE_TRANSFER_READ);
+            row_stride[0] = lp_tex->row_stride[0];
+            img_stride[0] = lp_tex->img_stride[0];
+            mip_offsets[0] = 0;
+            assert(addr);
+         }
+         draw_set_mapped_texture(lp->draw,
+                                 shader_type,
+                                 i,
+                                 width0, tex->height0, num_layers,
+                                 first_level, last_level,
+                                 addr,
+                                 row_stride, img_stride, mip_offsets);
+      }
+   }
+}
+
+
+/**
+ * Called during state validation when LP_NEW_SAMPLER_VIEW is set.
+ */
+void
+llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *lp,
+                                 unsigned num,
+                                 struct pipe_sampler_view **views)
+{
+   prepare_shader_sampling(lp, num, views, PIPE_SHADER_VERTEX,
+                           lp->mapped_vs_tex);
+}
+
+void
+llvmpipe_cleanup_vertex_sampling(struct llvmpipe_context *ctx)
+{
+   unsigned i;
+   for (i = 0; i < Elements(ctx->mapped_vs_tex); i++) {
+      pipe_resource_reference(&ctx->mapped_vs_tex[i], NULL);
+   }
+}
+
+
+/**
+ * Called during state validation when LP_NEW_SAMPLER_VIEW is set.
+ */
+void
+llvmpipe_prepare_geometry_sampling(struct llvmpipe_context *lp,
+                                   unsigned num,
+                                   struct pipe_sampler_view **views)
+{
+   prepare_shader_sampling(lp, num, views, PIPE_SHADER_GEOMETRY,
+                           lp->mapped_gs_tex);
+}
+
+void
+llvmpipe_cleanup_geometry_sampling(struct llvmpipe_context *ctx)
+{
+   unsigned i;
+   for (i = 0; i < Elements(ctx->mapped_gs_tex); i++) {
+      pipe_resource_reference(&ctx->mapped_gs_tex[i], NULL);
+   }
+}
+
+void
+llvmpipe_init_sampler_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_sampler_state = llvmpipe_create_sampler_state;
+
+   llvmpipe->pipe.bind_sampler_states = llvmpipe_bind_sampler_states;
+   llvmpipe->pipe.create_sampler_view = llvmpipe_create_sampler_view;
+   llvmpipe->pipe.set_sampler_views = llvmpipe_set_sampler_views;
+   llvmpipe->pipe.sampler_view_destroy = llvmpipe_sampler_view_destroy;
+   llvmpipe->pipe.delete_sampler_state = llvmpipe_delete_sampler_state;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_setup.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_setup.c
new file mode 100644
index 000000000..6397b5196
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_setup.c
@@ -0,0 +1,1021 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/simple_list.h"
+#include "os/os_time.h"
+#include "gallivm/lp_bld_arit.h"
+#include "gallivm/lp_bld_bitarit.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_logic.h"
+#include "gallivm/lp_bld_intr.h"
+#include "gallivm/lp_bld_flow.h"
+#include "gallivm/lp_bld_type.h"
+
+#include "lp_perf.h"
+#include "lp_debug.h"
+#include "lp_flush.h"
+#include "lp_screen.h"
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_state_fs.h"
+#include "lp_state_setup.h"
+
+
+/** Setup shader number (for debugging) */
+static unsigned setup_no = 0;
+
+
+/* currently organized to interpolate full float[4] attributes even
+ * when some elements are unused.  Later, can pack vertex data more
+ * closely.
+ */
+
+
+struct lp_setup_args
+{
+   /* Function arguments:
+    */
+   LLVMValueRef v0;
+   LLVMValueRef v1;
+   LLVMValueRef v2;
+   LLVMValueRef facing;		/* boolean */
+   LLVMValueRef a0;
+   LLVMValueRef dadx;
+   LLVMValueRef dady;
+
+   /* Derived:
+    */
+   LLVMValueRef x0_center;
+   LLVMValueRef y0_center;
+   LLVMValueRef dy20_ooa;
+   LLVMValueRef dy01_ooa;
+   LLVMValueRef dx20_ooa;
+   LLVMValueRef dx01_ooa;
+   struct lp_build_context bld;
+};
+
+
+static void
+store_coef(struct gallivm_state *gallivm,
+           struct lp_setup_args *args,
+           unsigned slot,
+           LLVMValueRef a0,
+           LLVMValueRef dadx,
+           LLVMValueRef dady)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef idx = lp_build_const_int32(gallivm, slot);
+
+   LLVMBuildStore(builder,
+                  a0,
+                  LLVMBuildGEP(builder, args->a0, &idx, 1, ""));
+
+   LLVMBuildStore(builder,
+                  dadx,
+                  LLVMBuildGEP(builder, args->dadx, &idx, 1, ""));
+
+   LLVMBuildStore(builder,
+                  dady,
+                  LLVMBuildGEP(builder, args->dady, &idx, 1, ""));
+}
+
+
+
+static void 
+emit_constant_coef4(struct gallivm_state *gallivm,
+                    struct lp_setup_args *args,
+                    unsigned slot,
+                    LLVMValueRef vert)
+{
+   store_coef(gallivm, args, slot, vert, args->bld.zero, args->bld.zero);
+}
+
+
+
+/**
+ * Setup the fragment input attribute with the front-facing value.
+ * \param frontface  is the triangle front facing?
+ */
+static void 
+emit_facing_coef(struct gallivm_state *gallivm,
+                 struct lp_setup_args *args,
+                 unsigned slot )
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef float_type = LLVMFloatTypeInContext(gallivm->context);
+   LLVMValueRef a0_0 = args->facing;
+   LLVMValueRef a0_0f = LLVMBuildSIToFP(builder, a0_0, float_type, "");
+   LLVMValueRef a0, face_val;
+   const unsigned char swizzles[4] = { PIPE_SWIZZLE_RED, PIPE_SWIZZLE_ZERO,
+                                       PIPE_SWIZZLE_ZERO, PIPE_SWIZZLE_ZERO };
+   /* Our face val is either 1 or 0 so we do
+    * face = (val * 2) - 1
+    * to make it 1 or -1
+    */
+   face_val =
+      LLVMBuildFAdd(builder,
+                    LLVMBuildFMul(builder, a0_0f,
+                                  lp_build_const_float(gallivm, 2.0),
+                                  ""),
+                    lp_build_const_float(gallivm, -1.0),
+                    "facing");
+   face_val = lp_build_broadcast_scalar(&args->bld, face_val);
+   a0 = lp_build_swizzle_aos(&args->bld, face_val, swizzles);
+
+   store_coef(gallivm, args, slot, a0, args->bld.zero, args->bld.zero);
+}
+
+
+static LLVMValueRef
+vert_attrib(struct gallivm_state *gallivm,
+            LLVMValueRef vert,
+            int attr,
+            int elem,
+            const char *name)
+{
+   LLVMBuilderRef b = gallivm->builder;
+   LLVMValueRef idx[2];
+   idx[0] = lp_build_const_int32(gallivm, attr);
+   idx[1] = lp_build_const_int32(gallivm, elem);
+   return LLVMBuildLoad(b, LLVMBuildGEP(b, vert, idx, 2, ""), name);
+}
+
+
+static void
+lp_twoside(struct gallivm_state *gallivm,
+           struct lp_setup_args *args,
+           const struct lp_setup_variant_key *key,
+           int bcolor_slot,
+           LLVMValueRef attribv[3])
+{
+   LLVMBuilderRef b = gallivm->builder;
+   LLVMValueRef a0_back, a1_back, a2_back;
+   LLVMValueRef idx2 = lp_build_const_int32(gallivm, bcolor_slot);
+
+   LLVMValueRef facing = args->facing;
+   LLVMValueRef front_facing = LLVMBuildICmp(b, LLVMIntEQ, facing,
+                                             lp_build_const_int32(gallivm, 0), ""); /** need i1 for if condition */
+
+   a0_back = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx2, 1, ""), "v0a_back");
+   a1_back = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx2, 1, ""), "v1a_back");
+   a2_back = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx2, 1, ""), "v2a_back");
+
+   /* Possibly swap the front and back attrib values,
+    *
+    * Prefer select to if so we don't have to worry about phis or
+    * allocas.
+    */
+   attribv[0] = LLVMBuildSelect(b, front_facing, a0_back, attribv[0], "");
+   attribv[1] = LLVMBuildSelect(b, front_facing, a1_back, attribv[1], "");
+   attribv[2] = LLVMBuildSelect(b, front_facing, a2_back, attribv[2], "");
+
+}
+
+static void
+lp_do_offset_tri(struct gallivm_state *gallivm,
+                 struct lp_setup_args *args,
+                 const struct lp_setup_variant_key *key,
+                 LLVMValueRef inv_det,
+                 LLVMValueRef dxyz01,
+                 LLVMValueRef dxyz20,
+                 LLVMValueRef attribv[3])
+{
+   LLVMBuilderRef b = gallivm->builder;
+   struct lp_build_context flt_scalar_bld;
+   struct lp_build_context int_scalar_bld;
+   struct lp_build_context *bld = &args->bld;
+   LLVMValueRef zoffset, mult;
+   LLVMValueRef z0_new, z1_new, z2_new;
+   LLVMValueRef dzdxdzdy, dzdx, dzdy, dzxyz20, dyzzx01, dyzzx01_dzxyz20, dzx01_dyz20;
+   LLVMValueRef z0z1, z0z1z2;
+   LLVMValueRef max, max_value, res12;
+   LLVMValueRef shuffles[4];
+   LLVMTypeRef shuf_type = LLVMInt32TypeInContext(gallivm->context);
+   LLVMValueRef onei = lp_build_const_int32(gallivm, 1);
+   LLVMValueRef zeroi = lp_build_const_int32(gallivm, 0);
+   LLVMValueRef twoi = lp_build_const_int32(gallivm, 2);
+   LLVMValueRef threei  = lp_build_const_int32(gallivm, 3);
+
+   /* (res12) = cross(e,f).xy */
+   shuffles[0] = twoi;
+   shuffles[1] = zeroi;
+   shuffles[2] = onei;
+   shuffles[3] = twoi;
+   dzxyz20 = LLVMBuildShuffleVector(b, dxyz20, dxyz20, LLVMConstVector(shuffles, 4), "");
+
+   shuffles[0] = onei;
+   shuffles[1] = twoi;
+   shuffles[2] = twoi;
+   shuffles[3] = zeroi;
+   dyzzx01 = LLVMBuildShuffleVector(b, dxyz01, dxyz01, LLVMConstVector(shuffles, 4), "");
+
+   dyzzx01_dzxyz20 = LLVMBuildFMul(b, dzxyz20, dyzzx01, "dyzzx01_dzxyz20");
+
+   shuffles[0] = twoi;
+   shuffles[1] = threei;
+   shuffles[2] = LLVMGetUndef(shuf_type);
+   shuffles[3] = LLVMGetUndef(shuf_type);
+   dzx01_dyz20 = LLVMBuildShuffleVector(b, dyzzx01_dzxyz20, dyzzx01_dzxyz20,
+                                        LLVMConstVector(shuffles, 4), "");
+
+   res12 = LLVMBuildFSub(b, dyzzx01_dzxyz20, dzx01_dyz20, "res12");
+
+   /* dzdx = fabsf(res1 * inv_det), dydx = fabsf(res2 * inv_det)*/
+   dzdxdzdy = LLVMBuildFMul(b, res12, inv_det, "dzdxdzdy");
+   dzdxdzdy = lp_build_abs(bld, dzdxdzdy);
+
+   dzdx = LLVMBuildExtractElement(b, dzdxdzdy, zeroi, "");
+   dzdy = LLVMBuildExtractElement(b, dzdxdzdy, onei, "");
+
+   /* mult = MAX2(dzdx, dzdy) * pgon_offset_scale */
+   max = LLVMBuildFCmp(b, LLVMRealUGT, dzdx, dzdy, "");
+   max_value = LLVMBuildSelect(b, max, dzdx, dzdy, "max"); 
+
+   mult = LLVMBuildFMul(b, max_value,
+                        lp_build_const_float(gallivm, key->pgon_offset_scale), "");
+
+   lp_build_context_init(&flt_scalar_bld, gallivm, lp_type_float_vec(32, 32));
+
+   if (key->floating_point_depth) {
+      /*
+       * bias = pgon_offset_units * 2^(exponent(max(z0, z1, z2)) - mantissa_bits) +
+       *           MAX2(dzdx, dzdy) * pgon_offset_scale
+       *
+       * NOTE: Assumes IEEE float32.
+       */
+      LLVMValueRef c23_shifted, exp_mask, bias, exp;
+      LLVMValueRef maxz_value, maxz0z1_value;
+
+      lp_build_context_init(&int_scalar_bld, gallivm, lp_type_int_vec(32, 32));
+
+      c23_shifted = lp_build_const_int32(gallivm, 23 << 23);
+      exp_mask = lp_build_const_int32(gallivm, 0xff << 23);
+
+      maxz0z1_value = lp_build_max(&flt_scalar_bld,
+                         LLVMBuildExtractElement(b, attribv[0], twoi, ""),
+                         LLVMBuildExtractElement(b, attribv[1], twoi, ""));
+
+      maxz_value = lp_build_max(&flt_scalar_bld,
+                      LLVMBuildExtractElement(b, attribv[2], twoi, ""),
+                      maxz0z1_value);
+
+      exp = LLVMBuildBitCast(b, maxz_value, int_scalar_bld.vec_type, "");
+      exp = lp_build_and(&int_scalar_bld, exp, exp_mask);
+      exp = lp_build_sub(&int_scalar_bld, exp, c23_shifted);
+      /* Clamping to zero means mrd will be zero for very small numbers,
+       * but specs do not indicate this should be prevented by clamping
+       * mrd to smallest normal number instead. */
+      exp = lp_build_max(&int_scalar_bld, exp, int_scalar_bld.zero);
+      exp = LLVMBuildBitCast(b, exp, flt_scalar_bld.vec_type, "");
+
+      bias = LLVMBuildFMul(b, exp,
+                           lp_build_const_float(gallivm, key->pgon_offset_units),
+                           "bias");
+
+      zoffset = LLVMBuildFAdd(b, bias, mult, "zoffset");
+   } else {
+      /*
+       * bias = pgon_offset_units + MAX2(dzdx, dzdy) * pgon_offset_scale
+       */
+      zoffset = LLVMBuildFAdd(b,
+                              lp_build_const_float(gallivm, key->pgon_offset_units),
+                              mult, "zoffset");
+   }
+
+   if (key->pgon_offset_clamp > 0) {
+      zoffset = lp_build_min(&flt_scalar_bld,
+                             lp_build_const_float(gallivm, key->pgon_offset_clamp),
+                             zoffset);
+   }
+   else if (key->pgon_offset_clamp < 0) {
+      zoffset = lp_build_max(&flt_scalar_bld,
+                             lp_build_const_float(gallivm, key->pgon_offset_clamp),
+                             zoffset);
+   }
+
+   /* yuck */
+   shuffles[0] = twoi;
+   shuffles[1] = lp_build_const_int32(gallivm, 6);
+   shuffles[2] = LLVMGetUndef(shuf_type);
+   shuffles[3] = LLVMGetUndef(shuf_type);
+   z0z1 = LLVMBuildShuffleVector(b, attribv[0], attribv[1], LLVMConstVector(shuffles, 4), "");
+   shuffles[0] = zeroi;
+   shuffles[1] = onei;
+   shuffles[2] = lp_build_const_int32(gallivm, 6);
+   shuffles[3] = LLVMGetUndef(shuf_type);
+   z0z1z2 = LLVMBuildShuffleVector(b, z0z1, attribv[2], LLVMConstVector(shuffles, 4), "");
+   zoffset = lp_build_broadcast_scalar(bld, zoffset);
+
+   /* clamp and do offset */
+   /*
+    * FIXME I suspect the clamp (is that even right to always clamp to fixed
+    * 0.0/1.0?) should really be per fragment?
+    */
+   z0z1z2 = lp_build_clamp(bld, LLVMBuildFAdd(b, z0z1z2, zoffset, ""), bld->zero, bld->one);
+
+   /* insert into args->a0.z, a1.z, a2.z:
+    */
+   z0_new = LLVMBuildExtractElement(b, z0z1z2, zeroi, "");
+   z1_new = LLVMBuildExtractElement(b, z0z1z2, onei, "");
+   z2_new = LLVMBuildExtractElement(b, z0z1z2, twoi, "");
+   attribv[0] = LLVMBuildInsertElement(b, attribv[0], z0_new, twoi, "");
+   attribv[1] = LLVMBuildInsertElement(b, attribv[1], z1_new, twoi, "");
+   attribv[2] = LLVMBuildInsertElement(b, attribv[2], z2_new, twoi, "");
+}
+
+static void
+load_attribute(struct gallivm_state *gallivm,
+               struct lp_setup_args *args,
+               const struct lp_setup_variant_key *key,
+               unsigned vert_attr,
+               LLVMValueRef attribv[3])
+{
+   LLVMBuilderRef b = gallivm->builder;
+   LLVMValueRef idx = lp_build_const_int32(gallivm, vert_attr);
+
+   /* Load the vertex data
+    */
+   attribv[0] = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a");
+   attribv[1] = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a");
+   attribv[2] = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a");
+
+
+   /* Potentially modify it according to twoside, etc:
+    */
+   if (key->twoside) {
+      if (vert_attr == key->color_slot && key->bcolor_slot >= 0)
+         lp_twoside(gallivm, args, key, key->bcolor_slot, attribv);
+      else if (vert_attr == key->spec_slot && key->bspec_slot >= 0)
+         lp_twoside(gallivm, args, key, key->bspec_slot, attribv);
+   }
+}
+
+/*
+ * FIXME: interpolation is always done wrt fb origin (0/0).
+ * However, if some (small) tri is far away from the origin and gradients
+ * are large, this can lead to HUGE errors, since the a0 value calculated
+ * here can get very large (with the actual values inside the triangle way
+ * smaller), leading to complete loss of accuracy. This could be prevented
+ * by using some point inside (or at corner) of the tri as interpolation
+ * origin, or just use barycentric interpolation (which GL suggests and is
+ * what real hw does - you can get the barycentric coordinates from the
+ * edge functions in rasterization in principle (though we skip these
+ * sometimes completely in case of tris covering a block fully,
+ * which obviously wouldn't work)).
+ */
+static void 
+emit_coef4( struct gallivm_state *gallivm,
+            struct lp_setup_args *args,
+            unsigned slot,
+            LLVMValueRef a0,
+            LLVMValueRef a1,
+            LLVMValueRef a2)
+{
+   LLVMBuilderRef b = gallivm->builder;
+   LLVMValueRef attr_0;
+   LLVMValueRef dy20_ooa = args->dy20_ooa;
+   LLVMValueRef dy01_ooa = args->dy01_ooa;
+   LLVMValueRef dx20_ooa = args->dx20_ooa;
+   LLVMValueRef dx01_ooa = args->dx01_ooa;
+   LLVMValueRef x0_center = args->x0_center;
+   LLVMValueRef y0_center = args->y0_center;
+   LLVMValueRef da01 = LLVMBuildFSub(b, a0, a1, "da01");
+   LLVMValueRef da20 = LLVMBuildFSub(b, a2, a0, "da20");
+
+   /* Calculate dadx (vec4f)
+    */
+   LLVMValueRef da01_dy20_ooa = LLVMBuildFMul(b, da01, dy20_ooa, "da01_dy20_ooa");
+   LLVMValueRef da20_dy01_ooa = LLVMBuildFMul(b, da20, dy01_ooa, "da20_dy01_ooa");
+   LLVMValueRef dadx          = LLVMBuildFSub(b, da01_dy20_ooa, da20_dy01_ooa, "dadx");
+
+   /* Calculate dady (vec4f)
+    */
+   LLVMValueRef da01_dx20_ooa = LLVMBuildFMul(b, da01, dx20_ooa, "da01_dx20_ooa");
+   LLVMValueRef da20_dx01_ooa = LLVMBuildFMul(b, da20, dx01_ooa, "da20_dx01_ooa");
+   LLVMValueRef dady          = LLVMBuildFSub(b, da20_dx01_ooa, da01_dx20_ooa, "dady");
+
+   /* Calculate a0 - the attribute value at the origin
+    */
+   LLVMValueRef dadx_x0    = LLVMBuildFMul(b, dadx, x0_center, "dadx_x0");
+   LLVMValueRef dady_y0    = LLVMBuildFMul(b, dady, y0_center, "dady_y0");
+   LLVMValueRef attr_v0    = LLVMBuildFAdd(b, dadx_x0, dady_y0, "attr_v0");
+   attr_0                  = LLVMBuildFSub(b, a0, attr_v0, "attr_0");
+
+   store_coef(gallivm, args, slot, attr_0, dadx, dady);
+}
+
+
+static void 
+emit_linear_coef( struct gallivm_state *gallivm,
+                  struct lp_setup_args *args,
+                  unsigned slot,
+                  LLVMValueRef attribv[3])
+{
+   /* nothing to do anymore */
+   emit_coef4(gallivm,
+              args, slot, 
+              attribv[0],
+              attribv[1],
+              attribv[2]);
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void 
+apply_perspective_corr( struct gallivm_state *gallivm,
+                        struct lp_setup_args *args,
+                        unsigned slot,
+                        LLVMValueRef attribv[3])
+{
+   LLVMBuilderRef b = gallivm->builder;
+
+   /* premultiply by 1/w  (v[0][3] is always 1/w):
+    */
+   LLVMValueRef v0_oow = lp_build_broadcast_scalar(&args->bld,
+                            vert_attrib(gallivm, args->v0, 0, 3, "v0_oow"));
+   LLVMValueRef v1_oow = lp_build_broadcast_scalar(&args->bld,
+                            vert_attrib(gallivm, args->v1, 0, 3, "v1_oow"));
+   LLVMValueRef v2_oow = lp_build_broadcast_scalar(&args->bld,
+                            vert_attrib(gallivm, args->v2, 0, 3, "v2_oow"));
+
+   attribv[0] = LLVMBuildFMul(b, attribv[0], v0_oow, "v0_oow_v0a");
+   attribv[1] = LLVMBuildFMul(b, attribv[1], v1_oow, "v1_oow_v1a");
+   attribv[2] = LLVMBuildFMul(b, attribv[2], v2_oow, "v2_oow_v2a");
+}
+
+
+/**
+ * Applys cylindrical wrapping to vertex attributes if enabled.
+ * Input coordinates must be in [0, 1] range, otherwise results are undefined.
+ *
+ * @param cyl_wrap  TGSI_CYLINDRICAL_WRAP_x flags
+ */
+static void
+emit_apply_cyl_wrap(struct gallivm_state *gallivm,
+                    struct lp_setup_args *args,
+                    uint cyl_wrap,
+                    LLVMValueRef attribv[3])
+
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   struct lp_type type = args->bld.type;
+   LLVMTypeRef float_vec_type = args->bld.vec_type;
+   LLVMValueRef pos_half;
+   LLVMValueRef neg_half;
+   LLVMValueRef cyl_mask;
+   LLVMValueRef offset;
+   LLVMValueRef delta;
+   LLVMValueRef one;
+
+   if (!cyl_wrap)
+      return;
+
+   /* Constants */
+   pos_half = lp_build_const_vec(gallivm, type, +0.5f);
+   neg_half = lp_build_const_vec(gallivm, type, -0.5f);
+   cyl_mask = lp_build_const_mask_aos(gallivm, type, cyl_wrap, 4);
+
+   one = lp_build_const_vec(gallivm, type, 1.0f);
+   one = LLVMBuildBitCast(builder, one, lp_build_int_vec_type(gallivm, type), "");
+   one = LLVMBuildAnd(builder, one, cyl_mask, "");
+
+   /* Edge v0 -> v1 */
+   delta = LLVMBuildFSub(builder, attribv[1], attribv[0], "");
+
+   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
+   offset     = LLVMBuildAnd(builder, offset, one, "");
+   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+   attribv[0] = LLVMBuildFAdd(builder, attribv[0], offset, "");
+
+   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
+   offset     = LLVMBuildAnd(builder, offset, one, "");
+   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+   attribv[1] = LLVMBuildFAdd(builder, attribv[1], offset, "");
+
+   /* Edge v1 -> v2 */
+   delta = LLVMBuildFSub(builder, attribv[2], attribv[1], "");
+
+   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
+   offset     = LLVMBuildAnd(builder, offset, one, "");
+   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+   attribv[1] = LLVMBuildFAdd(builder, attribv[1], offset, "");
+
+   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
+   offset     = LLVMBuildAnd(builder, offset, one, "");
+   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+   attribv[2] = LLVMBuildFAdd(builder, attribv[2], offset, "");
+
+   /* Edge v2 -> v0 */
+   delta = LLVMBuildFSub(builder, attribv[0], attribv[2], "");
+
+   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_GREATER, delta, pos_half);
+   offset     = LLVMBuildAnd(builder, offset, one, "");
+   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+   attribv[2] = LLVMBuildFAdd(builder, attribv[2], offset, "");
+
+   offset     = lp_build_compare(gallivm, type, PIPE_FUNC_LESS, delta, neg_half);
+   offset     = LLVMBuildAnd(builder, offset, one, "");
+   offset     = LLVMBuildBitCast(builder, offset, float_vec_type, "");
+   attribv[0] = LLVMBuildFAdd(builder, attribv[0], offset, "");
+}
+
+
+/**
+ * Compute the inputs-> dadx, dady, a0 values.
+ */
+static void 
+emit_tri_coef( struct gallivm_state *gallivm,
+               const struct lp_setup_variant_key *key,
+               struct lp_setup_args *args)
+{
+   unsigned slot;
+
+   LLVMValueRef attribs[3];
+
+  /* setup interpolation for all the remaining attributes:
+    */
+   for (slot = 0; slot < key->num_inputs; slot++) {
+      switch (key->inputs[slot].interp) {
+      case LP_INTERP_CONSTANT:
+         load_attribute(gallivm, args, key, key->inputs[slot].src_index, attribs);
+         if (key->flatshade_first) {
+            emit_constant_coef4(gallivm, args, slot+1, attribs[0]);
+         }
+         else {
+            emit_constant_coef4(gallivm, args, slot+1, attribs[2]);
+         }
+         break;
+
+      case LP_INTERP_LINEAR:
+         load_attribute(gallivm, args, key, key->inputs[slot].src_index, attribs);
+         emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap, attribs);
+         emit_linear_coef(gallivm, args, slot+1, attribs);
+         break;
+
+      case LP_INTERP_PERSPECTIVE:
+         load_attribute(gallivm, args, key, key->inputs[slot].src_index, attribs);
+         emit_apply_cyl_wrap(gallivm, args, key->inputs[slot].cyl_wrap, attribs);
+         apply_perspective_corr(gallivm, args, slot+1, attribs);
+         emit_linear_coef(gallivm, args, slot+1, attribs);
+         break;
+
+      case LP_INTERP_POSITION:
+         /*
+          * The generated pixel interpolators will pick up the coeffs from
+          * slot 0.
+          */
+         break;
+
+      case LP_INTERP_FACING:
+         emit_facing_coef(gallivm, args, slot+1);
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+}
+
+
+/* XXX: generic code:
+ */
+static void
+set_noalias(LLVMBuilderRef builder,
+            LLVMValueRef function,
+            const LLVMTypeRef *arg_types,
+            int nr_args)
+{
+   int i;
+   for(i = 0; i < nr_args; ++i)
+      if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
+         LLVMAddAttribute(LLVMGetParam(function, i),
+            LLVMNoAliasAttribute);
+}
+
+static void
+init_args(struct gallivm_state *gallivm,
+          const struct lp_setup_variant_key *key,
+          struct lp_setup_args *args)
+{
+   LLVMBuilderRef b = gallivm->builder;
+   LLVMTypeRef shuf_type = LLVMInt32TypeInContext(gallivm->context);
+   LLVMValueRef onef = lp_build_const_float(gallivm, 1.0);
+   LLVMValueRef onei = lp_build_const_int32(gallivm, 1);
+   LLVMValueRef zeroi = lp_build_const_int32(gallivm, 0);
+   LLVMValueRef pixel_center, xy0_center, dxy01, dxy20, dyx20;
+   LLVMValueRef e, f, ef, ooa;
+   LLVMValueRef shuffles[4], shuf10;
+   LLVMValueRef attr_pos[3];
+   struct lp_type typef4 = lp_type_float_vec(32, 128);
+   struct lp_build_context bld;
+
+   lp_build_context_init(&bld, gallivm, typef4);
+   args->bld = bld;
+
+   /* The internal position input is in slot zero:
+    */
+   load_attribute(gallivm, args, key, 0, attr_pos);
+
+   pixel_center = lp_build_const_vec(gallivm, typef4,
+                                     key->pixel_center_half ? 0.5 : 0.0);
+
+   /*
+    * xy are first two elems in v0a/v1a/v2a but just use vec4 arit
+    * also offset_tri uses actually xyz in them
+    */
+   xy0_center = LLVMBuildFSub(b, attr_pos[0], pixel_center, "xy0_center" );
+
+   dxy01 = LLVMBuildFSub(b, attr_pos[0], attr_pos[1], "dxy01");
+   dxy20 = LLVMBuildFSub(b, attr_pos[2], attr_pos[0], "dxy20");
+
+   shuffles[0] = onei;
+   shuffles[1] = zeroi;
+   shuffles[2] = LLVMGetUndef(shuf_type);
+   shuffles[3] = LLVMGetUndef(shuf_type);
+   shuf10 = LLVMConstVector(shuffles, 4);
+
+   dyx20 = LLVMBuildShuffleVector(b, dxy20, dxy20, shuf10, "");
+
+   ef = LLVMBuildFMul(b, dxy01, dyx20, "ef");
+   e = LLVMBuildExtractElement(b, ef, zeroi, "");
+   f = LLVMBuildExtractElement(b, ef, onei, "");
+
+   ooa  = LLVMBuildFDiv(b, onef, LLVMBuildFSub(b, e, f, ""), "ooa");
+
+   ooa = lp_build_broadcast_scalar(&bld, ooa);
+
+   /* tri offset calc shares a lot of arithmetic, do it here */
+   if (key->pgon_offset_scale != 0.0f || key->pgon_offset_units != 0.0f) {
+      lp_do_offset_tri(gallivm, args, key, ooa, dxy01, dxy20, attr_pos);
+   }
+
+   dxy20 = LLVMBuildFMul(b, dxy20, ooa, "");
+   dxy01 = LLVMBuildFMul(b, dxy01, ooa, "");
+
+   args->dy20_ooa  = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy20, onei);
+   args->dy01_ooa  = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy01, onei);
+
+   args->dx20_ooa  = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy20, zeroi);
+   args->dx01_ooa  = lp_build_extract_broadcast(gallivm, typef4, typef4, dxy01, zeroi);
+
+   args->x0_center = lp_build_extract_broadcast(gallivm, typef4, typef4, xy0_center, zeroi);
+   args->y0_center = lp_build_extract_broadcast(gallivm, typef4, typef4, xy0_center, onei);
+
+   emit_linear_coef(gallivm, args, 0, attr_pos);
+}
+
+/**
+ * Generate the runtime callable function for the coefficient calculation.
+ *
+ */
+static struct lp_setup_variant *
+generate_setup_variant(struct lp_setup_variant_key *key,
+                       struct llvmpipe_context *lp)
+{
+   struct lp_setup_variant *variant = NULL;
+   struct gallivm_state *gallivm;
+   struct lp_setup_args args;
+   char func_name[64];
+   LLVMTypeRef vec4f_type;
+   LLVMTypeRef func_type;
+   LLVMTypeRef arg_types[7];
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef builder;
+   int64_t t0 = 0, t1;
+
+   if (0)
+      goto fail;
+
+   variant = CALLOC_STRUCT(lp_setup_variant);
+   if (variant == NULL)
+      goto fail;
+
+   variant->no = setup_no++;
+
+   util_snprintf(func_name, sizeof(func_name), "setup_variant_%u",
+                 variant->no);
+
+   variant->gallivm = gallivm = gallivm_create(func_name, lp->context);
+   if (!variant->gallivm) {
+      goto fail;
+   }
+
+   builder = gallivm->builder;
+
+   if (LP_DEBUG & DEBUG_COUNTERS) {
+      t0 = os_time_get();
+   }
+
+   memcpy(&variant->key, key, key->size);
+   variant->list_item_global.base = variant;
+
+   /* Currently always deal with full 4-wide vertex attributes from
+    * the vertices.
+    */
+
+   vec4f_type = LLVMVectorType(LLVMFloatTypeInContext(gallivm->context), 4);
+
+   arg_types[0] = LLVMPointerType(vec4f_type, 0);        /* v0 */
+   arg_types[1] = LLVMPointerType(vec4f_type, 0);        /* v1 */
+   arg_types[2] = LLVMPointerType(vec4f_type, 0);        /* v2 */
+   arg_types[3] = LLVMInt32TypeInContext(gallivm->context); /* facing */
+   arg_types[4] = LLVMPointerType(vec4f_type, 0);	/* a0, aligned */
+   arg_types[5] = LLVMPointerType(vec4f_type, 0);	/* dadx, aligned */
+   arg_types[6] = LLVMPointerType(vec4f_type, 0);	/* dady, aligned */
+
+   func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
+                                arg_types, Elements(arg_types), 0);
+
+   variant->function = LLVMAddFunction(gallivm->module, func_name, func_type);
+   if (!variant->function)
+      goto fail;
+
+   LLVMSetFunctionCallConv(variant->function, LLVMCCallConv);
+
+   args.v0       = LLVMGetParam(variant->function, 0);
+   args.v1       = LLVMGetParam(variant->function, 1);
+   args.v2       = LLVMGetParam(variant->function, 2);
+   args.facing   = LLVMGetParam(variant->function, 3);
+   args.a0       = LLVMGetParam(variant->function, 4);
+   args.dadx     = LLVMGetParam(variant->function, 5);
+   args.dady     = LLVMGetParam(variant->function, 6);
+
+   lp_build_name(args.v0, "in_v0");
+   lp_build_name(args.v1, "in_v1");
+   lp_build_name(args.v2, "in_v2");
+   lp_build_name(args.facing, "in_facing");
+   lp_build_name(args.a0, "out_a0");
+   lp_build_name(args.dadx, "out_dadx");
+   lp_build_name(args.dady, "out_dady");
+
+   /*
+    * Function body
+    */
+   block = LLVMAppendBasicBlockInContext(gallivm->context,
+                                         variant->function, "entry");
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   set_noalias(builder, variant->function, arg_types, Elements(arg_types));
+   init_args(gallivm, &variant->key, &args);
+   emit_tri_coef(gallivm, &variant->key, &args);
+
+   LLVMBuildRetVoid(builder);
+
+   gallivm_verify_function(gallivm, variant->function);
+
+   gallivm_compile_module(gallivm);
+
+   variant->jit_function = (lp_jit_setup_triangle)
+      gallivm_jit_function(gallivm, variant->function);
+   if (!variant->jit_function)
+      goto fail;
+
+   gallivm_free_ir(variant->gallivm);
+
+   /*
+    * Update timing information:
+    */
+   if (LP_DEBUG & DEBUG_COUNTERS) {
+      t1 = os_time_get();
+      LP_COUNT_ADD(llvm_compile_time, t1 - t0);
+      LP_COUNT_ADD(nr_llvm_compiles, 1);
+   }
+
+   return variant;
+
+fail:
+   if (variant) {
+      if (variant->gallivm) {
+         gallivm_destroy(variant->gallivm);
+      }
+      FREE(variant);
+   }
+
+   return NULL;
+}
+
+
+
+static void
+lp_make_setup_variant_key(struct llvmpipe_context *lp,
+                          struct lp_setup_variant_key *key)
+{
+   struct lp_fragment_shader *fs = lp->fs;
+   unsigned i;
+
+   assert(sizeof key->inputs[0] == sizeof(uint));
+
+   key->num_inputs = fs->info.base.num_inputs;
+   key->flatshade_first = lp->rasterizer->flatshade_first;
+   key->pixel_center_half = lp->rasterizer->half_pixel_center;
+   key->twoside = lp->rasterizer->light_twoside;
+   key->size = Offset(struct lp_setup_variant_key,
+                      inputs[key->num_inputs]);
+
+   key->color_slot  = lp->color_slot [0];
+   key->bcolor_slot = lp->bcolor_slot[0];
+   key->spec_slot   = lp->color_slot [1];
+   key->bspec_slot  = lp->bcolor_slot[1];
+   assert(key->color_slot  == lp->color_slot [0]);
+   assert(key->bcolor_slot == lp->bcolor_slot[0]);
+   assert(key->spec_slot   == lp->color_slot [1]);
+   assert(key->bspec_slot  == lp->bcolor_slot[1]);
+
+   /*
+    * If depth is floating point, depth bias is calculated with respect
+    * to the primitive's maximum Z value. Retain the original depth bias
+    * value until that stage.
+    */
+   key->floating_point_depth = lp->floating_point_depth;
+
+   if (key->floating_point_depth) {
+      key->pgon_offset_units = (float) lp->rasterizer->offset_units;
+   } else {
+      key->pgon_offset_units =
+         (float) (lp->rasterizer->offset_units * lp->mrd);
+   }
+
+   key->pgon_offset_scale = lp->rasterizer->offset_scale;
+   key->pgon_offset_clamp = lp->rasterizer->offset_clamp;
+   key->pad = 0;
+   memcpy(key->inputs, fs->inputs, key->num_inputs * sizeof key->inputs[0]);
+   for (i = 0; i < key->num_inputs; i++) {
+      if (key->inputs[i].interp == LP_INTERP_COLOR) {
+         if (lp->rasterizer->flatshade)
+            key->inputs[i].interp = LP_INTERP_CONSTANT;
+         else
+            key->inputs[i].interp = LP_INTERP_PERSPECTIVE;
+      }
+   }
+
+}
+
+
+static void
+remove_setup_variant(struct llvmpipe_context *lp,
+                     struct lp_setup_variant *variant)
+{
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      debug_printf("llvmpipe: del setup_variant #%u total %u\n",
+                   variant->no, lp->nr_setup_variants);
+   }
+
+   if (variant->gallivm) {
+      gallivm_destroy(variant->gallivm);
+   }
+
+   remove_from_list(&variant->list_item_global);
+   lp->nr_setup_variants--;
+   FREE(variant);
+}
+
+
+
+/* When the number of setup variants exceeds a threshold, cull a
+ * fraction (currently a quarter) of them.
+ */
+static void
+cull_setup_variants(struct llvmpipe_context *lp)
+{
+   struct pipe_context *pipe = &lp->pipe;
+   int i;
+
+   /*
+    * XXX: we need to flush the context until we have some sort of reference
+    * counting in fragment shaders as they may still be binned
+    * Flushing alone might not be sufficient we need to wait on it too.
+    */
+   llvmpipe_finish(pipe, __FUNCTION__);
+
+   for (i = 0; i < LP_MAX_SETUP_VARIANTS / 4; i++) {
+      struct lp_setup_variant_list_item *item;
+      if (is_empty_list(&lp->setup_variants_list)) {
+         break;
+      }
+      item = last_elem(&lp->setup_variants_list);
+      assert(item);
+      assert(item->base);
+      remove_setup_variant(lp, item->base);
+   }
+}
+
+
+/**
+ * Update fragment/vertex shader linkage state.  This is called just
+ * prior to drawing something when some fragment-related state has
+ * changed.
+ */
+void 
+llvmpipe_update_setup(struct llvmpipe_context *lp)
+{
+   struct lp_setup_variant_key *key = &lp->setup_variant.key;
+   struct lp_setup_variant *variant = NULL;
+   struct lp_setup_variant_list_item *li;
+
+   lp_make_setup_variant_key(lp, key);
+
+   foreach(li, &lp->setup_variants_list) {
+      if(li->base->key.size == key->size &&
+         memcmp(&li->base->key, key, key->size) == 0) {
+         variant = li->base;
+         break;
+      }
+   }
+
+   if (variant) {
+      move_to_head(&lp->setup_variants_list, &variant->list_item_global);
+   }
+   else {
+      if (lp->nr_setup_variants >= LP_MAX_SETUP_VARIANTS) {
+         cull_setup_variants(lp);
+      }
+
+      variant = generate_setup_variant(key, lp);
+      if (variant) {
+         insert_at_head(&lp->setup_variants_list, &variant->list_item_global);
+         lp->nr_setup_variants++;
+      }
+   }
+
+   lp_setup_set_setup_variant(lp->setup, variant);
+}
+
+void
+lp_delete_setup_variants(struct llvmpipe_context *lp)
+{
+   struct lp_setup_variant_list_item *li;
+   li = first_elem(&lp->setup_variants_list);
+   while(!at_end(&lp->setup_variants_list, li)) {
+      struct lp_setup_variant_list_item *next = next_elem(li);
+      remove_setup_variant(lp, li->base);
+      li = next;
+   }
+}
+
+void
+lp_dump_setup_coef(const struct lp_setup_variant_key *key,
+                   const float (*sa0)[4],
+                   const float (*sdadx)[4],
+                   const float (*sdady)[4])
+{
+   int i, slot;
+
+   for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
+      float a0   = sa0  [0][i];
+      float dadx = sdadx[0][i];
+      float dady = sdady[0][i];
+
+      debug_printf("POS.%c: a0 = %f, dadx = %f, dady = %f\n",
+                   "xyzw"[i], a0, dadx, dady);
+   }
+
+   for (slot = 0; slot < key->num_inputs; slot++) {
+      unsigned usage_mask = key->inputs[slot].usage_mask;
+      for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
+         if (usage_mask & (1 << i)) {
+            float a0   = sa0  [1 + slot][i];
+            float dadx = sdadx[1 + slot][i];
+            float dady = sdady[1 + slot][i];
+
+            debug_printf("IN[%u].%c: a0 = %f, dadx = %f, dady = %f\n",
+                         slot, "xyzw"[i], a0, dadx, dady);
+         }
+      }
+   }
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_setup.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_setup.h
new file mode 100644
index 000000000..82af8350f
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_setup.h
@@ -0,0 +1,85 @@
+#ifndef LP_STATE_SETUP_H
+#define LP_STATE_SETUP_H
+
+#include "lp_bld_interp.h"
+
+
+struct llvmpipe_context;
+struct lp_setup_variant;
+
+struct lp_setup_variant_list_item
+{
+   struct lp_setup_variant *base;
+   struct lp_setup_variant_list_item *next, *prev;
+};
+
+
+struct lp_setup_variant_key {
+   unsigned size:16;
+   unsigned num_inputs:8;
+   int color_slot:8;
+
+   int bcolor_slot:8;
+   int spec_slot:8;
+   int bspec_slot:8;
+   unsigned flatshade_first:1;
+   unsigned pixel_center_half:1;
+   unsigned twoside:1;
+   unsigned floating_point_depth:1;
+   unsigned pad:4;
+
+   /* TODO: get those floats out of the key and use a jit_context for setup */
+   float pgon_offset_units;
+   float pgon_offset_scale;
+   float pgon_offset_clamp;
+   struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
+};
+
+
+typedef void (*lp_jit_setup_triangle)( const float (*v0)[4],
+				       const float (*v1)[4],
+				       const float (*v2)[4],
+				       boolean front_facing,
+				       float (*a0)[4],
+				       float (*dadx)[4],
+				       float (*dady)[4] );
+
+
+
+
+/* At this stage, for a given variant key, we create a
+ * draw_vertex_info struct telling the draw module how to format the
+ * vertices, and an llvm-generated function which calculates the
+ * attribute interpolants (a0, dadx, dady) from three of those
+ * vertices.
+ */
+struct lp_setup_variant {
+   struct lp_setup_variant_key key;
+   
+   struct lp_setup_variant_list_item list_item_global;
+
+   struct gallivm_state *gallivm;
+
+   /* XXX: this is a pointer to the LLVM IR.  Once jit_function is
+    * generated, we never need to use the IR again - need to find a
+    * way to release this data without destroying the generated
+    * assembly.
+    */
+   LLVMValueRef function;
+
+   /* The actual generated setup function:
+    */
+   lp_jit_setup_triangle jit_function;
+
+   unsigned no;
+};
+
+void lp_delete_setup_variants(struct llvmpipe_context *lp);
+
+void
+lp_dump_setup_coef( const struct lp_setup_variant_key *key,
+		    const float (*sa0)[4],
+		    const float (*sdadx)[4],
+		    const float (*sdady)[4]);
+
+#endif
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_so.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_so.c
new file mode 100644
index 000000000..2af04cdf1
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_so.c
@@ -0,0 +1,93 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "lp_context.h"
+#include "lp_state.h"
+#include "lp_texture.h"
+
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+
+static struct pipe_stream_output_target *
+llvmpipe_create_so_target(struct pipe_context *pipe,
+                          struct pipe_resource *buffer,
+                          unsigned buffer_offset,
+                          unsigned buffer_size)
+{
+   struct draw_so_target *t;
+
+   t = CALLOC_STRUCT(draw_so_target);
+   if (!t)
+      return NULL;
+
+   t->target.context = pipe;
+   t->target.reference.count = 1;
+   pipe_resource_reference(&t->target.buffer, buffer);
+   t->target.buffer_offset = buffer_offset;
+   t->target.buffer_size = buffer_size;
+   return &t->target;
+}
+ 
+static void
+llvmpipe_so_target_destroy(struct pipe_context *pipe,
+                           struct pipe_stream_output_target *target)
+{
+   pipe_resource_reference(&target->buffer, NULL);
+   FREE(target);
+}
+
+static void
+llvmpipe_set_so_targets(struct pipe_context *pipe,
+                        unsigned num_targets,
+                        struct pipe_stream_output_target **targets,
+                        const unsigned *offsets)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   int i;
+   for (i = 0; i < num_targets; i++) {
+      const boolean append = (offsets[i] == (unsigned)-1);
+      pipe_so_target_reference((struct pipe_stream_output_target **)&llvmpipe->so_targets[i], targets[i]);
+      /* If we're not appending then lets set the internal
+         offset to what was requested */
+      if (!append && llvmpipe->so_targets[i]) {
+         llvmpipe->so_targets[i]->internal_offset = offsets[i];
+      }
+   }
+
+   for (; i < llvmpipe->num_so_targets; i++) {
+      pipe_so_target_reference((struct pipe_stream_output_target **)&llvmpipe->so_targets[i], NULL);
+   }
+   llvmpipe->num_so_targets = num_targets;
+}
+
+void
+llvmpipe_init_so_funcs(struct llvmpipe_context *pipe)
+{
+   pipe->pipe.create_stream_output_target = llvmpipe_create_so_target;
+   pipe->pipe.stream_output_target_destroy = llvmpipe_so_target_destroy;
+   pipe->pipe.set_stream_output_targets = llvmpipe_set_so_targets;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_surface.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_surface.c
new file mode 100644
index 000000000..c879ba975
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_surface.c
@@ -0,0 +1,91 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keithw@vmware.com>
+ */
+
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_framebuffer.h"
+#include "util/u_surface.h"
+#include "lp_context.h"
+#include "lp_scene.h"
+#include "lp_state.h"
+#include "lp_setup.h"
+
+#include "draw/draw_context.h"
+
+#include "util/u_format.h"
+
+
+/**
+ * Set the framebuffer surface info: color buffers, zbuffer, stencil buffer.
+ */
+void
+llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
+                               const struct pipe_framebuffer_state *fb)
+{
+   struct llvmpipe_context *lp = llvmpipe_context(pipe);
+
+   boolean changed = !util_framebuffer_state_equal(&lp->framebuffer, fb);
+
+   assert(fb->width <= LP_MAX_WIDTH);
+   assert(fb->height <= LP_MAX_HEIGHT);
+
+   if (changed) {
+      /*
+       * If no depth buffer is bound, send the utility function the default
+       * format for no bound depth (PIPE_FORMAT_NONE).
+       */ 
+      enum pipe_format depth_format = fb->zsbuf ?
+         fb->zsbuf->format : PIPE_FORMAT_NONE;
+      const struct util_format_description *depth_desc =
+         util_format_description(depth_format);
+
+      util_copy_framebuffer_state(&lp->framebuffer, fb);
+
+      if (LP_PERF & PERF_NO_DEPTH) {
+	 pipe_surface_reference(&lp->framebuffer.zsbuf, NULL);
+      }
+
+      /*
+       * Calculate the floating point depth sense and Minimum Resolvable Depth
+       * value for the llvmpipe module. This is separate from the draw module.
+       */
+      lp->floating_point_depth =
+         (util_get_depth_format_type(depth_desc) == UTIL_FORMAT_TYPE_FLOAT);
+ 
+      lp->mrd = util_get_depth_format_mrd(depth_desc);
+
+      /* Tell the draw module how deep the Z/depth buffer is. */
+      draw_set_zs_format(lp->draw, depth_format);
+
+      lp_setup_bind_framebuffer( lp->setup, &lp->framebuffer );
+
+      lp->dirty |= LP_NEW_FRAMEBUFFER;
+   }
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_vertex.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_vertex.c
new file mode 100644
index 000000000..1e93fd867
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_vertex.c
@@ -0,0 +1,117 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keithw@vmware.com>
+ */
+
+
+#include "lp_context.h"
+#include "lp_state.h"
+
+#include "draw/draw_context.h"
+#include "util/u_helpers.h"
+#include "util/u_inlines.h"
+#include "util/u_transfer.h"
+
+
+static void *
+llvmpipe_create_vertex_elements_state(struct pipe_context *pipe,
+                                      unsigned count,
+                                      const struct pipe_vertex_element *attribs)
+{
+   struct lp_velems_state *velems;
+   assert(count <= PIPE_MAX_ATTRIBS);
+   velems = (struct lp_velems_state *) MALLOC(sizeof(struct lp_velems_state));
+   if (velems) {
+      velems->count = count;
+      memcpy(velems->velem, attribs, sizeof(*attribs) * count);
+   }
+   return velems;
+}
+
+static void
+llvmpipe_bind_vertex_elements_state(struct pipe_context *pipe,
+                                    void *velems)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct lp_velems_state *lp_velems = (struct lp_velems_state *) velems;
+
+   llvmpipe->velems = lp_velems;
+
+   llvmpipe->dirty |= LP_NEW_VERTEX;
+
+   if (velems)
+      draw_set_vertex_elements(llvmpipe->draw, lp_velems->count, lp_velems->velem);
+}
+
+static void
+llvmpipe_delete_vertex_elements_state(struct pipe_context *pipe, void *velems)
+{
+   FREE( velems );
+}
+
+static void
+llvmpipe_set_vertex_buffers(struct pipe_context *pipe,
+                            unsigned start_slot, unsigned count,
+                            const struct pipe_vertex_buffer *buffers)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   util_set_vertex_buffers_count(llvmpipe->vertex_buffer,
+                                 &llvmpipe->num_vertex_buffers,
+                                 buffers, start_slot, count);
+
+   llvmpipe->dirty |= LP_NEW_VERTEX;
+
+   draw_set_vertex_buffers(llvmpipe->draw, start_slot, count, buffers);
+}
+
+
+static void
+llvmpipe_set_index_buffer(struct pipe_context *pipe,
+                          const struct pipe_index_buffer *ib)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if (ib)
+      memcpy(&llvmpipe->index_buffer, ib, sizeof(llvmpipe->index_buffer));
+   else
+      memset(&llvmpipe->index_buffer, 0, sizeof(llvmpipe->index_buffer));
+}
+
+void
+llvmpipe_init_vertex_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_vertex_elements_state = llvmpipe_create_vertex_elements_state;
+   llvmpipe->pipe.bind_vertex_elements_state = llvmpipe_bind_vertex_elements_state;
+   llvmpipe->pipe.delete_vertex_elements_state = llvmpipe_delete_vertex_elements_state;
+
+   llvmpipe->pipe.set_vertex_buffers = llvmpipe_set_vertex_buffers;
+   llvmpipe->pipe.set_index_buffer = llvmpipe_set_index_buffer;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_vs.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_vs.c
new file mode 100644
index 000000000..826ee5b72
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_state_vs.c
@@ -0,0 +1,96 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_defines.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+
+#include "lp_context.h"
+#include "lp_debug.h"
+#include "lp_state.h"
+
+
+static void *
+llvmpipe_create_vs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct draw_vertex_shader *vs;
+
+   vs = draw_create_vertex_shader(llvmpipe->draw, templ);
+   if (vs == NULL) {
+      return NULL;
+   }
+
+   if (LP_DEBUG & DEBUG_TGSI) {
+      debug_printf("llvmpipe: Create vertex shader %p:\n", (void *) vs);
+      tgsi_dump(templ->tokens, 0);
+   }
+
+   return vs;
+}
+
+
+static void
+llvmpipe_bind_vs_state(struct pipe_context *pipe, void *_vs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct draw_vertex_shader *vs = (struct draw_vertex_shader *)_vs;
+
+   if (llvmpipe->vs == vs)
+      return;
+
+   draw_bind_vertex_shader(llvmpipe->draw, vs);
+
+   llvmpipe->vs = vs;
+
+   llvmpipe->dirty |= LP_NEW_VS;
+}
+
+
+static void
+llvmpipe_delete_vs_state(struct pipe_context *pipe, void *_vs)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct draw_vertex_shader *vs = (struct draw_vertex_shader *)_vs;
+
+   draw_delete_vertex_shader(llvmpipe->draw, vs);
+}
+
+
+
+void
+llvmpipe_init_vs_funcs(struct llvmpipe_context *llvmpipe)
+{
+   llvmpipe->pipe.create_vs_state = llvmpipe_create_vs_state;
+   llvmpipe->pipe.bind_vs_state   = llvmpipe_bind_vs_state;
+   llvmpipe->pipe.delete_vs_state = llvmpipe_delete_vs_state;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_surface.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_surface.c
new file mode 100644
index 000000000..96f8ed82c
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_surface.c
@@ -0,0 +1,229 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_rect.h"
+#include "util/u_surface.h"
+#include "lp_context.h"
+#include "lp_flush.h"
+#include "lp_limits.h"
+#include "lp_surface.h"
+#include "lp_texture.h"
+#include "lp_query.h"
+
+
+static void
+lp_resource_copy(struct pipe_context *pipe,
+                 struct pipe_resource *dst, unsigned dst_level,
+                 unsigned dstx, unsigned dsty, unsigned dstz,
+                 struct pipe_resource *src, unsigned src_level,
+                 const struct pipe_box *src_box)
+{
+   llvmpipe_flush_resource(pipe,
+                           dst, dst_level,
+                           FALSE, /* read_only */
+                           TRUE, /* cpu_access */
+                           FALSE, /* do_not_block */
+                           "blit dest");
+
+   llvmpipe_flush_resource(pipe,
+                           src, src_level,
+                           TRUE, /* read_only */
+                           TRUE, /* cpu_access */
+                           FALSE, /* do_not_block */
+                           "blit src");
+
+   util_resource_copy_region(pipe, dst, dst_level, dstx, dsty, dstz,
+                             src, src_level, src_box);
+}
+
+
+static void lp_blit(struct pipe_context *pipe,
+                    const struct pipe_blit_info *blit_info)
+{
+   struct llvmpipe_context *lp = llvmpipe_context(pipe);
+   struct pipe_blit_info info = *blit_info;
+
+   if (blit_info->render_condition_enable && !llvmpipe_check_render_cond(lp))
+      return;
+
+   if (info.src.resource->nr_samples > 1 &&
+       info.dst.resource->nr_samples <= 1 &&
+       !util_format_is_depth_or_stencil(info.src.resource->format) &&
+       !util_format_is_pure_integer(info.src.resource->format)) {
+      debug_printf("llvmpipe: color resolve unimplemented\n");
+      return;
+   }
+
+   if (util_try_blit_via_copy_region(pipe, &info)) {
+      return; /* done */
+   }
+
+   if (!util_blitter_is_blit_supported(lp->blitter, &info)) {
+      debug_printf("llvmpipe: blit unsupported %s -> %s\n",
+                   util_format_short_name(info.src.resource->format),
+                   util_format_short_name(info.dst.resource->format));
+      return;
+   }
+
+   /* XXX turn off occlusion and streamout queries */
+
+   util_blitter_save_vertex_buffer_slot(lp->blitter, lp->vertex_buffer);
+   util_blitter_save_vertex_elements(lp->blitter, (void*)lp->velems);
+   util_blitter_save_vertex_shader(lp->blitter, (void*)lp->vs);
+   util_blitter_save_geometry_shader(lp->blitter, (void*)lp->gs);
+   util_blitter_save_so_targets(lp->blitter, lp->num_so_targets,
+                                (struct pipe_stream_output_target**)lp->so_targets);
+   util_blitter_save_rasterizer(lp->blitter, (void*)lp->rasterizer);
+   util_blitter_save_viewport(lp->blitter, &lp->viewports[0]);
+   util_blitter_save_scissor(lp->blitter, &lp->scissors[0]);
+   util_blitter_save_fragment_shader(lp->blitter, lp->fs);
+   util_blitter_save_blend(lp->blitter, (void*)lp->blend);
+   util_blitter_save_depth_stencil_alpha(lp->blitter, (void*)lp->depth_stencil);
+   util_blitter_save_stencil_ref(lp->blitter, &lp->stencil_ref);
+   /*util_blitter_save_sample_mask(sp->blitter, lp->sample_mask);*/
+   util_blitter_save_framebuffer(lp->blitter, &lp->framebuffer);
+   util_blitter_save_fragment_sampler_states(lp->blitter,
+                     lp->num_samplers[PIPE_SHADER_FRAGMENT],
+                     (void**)lp->samplers[PIPE_SHADER_FRAGMENT]);
+   util_blitter_save_fragment_sampler_views(lp->blitter,
+                     lp->num_sampler_views[PIPE_SHADER_FRAGMENT],
+                     lp->sampler_views[PIPE_SHADER_FRAGMENT]);
+   util_blitter_save_render_condition(lp->blitter, lp->render_cond_query,
+                                      lp->render_cond_cond, lp->render_cond_mode);
+   util_blitter_blit(lp->blitter, &info);
+}
+
+
+static void
+lp_flush_resource(struct pipe_context *ctx, struct pipe_resource *resource)
+{
+}
+
+
+static struct pipe_surface *
+llvmpipe_create_surface(struct pipe_context *pipe,
+                        struct pipe_resource *pt,
+                        const struct pipe_surface *surf_tmpl)
+{
+   struct pipe_surface *ps;
+
+   if (!(pt->bind & (PIPE_BIND_DEPTH_STENCIL | PIPE_BIND_RENDER_TARGET)))
+      debug_printf("Illegal surface creation without bind flag\n");
+
+   ps = CALLOC_STRUCT(pipe_surface);
+   if (ps) {
+      pipe_reference_init(&ps->reference, 1);
+      pipe_resource_reference(&ps->texture, pt);
+      ps->context = pipe;
+      ps->format = surf_tmpl->format;
+      if (llvmpipe_resource_is_texture(pt)) {
+         assert(surf_tmpl->u.tex.level <= pt->last_level);
+         assert(surf_tmpl->u.tex.first_layer <= surf_tmpl->u.tex.last_layer);
+         ps->width = u_minify(pt->width0, surf_tmpl->u.tex.level);
+         ps->height = u_minify(pt->height0, surf_tmpl->u.tex.level);
+         ps->u.tex.level = surf_tmpl->u.tex.level;
+         ps->u.tex.first_layer = surf_tmpl->u.tex.first_layer;
+         ps->u.tex.last_layer = surf_tmpl->u.tex.last_layer;
+      }
+      else {
+         /* setting width as number of elements should get us correct renderbuffer width */
+         ps->width = surf_tmpl->u.buf.last_element - surf_tmpl->u.buf.first_element + 1;
+         ps->height = pt->height0;
+         ps->u.buf.first_element = surf_tmpl->u.buf.first_element;
+         ps->u.buf.last_element = surf_tmpl->u.buf.last_element;
+         assert(ps->u.buf.first_element <= ps->u.buf.last_element);
+         assert(util_format_get_blocksize(surf_tmpl->format) *
+                (ps->u.buf.last_element + 1) <= pt->width0);
+      }
+   }
+   return ps;
+}
+
+
+static void
+llvmpipe_surface_destroy(struct pipe_context *pipe,
+                         struct pipe_surface *surf)
+{
+   /* Effectively do the texture_update work here - if texture images
+    * needed post-processing to put them into hardware layout, this is
+    * where it would happen.  For llvmpipe, nothing to do.
+    */
+   assert(surf->texture);
+   pipe_resource_reference(&surf->texture, NULL);
+   FREE(surf);
+}
+
+
+static void
+llvmpipe_clear_render_target(struct pipe_context *pipe,
+                             struct pipe_surface *dst,
+                             const union pipe_color_union *color,
+                             unsigned dstx, unsigned dsty,
+                             unsigned width, unsigned height)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if (!llvmpipe_check_render_cond(llvmpipe))
+      return;
+
+   util_clear_render_target(pipe, dst, color,
+                            dstx, dsty, width, height);
+}
+
+
+static void
+llvmpipe_clear_depth_stencil(struct pipe_context *pipe,
+                             struct pipe_surface *dst,
+                             unsigned clear_flags,
+                             double depth,
+                             unsigned stencil,
+                             unsigned dstx, unsigned dsty,
+                             unsigned width, unsigned height)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+
+   if (!llvmpipe_check_render_cond(llvmpipe))
+      return;
+
+   util_clear_depth_stencil(pipe, dst, clear_flags,
+                            depth, stencil,
+                            dstx, dsty, width, height);
+}
+
+
+void
+llvmpipe_init_surface_functions(struct llvmpipe_context *lp)
+{
+   lp->pipe.clear_render_target = llvmpipe_clear_render_target;
+   lp->pipe.clear_depth_stencil = llvmpipe_clear_depth_stencil;
+   lp->pipe.create_surface = llvmpipe_create_surface;
+   lp->pipe.surface_destroy = llvmpipe_surface_destroy;
+   /* These two are not actually functions dealing with surfaces */
+   lp->pipe.resource_copy_region = lp_resource_copy;
+   lp->pipe.blit = lp_blit;
+   lp->pipe.flush_resource = lp_flush_resource;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_surface.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_surface.h
new file mode 100644
index 000000000..b50dc21f4
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_surface.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keithw@vmware.com>
+ */
+
+#ifndef LP_SURFACE_H
+#define LP_SURFACE_H
+
+
+struct llvmpipe_context;
+
+
+extern void
+llvmpipe_init_surface_functions(struct llvmpipe_context *lp);
+
+
+#endif /* LP_SURFACE_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_test.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_test.h
new file mode 100644
index 000000000..e1b51c9c9
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_test.h
@@ -0,0 +1,140 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Shared testing code.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#ifndef LP_TEST_H
+#define LP_TEST_H
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <float.h>
+
+#include "gallivm/lp_bld.h"
+
+#include "pipe/p_state.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_dump.h"
+
+#include "gallivm/lp_bld_type.h"
+
+
+#define LP_TEST_NUM_SAMPLES 32
+
+
+void
+write_tsv_header(FILE *fp);
+
+
+boolean
+test_some(unsigned verbose, FILE *fp,
+          unsigned long n);
+
+boolean
+test_single(unsigned verbose, FILE *fp);
+
+boolean
+test_all(unsigned verbose, FILE *fp);
+
+
+#if defined(PIPE_CC_MSVC)
+
+unsigned __int64 __rdtsc();
+#pragma intrinsic(__rdtsc)
+#define rdtsc() __rdtsc()
+
+#elif defined(PIPE_CC_GCC) && (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64))
+
+static inline uint64_t
+rdtsc(void)
+{
+   uint32_t hi, lo;
+   __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+   return ((uint64_t)lo) | (((uint64_t)hi) << 32);
+}
+
+#else
+
+#define rdtsc() 0
+
+#endif
+
+
+
+float
+random_float(void);
+
+
+void
+dump_type(FILE *fp, struct lp_type type);
+
+
+double
+read_elem(struct lp_type type, const void *src, unsigned index);
+
+
+void
+write_elem(struct lp_type type, void *dst, unsigned index, double src);
+
+
+void
+random_elem(struct lp_type type, void *dst, unsigned index);
+
+
+void
+read_vec(struct lp_type type, const void *src, double *dst);
+
+
+void
+write_vec(struct lp_type type, void *dst, const double *src);
+
+
+void
+random_vec(struct lp_type type, void *dst);
+
+
+boolean
+compare_vec_with_eps(struct lp_type type, const void *res, const void *ref, double eps);
+
+
+boolean
+compare_vec(struct lp_type type, const void *res, const void *ref);
+
+
+void
+dump_vec(FILE *fp, struct lp_type type, const void *src);
+
+
+#endif /* !LP_TEST_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_arit.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_arit.c
new file mode 100644
index 000000000..290c523f0
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_arit.c
@@ -0,0 +1,484 @@
+/**************************************************************************
+ *
+ * Copyright 2011 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "util/u_pointer.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_cpu_detect.h"
+
+#include "gallivm/lp_bld.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_arit.h"
+
+#include "lp_test.h"
+
+
+void
+write_tsv_header(FILE *fp)
+{
+   fprintf(fp,
+           "result\t"
+           "format\n");
+
+   fflush(fp);
+}
+
+
+typedef void (*unary_func_t)(float *out, const float *in);
+
+
+/**
+ * Describe a test case of one unary function.
+ */
+struct unary_test_t
+{
+   /*
+    * Test name -- name of the mathematical function under test.
+    */
+
+   const char *name;
+
+   LLVMValueRef
+   (*builder)(struct lp_build_context *bld, LLVMValueRef a);
+
+   /*
+    * Reference (pure-C) function.
+    */
+   float
+   (*ref)(float a);
+
+   /*
+    * Test values.
+    */
+   const float *values;
+   unsigned num_values;
+
+   /*
+    * Required precision in bits.
+    */
+   double precision;
+};
+
+
+static float negf(float x)
+{
+   return -x;
+}
+
+
+static float sgnf(float x)
+{
+   if (x > 0.0f) {
+      return 1.0f;
+   }
+   if (x < 0.0f) {
+      return -1.0f;
+   }
+   return 0.0f;
+}
+
+
+const float exp2_values[] = {
+   -INFINITY,
+   -60,
+   -4,
+   -2,
+   -1,
+   -1e-007,
+   0,
+   1e-007,
+   0.01,
+   0.1,
+   0.9,
+   0.99,
+   1, 
+   2, 
+   4, 
+   60,
+   INFINITY,
+   NAN
+};
+
+
+const float log2_values[] = {
+#if 0
+   /* 
+    * Smallest denormalized number; meant just for experimentation, but not
+    * validation.
+    */
+   1.4012984643248171e-45,
+#endif
+   -INFINITY,
+   0,
+   1e-007,
+   0.1,
+   0.5,
+   0.99,
+   1,
+   1.01,
+   1.1,
+   1.9,
+   1.99,
+   2,
+   4,
+   100000,
+   1e+018,
+   INFINITY,
+   NAN
+};
+
+
+static float rcpf(float x)
+{
+   return 1.0/x;
+}
+
+
+const float rcp_values[] = {
+   -0.0, 0.0,
+   -1.0, 1.0,
+   -1e-007, 1e-007,
+   -4.0, 4.0,
+   -1e+035, -100000,
+   100000, 1e+035,
+   5.88e-39f, // denormal
+#if (__STDC_VERSION__ >= 199901L)
+   INFINITY, -INFINITY,
+#endif
+};
+
+
+static float rsqrtf(float x)
+{
+   return 1.0/(float)sqrt(x);
+}
+
+
+const float rsqrt_values[] = {
+   // http://msdn.microsoft.com/en-us/library/windows/desktop/bb147346.aspx
+   0.0, // must yield infinity
+   1.0, // must yield 1.0
+   1e-007, 4.0,
+   100000, 1e+035,
+   5.88e-39f, // denormal
+#if (__STDC_VERSION__ >= 199901L)
+   INFINITY,
+#endif
+};
+
+
+const float sincos_values[] = {
+   -INFINITY,
+   -5*M_PI/4,
+   -4*M_PI/4,
+   -4*M_PI/4,
+   -3*M_PI/4,
+   -2*M_PI/4,
+   -1*M_PI/4,
+   1*M_PI/4,
+   2*M_PI/4,
+   3*M_PI/4,
+   4*M_PI/4,
+   5*M_PI/4,
+   INFINITY,
+   NAN
+};
+
+const float round_values[] = {
+      -10.0, -1, 0.0, 12.0,
+      -1.49, -0.25, 1.25, 2.51,
+      -0.99, -0.01, 0.01, 0.99,
+      1.401298464324817e-45f, // smallest denormal
+      -1.401298464324817e-45f,
+      1.62981451e-08f,
+      -1.62981451e-08f,
+      1.62981451e15f, // large number not representable as 32bit int
+      -1.62981451e15f,
+      FLT_EPSILON,
+      -FLT_EPSILON,
+      1.0f - 0.5f*FLT_EPSILON,
+      -1.0f + FLT_EPSILON,
+      FLT_MAX,
+      -FLT_MAX
+};
+
+static float fractf(float x)
+{
+   x -= floorf(x);
+   if (x >= 1.0f) {
+      // clamp to the largest number smaller than one
+      x = 1.0f - 0.5f*FLT_EPSILON;
+   }
+   return x;
+}
+
+
+const float fract_values[] = {
+   // http://en.wikipedia.org/wiki/IEEE_754-1985#Examples
+   0.0f,
+   -0.0f,
+   1.0f,
+   -1.0f,
+   0.5f,
+   -0.5f,
+   1.401298464324817e-45f, // smallest denormal
+   -1.401298464324817e-45f,
+   5.88e-39f, // middle denormal
+   1.18e-38f, // largest denormal
+   -1.18e-38f,
+   -1.62981451e-08f,
+   FLT_EPSILON,
+   -FLT_EPSILON,
+   1.0f - 0.5f*FLT_EPSILON,
+   -1.0f + FLT_EPSILON,
+   FLT_MAX,
+   -FLT_MAX
+};
+
+
+/*
+ * Unary test cases.
+ */
+
+static const struct unary_test_t
+unary_tests[] = {
+   {"neg", &lp_build_negate, &negf, exp2_values, Elements(exp2_values), 20.0 },
+   {"exp2", &lp_build_exp2, &exp2f, exp2_values, Elements(exp2_values), 20.0 },
+   {"log2", &lp_build_log2_safe, &log2f, log2_values, Elements(log2_values), 20.0 },
+   {"exp", &lp_build_exp, &expf, exp2_values, Elements(exp2_values), 18.0 },
+   {"log", &lp_build_log_safe, &logf, log2_values, Elements(log2_values), 20.0 },
+   {"rcp", &lp_build_rcp, &rcpf, rcp_values, Elements(rcp_values), 20.0 },
+   {"rsqrt", &lp_build_rsqrt, &rsqrtf, rsqrt_values, Elements(rsqrt_values), 20.0 },
+   {"sin", &lp_build_sin, &sinf, sincos_values, Elements(sincos_values), 20.0 },
+   {"cos", &lp_build_cos, &cosf, sincos_values, Elements(sincos_values), 20.0 },
+   {"sgn", &lp_build_sgn, &sgnf, exp2_values, Elements(exp2_values), 20.0 },
+   {"round", &lp_build_round, &roundf, round_values, Elements(round_values), 24.0 },
+   {"trunc", &lp_build_trunc, &truncf, round_values, Elements(round_values), 24.0 },
+   {"floor", &lp_build_floor, &floorf, round_values, Elements(round_values), 24.0 },
+   {"ceil", &lp_build_ceil, &ceilf, round_values, Elements(round_values), 24.0 },
+   {"fract", &lp_build_fract_safe, &fractf, fract_values, Elements(fract_values), 24.0 },
+};
+
+
+/*
+ * Build LLVM function that exercises the unary operator builder.
+ */
+static LLVMValueRef
+build_unary_test_func(struct gallivm_state *gallivm,
+                      const struct unary_test_t *test)
+{
+   struct lp_type type = lp_type_float_vec(32, lp_native_vector_width);
+   LLVMContextRef context = gallivm->context;
+   LLVMModuleRef module = gallivm->module;
+   LLVMTypeRef vf32t = lp_build_vec_type(gallivm, type);
+   LLVMTypeRef args[2] = { LLVMPointerType(vf32t, 0), LLVMPointerType(vf32t, 0) };
+   LLVMValueRef func = LLVMAddFunction(module, test->name,
+                                       LLVMFunctionType(LLVMVoidTypeInContext(context),
+                                                        args, Elements(args), 0));
+   LLVMValueRef arg0 = LLVMGetParam(func, 0);
+   LLVMValueRef arg1 = LLVMGetParam(func, 1);
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMBasicBlockRef block = LLVMAppendBasicBlockInContext(context, func, "entry");
+   LLVMValueRef ret;
+
+   struct lp_build_context bld;
+
+   lp_build_context_init(&bld, gallivm, type);
+
+   LLVMSetFunctionCallConv(func, LLVMCCallConv);
+
+   LLVMPositionBuilderAtEnd(builder, block);
+   
+   arg1 = LLVMBuildLoad(builder, arg1, "");
+
+   ret = test->builder(&bld, arg1);
+   
+   LLVMBuildStore(builder, ret, arg0);
+
+   LLVMBuildRetVoid(builder);
+
+   gallivm_verify_function(gallivm, func);
+
+   return func;
+}
+
+
+/*
+ * Flush denorms to zero.
+ */
+static float
+flush_denorm_to_zero(float val)
+{
+   /*
+    * If we have a denorm manually set it to (+-)0.
+    * This is because the reference may or may not do the right thing
+    * otherwise because we want the result according to treating all
+    * denormals as zero (FTZ/DAZ). Not using fpclassify because
+    * a) some compilers are stuck at c89 (msvc)
+    * b) not sure it reliably works with non-standard ftz/daz mode
+    * And, right now we only disable denorms with jited code on x86/sse
+    * (albeit this should be classified as a bug) so to get results which
+    * match we must only flush them to zero here in that case too.
+    */
+   union fi fi_val;
+
+   fi_val.f = val;
+
+#if defined(PIPE_ARCH_SSE)
+   if (util_cpu_caps.has_sse) {
+      if ((fi_val.ui & 0x7f800000) == 0) {
+         fi_val.ui &= 0xff800000;
+      }
+   }
+#endif
+
+   return fi_val.f;
+}
+
+/*
+ * Test one LLVM unary arithmetic builder function.
+ */
+static boolean
+test_unary(unsigned verbose, FILE *fp, const struct unary_test_t *test)
+{
+   struct gallivm_state *gallivm;
+   LLVMValueRef test_func;
+   unary_func_t test_func_jit;
+   boolean success = TRUE;
+   int i, j;
+   int length = lp_native_vector_width / 32;
+   float *in, *out;
+
+   in = align_malloc(length * 4, length * 4);
+   out = align_malloc(length * 4, length * 4);
+
+   /* random NaNs or 0s could wreak havoc */
+   for (i = 0; i < length; i++) {
+      in[i] = 1.0;
+   }
+
+   gallivm = gallivm_create("test_module", LLVMGetGlobalContext());
+
+   test_func = build_unary_test_func(gallivm, test);
+
+   gallivm_compile_module(gallivm);
+
+   test_func_jit = (unary_func_t) gallivm_jit_function(gallivm, test_func);
+
+   gallivm_free_ir(gallivm);
+
+   for (j = 0; j < (test->num_values + length - 1) / length; j++) {
+      int num_vals = ((j + 1) * length <= test->num_values) ? length :
+                                                              test->num_values % length;
+
+      for (i = 0; i < num_vals; ++i) {
+         in[i] = test->values[i+j*length];
+      }
+
+      test_func_jit(out, in);
+      for (i = 0; i < num_vals; ++i) {
+         float testval, ref;
+         double error, precision;
+         bool pass;
+
+         testval = flush_denorm_to_zero(in[i]);
+         ref = flush_denorm_to_zero(test->ref(testval));
+
+         if (util_inf_sign(ref) && util_inf_sign(out[i]) == util_inf_sign(ref)) {
+            error = 0;
+         } else {
+            error = fabs(out[i] - ref);
+         }
+         precision = error ? -log2(error/fabs(ref)) : FLT_MANT_DIG;
+
+         pass = precision >= test->precision;
+
+         if (isnan(ref)) {
+            continue;
+         }
+
+         if (!pass || verbose) {
+            printf("%s(%.9g): ref = %.9g, out = %.9g, precision = %f bits, %s\n",
+                  test->name, in[i], ref, out[i], precision,
+                  pass ? "PASS" : "FAIL");
+            fflush(stdout);
+         }
+
+         if (!pass) {
+            success = FALSE;
+         }
+      }
+   }
+
+   gallivm_destroy(gallivm);
+
+   align_free(in);
+   align_free(out);
+
+   return success;
+}
+
+
+boolean
+test_all(unsigned verbose, FILE *fp)
+{
+   boolean success = TRUE;
+   int i;
+
+   for (i = 0; i < Elements(unary_tests); ++i) {
+      if (!test_unary(verbose, fp, &unary_tests[i])) {
+         success = FALSE;
+      }
+   }
+
+   return success;
+}
+
+
+boolean
+test_some(unsigned verbose, FILE *fp,
+          unsigned long n)
+{
+   /*
+    * Not randomly generated test cases, so test all.
+    */
+
+   return test_all(verbose, fp);
+}
+
+
+boolean
+test_single(unsigned verbose, FILE *fp)
+{
+   return TRUE;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_blend.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_blend.c
new file mode 100644
index 000000000..37420b024
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -0,0 +1,737 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Unit tests for blend LLVM IR generation
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ *
+ * Blend computation code derived from code written by
+ * @author Brian Paul <brian@vmware.com>
+ */
+
+#include "util/u_memory.h"
+
+#include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_debug.h"
+#include "lp_bld_blend.h"
+#include "lp_test.h"
+
+
+typedef void (*blend_test_ptr_t)(const void *src, const void *src1,
+                                 const void *dst, const void *con, void *res);
+
+
+void
+write_tsv_header(FILE *fp)
+{
+   fprintf(fp,
+           "result\t"
+           "cycles_per_channel\t"
+           "type\t"
+           "sep_func\t"
+           "sep_src_factor\t"
+           "sep_dst_factor\t"
+           "rgb_func\t"
+           "rgb_src_factor\t"
+           "rgb_dst_factor\t"
+           "alpha_func\t"
+           "alpha_src_factor\t"
+           "alpha_dst_factor\n");
+
+   fflush(fp);
+}
+
+
+static void
+write_tsv_row(FILE *fp,
+              const struct pipe_blend_state *blend,
+              struct lp_type type,
+              double cycles,
+              boolean success)
+{
+   fprintf(fp, "%s\t", success ? "pass" : "fail");
+
+   fprintf(fp, "%.1f\t", cycles / type.length);
+
+   fprintf(fp, "%s%u%sx%u\t",
+           type.floating ? "f" : (type.fixed ? "h" : (type.sign ? "s" : "u")),
+           type.width,
+           type.norm ? "n" : "",
+           type.length);
+
+   fprintf(fp,
+           "%s\t%s\t%s\t",
+           blend->rt[0].rgb_func != blend->rt[0].alpha_func ? "true" : "false",
+           blend->rt[0].rgb_src_factor != blend->rt[0].alpha_src_factor ? "true" : "false",
+           blend->rt[0].rgb_dst_factor != blend->rt[0].alpha_dst_factor ? "true" : "false");
+
+   fprintf(fp,
+           "%s\t%s\t%s\t%s\t%s\t%s\n",
+           util_dump_blend_func(blend->rt[0].rgb_func, TRUE),
+           util_dump_blend_factor(blend->rt[0].rgb_src_factor, TRUE),
+           util_dump_blend_factor(blend->rt[0].rgb_dst_factor, TRUE),
+           util_dump_blend_func(blend->rt[0].alpha_func, TRUE),
+           util_dump_blend_factor(blend->rt[0].alpha_src_factor, TRUE),
+           util_dump_blend_factor(blend->rt[0].alpha_dst_factor, TRUE));
+
+   fflush(fp);
+}
+
+
+static void
+dump_blend_type(FILE *fp,
+                const struct pipe_blend_state *blend,
+                struct lp_type type)
+{
+   fprintf(fp, " type=%s%u%sx%u",
+           type.floating ? "f" : (type.fixed ? "h" : (type.sign ? "s" : "u")),
+           type.width,
+           type.norm ? "n" : "",
+           type.length);
+
+   fprintf(fp,
+           " %s=%s %s=%s %s=%s %s=%s %s=%s %s=%s",
+           "rgb_func",         util_dump_blend_func(blend->rt[0].rgb_func, TRUE),
+           "rgb_src_factor",   util_dump_blend_factor(blend->rt[0].rgb_src_factor, TRUE),
+           "rgb_dst_factor",   util_dump_blend_factor(blend->rt[0].rgb_dst_factor, TRUE),
+           "alpha_func",       util_dump_blend_func(blend->rt[0].alpha_func, TRUE),
+           "alpha_src_factor", util_dump_blend_factor(blend->rt[0].alpha_src_factor, TRUE),
+           "alpha_dst_factor", util_dump_blend_factor(blend->rt[0].alpha_dst_factor, TRUE));
+
+   fprintf(fp, " ...\n");
+   fflush(fp);
+}
+
+
+static LLVMValueRef
+add_blend_test(struct gallivm_state *gallivm,
+               const struct pipe_blend_state *blend,
+               struct lp_type type)
+{
+   LLVMModuleRef module = gallivm->module;
+   LLVMContextRef context = gallivm->context;
+   LLVMTypeRef vec_type;
+   LLVMTypeRef args[5];
+   LLVMValueRef func;
+   LLVMValueRef src_ptr;
+   LLVMValueRef src1_ptr;
+   LLVMValueRef dst_ptr;
+   LLVMValueRef const_ptr;
+   LLVMValueRef res_ptr;
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef builder;
+   const enum pipe_format format = PIPE_FORMAT_R8G8B8A8_UNORM;
+   const unsigned rt = 0;
+   const unsigned char swizzle[4] = { 0, 1, 2, 3 };
+   LLVMValueRef src;
+   LLVMValueRef src1;
+   LLVMValueRef dst;
+   LLVMValueRef con;
+   LLVMValueRef res;
+
+   vec_type = lp_build_vec_type(gallivm, type);
+
+   args[4] = args[3] = args[2] = args[1] = args[0] = LLVMPointerType(vec_type, 0);
+   func = LLVMAddFunction(module, "test", LLVMFunctionType(LLVMVoidTypeInContext(context), args, 5, 0));
+   LLVMSetFunctionCallConv(func, LLVMCCallConv);
+   src_ptr = LLVMGetParam(func, 0);
+   src1_ptr = LLVMGetParam(func, 1);
+   dst_ptr = LLVMGetParam(func, 2);
+   const_ptr = LLVMGetParam(func, 3);
+   res_ptr = LLVMGetParam(func, 4);
+
+   block = LLVMAppendBasicBlockInContext(context, func, "entry");
+   builder = gallivm->builder;
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   src = LLVMBuildLoad(builder, src_ptr, "src");
+   src1 = LLVMBuildLoad(builder, src1_ptr, "src1");
+   dst = LLVMBuildLoad(builder, dst_ptr, "dst");
+   con = LLVMBuildLoad(builder, const_ptr, "const");
+
+   res = lp_build_blend_aos(gallivm, blend, format, type, rt, src, NULL,
+                            src1, NULL, dst, NULL, con, NULL, swizzle, 4);
+
+   lp_build_name(res, "res");
+
+   LLVMBuildStore(builder, res, res_ptr);
+
+   LLVMBuildRetVoid(builder);;
+
+   gallivm_verify_function(gallivm, func);
+
+   return func;
+}
+
+
+static void
+compute_blend_ref_term(unsigned rgb_factor,
+                       unsigned alpha_factor,
+                       const double *factor,
+                       const double *src,
+                       const double *src1,
+                       const double *dst,
+                       const double *con,
+                       double *term)
+{
+   double temp;
+
+   switch (rgb_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      term[0] = factor[0]; /* R */
+      term[1] = factor[1]; /* G */
+      term[2] = factor[2]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      term[0] = factor[0] * src[0]; /* R */
+      term[1] = factor[1] * src[1]; /* G */
+      term[2] = factor[2] * src[2]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      term[0] = factor[0] * src[3]; /* R */
+      term[1] = factor[1] * src[3]; /* G */
+      term[2] = factor[2] * src[3]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      term[0] = factor[0] * dst[0]; /* R */
+      term[1] = factor[1] * dst[1]; /* G */
+      term[2] = factor[2] * dst[2]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      term[0] = factor[0] * dst[3]; /* R */
+      term[1] = factor[1] * dst[3]; /* G */
+      term[2] = factor[2] * dst[3]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      temp = MIN2(src[3], 1.0f - dst[3]);
+      term[0] = factor[0] * temp; /* R */
+      term[1] = factor[1] * temp; /* G */
+      term[2] = factor[2] * temp; /* B */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      term[0] = factor[0] * con[0]; /* R */
+      term[1] = factor[1] * con[1]; /* G */
+      term[2] = factor[2] * con[2]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      term[0] = factor[0] * con[3]; /* R */
+      term[1] = factor[1] * con[3]; /* G */
+      term[2] = factor[2] * con[3]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      term[0] = factor[0] * src1[0]; /* R */
+      term[1] = factor[1] * src1[1]; /* G */
+      term[2] = factor[2] * src1[2]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      term[0] = factor[0] * src1[3]; /* R */
+      term[1] = factor[1] * src1[3]; /* G */
+      term[2] = factor[2] * src1[3]; /* B */
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      term[0] = 0.0f; /* R */
+      term[1] = 0.0f; /* G */
+      term[2] = 0.0f; /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      term[0] = factor[0] * (1.0f - src[0]); /* R */
+      term[1] = factor[1] * (1.0f - src[1]); /* G */
+      term[2] = factor[2] * (1.0f - src[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      term[0] = factor[0] * (1.0f - src[3]); /* R */
+      term[1] = factor[1] * (1.0f - src[3]); /* G */
+      term[2] = factor[2] * (1.0f - src[3]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      term[0] = factor[0] * (1.0f - dst[3]); /* R */
+      term[1] = factor[1] * (1.0f - dst[3]); /* G */
+      term[2] = factor[2] * (1.0f - dst[3]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      term[0] = factor[0] * (1.0f - dst[0]); /* R */
+      term[1] = factor[1] * (1.0f - dst[1]); /* G */
+      term[2] = factor[2] * (1.0f - dst[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      term[0] = factor[0] * (1.0f - con[0]); /* R */
+      term[1] = factor[1] * (1.0f - con[1]); /* G */
+      term[2] = factor[2] * (1.0f - con[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      term[0] = factor[0] * (1.0f - con[3]); /* R */
+      term[1] = factor[1] * (1.0f - con[3]); /* G */
+      term[2] = factor[2] * (1.0f - con[3]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      term[0] = factor[0] * (1.0f - src1[0]); /* R */
+      term[1] = factor[1] * (1.0f - src1[1]); /* G */
+      term[2] = factor[2] * (1.0f - src1[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      term[0] = factor[0] * (1.0f - src1[3]); /* R */
+      term[1] = factor[1] * (1.0f - src1[3]); /* G */
+      term[2] = factor[2] * (1.0f - src1[3]); /* B */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Compute src/first term A
+    */
+   switch (alpha_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      term[3] = factor[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      term[3] = factor[3] * src[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      term[3] = factor[3] * dst[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      term[3] = src[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      term[3] = factor[3] * con[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      term[3] = factor[3] * src1[3]; /* A */
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      term[3] = 0.0f; /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      term[3] = factor[3] * (1.0f - src[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      term[3] = factor[3] * (1.0f - dst[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      term[3] = factor[3] * (1.0f - con[3]);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      term[3] = factor[3] * (1.0f - src1[3]); /* A */
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static void
+compute_blend_ref(const struct pipe_blend_state *blend,
+                  const double *src,
+                  const double *src1,
+                  const double *dst,
+                  const double *con,
+                  double *res)
+{
+   double src_term[4];
+   double dst_term[4];
+
+   compute_blend_ref_term(blend->rt[0].rgb_src_factor, blend->rt[0].alpha_src_factor,
+                          src, src, src1, dst, con, src_term);
+   compute_blend_ref_term(blend->rt[0].rgb_dst_factor, blend->rt[0].alpha_dst_factor,
+                          dst, src, src1, dst, con, dst_term);
+
+   /*
+    * Combine RGB terms
+    */
+   switch (blend->rt[0].rgb_func) {
+   case PIPE_BLEND_ADD:
+      res[0] = src_term[0] + dst_term[0]; /* R */
+      res[1] = src_term[1] + dst_term[1]; /* G */
+      res[2] = src_term[2] + dst_term[2]; /* B */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      res[0] = src_term[0] - dst_term[0]; /* R */
+      res[1] = src_term[1] - dst_term[1]; /* G */
+      res[2] = src_term[2] - dst_term[2]; /* B */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      res[0] = dst_term[0] - src_term[0]; /* R */
+      res[1] = dst_term[1] - src_term[1]; /* G */
+      res[2] = dst_term[2] - src_term[2]; /* B */
+      break;
+   case PIPE_BLEND_MIN:
+      res[0] = MIN2(src_term[0], dst_term[0]); /* R */
+      res[1] = MIN2(src_term[1], dst_term[1]); /* G */
+      res[2] = MIN2(src_term[2], dst_term[2]); /* B */
+      break;
+   case PIPE_BLEND_MAX:
+      res[0] = MAX2(src_term[0], dst_term[0]); /* R */
+      res[1] = MAX2(src_term[1], dst_term[1]); /* G */
+      res[2] = MAX2(src_term[2], dst_term[2]); /* B */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Combine A terms
+    */
+   switch (blend->rt[0].alpha_func) {
+   case PIPE_BLEND_ADD:
+      res[3] = src_term[3] + dst_term[3]; /* A */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      res[3] = src_term[3] - dst_term[3]; /* A */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      res[3] = dst_term[3] - src_term[3]; /* A */
+      break;
+   case PIPE_BLEND_MIN:
+      res[3] = MIN2(src_term[3], dst_term[3]); /* A */
+      break;
+   case PIPE_BLEND_MAX:
+      res[3] = MAX2(src_term[3], dst_term[3]); /* A */
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+PIPE_ALIGN_STACK
+static boolean
+test_one(unsigned verbose,
+         FILE *fp,
+         const struct pipe_blend_state *blend,
+         struct lp_type type)
+{
+   struct gallivm_state *gallivm;
+   LLVMValueRef func = NULL;
+   blend_test_ptr_t blend_test_ptr;
+   boolean success;
+   const unsigned n = LP_TEST_NUM_SAMPLES;
+   int64_t cycles[LP_TEST_NUM_SAMPLES];
+   double cycles_avg = 0.0;
+   unsigned i, j;
+   const unsigned stride = lp_type_width(type)/8;
+
+   if(verbose >= 1)
+      dump_blend_type(stdout, blend, type);
+
+   gallivm = gallivm_create("test_module", LLVMGetGlobalContext());
+
+   func = add_blend_test(gallivm, blend, type);
+
+   gallivm_compile_module(gallivm);
+
+   blend_test_ptr = (blend_test_ptr_t)gallivm_jit_function(gallivm, func);
+
+   gallivm_free_ir(gallivm);
+
+   success = TRUE;
+
+   {
+      uint8_t *src, *src1, *dst, *con, *res, *ref;
+      src = align_malloc(stride, stride);
+      src1 = align_malloc(stride, stride);
+      dst = align_malloc(stride, stride);
+      con = align_malloc(stride, stride);
+      res = align_malloc(stride, stride);
+      ref = align_malloc(stride, stride);
+
+      for(i = 0; i < n && success; ++i) {
+         int64_t start_counter = 0;
+         int64_t end_counter = 0;
+
+         random_vec(type, src);
+         random_vec(type, src1);
+         random_vec(type, dst);
+         random_vec(type, con);
+
+         {
+            double fsrc[LP_MAX_VECTOR_LENGTH];
+            double fsrc1[LP_MAX_VECTOR_LENGTH];
+            double fdst[LP_MAX_VECTOR_LENGTH];
+            double fcon[LP_MAX_VECTOR_LENGTH];
+            double fref[LP_MAX_VECTOR_LENGTH];
+
+            read_vec(type, src, fsrc);
+            read_vec(type, src1, fsrc1);
+            read_vec(type, dst, fdst);
+            read_vec(type, con, fcon);
+
+            for(j = 0; j < type.length; j += 4)
+               compute_blend_ref(blend, fsrc + j, fsrc1 + j, fdst + j, fcon + j, fref + j);
+
+            write_vec(type, ref, fref);
+         }
+
+         start_counter = rdtsc();
+         blend_test_ptr(src, src1, dst, con, res);
+         end_counter = rdtsc();
+
+         cycles[i] = end_counter - start_counter;
+
+         if(!compare_vec(type, res, ref)) {
+            success = FALSE;
+
+            if(verbose < 1)
+               dump_blend_type(stderr, blend, type);
+            fprintf(stderr, "MISMATCH\n");
+
+            fprintf(stderr, "  Src: ");
+            dump_vec(stderr, type, src);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Src1: ");
+            dump_vec(stderr, type, src1);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Dst: ");
+            dump_vec(stderr, type, dst);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Con: ");
+            dump_vec(stderr, type, con);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Res: ");
+            dump_vec(stderr, type, res);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Ref: ");
+            dump_vec(stderr, type, ref);
+            fprintf(stderr, "\n");
+         }
+      }
+      align_free(src);
+      align_free(src1);
+      align_free(dst);
+      align_free(con);
+      align_free(res);
+      align_free(ref);
+   }
+
+   /*
+    * Unfortunately the output of cycle counter is not very reliable as it comes
+    * -- sometimes we get outliers (due IRQs perhaps?) which are
+    * better removed to avoid random or biased data.
+    */
+   {
+      double sum = 0.0, sum2 = 0.0;
+      double avg, std;
+      unsigned m;
+
+      for(i = 0; i < n; ++i) {
+         sum += cycles[i];
+         sum2 += cycles[i]*cycles[i];
+      }
+
+      avg = sum/n;
+      std = sqrtf((sum2 - n*avg*avg)/n);
+
+      m = 0;
+      sum = 0.0;
+      for(i = 0; i < n; ++i) {
+         if(fabs(cycles[i] - avg) <= 4.0*std) {
+            sum += cycles[i];
+            ++m;
+         }
+      }
+
+      cycles_avg = sum/m;
+
+   }
+
+   if(fp)
+      write_tsv_row(fp, blend, type, cycles_avg, success);
+
+   gallivm_destroy(gallivm);
+
+   return success;
+}
+
+
+const unsigned
+blend_factors[] = {
+   PIPE_BLENDFACTOR_ZERO,
+   PIPE_BLENDFACTOR_ONE,
+   PIPE_BLENDFACTOR_SRC_COLOR,
+   PIPE_BLENDFACTOR_SRC_ALPHA,
+   PIPE_BLENDFACTOR_DST_COLOR,
+   PIPE_BLENDFACTOR_DST_ALPHA,
+   PIPE_BLENDFACTOR_CONST_COLOR,
+   PIPE_BLENDFACTOR_CONST_ALPHA,
+   PIPE_BLENDFACTOR_SRC1_COLOR,
+   PIPE_BLENDFACTOR_SRC1_ALPHA,
+   PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE,
+   PIPE_BLENDFACTOR_INV_SRC_COLOR,
+   PIPE_BLENDFACTOR_INV_SRC_ALPHA,
+   PIPE_BLENDFACTOR_INV_DST_COLOR,
+   PIPE_BLENDFACTOR_INV_DST_ALPHA,
+   PIPE_BLENDFACTOR_INV_CONST_COLOR,
+   PIPE_BLENDFACTOR_INV_CONST_ALPHA,
+   PIPE_BLENDFACTOR_INV_SRC1_COLOR,
+   PIPE_BLENDFACTOR_INV_SRC1_ALPHA,
+};
+
+
+const unsigned
+blend_funcs[] = {
+   PIPE_BLEND_ADD,
+   PIPE_BLEND_SUBTRACT,
+   PIPE_BLEND_REVERSE_SUBTRACT,
+   PIPE_BLEND_MIN,
+   PIPE_BLEND_MAX
+};
+
+
+const struct lp_type blend_types[] = {
+   /* float, fixed,  sign,  norm, width, len */
+   {   TRUE, FALSE,  TRUE, FALSE,    32,   4 }, /* f32 x 4 */
+   {  FALSE, FALSE, FALSE,  TRUE,     8,  16 }, /* u8n x 16 */
+};
+
+
+const unsigned num_funcs = sizeof(blend_funcs)/sizeof(blend_funcs[0]);
+const unsigned num_factors = sizeof(blend_factors)/sizeof(blend_factors[0]);
+const unsigned num_types = sizeof(blend_types)/sizeof(blend_types[0]);
+
+
+boolean
+test_all(unsigned verbose, FILE *fp)
+{
+   const unsigned *rgb_func;
+   const unsigned *rgb_src_factor;
+   const unsigned *rgb_dst_factor;
+   const unsigned *alpha_func;
+   const unsigned *alpha_src_factor;
+   const unsigned *alpha_dst_factor;
+   struct pipe_blend_state blend;
+   const struct lp_type *type;
+   boolean success = TRUE;
+
+   for(rgb_func = blend_funcs; rgb_func < &blend_funcs[num_funcs]; ++rgb_func) {
+      for(alpha_func = blend_funcs; alpha_func < &blend_funcs[num_funcs]; ++alpha_func) {
+         for(rgb_src_factor = blend_factors; rgb_src_factor < &blend_factors[num_factors]; ++rgb_src_factor) {
+            for(rgb_dst_factor = blend_factors; rgb_dst_factor <= rgb_src_factor; ++rgb_dst_factor) {
+               for(alpha_src_factor = blend_factors; alpha_src_factor < &blend_factors[num_factors]; ++alpha_src_factor) {
+                  for(alpha_dst_factor = blend_factors; alpha_dst_factor <= alpha_src_factor; ++alpha_dst_factor) {
+                     for(type = blend_types; type < &blend_types[num_types]; ++type) {
+
+                        if(*rgb_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
+                           *alpha_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE)
+                           continue;
+
+                        memset(&blend, 0, sizeof blend);
+                        blend.rt[0].blend_enable      = 1;
+                        blend.rt[0].rgb_func          = *rgb_func;
+                        blend.rt[0].rgb_src_factor    = *rgb_src_factor;
+                        blend.rt[0].rgb_dst_factor    = *rgb_dst_factor;
+                        blend.rt[0].alpha_func        = *alpha_func;
+                        blend.rt[0].alpha_src_factor  = *alpha_src_factor;
+                        blend.rt[0].alpha_dst_factor  = *alpha_dst_factor;
+                        blend.rt[0].colormask         = PIPE_MASK_RGBA;
+
+                        if(!test_one(verbose, fp, &blend, *type))
+                          success = FALSE;
+
+                     }
+                  }
+               }
+            }
+         }
+      }
+   }
+
+   return success;
+}
+
+
+boolean
+test_some(unsigned verbose, FILE *fp,
+          unsigned long n)
+{
+   const unsigned *rgb_func;
+   const unsigned *rgb_src_factor;
+   const unsigned *rgb_dst_factor;
+   const unsigned *alpha_func;
+   const unsigned *alpha_src_factor;
+   const unsigned *alpha_dst_factor;
+   struct pipe_blend_state blend;
+   const struct lp_type *type;
+   unsigned long i;
+   boolean success = TRUE;
+
+   for(i = 0; i < n; ++i) {
+      rgb_func = &blend_funcs[rand() % num_funcs];
+      alpha_func = &blend_funcs[rand() % num_funcs];
+      rgb_src_factor = &blend_factors[rand() % num_factors];
+      alpha_src_factor = &blend_factors[rand() % num_factors];
+      
+      do {
+         rgb_dst_factor = &blend_factors[rand() % num_factors];
+      } while(*rgb_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE);
+
+      do {
+         alpha_dst_factor = &blend_factors[rand() % num_factors];
+      } while(*alpha_dst_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE);
+
+      type = &blend_types[rand() % num_types];
+
+      memset(&blend, 0, sizeof blend);
+      blend.rt[0].blend_enable      = 1;
+      blend.rt[0].rgb_func          = *rgb_func;
+      blend.rt[0].rgb_src_factor    = *rgb_src_factor;
+      blend.rt[0].rgb_dst_factor    = *rgb_dst_factor;
+      blend.rt[0].alpha_func        = *alpha_func;
+      blend.rt[0].alpha_src_factor  = *alpha_src_factor;
+      blend.rt[0].alpha_dst_factor  = *alpha_dst_factor;
+      blend.rt[0].colormask         = PIPE_MASK_RGBA;
+
+      if(!test_one(verbose, fp, &blend, *type))
+        success = FALSE;
+   }
+
+   return success;
+}
+
+
+boolean
+test_single(unsigned verbose, FILE *fp)
+{
+   printf("no test_single()");
+   return TRUE;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_conv.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_conv.c
new file mode 100644
index 000000000..8290da400
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -0,0 +1,453 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Unit tests for type conversion.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_pointer.h"
+#include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_conv.h"
+#include "gallivm/lp_bld_debug.h"
+#include "lp_test.h"
+
+
+typedef void (*conv_test_ptr_t)(const void *src, const void *dst);
+
+
+void
+write_tsv_header(FILE *fp)
+{
+   fprintf(fp,
+           "result\t"
+           "cycles_per_channel\t"
+           "src_type\t"
+           "dst_type\n");
+
+   fflush(fp);
+}
+
+
+static void
+write_tsv_row(FILE *fp,
+              struct lp_type src_type,
+              struct lp_type dst_type,
+              double cycles,
+              boolean success)
+{
+   fprintf(fp, "%s\t", success ? "pass" : "fail");
+
+   fprintf(fp, "%.1f\t", cycles / MAX2(src_type.length, dst_type.length));
+
+   dump_type(fp, src_type);
+   fprintf(fp, "\t");
+
+   dump_type(fp, dst_type);
+   fprintf(fp, "\n");
+
+   fflush(fp);
+}
+
+
+static void
+dump_conv_types(FILE *fp,
+               struct lp_type src_type,
+               struct lp_type dst_type)
+{
+   fprintf(fp, "src_type=");
+   dump_type(fp, src_type);
+
+   fprintf(fp, " dst_type=");
+   dump_type(fp, dst_type);
+
+   fprintf(fp, " ...\n");
+   fflush(fp);
+}
+
+
+static LLVMValueRef
+add_conv_test(struct gallivm_state *gallivm,
+              struct lp_type src_type, unsigned num_srcs,
+              struct lp_type dst_type, unsigned num_dsts)
+{
+   LLVMModuleRef module = gallivm->module;
+   LLVMContextRef context = gallivm->context;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef args[2];
+   LLVMValueRef func;
+   LLVMValueRef src_ptr;
+   LLVMValueRef dst_ptr;
+   LLVMBasicBlockRef block;
+   LLVMValueRef src[LP_MAX_VECTOR_LENGTH];
+   LLVMValueRef dst[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+
+   args[0] = LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0);
+   args[1] = LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0);
+
+   func = LLVMAddFunction(module, "test",
+                          LLVMFunctionType(LLVMVoidTypeInContext(context),
+                                           args, 2, 0));
+   LLVMSetFunctionCallConv(func, LLVMCCallConv);
+   src_ptr = LLVMGetParam(func, 0);
+   dst_ptr = LLVMGetParam(func, 1);
+
+   block = LLVMAppendBasicBlockInContext(context, func, "entry");
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   for(i = 0; i < num_srcs; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32TypeInContext(context), i, 0);
+      LLVMValueRef ptr = LLVMBuildGEP(builder, src_ptr, &index, 1, "");
+      src[i] = LLVMBuildLoad(builder, ptr, "");
+   }
+
+   lp_build_conv(gallivm, src_type, dst_type, src, num_srcs, dst, num_dsts);
+
+   for(i = 0; i < num_dsts; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32TypeInContext(context), i, 0);
+      LLVMValueRef ptr = LLVMBuildGEP(builder, dst_ptr, &index, 1, "");
+      LLVMBuildStore(builder, dst[i], ptr);
+   }
+
+   LLVMBuildRetVoid(builder);;
+
+   gallivm_verify_function(gallivm, func);
+
+   return func;
+}
+
+
+PIPE_ALIGN_STACK
+static boolean
+test_one(unsigned verbose,
+         FILE *fp,
+         struct lp_type src_type,
+         struct lp_type dst_type)
+{
+   struct gallivm_state *gallivm;
+   LLVMValueRef func = NULL;
+   conv_test_ptr_t conv_test_ptr;
+   boolean success;
+   const unsigned n = LP_TEST_NUM_SAMPLES;
+   int64_t cycles[LP_TEST_NUM_SAMPLES];
+   double cycles_avg = 0.0;
+   unsigned num_srcs;
+   unsigned num_dsts;
+   double eps;
+   unsigned i, j;
+
+   if ((src_type.width >= dst_type.width && src_type.length > dst_type.length) ||
+       (src_type.width <= dst_type.width && src_type.length < dst_type.length)) {
+      return TRUE;
+   }
+
+   /* Known failures
+    * - fixed point 32 -> float 32
+    * - float 32 -> signed normalised integer 32
+    */
+   if ((src_type.floating && !dst_type.floating && dst_type.sign && dst_type.norm && src_type.width == dst_type.width) ||
+       (!src_type.floating && dst_type.floating && src_type.fixed && src_type.width == dst_type.width)) {
+      return TRUE;
+   }
+
+   /* Known failures
+    * - fixed point 32 -> float 32
+    * - float 32 -> signed normalised integer 32
+    */
+   if ((src_type.floating && !dst_type.floating && dst_type.sign && dst_type.norm && src_type.width == dst_type.width) ||
+       (!src_type.floating && dst_type.floating && src_type.fixed && src_type.width == dst_type.width)) {
+      return TRUE;
+   }
+
+   if(verbose >= 1)
+      dump_conv_types(stderr, src_type, dst_type);
+
+   if (src_type.length > dst_type.length) {
+      num_srcs = 1;
+      num_dsts = src_type.length/dst_type.length;
+   }
+   else if (src_type.length < dst_type.length) {
+      num_dsts = 1;
+      num_srcs = dst_type.length/src_type.length;
+   }
+   else  {
+      num_dsts = 1;
+      num_srcs = 1;
+   }
+
+   /* We must not loose or gain channels. Only precision */
+   assert(src_type.length * num_srcs == dst_type.length * num_dsts);
+
+   eps = MAX2(lp_const_eps(src_type), lp_const_eps(dst_type));
+
+   gallivm = gallivm_create("test_module", LLVMGetGlobalContext());
+
+   func = add_conv_test(gallivm, src_type, num_srcs, dst_type, num_dsts);
+
+   gallivm_compile_module(gallivm);
+
+   conv_test_ptr = (conv_test_ptr_t)gallivm_jit_function(gallivm, func);
+
+   gallivm_free_ir(gallivm);
+
+   success = TRUE;
+   for(i = 0; i < n && success; ++i) {
+      unsigned src_stride = src_type.length*src_type.width/8;
+      unsigned dst_stride = dst_type.length*dst_type.width/8;
+      PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN) uint8_t src[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      PIPE_ALIGN_VAR(LP_MIN_VECTOR_ALIGN) uint8_t dst[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      double fref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      uint8_t ref[LP_MAX_VECTOR_LENGTH*LP_MAX_VECTOR_LENGTH];
+      int64_t start_counter = 0;
+      int64_t end_counter = 0;
+
+      for(j = 0; j < num_srcs; ++j) {
+         random_vec(src_type, src + j*src_stride);
+         read_vec(src_type, src + j*src_stride, fref + j*src_type.length);
+      }
+
+      for(j = 0; j < num_dsts; ++j) {
+         write_vec(dst_type, ref + j*dst_stride, fref + j*dst_type.length);
+      }
+
+      start_counter = rdtsc();
+      conv_test_ptr(src, dst);
+      end_counter = rdtsc();
+
+      cycles[i] = end_counter - start_counter;
+
+      for(j = 0; j < num_dsts; ++j) {
+         if(!compare_vec_with_eps(dst_type, dst + j*dst_stride, ref + j*dst_stride, eps))
+            success = FALSE;
+      }
+
+      if (!success || verbose >= 3) {
+         if(verbose < 1)
+            dump_conv_types(stderr, src_type, dst_type);
+         if (success) {
+            fprintf(stderr, "PASS\n");
+         }
+         else {
+            fprintf(stderr, "MISMATCH\n");
+         }
+
+         for(j = 0; j < num_srcs; ++j) {
+            fprintf(stderr, "  Src%u: ", j);
+            dump_vec(stderr, src_type, src + j*src_stride);
+            fprintf(stderr, "\n");
+         }
+
+#if 1
+         fprintf(stderr, "  Ref: ");
+         for(j = 0; j < src_type.length*num_srcs; ++j)
+            fprintf(stderr, " %f", fref[j]);
+         fprintf(stderr, "\n");
+#endif
+
+         for(j = 0; j < num_dsts; ++j) {
+            fprintf(stderr, "  Dst%u: ", j);
+            dump_vec(stderr, dst_type, dst + j*dst_stride);
+            fprintf(stderr, "\n");
+
+            fprintf(stderr, "  Ref%u: ", j);
+            dump_vec(stderr, dst_type, ref + j*dst_stride);
+            fprintf(stderr, "\n");
+         }
+      }
+   }
+
+   /*
+    * Unfortunately the output of cycle counter is not very reliable as it comes
+    * -- sometimes we get outliers (due IRQs perhaps?) which are
+    * better removed to avoid random or biased data.
+    */
+   {
+      double sum = 0.0, sum2 = 0.0;
+      double avg, std;
+      unsigned m;
+
+      for(i = 0; i < n; ++i) {
+         sum += cycles[i];
+         sum2 += cycles[i]*cycles[i];
+      }
+
+      avg = sum/n;
+      std = sqrtf((sum2 - n*avg*avg)/n);
+
+      m = 0;
+      sum = 0.0;
+      for(i = 0; i < n; ++i) {
+         if(fabs(cycles[i] - avg) <= 4.0*std) {
+            sum += cycles[i];
+            ++m;
+         }
+      }
+
+      cycles_avg = sum/m;
+
+   }
+
+   if(fp)
+      write_tsv_row(fp, src_type, dst_type, cycles_avg, success);
+
+   gallivm_destroy(gallivm);
+
+   return success;
+}
+
+
+const struct lp_type conv_types[] = {
+   /* float, fixed,  sign,  norm, width, len */
+
+   /* Float */
+   {   TRUE, FALSE,  TRUE,  TRUE,    32,   4 },
+   {   TRUE, FALSE,  TRUE, FALSE,    32,   4 },
+   {   TRUE, FALSE, FALSE,  TRUE,    32,   4 },
+   {   TRUE, FALSE, FALSE, FALSE,    32,   4 },
+
+   {   TRUE, FALSE,  TRUE,  TRUE,    32,   8 },
+   {   TRUE, FALSE,  TRUE, FALSE,    32,   8 },
+   {   TRUE, FALSE, FALSE,  TRUE,    32,   8 },
+   {   TRUE, FALSE, FALSE, FALSE,    32,   8 },
+
+   /* Fixed */
+   {  FALSE,  TRUE,  TRUE,  TRUE,    32,   4 },
+   {  FALSE,  TRUE,  TRUE, FALSE,    32,   4 },
+   {  FALSE,  TRUE, FALSE,  TRUE,    32,   4 },
+   {  FALSE,  TRUE, FALSE, FALSE,    32,   4 },
+
+   {  FALSE,  TRUE,  TRUE,  TRUE,    32,   8 },
+   {  FALSE,  TRUE,  TRUE, FALSE,    32,   8 },
+   {  FALSE,  TRUE, FALSE,  TRUE,    32,   8 },
+   {  FALSE,  TRUE, FALSE, FALSE,    32,   8 },
+
+   /* Integer */
+   {  FALSE, FALSE,  TRUE,  TRUE,    32,   4 },
+   {  FALSE, FALSE,  TRUE, FALSE,    32,   4 },
+   {  FALSE, FALSE, FALSE,  TRUE,    32,   4 },
+   {  FALSE, FALSE, FALSE, FALSE,    32,   4 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,    32,   8 },
+   {  FALSE, FALSE,  TRUE, FALSE,    32,   8 },
+   {  FALSE, FALSE, FALSE,  TRUE,    32,   8 },
+   {  FALSE, FALSE, FALSE, FALSE,    32,   8 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,    16,   8 },
+   {  FALSE, FALSE,  TRUE, FALSE,    16,   8 },
+   {  FALSE, FALSE, FALSE,  TRUE,    16,   8 },
+   {  FALSE, FALSE, FALSE, FALSE,    16,   8 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,     8,  16 },
+   {  FALSE, FALSE,  TRUE, FALSE,     8,  16 },
+   {  FALSE, FALSE, FALSE,  TRUE,     8,  16 },
+   {  FALSE, FALSE, FALSE, FALSE,     8,  16 },
+
+   {  FALSE, FALSE,  TRUE,  TRUE,     8,   4 },
+   {  FALSE, FALSE,  TRUE, FALSE,     8,   4 },
+   {  FALSE, FALSE, FALSE,  TRUE,     8,   4 },
+   {  FALSE, FALSE, FALSE, FALSE,     8,   4 },
+
+   {  FALSE, FALSE,  FALSE,  TRUE,    8,   8 },
+};
+
+
+const unsigned num_types = sizeof(conv_types)/sizeof(conv_types[0]);
+
+
+boolean
+test_all(unsigned verbose, FILE *fp)
+{
+   const struct lp_type *src_type;
+   const struct lp_type *dst_type;
+   boolean success = TRUE;
+   int error_count = 0;
+
+   for(src_type = conv_types; src_type < &conv_types[num_types]; ++src_type) {
+      for(dst_type = conv_types; dst_type < &conv_types[num_types]; ++dst_type) {
+
+         if(src_type == dst_type)
+            continue;
+
+         if(!test_one(verbose, fp, *src_type, *dst_type)){
+            success = FALSE;
+            ++error_count;
+         }
+      }
+   }
+
+   fprintf(stderr, "%d failures\n", error_count);
+
+   return success;
+}
+
+
+boolean
+test_some(unsigned verbose, FILE *fp,
+          unsigned long n)
+{
+   const struct lp_type *src_type;
+   const struct lp_type *dst_type;
+   unsigned long i;
+   boolean success = TRUE;
+
+   for(i = 0; i < n; ++i) {
+      src_type = &conv_types[rand() % num_types];
+      
+      do {
+         dst_type = &conv_types[rand() % num_types];
+      } while (src_type == dst_type || src_type->norm != dst_type->norm);
+
+      if(!test_one(verbose, fp, *src_type, *dst_type))
+        success = FALSE;
+   }
+
+   return success;
+}
+
+
+boolean
+test_single(unsigned verbose, FILE *fp)
+{
+   /*    float, fixed,  sign,  norm, width, len */
+   struct lp_type f32x4_type =
+      {   TRUE, FALSE,  TRUE,  TRUE,    32,   4 };
+   struct lp_type ub8x4_type =
+      {  FALSE, FALSE, FALSE,  TRUE,     8,  16 };
+
+   boolean success;
+
+   success = test_one(verbose, fp, f32x4_type, ub8x4_type);
+
+   return success;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_format.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_format.c
new file mode 100644
index 000000000..d9abd1ae3
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -0,0 +1,384 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <float.h>
+
+#include "util/u_memory.h"
+#include "util/u_pointer.h"
+#include "util/u_string.h"
+#include "util/u_format.h"
+#include "util/u_format_tests.h"
+#include "util/u_format_s3tc.h"
+
+#include "gallivm/lp_bld.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_format.h"
+#include "gallivm/lp_bld_init.h"
+
+#include "lp_test.h"
+
+
+void
+write_tsv_header(FILE *fp)
+{
+   fprintf(fp,
+           "result\t"
+           "format\n");
+
+   fflush(fp);
+}
+
+
+static void
+write_tsv_row(FILE *fp,
+              const struct util_format_description *desc,
+              boolean success)
+{
+   fprintf(fp, "%s\t", success ? "pass" : "fail");
+
+   fprintf(fp, "%s\n", desc->name);
+
+   fflush(fp);
+}
+
+
+typedef void
+(*fetch_ptr_t)(void *unpacked, const void *packed,
+               unsigned i, unsigned j);
+
+
+static LLVMValueRef
+add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
+                    const struct util_format_description *desc,
+                    struct lp_type type)
+{
+   char name[256];
+   LLVMContextRef context = gallivm->context;
+   LLVMModuleRef module = gallivm->module;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef args[4];
+   LLVMValueRef func;
+   LLVMValueRef packed_ptr;
+   LLVMValueRef offset = LLVMConstNull(LLVMInt32TypeInContext(context));
+   LLVMValueRef rgba_ptr;
+   LLVMValueRef i;
+   LLVMValueRef j;
+   LLVMBasicBlockRef block;
+   LLVMValueRef rgba;
+
+   util_snprintf(name, sizeof name, "fetch_%s_%s", desc->short_name,
+                 type.floating ? "float" : "unorm8");
+
+   args[0] = LLVMPointerType(lp_build_vec_type(gallivm, type), 0);
+   args[1] = LLVMPointerType(LLVMInt8TypeInContext(context), 0);
+   args[3] = args[2] = LLVMInt32TypeInContext(context);
+
+   func = LLVMAddFunction(module, name,
+                          LLVMFunctionType(LLVMVoidTypeInContext(context),
+                                           args, Elements(args), 0));
+   LLVMSetFunctionCallConv(func, LLVMCCallConv);
+   rgba_ptr = LLVMGetParam(func, 0);
+   packed_ptr = LLVMGetParam(func, 1);
+   i = LLVMGetParam(func, 2);
+   j = LLVMGetParam(func, 3);
+
+   block = LLVMAppendBasicBlockInContext(context, func, "entry");
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   rgba = lp_build_fetch_rgba_aos(gallivm, desc, type, TRUE,
+                                  packed_ptr, offset, i, j);
+
+   LLVMBuildStore(builder, rgba, rgba_ptr);
+
+   LLVMBuildRetVoid(builder);
+
+   gallivm_verify_function(gallivm, func);
+
+   return func;
+}
+
+
+PIPE_ALIGN_STACK
+static boolean
+test_format_float(unsigned verbose, FILE *fp,
+                  const struct util_format_description *desc)
+{
+   struct gallivm_state *gallivm;
+   LLVMValueRef fetch = NULL;
+   fetch_ptr_t fetch_ptr;
+   PIPE_ALIGN_VAR(16) uint8_t packed[UTIL_FORMAT_MAX_PACKED_BYTES];
+   PIPE_ALIGN_VAR(16) float unpacked[4];
+   boolean first = TRUE;
+   boolean success = TRUE;
+   unsigned i, j, k, l;
+
+   gallivm = gallivm_create("test_module_float", LLVMGetGlobalContext());
+
+   fetch = add_fetch_rgba_test(gallivm, verbose, desc, lp_float32_vec4_type());
+
+   gallivm_compile_module(gallivm);
+
+   fetch_ptr = (fetch_ptr_t) gallivm_jit_function(gallivm, fetch);
+
+   gallivm_free_ir(gallivm);
+
+   for (l = 0; l < util_format_nr_test_cases; ++l) {
+      const struct util_format_test_case *test = &util_format_test_cases[l];
+
+      if (test->format == desc->format) {
+
+         if (first) {
+            printf("Testing %s (float) ...\n",
+                   desc->name);
+            fflush(stdout);
+            first = FALSE;
+         }
+
+         /* To ensure it's 16-byte aligned */
+         memcpy(packed, test->packed, sizeof packed);
+
+         for (i = 0; i < desc->block.height; ++i) {
+            for (j = 0; j < desc->block.width; ++j) {
+               boolean match = TRUE;
+
+               memset(unpacked, 0, sizeof unpacked);
+
+               fetch_ptr(unpacked, packed, j, i);
+
+               for(k = 0; k < 4; ++k) {
+                  if (util_double_inf_sign(test->unpacked[i][j][k]) != util_inf_sign(unpacked[k])) {
+                     match = FALSE;
+                  }
+
+                  if (util_is_double_nan(test->unpacked[i][j][k]) != util_is_nan(unpacked[k])) {
+                     match = FALSE;
+                  }
+
+                  if (!util_is_double_inf_or_nan(test->unpacked[i][j][k]) &&
+                      fabs((float)test->unpacked[i][j][k] - unpacked[k]) > FLT_EPSILON) {
+                     match = FALSE;
+                  }
+               }
+
+               if (!match) {
+                  printf("FAILED\n");
+                  printf("  Packed: %02x %02x %02x %02x\n",
+                         test->packed[0], test->packed[1], test->packed[2], test->packed[3]);
+                  printf("  Unpacked (%u,%u): %.9g %.9g %.9g %.9g obtained\n",
+                         j, i,
+                         unpacked[0], unpacked[1], unpacked[2], unpacked[3]);
+                  printf("                  %.9g %.9g %.9g %.9g expected\n",
+                         test->unpacked[i][j][0],
+                         test->unpacked[i][j][1],
+                         test->unpacked[i][j][2],
+                         test->unpacked[i][j][3]);
+                  fflush(stdout);
+                  success = FALSE;
+               }
+            }
+         }
+      }
+   }
+
+   gallivm_destroy(gallivm);
+
+   if(fp)
+      write_tsv_row(fp, desc, success);
+
+   return success;
+}
+
+
+PIPE_ALIGN_STACK
+static boolean
+test_format_unorm8(unsigned verbose, FILE *fp,
+                   const struct util_format_description *desc)
+{
+   struct gallivm_state *gallivm;
+   LLVMValueRef fetch = NULL;
+   fetch_ptr_t fetch_ptr;
+   PIPE_ALIGN_VAR(16) uint8_t packed[UTIL_FORMAT_MAX_PACKED_BYTES];
+   uint8_t unpacked[4];
+   boolean first = TRUE;
+   boolean success = TRUE;
+   unsigned i, j, k, l;
+
+   gallivm = gallivm_create("test_module_unorm8", LLVMGetGlobalContext());
+
+   fetch = add_fetch_rgba_test(gallivm, verbose, desc, lp_unorm8_vec4_type());
+
+   gallivm_compile_module(gallivm);
+
+   fetch_ptr = (fetch_ptr_t) gallivm_jit_function(gallivm, fetch);
+
+   gallivm_free_ir(gallivm);
+
+   for (l = 0; l < util_format_nr_test_cases; ++l) {
+      const struct util_format_test_case *test = &util_format_test_cases[l];
+
+      if (test->format == desc->format) {
+
+         if (first) {
+            printf("Testing %s (unorm8) ...\n",
+                   desc->name);
+            first = FALSE;
+         }
+
+         /* To ensure it's 16-byte aligned */
+         /* Could skip this and use unaligned lp_build_fetch_rgba_aos */
+         memcpy(packed, test->packed, sizeof packed);
+
+         for (i = 0; i < desc->block.height; ++i) {
+            for (j = 0; j < desc->block.width; ++j) {
+               boolean match;
+
+               memset(unpacked, 0, sizeof unpacked);
+
+               fetch_ptr(unpacked, packed, j, i);
+
+               match = TRUE;
+               for(k = 0; k < 4; ++k) {
+                  int error = float_to_ubyte(test->unpacked[i][j][k]) - unpacked[k];
+
+                  if (util_is_double_nan(test->unpacked[i][j][k]))
+                     continue;
+
+                  if (error < 0)
+                     error = -error;
+
+                  if (error > 1)
+                     match = FALSE;
+               }
+
+               if (!match) {
+                  printf("FAILED\n");
+                  printf("  Packed: %02x %02x %02x %02x\n",
+                         test->packed[0], test->packed[1], test->packed[2], test->packed[3]);
+                  printf("  Unpacked (%u,%u): %02x %02x %02x %02x obtained\n",
+                         j, i,
+                         unpacked[0], unpacked[1], unpacked[2], unpacked[3]);
+                  printf("                  %02x %02x %02x %02x expected\n",
+                         float_to_ubyte(test->unpacked[i][j][0]),
+                         float_to_ubyte(test->unpacked[i][j][1]),
+                         float_to_ubyte(test->unpacked[i][j][2]),
+                         float_to_ubyte(test->unpacked[i][j][3]));
+
+                  success = FALSE;
+               }
+            }
+         }
+      }
+   }
+
+   gallivm_destroy(gallivm);
+
+   if(fp)
+      write_tsv_row(fp, desc, success);
+
+   return success;
+}
+
+
+
+
+static boolean
+test_one(unsigned verbose, FILE *fp,
+         const struct util_format_description *format_desc)
+{
+   boolean success = TRUE;
+
+   if (!test_format_float(verbose, fp, format_desc)) {
+     success = FALSE;
+   }
+
+   if (!test_format_unorm8(verbose, fp, format_desc)) {
+     success = FALSE;
+   }
+
+   return success;
+}
+
+
+boolean
+test_all(unsigned verbose, FILE *fp)
+{
+   enum pipe_format format;
+   boolean success = TRUE;
+
+   util_format_s3tc_init();
+
+   for (format = 1; format < PIPE_FORMAT_COUNT; ++format) {
+      const struct util_format_description *format_desc;
+
+      format_desc = util_format_description(format);
+      if (!format_desc) {
+         continue;
+      }
+
+
+      /*
+       * TODO: test more
+       */
+
+      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
+         continue;
+      }
+
+      if (util_format_is_pure_integer(format))
+	 continue;
+
+      if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC &&
+          !util_format_s3tc_enabled) {
+         continue;
+      }
+
+      if (!test_one(verbose, fp, format_desc)) {
+           success = FALSE;
+      }
+   }
+
+   return success;
+}
+
+
+boolean
+test_some(unsigned verbose, FILE *fp,
+          unsigned long n)
+{
+   return test_all(verbose, fp);
+}
+
+
+boolean
+test_single(unsigned verbose, FILE *fp)
+{
+   printf("no test_single()");
+   return TRUE;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_main.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_main.c
new file mode 100644
index 000000000..d835dbbd6
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -0,0 +1,418 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Shared testing code.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+
+#include "util/u_cpu_detect.h"
+#include "util/u_math.h"
+
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_debug.h"
+#include "lp_test.h"
+
+
+void
+dump_type(FILE *fp,
+          struct lp_type type)
+{
+   fprintf(fp, "%s%s%u%sx%u",
+           type.sign ? (type.floating || type.fixed ? "" : "s") : "u",
+           type.floating ? "f" : (type.fixed ? "h" : "i"),
+           type.width,
+           type.norm ? "n" : "",
+           type.length);
+}
+
+
+double
+read_elem(struct lp_type type, const void *src, unsigned index)
+{
+   double scale = lp_const_scale(type);
+   double value;
+   assert(index < type.length);
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         value = *((const float *)src + index);
+         break;
+      case 64:
+         value =  *((const double *)src + index);
+         break;
+      default:
+         assert(0);
+         return 0.0;
+      }
+   }
+   else {
+      if(type.sign) {
+         switch(type.width) {
+         case 8:
+            value = *((const int8_t *)src + index);
+            break;
+         case 16:
+            value = *((const int16_t *)src + index);
+            break;
+         case 32:
+            value = *((const int32_t *)src + index);
+            break;
+         case 64:
+            value = *((const int64_t *)src + index);
+            break;
+         default:
+            assert(0);
+            return 0.0;
+         }
+      }
+      else {
+         switch(type.width) {
+         case 8:
+            value = *((const uint8_t *)src + index);
+            break;
+         case 16:
+            value = *((const uint16_t *)src + index);
+            break;
+         case 32:
+            value = *((const uint32_t *)src + index);
+            break;
+         case 64:
+            value = *((const uint64_t *)src + index);
+            break;
+         default:
+            assert(0);
+            return 0.0;
+         }
+      }
+   }
+   return value/scale;
+}
+
+
+void
+write_elem(struct lp_type type, void *dst, unsigned index, double value)
+{
+   assert(index < type.length);
+   if(!type.sign && value < 0.0)
+      value = 0.0;
+   if(type.norm && value < -1.0)
+      value = -1.0;
+   if(type.norm && value > 1.0)
+      value = 1.0;
+   if (type.floating) {
+      switch(type.width) {
+      case 32:
+         *((float *)dst + index) = (float)(value);
+         break;
+      case 64:
+          *((double *)dst + index) = value;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   else {
+      double scale = lp_const_scale(type);
+      value = round(value*scale);
+      if(type.sign) {
+         long long lvalue = (long long)value;
+         lvalue = MIN2(lvalue, ((long long)1 << (type.width - 1)) - 1);
+         switch(type.width) {
+         case 8:
+            *((int8_t *)dst + index) = (int8_t)lvalue;
+            break;
+         case 16:
+            *((int16_t *)dst + index) = (int16_t)lvalue;
+            break;
+         case 32:
+            *((int32_t *)dst + index) = (int32_t)lvalue;
+            break;
+         case 64:
+            *((int64_t *)dst + index) = (int64_t)lvalue;
+            break;
+         default:
+            assert(0);
+         }
+      }
+      else {
+         unsigned long long lvalue = (long long)value;
+         lvalue = MIN2(lvalue, ((unsigned long long)1 << type.width) - 1);
+         switch(type.width) {
+         case 8:
+            *((uint8_t *)dst + index) = (uint8_t)lvalue;
+            break;
+         case 16:
+            *((uint16_t *)dst + index) = (uint16_t)lvalue;
+            break;
+         case 32:
+            *((uint32_t *)dst + index) = (uint32_t)lvalue;
+            break;
+         case 64:
+            *((uint64_t *)dst + index) = (uint64_t)lvalue;
+            break;
+         default:
+            assert(0);
+         }
+      }
+   }
+}
+
+
+void
+random_elem(struct lp_type type, void *dst, unsigned index)
+{
+   double value;
+   assert(index < type.length);
+   value = (double)rand()/(double)RAND_MAX;
+   if(!type.norm) {
+      if (type.floating) {
+         value *= 2.0;
+      }
+      else {
+         unsigned long long mask;
+	 if (type.fixed)
+            mask = ((unsigned long long)1 << (type.width / 2)) - 1;
+         else if (type.sign)
+            mask = ((unsigned long long)1 << (type.width - 1)) - 1;
+         else
+            mask = ((unsigned long long)1 << type.width) - 1;
+         value += (double)(mask & rand());
+      }
+   }
+   if(!type.sign)
+      if(rand() & 1)
+         value = -value;
+   write_elem(type, dst, index, value);
+}
+
+
+void
+read_vec(struct lp_type type, const void *src, double *dst)
+{
+   unsigned i;
+   for (i = 0; i < type.length; ++i)
+      dst[i] = read_elem(type, src, i);
+}
+
+
+void
+write_vec(struct lp_type type, void *dst, const double *src)
+{
+   unsigned i;
+   for (i = 0; i < type.length; ++i)
+      write_elem(type, dst, i, src[i]);
+}
+
+
+float
+random_float(void)
+{
+    return (float)((double)rand()/(double)RAND_MAX);
+}
+
+
+void
+random_vec(struct lp_type type, void *dst)
+{
+   unsigned i;
+   for (i = 0; i < type.length; ++i)
+      random_elem(type, dst, i);
+}
+
+
+boolean
+compare_vec_with_eps(struct lp_type type, const void *res, const void *ref, double eps)
+{
+   unsigned i;
+   eps *= type.floating ? 8.0 : 2.0;
+   for (i = 0; i < type.length; ++i) {
+      double res_elem = read_elem(type, res, i);
+      double ref_elem = read_elem(type, ref, i);
+      double delta = res_elem - ref_elem;
+      if (ref_elem < -1.0 || ref_elem > 1.0) {
+	 delta /= ref_elem;
+      }
+      delta = fabs(delta);
+      if (delta >= eps) {
+         return FALSE;
+      }
+   }
+
+   return TRUE;
+}
+
+
+boolean
+compare_vec(struct lp_type type, const void *res, const void *ref)
+{
+   double eps = lp_const_eps(type);
+   return compare_vec_with_eps(type, res, ref, eps);
+}
+
+
+void
+dump_vec(FILE *fp, struct lp_type type, const void *src)
+{
+   unsigned i;
+   for (i = 0; i < type.length; ++i) {
+      if(i)
+         fprintf(fp, " ");
+      if (type.floating) {
+         double value;
+         switch(type.width) {
+         case 32:
+            value = *((const float *)src + i);
+            break;
+         case 64:
+            value = *((const double *)src + i);
+            break;
+         default:
+            assert(0);
+            value = 0.0;
+         }
+         fprintf(fp, "%f", value);
+      }
+      else {
+         if(type.sign && !type.norm) {
+            long long value;
+            const char *format;
+            switch(type.width) {
+            case 8:
+               value = *((const int8_t *)src + i);
+               format = "%3lli";
+               break;
+            case 16:
+               value = *((const int16_t *)src + i);
+               format = "%5lli";
+               break;
+            case 32:
+               value = *((const int32_t *)src + i);
+               format = "%10lli";
+               break;
+            case 64:
+               value = *((const int64_t *)src + i);
+               format = "%20lli";
+               break;
+            default:
+               assert(0);
+               value = 0.0;
+               format = "?";
+            }
+            fprintf(fp, format, value);
+         }
+         else {
+            unsigned long long value;
+            const char *format;
+            switch(type.width) {
+            case 8:
+               value = *((const uint8_t *)src + i);
+               format = type.norm ? "%2x" : "%4llu";
+               break;
+            case 16:
+               value = *((const uint16_t *)src + i);
+               format = type.norm ? "%4x" : "%6llx";
+               break;
+            case 32:
+               value = *((const uint32_t *)src + i);
+               format = type.norm ? "%8x" : "%11llx";
+               break;
+            case 64:
+               value = *((const uint64_t *)src + i);
+               format = type.norm ? "%16x" : "%21llx";
+               break;
+            default:
+               assert(0);
+               value = 0.0;
+               format = "?";
+            }
+            fprintf(fp, format, value);
+         }
+      }
+   }
+}
+
+
+int main(int argc, char **argv)
+{
+   unsigned verbose = 0;
+   FILE *fp = NULL;
+   unsigned long n = 1000;
+   unsigned i;
+   boolean success;
+   boolean single = FALSE;
+   unsigned fpstate;
+
+   util_cpu_detect();
+   fpstate = util_fpstate_get();
+   util_fpstate_set_denorms_to_zero(fpstate);
+
+   if (!lp_build_init())
+      return 1;
+
+   for(i = 1; i < argc; ++i) {
+      if(strcmp(argv[i], "-v") == 0)
+         ++verbose;
+      else if(strcmp(argv[i], "-s") == 0)
+         single = TRUE;
+      else if(strcmp(argv[i], "-o") == 0)
+         fp = fopen(argv[++i], "wt");
+      else
+         n = atoi(argv[i]);
+   }
+
+#ifdef DEBUG
+   if (verbose >= 2) {
+      gallivm_debug |= GALLIVM_DEBUG_IR;
+      gallivm_debug |= GALLIVM_DEBUG_ASM;
+   }
+#endif
+
+   if(fp) {
+      /* Warm up the caches */
+      test_some(0, NULL, 100);
+
+      write_tsv_header(fp);
+   }
+      
+   if (single)
+      success = test_single(verbose, fp);
+   else if (n)
+      success = test_some(verbose, fp, n);
+   else
+      success = test_all(verbose, fp);
+
+   if(fp)
+      fclose(fp);
+
+   return success ? 0 : 1;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_printf.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_printf.c
new file mode 100644
index 000000000..fe4ce0fc5
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_test_printf.c
@@ -0,0 +1,139 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "util/u_pointer.h"
+#include "gallivm/lp_bld.h"
+#include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_assert.h"
+#include "gallivm/lp_bld_printf.h"
+
+#include "lp_test.h"
+
+
+struct printf_test_case {
+   int foo;
+};
+
+void
+write_tsv_header(FILE *fp)
+{
+   fprintf(fp,
+           "result\t"
+           "format\n");
+
+   fflush(fp);
+}
+
+
+
+typedef void (*test_printf_t)(int i);
+
+
+static LLVMValueRef
+add_printf_test(struct gallivm_state *gallivm)
+{
+   LLVMModuleRef module = gallivm->module;
+   LLVMTypeRef args[1] = { LLVMIntTypeInContext(gallivm->context, 32) };
+   LLVMValueRef func = LLVMAddFunction(module, "test_printf", LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context), args, 1, 0));
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMBasicBlockRef block = LLVMAppendBasicBlockInContext(gallivm->context, func, "entry");
+
+   LLVMSetFunctionCallConv(func, LLVMCCallConv);
+
+   LLVMPositionBuilderAtEnd(builder, block);
+   lp_build_printf(gallivm, "hello, world\n");
+   lp_build_printf(gallivm, "print 5 6: %d %d\n", LLVMConstInt(LLVMInt32TypeInContext(gallivm->context), 5, 0),
+				LLVMConstInt(LLVMInt32TypeInContext(gallivm->context), 6, 0));
+
+   /* Also test lp_build_assert().  This should not fail. */
+   lp_build_assert(gallivm, LLVMConstInt(LLVMInt32TypeInContext(gallivm->context), 1, 0), "assert(1)");
+
+   LLVMBuildRetVoid(builder);
+
+   gallivm_verify_function(gallivm, func);
+
+   return func;
+}
+
+
+PIPE_ALIGN_STACK
+static boolean
+test_printf(unsigned verbose, FILE *fp,
+            const struct printf_test_case *testcase)
+{
+   struct gallivm_state *gallivm;
+   LLVMValueRef test;
+   test_printf_t test_printf_func;
+   boolean success = TRUE;
+
+   gallivm = gallivm_create("test_module", LLVMGetGlobalContext());
+
+   test = add_printf_test(gallivm);
+
+   gallivm_compile_module(gallivm);
+
+   test_printf_func = (test_printf_t) gallivm_jit_function(gallivm, test);
+
+   gallivm_free_ir(gallivm);
+
+   test_printf_func(0);
+
+   gallivm_destroy(gallivm);
+
+   return success;
+}
+
+
+boolean
+test_all(unsigned verbose, FILE *fp)
+{
+   boolean success = TRUE;
+
+   test_printf(verbose, fp, NULL);
+
+   return success;
+}
+
+
+boolean
+test_some(unsigned verbose, FILE *fp,
+          unsigned long n)
+{
+   return test_all(verbose, fp);
+}
+
+
+boolean
+test_single(unsigned verbose, FILE *fp)
+{
+   printf("no test_single()");
+   return TRUE;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_tex_sample.c
new file mode 100644
index 000000000..316d1c550
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_tex_sample.c
@@ -0,0 +1,321 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 VMware, Inc.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture sampling code generation
+ *
+ * This file is nothing more than ugly glue between three largely independent
+ * entities:
+ * - TGSI -> LLVM translation (i.e., lp_build_tgsi_soa)
+ * - texture sampling code generation (i.e., lp_build_sample_soa)
+ * - LLVM pipe driver
+ *
+ * All interesting code is in the functions mentioned above. There is really
+ * nothing to see here.
+ *
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_sample.h"
+#include "gallivm/lp_bld_tgsi.h"
+#include "lp_jit.h"
+#include "lp_tex_sample.h"
+#include "lp_state_fs.h"
+#include "lp_debug.h"
+
+
+/**
+ * This provides the bridge between the sampler state store in
+ * lp_jit_context and lp_jit_texture and the sampler code
+ * generator. It provides the texture layout information required by
+ * the texture sampler code generator in terms of the state stored in
+ * lp_jit_context and lp_jit_texture in runtime.
+ */
+struct llvmpipe_sampler_dynamic_state
+{
+   struct lp_sampler_dynamic_state base;
+
+   const struct lp_sampler_static_state *static_state;
+};
+
+
+/**
+ * This is the bridge between our sampler and the TGSI translator.
+ */
+struct lp_llvm_sampler_soa
+{
+   struct lp_build_sampler_soa base;
+
+   struct llvmpipe_sampler_dynamic_state dynamic_state;
+};
+
+
+/**
+ * Fetch the specified member of the lp_jit_texture structure.
+ * \param emit_load  if TRUE, emit the LLVM load instruction to actually
+ *                   fetch the field's value.  Otherwise, just emit the
+ *                   GEP code to address the field.
+ *
+ * @sa http://llvm.org/docs/GetElementPtr.html
+ */
+static LLVMValueRef
+lp_llvm_texture_member(const struct lp_sampler_dynamic_state *base,
+                       struct gallivm_state *gallivm,
+                       LLVMValueRef context_ptr,
+                       unsigned texture_unit,
+                       unsigned member_index,
+                       const char *member_name,
+                       boolean emit_load)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef indices[4];
+   LLVMValueRef ptr;
+   LLVMValueRef res;
+
+   assert(texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS);
+
+   /* context[0] */
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   /* context[0].textures */
+   indices[1] = lp_build_const_int32(gallivm, LP_JIT_CTX_TEXTURES);
+   /* context[0].textures[unit] */
+   indices[2] = lp_build_const_int32(gallivm, texture_unit);
+   /* context[0].textures[unit].member */
+   indices[3] = lp_build_const_int32(gallivm, member_index);
+
+   ptr = LLVMBuildGEP(builder, context_ptr, indices, Elements(indices), "");
+
+   if (emit_load)
+      res = LLVMBuildLoad(builder, ptr, "");
+   else
+      res = ptr;
+
+   lp_build_name(res, "context.texture%u.%s", texture_unit, member_name);
+
+   return res;
+}
+
+
+/**
+ * Helper macro to instantiate the functions that generate the code to
+ * fetch the members of lp_jit_texture to fulfill the sampler code
+ * generator requests.
+ *
+ * This complexity is the price we have to pay to keep the texture
+ * sampler code generator a reusable module without dependencies to
+ * llvmpipe internals.
+ */
+#define LP_LLVM_TEXTURE_MEMBER(_name, _index, _emit_load)  \
+   static LLVMValueRef \
+   lp_llvm_texture_##_name( const struct lp_sampler_dynamic_state *base, \
+                            struct gallivm_state *gallivm, \
+                            LLVMValueRef context_ptr, \
+                            unsigned texture_unit) \
+   { \
+      return lp_llvm_texture_member(base, gallivm, context_ptr, \
+                                    texture_unit, _index, #_name, _emit_load ); \
+   }
+
+
+LP_LLVM_TEXTURE_MEMBER(width,      LP_JIT_TEXTURE_WIDTH, TRUE)
+LP_LLVM_TEXTURE_MEMBER(height,     LP_JIT_TEXTURE_HEIGHT, TRUE)
+LP_LLVM_TEXTURE_MEMBER(depth,      LP_JIT_TEXTURE_DEPTH, TRUE)
+LP_LLVM_TEXTURE_MEMBER(first_level, LP_JIT_TEXTURE_FIRST_LEVEL, TRUE)
+LP_LLVM_TEXTURE_MEMBER(last_level, LP_JIT_TEXTURE_LAST_LEVEL, TRUE)
+LP_LLVM_TEXTURE_MEMBER(base_ptr,   LP_JIT_TEXTURE_BASE, TRUE)
+LP_LLVM_TEXTURE_MEMBER(row_stride, LP_JIT_TEXTURE_ROW_STRIDE, FALSE)
+LP_LLVM_TEXTURE_MEMBER(img_stride, LP_JIT_TEXTURE_IMG_STRIDE, FALSE)
+LP_LLVM_TEXTURE_MEMBER(mip_offsets, LP_JIT_TEXTURE_MIP_OFFSETS, FALSE)
+
+
+/**
+ * Fetch the specified member of the lp_jit_sampler structure.
+ * \param emit_load  if TRUE, emit the LLVM load instruction to actually
+ *                   fetch the field's value.  Otherwise, just emit the
+ *                   GEP code to address the field.
+ *
+ * @sa http://llvm.org/docs/GetElementPtr.html
+ */
+static LLVMValueRef
+lp_llvm_sampler_member(const struct lp_sampler_dynamic_state *base,
+                       struct gallivm_state *gallivm,
+                       LLVMValueRef context_ptr,
+                       unsigned sampler_unit,
+                       unsigned member_index,
+                       const char *member_name,
+                       boolean emit_load)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef indices[4];
+   LLVMValueRef ptr;
+   LLVMValueRef res;
+
+   assert(sampler_unit < PIPE_MAX_SAMPLERS);
+
+   /* context[0] */
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   /* context[0].samplers */
+   indices[1] = lp_build_const_int32(gallivm, LP_JIT_CTX_SAMPLERS);
+   /* context[0].samplers[unit] */
+   indices[2] = lp_build_const_int32(gallivm, sampler_unit);
+   /* context[0].samplers[unit].member */
+   indices[3] = lp_build_const_int32(gallivm, member_index);
+
+   ptr = LLVMBuildGEP(builder, context_ptr, indices, Elements(indices), "");
+
+   if (emit_load)
+      res = LLVMBuildLoad(builder, ptr, "");
+   else
+      res = ptr;
+
+   lp_build_name(res, "context.sampler%u.%s", sampler_unit, member_name);
+
+   return res;
+}
+
+
+#define LP_LLVM_SAMPLER_MEMBER(_name, _index, _emit_load)  \
+   static LLVMValueRef \
+   lp_llvm_sampler_##_name( const struct lp_sampler_dynamic_state *base, \
+                            struct gallivm_state *gallivm, \
+                            LLVMValueRef context_ptr, \
+                            unsigned sampler_unit) \
+   { \
+      return lp_llvm_sampler_member(base, gallivm, context_ptr, \
+                                    sampler_unit, _index, #_name, _emit_load ); \
+   }
+
+
+LP_LLVM_SAMPLER_MEMBER(min_lod,    LP_JIT_SAMPLER_MIN_LOD, TRUE)
+LP_LLVM_SAMPLER_MEMBER(max_lod,    LP_JIT_SAMPLER_MAX_LOD, TRUE)
+LP_LLVM_SAMPLER_MEMBER(lod_bias,   LP_JIT_SAMPLER_LOD_BIAS, TRUE)
+LP_LLVM_SAMPLER_MEMBER(border_color, LP_JIT_SAMPLER_BORDER_COLOR, FALSE)
+
+
+static void
+lp_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
+{
+   FREE(sampler);
+}
+
+
+/**
+ * Fetch filtered values from texture.
+ * The 'texel' parameter returns four vectors corresponding to R, G, B, A.
+ */
+static void
+lp_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
+                                     struct gallivm_state *gallivm,
+                                     const struct lp_sampler_params *params)
+{
+   struct lp_llvm_sampler_soa *sampler = (struct lp_llvm_sampler_soa *)base;
+   unsigned texture_index = params->texture_index;
+   unsigned sampler_index = params->sampler_index;
+
+   assert(sampler_index < PIPE_MAX_SAMPLERS);
+   assert(texture_index < PIPE_MAX_SHADER_SAMPLER_VIEWS);
+   
+   if (LP_PERF & PERF_NO_TEX) {
+      lp_build_sample_nop(gallivm, params->type, params->coords, params->texel);
+      return;
+   }
+
+   lp_build_sample_soa(&sampler->dynamic_state.static_state[texture_index].texture_state,
+                       &sampler->dynamic_state.static_state[sampler_index].sampler_state,
+                       &sampler->dynamic_state.base,
+                       gallivm, params);
+}
+
+/**
+ * Fetch the texture size.
+ */
+static void
+lp_llvm_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
+                                    struct gallivm_state *gallivm,
+                                    struct lp_type type,
+                                    unsigned texture_unit,
+                                    unsigned target,
+                                    LLVMValueRef context_ptr,
+                                    boolean is_sviewinfo,
+                                    enum lp_sampler_lod_property lod_property,
+                                    LLVMValueRef explicit_lod, /* optional */
+                                    LLVMValueRef *sizes_out)
+{
+   struct lp_llvm_sampler_soa *sampler = (struct lp_llvm_sampler_soa *)base;
+
+   assert(texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS);
+
+   lp_build_size_query_soa(gallivm,
+                           &sampler->dynamic_state.static_state[texture_unit].texture_state,
+                           &sampler->dynamic_state.base,
+                           type,
+                           texture_unit,
+                           target,
+                           context_ptr,
+                           is_sviewinfo,
+                           lod_property,
+                           explicit_lod,
+                           sizes_out);
+}
+
+
+struct lp_build_sampler_soa *
+lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state)
+{
+   struct lp_llvm_sampler_soa *sampler;
+
+   sampler = CALLOC_STRUCT(lp_llvm_sampler_soa);
+   if(!sampler)
+      return NULL;
+
+   sampler->base.destroy = lp_llvm_sampler_soa_destroy;
+   sampler->base.emit_tex_sample = lp_llvm_sampler_soa_emit_fetch_texel;
+   sampler->base.emit_size_query = lp_llvm_sampler_soa_emit_size_query;
+   sampler->dynamic_state.base.width = lp_llvm_texture_width;
+   sampler->dynamic_state.base.height = lp_llvm_texture_height;
+   sampler->dynamic_state.base.depth = lp_llvm_texture_depth;
+   sampler->dynamic_state.base.first_level = lp_llvm_texture_first_level;
+   sampler->dynamic_state.base.last_level = lp_llvm_texture_last_level;
+   sampler->dynamic_state.base.base_ptr = lp_llvm_texture_base_ptr;
+   sampler->dynamic_state.base.row_stride = lp_llvm_texture_row_stride;
+   sampler->dynamic_state.base.img_stride = lp_llvm_texture_img_stride;
+   sampler->dynamic_state.base.mip_offsets = lp_llvm_texture_mip_offsets;
+   sampler->dynamic_state.base.min_lod = lp_llvm_sampler_min_lod;
+   sampler->dynamic_state.base.max_lod = lp_llvm_sampler_max_lod;
+   sampler->dynamic_state.base.lod_bias = lp_llvm_sampler_lod_bias;
+   sampler->dynamic_state.base.border_color = lp_llvm_sampler_border_color;
+
+   sampler->dynamic_state.static_state = static_state;
+
+   return &sampler->base;
+}
+
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_tex_sample.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_tex_sample.h
new file mode 100644
index 000000000..f4aff226c
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_tex_sample.h
@@ -0,0 +1,46 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef LP_TEX_SAMPLE_H
+#define LP_TEX_SAMPLE_H
+
+
+#include "gallivm/lp_bld.h"
+
+
+struct lp_sampler_static_state;
+
+
+/**
+ * Pure-LLVM texture sampling code generator.
+ *
+ */
+struct lp_build_sampler_soa *
+lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *key);
+
+
+#endif /* LP_TEX_SAMPLE_H */
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_texture.c b/lib/mesa/src/gallium/drivers/llvmpipe/lp_texture.c
new file mode 100644
index 000000000..af46342fd
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -0,0 +1,815 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  *   Michel Dänzer <daenzer@vmware.com>
+  */
+
+#include <stdio.h>
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+
+#include "util/u_inlines.h"
+#include "util/u_cpu_detect.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/simple_list.h"
+#include "util/u_transfer.h"
+
+#include "lp_context.h"
+#include "lp_flush.h"
+#include "lp_screen.h"
+#include "lp_texture.h"
+#include "lp_setup.h"
+#include "lp_state.h"
+#include "lp_rast.h"
+
+#include "state_tracker/sw_winsys.h"
+
+
+#ifdef DEBUG
+static struct llvmpipe_resource resource_list;
+#endif
+static unsigned id_counter = 0;
+
+
+/**
+ * Conventional allocation path for non-display textures:
+ * Compute strides and allocate data (unless asked not to).
+ */
+static boolean
+llvmpipe_texture_layout(struct llvmpipe_screen *screen,
+                        struct llvmpipe_resource *lpr,
+                        boolean allocate)
+{
+   struct pipe_resource *pt = &lpr->base;
+   unsigned level;
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned depth = pt->depth0;
+   uint64_t total_size = 0;
+   unsigned layers = pt->array_size;
+   /* XXX:
+    * This alignment here (same for displaytarget) was added for the purpose of
+    * ARB_map_buffer_alignment. I am not convinced it's needed for non-buffer
+    * resources. Otherwise we'd want the max of cacheline size and 16 (max size
+    * of a block for all formats) though this should not be strictly necessary
+    * neither. In any case it can only affect compressed or 1d textures.
+    */
+   unsigned mip_align = MAX2(64, util_cpu_caps.cacheline);
+
+   assert(LP_MAX_TEXTURE_2D_LEVELS <= LP_MAX_TEXTURE_LEVELS);
+   assert(LP_MAX_TEXTURE_3D_LEVELS <= LP_MAX_TEXTURE_LEVELS);
+
+   for (level = 0; level <= pt->last_level; level++) {
+      uint64_t mipsize;
+      unsigned align_x, align_y, nblocksx, nblocksy, block_size, num_slices;
+
+      /* Row stride and image stride */
+
+      /* For non-compressed formats we need 4x4 pixel alignment
+       * so we can read/write LP_RASTER_BLOCK_SIZE when rendering to them.
+       * We also want cache line size in x direction,
+       * otherwise same cache line could end up in multiple threads.
+       * For explicit 1d resources however we reduce this to 4x1 and
+       * handle specially in render output code (as we need to do special
+       * handling there for buffers in any case).
+       */
+      if (util_format_is_compressed(pt->format))
+         align_x = align_y = 1;
+      else {
+         align_x = LP_RASTER_BLOCK_SIZE;
+         if (llvmpipe_resource_is_1d(&lpr->base))
+            align_y = 1;
+         else
+            align_y = LP_RASTER_BLOCK_SIZE;
+      }
+
+      nblocksx = util_format_get_nblocksx(pt->format,
+                                          align(width, align_x));
+      nblocksy = util_format_get_nblocksy(pt->format,
+                                          align(height, align_y));
+      block_size = util_format_get_blocksize(pt->format);
+
+      if (util_format_is_compressed(pt->format))
+         lpr->row_stride[level] = nblocksx * block_size;
+      else
+         lpr->row_stride[level] = align(nblocksx * block_size, util_cpu_caps.cacheline);
+
+      /* if row_stride * height > LP_MAX_TEXTURE_SIZE */
+      if ((uint64_t)lpr->row_stride[level] * nblocksy > LP_MAX_TEXTURE_SIZE) {
+         /* image too large */
+         goto fail;
+      }
+
+      lpr->img_stride[level] = lpr->row_stride[level] * nblocksy;
+
+      /* Number of 3D image slices, cube faces or texture array layers */
+      if (lpr->base.target == PIPE_TEXTURE_CUBE) {
+         assert(layers == 6);
+      }
+
+      if (lpr->base.target == PIPE_TEXTURE_3D)
+         num_slices = depth;
+      else if (lpr->base.target == PIPE_TEXTURE_1D_ARRAY ||
+               lpr->base.target == PIPE_TEXTURE_2D_ARRAY ||
+               lpr->base.target == PIPE_TEXTURE_CUBE ||
+               lpr->base.target == PIPE_TEXTURE_CUBE_ARRAY)
+         num_slices = layers;
+      else
+         num_slices = 1;
+
+      /* if img_stride * num_slices_faces > LP_MAX_TEXTURE_SIZE */
+      mipsize = (uint64_t)lpr->img_stride[level] * num_slices;
+      if (mipsize > LP_MAX_TEXTURE_SIZE) {
+         /* volume too large */
+         goto fail;
+      }
+
+      lpr->mip_offsets[level] = total_size;
+
+      total_size += align((unsigned)mipsize, mip_align);
+      if (total_size > LP_MAX_TEXTURE_SIZE) {
+         goto fail;
+      }
+
+      /* Compute size of next mipmap level */
+      width = u_minify(width, 1);
+      height = u_minify(height, 1);
+      depth = u_minify(depth, 1);
+   }
+
+   if (allocate) {
+      lpr->tex_data = align_malloc(total_size, mip_align);
+      if (!lpr->tex_data) {
+         return FALSE;
+      }
+      else {
+         memset(lpr->tex_data, 0, total_size);
+      }
+   }
+
+   return TRUE;
+
+fail:
+   return FALSE;
+}
+
+
+/**
+ * Check the size of the texture specified by 'res'.
+ * \return TRUE if OK, FALSE if too large.
+ */
+static boolean
+llvmpipe_can_create_resource(struct pipe_screen *screen,
+                             const struct pipe_resource *res)
+{
+   struct llvmpipe_resource lpr;
+   memset(&lpr, 0, sizeof(lpr));
+   lpr.base = *res;
+   return llvmpipe_texture_layout(llvmpipe_screen(screen), &lpr, false);
+}
+
+
+static boolean
+llvmpipe_displaytarget_layout(struct llvmpipe_screen *screen,
+                              struct llvmpipe_resource *lpr)
+{
+   struct sw_winsys *winsys = screen->winsys;
+
+   /* Round up the surface size to a multiple of the tile size to
+    * avoid tile clipping.
+    */
+   const unsigned width = MAX2(1, align(lpr->base.width0, TILE_SIZE));
+   const unsigned height = MAX2(1, align(lpr->base.height0, TILE_SIZE));
+
+   lpr->dt = winsys->displaytarget_create(winsys,
+                                          lpr->base.bind,
+                                          lpr->base.format,
+                                          width, height,
+                                          64,
+                                          &lpr->row_stride[0] );
+
+   if (lpr->dt == NULL)
+      return FALSE;
+
+   {
+      void *map = winsys->displaytarget_map(winsys, lpr->dt,
+                                            PIPE_TRANSFER_WRITE);
+
+      if (map)
+         memset(map, 0, height * lpr->row_stride[0]);
+
+      winsys->displaytarget_unmap(winsys, lpr->dt);
+   }
+
+   return TRUE;
+}
+
+
+static struct pipe_resource *
+llvmpipe_resource_create(struct pipe_screen *_screen,
+                         const struct pipe_resource *templat)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
+   struct llvmpipe_resource *lpr = CALLOC_STRUCT(llvmpipe_resource);
+   if (!lpr)
+      return NULL;
+
+   lpr->base = *templat;
+   pipe_reference_init(&lpr->base.reference, 1);
+   lpr->base.screen = &screen->base;
+
+   /* assert(lpr->base.bind); */
+
+   if (llvmpipe_resource_is_texture(&lpr->base)) {
+      if (lpr->base.bind & (PIPE_BIND_DISPLAY_TARGET |
+                            PIPE_BIND_SCANOUT |
+                            PIPE_BIND_SHARED)) {
+         /* displayable surface */
+         if (!llvmpipe_displaytarget_layout(screen, lpr))
+            goto fail;
+      }
+      else {
+         /* texture map */
+         if (!llvmpipe_texture_layout(screen, lpr, true))
+            goto fail;
+      }
+   }
+   else {
+      /* other data (vertex buffer, const buffer, etc) */
+      const uint bytes = templat->width0;
+      assert(util_format_get_blocksize(templat->format) == 1);
+      assert(templat->height0 == 1);
+      assert(templat->depth0 == 1);
+      assert(templat->last_level == 0);
+      /*
+       * Reserve some extra storage since if we'd render to a buffer we
+       * read/write always LP_RASTER_BLOCK_SIZE pixels, but the element
+       * offset doesn't need to be aligned to LP_RASTER_BLOCK_SIZE.
+       */
+      lpr->data = align_malloc(bytes + (LP_RASTER_BLOCK_SIZE - 1) * 4 * sizeof(float), 64);
+
+      /*
+       * buffers don't really have stride but it's probably safer
+       * (for code doing same calculations for buffers and textures)
+       * to put something sane in there.
+       */
+      lpr->row_stride[0] = bytes;
+      if (!lpr->data)
+         goto fail;
+      memset(lpr->data, 0, bytes);
+   }
+
+   lpr->id = id_counter++;
+
+#ifdef DEBUG
+   insert_at_tail(&resource_list, lpr);
+#endif
+
+   return &lpr->base;
+
+ fail:
+   FREE(lpr);
+   return NULL;
+}
+
+
+static void
+llvmpipe_resource_destroy(struct pipe_screen *pscreen,
+                          struct pipe_resource *pt)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(pscreen);
+   struct llvmpipe_resource *lpr = llvmpipe_resource(pt);
+
+   if (lpr->dt) {
+      /* display target */
+      struct sw_winsys *winsys = screen->winsys;
+      winsys->displaytarget_destroy(winsys, lpr->dt);
+   }
+   else if (llvmpipe_resource_is_texture(pt)) {
+      /* free linear image data */
+      if (lpr->tex_data) {
+         align_free(lpr->tex_data);
+         lpr->tex_data = NULL;
+      }
+   }
+   else if (!lpr->userBuffer) {
+      assert(lpr->data);
+      align_free(lpr->data);
+   }
+
+#ifdef DEBUG
+   if (lpr->next)
+      remove_from_list(lpr);
+#endif
+
+   FREE(lpr);
+}
+
+
+/**
+ * Map a resource for read/write.
+ */
+void *
+llvmpipe_resource_map(struct pipe_resource *resource,
+                      unsigned level,
+                      unsigned layer,
+                      enum lp_texture_usage tex_usage)
+{
+   struct llvmpipe_resource *lpr = llvmpipe_resource(resource);
+   uint8_t *map;
+
+   assert(level < LP_MAX_TEXTURE_LEVELS);
+   assert(layer < (u_minify(resource->depth0, level) + resource->array_size - 1));
+
+   assert(tex_usage == LP_TEX_USAGE_READ ||
+          tex_usage == LP_TEX_USAGE_READ_WRITE ||
+          tex_usage == LP_TEX_USAGE_WRITE_ALL);
+
+   if (lpr->dt) {
+      /* display target */
+      struct llvmpipe_screen *screen = llvmpipe_screen(resource->screen);
+      struct sw_winsys *winsys = screen->winsys;
+      unsigned dt_usage;
+
+      if (tex_usage == LP_TEX_USAGE_READ) {
+         dt_usage = PIPE_TRANSFER_READ;
+      }
+      else {
+         dt_usage = PIPE_TRANSFER_READ_WRITE;
+      }
+
+      assert(level == 0);
+      assert(layer == 0);
+
+      /* FIXME: keep map count? */
+      map = winsys->displaytarget_map(winsys, lpr->dt, dt_usage);
+
+      /* install this linear image in texture data structure */
+      lpr->tex_data = map;
+
+      return map;
+   }
+   else if (llvmpipe_resource_is_texture(resource)) {
+
+      map = llvmpipe_get_texture_image_address(lpr, layer, level);
+      return map;
+   }
+   else {
+      return lpr->data;
+   }
+}
+
+
+/**
+ * Unmap a resource.
+ */
+void
+llvmpipe_resource_unmap(struct pipe_resource *resource,
+                       unsigned level,
+                       unsigned layer)
+{
+   struct llvmpipe_resource *lpr = llvmpipe_resource(resource);
+
+   if (lpr->dt) {
+      /* display target */
+      struct llvmpipe_screen *lp_screen = llvmpipe_screen(resource->screen);
+      struct sw_winsys *winsys = lp_screen->winsys;
+
+      assert(level == 0);
+      assert(layer == 0);
+
+      winsys->displaytarget_unmap(winsys, lpr->dt);
+   }
+}
+
+
+void *
+llvmpipe_resource_data(struct pipe_resource *resource)
+{
+   struct llvmpipe_resource *lpr = llvmpipe_resource(resource);
+
+   assert(!llvmpipe_resource_is_texture(resource));
+
+   return lpr->data;
+}
+
+
+static struct pipe_resource *
+llvmpipe_resource_from_handle(struct pipe_screen *screen,
+                              const struct pipe_resource *template,
+                              struct winsys_handle *whandle)
+{
+   struct sw_winsys *winsys = llvmpipe_screen(screen)->winsys;
+   struct llvmpipe_resource *lpr;
+
+   /* XXX Seems like from_handled depth textures doesn't work that well */
+
+   lpr = CALLOC_STRUCT(llvmpipe_resource);
+   if (!lpr) {
+      goto no_lpr;
+   }
+
+   lpr->base = *template;
+   pipe_reference_init(&lpr->base.reference, 1);
+   lpr->base.screen = screen;
+
+   /*
+    * Looks like unaligned displaytargets work just fine,
+    * at least sampler/render ones.
+    */
+#if 0
+   assert(lpr->base.width0 == width);
+   assert(lpr->base.height0 == height);
+#endif
+
+   lpr->dt = winsys->displaytarget_from_handle(winsys,
+                                               template,
+                                               whandle,
+                                               &lpr->row_stride[0]);
+   if (!lpr->dt) {
+      goto no_dt;
+   }
+
+   lpr->id = id_counter++;
+
+#ifdef DEBUG
+   insert_at_tail(&resource_list, lpr);
+#endif
+
+   return &lpr->base;
+
+no_dt:
+   FREE(lpr);
+no_lpr:
+   return NULL;
+}
+
+
+static boolean
+llvmpipe_resource_get_handle(struct pipe_screen *screen,
+                            struct pipe_resource *pt,
+                            struct winsys_handle *whandle)
+{
+   struct sw_winsys *winsys = llvmpipe_screen(screen)->winsys;
+   struct llvmpipe_resource *lpr = llvmpipe_resource(pt);
+
+   assert(lpr->dt);
+   if (!lpr->dt)
+      return FALSE;
+
+   return winsys->displaytarget_get_handle(winsys, lpr->dt, whandle);
+}
+
+
+static void *
+llvmpipe_transfer_map( struct pipe_context *pipe,
+                       struct pipe_resource *resource,
+                       unsigned level,
+                       unsigned usage,
+                       const struct pipe_box *box,
+                       struct pipe_transfer **transfer )
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   struct llvmpipe_screen *screen = llvmpipe_screen(pipe->screen);
+   struct llvmpipe_resource *lpr = llvmpipe_resource(resource);
+   struct llvmpipe_transfer *lpt;
+   struct pipe_transfer *pt;
+   ubyte *map;
+   enum pipe_format format;
+   enum lp_texture_usage tex_usage;
+   const char *mode;
+
+   assert(resource);
+   assert(level <= resource->last_level);
+
+   /*
+    * Transfers, like other pipe operations, must happen in order, so flush the
+    * context if necessary.
+    */
+   if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
+      boolean read_only = !(usage & PIPE_TRANSFER_WRITE);
+      boolean do_not_block = !!(usage & PIPE_TRANSFER_DONTBLOCK);
+      if (!llvmpipe_flush_resource(pipe, resource,
+                                   level,
+                                   read_only,
+                                   TRUE, /* cpu_access */
+                                   do_not_block,
+                                   __FUNCTION__)) {
+         /*
+          * It would have blocked, but state tracker requested no to.
+          */
+         assert(do_not_block);
+         return NULL;
+      }
+   }
+
+   /* Check if we're mapping the current constant buffer */
+   if ((usage & PIPE_TRANSFER_WRITE) &&
+       (resource->bind & PIPE_BIND_CONSTANT_BUFFER)) {
+      unsigned i;
+      for (i = 0; i < Elements(llvmpipe->constants[PIPE_SHADER_FRAGMENT]); ++i) {
+         if (resource == llvmpipe->constants[PIPE_SHADER_FRAGMENT][i].buffer) {
+            /* constants may have changed */
+            llvmpipe->dirty |= LP_NEW_CONSTANTS;
+            break;
+         }
+      }
+   }
+
+   lpt = CALLOC_STRUCT(llvmpipe_transfer);
+   if (!lpt)
+      return NULL;
+   pt = &lpt->base;
+   pipe_resource_reference(&pt->resource, resource);
+   pt->box = *box;
+   pt->level = level;
+   pt->stride = lpr->row_stride[level];
+   pt->layer_stride = lpr->img_stride[level];
+   pt->usage = usage;
+   *transfer = pt;
+
+   assert(level < LP_MAX_TEXTURE_LEVELS);
+
+   /*
+   printf("tex_transfer_map(%d, %d  %d x %d of %d x %d,  usage %d )\n",
+          transfer->x, transfer->y, transfer->width, transfer->height,
+          transfer->texture->width0,
+          transfer->texture->height0,
+          transfer->usage);
+   */
+
+   if (usage == PIPE_TRANSFER_READ) {
+      tex_usage = LP_TEX_USAGE_READ;
+      mode = "read";
+   }
+   else {
+      tex_usage = LP_TEX_USAGE_READ_WRITE;
+      mode = "read/write";
+   }
+
+   if (0) {
+      printf("transfer map tex %u  mode %s\n", lpr->id, mode);
+   }
+
+   format = lpr->base.format;
+
+   map = llvmpipe_resource_map(resource,
+                               level,
+                               box->z,
+                               tex_usage);
+
+
+   /* May want to do different things here depending on read/write nature
+    * of the map:
+    */
+   if (usage & PIPE_TRANSFER_WRITE) {
+      /* Do something to notify sharing contexts of a texture change.
+       */
+      screen->timestamp++;
+   }
+
+   map +=
+      box->y / util_format_get_blockheight(format) * pt->stride +
+      box->x / util_format_get_blockwidth(format) * util_format_get_blocksize(format);
+
+   return map;
+}
+
+
+static void
+llvmpipe_transfer_unmap(struct pipe_context *pipe,
+                        struct pipe_transfer *transfer)
+{
+   assert(transfer->resource);
+
+   llvmpipe_resource_unmap(transfer->resource,
+                           transfer->level,
+                           transfer->box.z);
+
+   /* Effectively do the texture_update work here - if texture images
+    * needed post-processing to put them into hardware layout, this is
+    * where it would happen.  For llvmpipe, nothing to do.
+    */
+   assert (transfer->resource);
+   pipe_resource_reference(&transfer->resource, NULL);
+   FREE(transfer);
+}
+
+unsigned int
+llvmpipe_is_resource_referenced( struct pipe_context *pipe,
+                                 struct pipe_resource *presource,
+                                 unsigned level)
+{
+   struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
+
+   /*
+    * XXX checking only resources with the right bind flags
+    * is unsafe since with opengl state tracker we can end up
+    * with resources bound to places they weren't supposed to be
+    * (buffers bound as sampler views is one possibility here).
+    */
+   if (!(presource->bind & (PIPE_BIND_DEPTH_STENCIL |
+                            PIPE_BIND_RENDER_TARGET |
+                            PIPE_BIND_SAMPLER_VIEW)))
+      return LP_UNREFERENCED;
+
+   return lp_setup_is_resource_referenced(llvmpipe->setup, presource);
+}
+
+
+/**
+ * Returns the largest possible alignment for a format in llvmpipe
+ */
+unsigned
+llvmpipe_get_format_alignment( enum pipe_format format )
+{
+   const struct util_format_description *desc = util_format_description(format);
+   unsigned size = 0;
+   unsigned bytes;
+   unsigned i;
+
+   for (i = 0; i < desc->nr_channels; ++i) {
+      size += desc->channel[i].size;
+   }
+
+   bytes = size / 8;
+
+   if (!util_is_power_of_two(bytes)) {
+      bytes /= desc->nr_channels;
+   }
+
+   if (bytes % 2 || bytes < 1) {
+      return 1;
+   } else {
+      return bytes;
+   }
+}
+
+
+/**
+ * Create buffer which wraps user-space data.
+ */
+struct pipe_resource *
+llvmpipe_user_buffer_create(struct pipe_screen *screen,
+                            void *ptr,
+                            unsigned bytes,
+                            unsigned bind_flags)
+{
+   struct llvmpipe_resource *buffer;
+
+   buffer = CALLOC_STRUCT(llvmpipe_resource);
+   if(!buffer)
+      return NULL;
+
+   pipe_reference_init(&buffer->base.reference, 1);
+   buffer->base.screen = screen;
+   buffer->base.format = PIPE_FORMAT_R8_UNORM; /* ?? */
+   buffer->base.bind = bind_flags;
+   buffer->base.usage = PIPE_USAGE_IMMUTABLE;
+   buffer->base.flags = 0;
+   buffer->base.width0 = bytes;
+   buffer->base.height0 = 1;
+   buffer->base.depth0 = 1;
+   buffer->base.array_size = 1;
+   buffer->userBuffer = TRUE;
+   buffer->data = ptr;
+
+   return &buffer->base;
+}
+
+
+/**
+ * Compute size (in bytes) need to store a texture image / mipmap level,
+ * for just one cube face, one array layer or one 3D texture slice
+ */
+static unsigned
+tex_image_face_size(const struct llvmpipe_resource *lpr, unsigned level)
+{
+   return lpr->img_stride[level];
+}
+
+
+/**
+ * Return pointer to a 2D texture image/face/slice.
+ * No tiled/linear conversion is done.
+ */
+ubyte *
+llvmpipe_get_texture_image_address(struct llvmpipe_resource *lpr,
+                                   unsigned face_slice, unsigned level)
+{
+   unsigned offset;
+
+   assert(llvmpipe_resource_is_texture(&lpr->base));
+
+   offset = lpr->mip_offsets[level];
+
+   if (face_slice > 0)
+      offset += face_slice * tex_image_face_size(lpr, level);
+
+   return (ubyte *) lpr->tex_data + offset;
+}
+
+
+/**
+ * Return size of resource in bytes
+ */
+unsigned
+llvmpipe_resource_size(const struct pipe_resource *resource)
+{
+   const struct llvmpipe_resource *lpr = llvmpipe_resource_const(resource);
+   unsigned size = 0;
+
+   if (llvmpipe_resource_is_texture(resource)) {
+      /* Note this will always return 0 for displaytarget resources */
+      size = lpr->total_alloc_size;
+   }
+   else {
+      size = resource->width0;
+   }
+   return size;
+}
+
+
+#ifdef DEBUG
+void
+llvmpipe_print_resources(void)
+{
+   struct llvmpipe_resource *lpr;
+   unsigned n = 0, total = 0;
+
+   debug_printf("LLVMPIPE: current resources:\n");
+   foreach(lpr, &resource_list) {
+      unsigned size = llvmpipe_resource_size(&lpr->base);
+      debug_printf("resource %u at %p, size %ux%ux%u: %u bytes, refcount %u\n",
+                   lpr->id, (void *) lpr,
+                   lpr->base.width0, lpr->base.height0, lpr->base.depth0,
+                   size, lpr->base.reference.count);
+      total += size;
+      n++;
+   }
+   debug_printf("LLVMPIPE: total size of %u resources: %u\n", n, total);
+}
+#endif
+
+
+void
+llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen)
+{
+#ifdef DEBUG
+   /* init linked list for tracking resources */
+   {
+      static boolean first_call = TRUE;
+      if (first_call) {
+         memset(&resource_list, 0, sizeof(resource_list));
+         make_empty_list(&resource_list);
+         first_call = FALSE;
+      }
+   }
+#endif
+
+   screen->resource_create = llvmpipe_resource_create;
+   screen->resource_destroy = llvmpipe_resource_destroy;
+   screen->resource_from_handle = llvmpipe_resource_from_handle;
+   screen->resource_get_handle = llvmpipe_resource_get_handle;
+   screen->can_create_resource = llvmpipe_can_create_resource;
+}
+
+
+void
+llvmpipe_init_context_resource_funcs(struct pipe_context *pipe)
+{
+   pipe->transfer_map = llvmpipe_transfer_map;
+   pipe->transfer_unmap = llvmpipe_transfer_unmap;
+
+   pipe->transfer_flush_region = u_default_transfer_flush_region;
+   pipe->transfer_inline_write = u_default_transfer_inline_write;
+}
diff --git a/lib/mesa/src/gallium/drivers/llvmpipe/lp_texture.h b/lib/mesa/src/gallium/drivers/llvmpipe/lp_texture.h
new file mode 100644
index 000000000..3d315bb9a
--- /dev/null
+++ b/lib/mesa/src/gallium/drivers/llvmpipe/lp_texture.h
@@ -0,0 +1,239 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef LP_TEXTURE_H
+#define LP_TEXTURE_H
+
+
+#include "pipe/p_state.h"
+#include "util/u_debug.h"
+#include "lp_limits.h"
+
+
+enum lp_texture_usage
+{
+   LP_TEX_USAGE_READ = 100,
+   LP_TEX_USAGE_READ_WRITE,
+   LP_TEX_USAGE_WRITE_ALL
+};
+
+
+struct pipe_context;
+struct pipe_screen;
+struct llvmpipe_context;
+
+struct sw_displaytarget;
+
+
+/**
+ * llvmpipe subclass of pipe_resource.  A texture, drawing surface,
+ * vertex buffer, const buffer, etc.
+ * Textures are stored differently than other types of objects such as
+ * vertex buffers and const buffers.
+ * The latter are simple malloc'd blocks of memory.
+ */
+struct llvmpipe_resource
+{
+   struct pipe_resource base;
+
+   /** Row stride in bytes */
+   unsigned row_stride[LP_MAX_TEXTURE_LEVELS];
+   /** Image stride (for cube maps, array or 3D textures) in bytes */
+   unsigned img_stride[LP_MAX_TEXTURE_LEVELS];
+   /** Offset to start of mipmap level, in bytes */
+   unsigned mip_offsets[LP_MAX_TEXTURE_LEVELS];
+   /** allocated total size (for non-display target texture resources only) */
+   unsigned total_alloc_size;
+
+   /**
+    * Display target, for textures with the PIPE_BIND_DISPLAY_TARGET
+    * usage.
+    */
+   struct sw_displaytarget *dt;
+
+   /**
+    * Malloc'ed data for regular textures, or a mapping to dt above.
+    */
+   void *tex_data;
+
+   /**
+    * Data for non-texture resources.
+    */
+   void *data;
+
+   boolean userBuffer;  /** Is this a user-space buffer? */
+   unsigned timestamp;
+
+   unsigned id;  /**< temporary, for debugging */
+
+#ifdef DEBUG
+   /** for linked list */
+   struct llvmpipe_resource *prev, *next;
+#endif
+};
+
+
+struct llvmpipe_transfer
+{
+   struct pipe_transfer base;
+
+   unsigned long offset;
+};
+
+
+/** cast wrappers */
+static inline struct llvmpipe_resource *
+llvmpipe_resource(struct pipe_resource *pt)
+{
+   return (struct llvmpipe_resource *) pt;
+}
+
+
+static inline const struct llvmpipe_resource *
+llvmpipe_resource_const(const struct pipe_resource *pt)
+{
+   return (const struct llvmpipe_resource *) pt;
+}
+
+
+static inline struct llvmpipe_transfer *
+llvmpipe_transfer(struct pipe_transfer *pt)
+{
+   return (struct llvmpipe_transfer *) pt;
+}
+
+
+void llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen);
+void llvmpipe_init_context_resource_funcs(struct pipe_context *pipe);
+
+
+static inline boolean
+llvmpipe_resource_is_texture(const struct pipe_resource *resource)
+{
+   switch (resource->target) {
+   case PIPE_BUFFER:
+      return FALSE;
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_1D_ARRAY:
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_2D_ARRAY:
+   case PIPE_TEXTURE_RECT:
+   case PIPE_TEXTURE_3D:
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return TRUE;
+   default:
+      assert(0);
+      return FALSE;
+   }
+}
+
+
+static inline boolean
+llvmpipe_resource_is_1d(const struct pipe_resource *resource)
+{
+   switch (resource->target) {
+   case PIPE_BUFFER:
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_1D_ARRAY:
+      return TRUE;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_2D_ARRAY:
+   case PIPE_TEXTURE_RECT:
+   case PIPE_TEXTURE_3D:
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return FALSE;
+   default:
+      assert(0);
+      return FALSE;
+   }
+}
+
+
+static inline unsigned
+llvmpipe_layer_stride(struct pipe_resource *resource,
+                      unsigned level)
+{
+   struct llvmpipe_resource *lpr = llvmpipe_resource(resource);
+   assert(level < LP_MAX_TEXTURE_2D_LEVELS);
+   return lpr->img_stride[level];
+}
+
+
+static inline unsigned
+llvmpipe_resource_stride(struct pipe_resource *resource,
+                         unsigned level)
+{
+   struct llvmpipe_resource *lpr = llvmpipe_resource(resource);
+   assert(level < LP_MAX_TEXTURE_2D_LEVELS);
+   return lpr->row_stride[level];
+}
+
+
+void *
+llvmpipe_resource_map(struct pipe_resource *resource,
+                      unsigned level,
+                      unsigned layer,
+                      enum lp_texture_usage tex_usage);
+
+void
+llvmpipe_resource_unmap(struct pipe_resource *resource,
+                        unsigned level,
+                        unsigned layer);
+
+
+void *
+llvmpipe_resource_data(struct pipe_resource *resource);
+
+
+unsigned
+llvmpipe_resource_size(const struct pipe_resource *resource);
+
+
+ubyte *
+llvmpipe_get_texture_image_address(struct llvmpipe_resource *lpr,
+                                   unsigned face_slice, unsigned level);
+
+
+extern void
+llvmpipe_print_resources(void);
+
+
+#define LP_UNREFERENCED         0
+#define LP_REFERENCED_FOR_READ  (1 << 0)
+#define LP_REFERENCED_FOR_WRITE (1 << 1)
+
+unsigned int
+llvmpipe_is_resource_referenced( struct pipe_context *pipe,
+                                 struct pipe_resource *presource,
+                                 unsigned level);
+
+unsigned
+llvmpipe_get_format_alignment(enum pipe_format format);
+
+#endif /* LP_TEXTURE_H */