Imported Upstream version 2.21.2upstream/2.21.2 upstream

author: Anas Nashif <anas.nashif@intel.com> 2013-02-11 07:30:29 -0800
committer: Anas Nashif <anas.nashif@intel.com> 2013-02-11 07:30:29 -0800
commit: c4f30fa8253338176ec71f157200b8e2824c0f15 (patch)
tree: 6b7485eb6f028539ce3dcc40770ee35889eda025
parent: 1501461b978a770b6fc8883901d6c3d177661667 (diff)
download: xf86-video-intel-c4f30fa8253338176ec71f157200b8e2824c0f15.tar.gz
xf86-video-intel-c4f30fa8253338176ec71f157200b8e2824c0f15.tar.bz2
xf86-video-intel-c4f30fa8253338176ec71f157200b8e2824c0f15.zip
111 files changed, 16961 insertions, 8303 deletions
diff --git a/ChangeLog b/ChangeLog
index d33c6084c..ca65287f0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,4810 @@
+commit a241949c05f44792f51a5bd1e246a44693cb5b06
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Feb 10 14:20:59 2013 +0000
+
+    2.21.2 release
+
+commit 0d75b19979b1ac14353765e2bb84c6a466129109
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Feb 10 15:47:53 2013 +0000
+
+    sna: Restore glyphs with xorg-1.12
+    
+    That simple and innocuous build fix for xorg-1.13 bizarrely causes
+    missing glyphs with earlier Xorgs.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9fd0d8873a5a5c4f77904cab0b9909ca941b5dae
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Feb 10 14:29:29 2013 +0000
+
+    NEWS: fix bug url
+    
+    The dangers of cutting and pasting from git log.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 617fadf3acf7bf75fb203c1e85fd0ddb98b3dbb9
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Feb 10 14:20:59 2013 +0000
+
+    2.21.1 release
+
+commit 3169a4e53cf39cc3d5c18ac6add909aa3a58de7e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Feb 10 11:57:14 2013 +0000
+
+    sna: Reorder some includes so that compat-api.h comes after the headers it wraps
+    
+    Fixes the build in cases where the compat-api.h was defining macros to
+    subvert the real functions found in the xorg includes
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 71fbad64c5cfe6832a03815bece4c89d15253e1a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Feb 10 10:54:17 2013 +0000
+
+    configure: Fix typo in checking for libdrm_intel
+    
+    The package name is libdrm_intel not libdrm_intel-1, an obvious
+    cut'n'paste error from testing for pixman-1.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 3cbdfb54d1fcfed7745111e861e19b7bbac243cc
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Feb 9 19:15:20 2013 +0000
+
+    sna: Backport to squeeze - Xorg-1.6, pixman-0.16, libdrm-2.4.21
+    
+    The principle change is to switch to the old Privates API and undo the
+    Region renames.
+    
+    The downside is that this ignores the critical bugfixes made to the
+    xserver since xorg-1.6 - but I assume that whoever wants to run the
+    latest hardware on the old xservers is also backporting those stability
+    fixes...
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 42a6b25817985e22e7d462be87fbd97973d96a29
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Feb 9 15:30:58 2013 +0000
+
+    sna: Fix alignment of the base of partial buffers for pre-G33 chipsets
+    
+    The older chipsets have much more restrictive alignment rules for the
+    base address of tiled but unfenced objects.
+    
+    Bugzilla: https://bugs.launchpad.net/ubuntu/+source/xserver-xorg-video-intel/+bug/1120108
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 80044e54634d0836694d5aa6f98ce22fe38d367f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Feb 9 09:57:26 2013 +0000
+
+    sna: Promote to GPU is only partially damaged on the CPU but busy on the GPU
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d18cb72a94fad0ee99ab361c21d643c927d29c35
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Feb 8 22:31:19 2013 +0000
+
+    sna: Randomly perturb 'wedged' to hunt for faults
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit ce9f0448367ea6a90490a28150bfdc0a76500129
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Feb 8 16:01:54 2013 +0000
+
+    sna/gen6: Use GT2 settings for both GT2 and GT2+
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit ae5399aaf9ef57d33e8fd957e8a96964897c09b3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Feb 8 11:31:21 2013 +0000
+
+    sna: Force the fallback path for unaccelerated randr damage
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit c69b4389abc324533a9a311c17a667bf8a1e1673
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Feb 7 22:54:37 2013 +0000
+
+    sna/gen4: Split the have_render flag in separate prefer_gpu hints
+    
+    The idea is to implement more fine-grained checks as we may want
+    different heuristics for desktops with GT1s than for mobile GT2s, etc.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit bec99de812ce6a1bbc2c8e4cfd05f4f74c560ea6
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Feb 8 00:53:10 2013 +0000
+
+    sna: Remove the bogus assertions on buffer domains
+    
+    Just a few lines earlier we already have the correct assertion that the
+    buffer was not in the GPU domain, so had these two been correct, they
+    would have still been redundant.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8d1d3c6e6102ff20fbff74ec6b3b2e94ee757015
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Feb 7 14:47:07 2013 +0000
+
+    sna: Fixup an invalid assertion
+    
+    We may choose to operate inplace on a buffer last used by the CPU if we
+    are discarding all the existing damage.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit c405dba367bdca51221bd2464213199783dc18fe
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Feb 7 13:41:42 2013 +0000
+
+    sna: Also assert that the GPU is not wedged before continuing a batch
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit feeff6fcefccdca5335fea55c2fdbf8a4004c175
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Feb 7 13:33:58 2013 +0000
+
+    sna: Force GTT readback if the GPU is wedged
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8a272971d5971a56f57dde00dceb082d0b142c8c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Feb 6 17:59:10 2013 +0000
+
+    sna: Allow inplace uploads to utilise GTT on LLC machines
+    
+    Rather than arbitrarily disable the fallback paths for LLC, allow it to
+    utilise any available GTT buffers for inplace uploads. The best
+    explanation so far is that with the streaming is that we are trashing
+    the LLC. On other machines, the difference is in the noise.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit bc8a2c30c4f6bb9ce751b6717a3a2feaea0d6d4b
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Feb 7 10:42:58 2013 +0000
+
+    sna: Only try the SRC fixup into the buffer if it is CPU mapped
+    
+    On one particular machine, this operation is behaving as if it is
+    reading back UC memory during the explicit write-only composite.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 889ed28f52bccdbc54692ea075f95f9635a8d58a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Feb 7 10:42:21 2013 +0000
+
+    sna: Correctly align used buffers to the following page boundary
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 974b6a97d78dadf09be8a2c4f61020f15d80d558
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Feb 6 17:02:27 2013 +0000
+
+    sna: Fallback to non-LLC paths after an allocation failure for an LLC buffer
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 5c8084ef04cb0a7da064fb1e13c8ef7dae528b1b
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Feb 6 16:39:31 2013 +0000
+
+    intel: Becareful not to match UMS against future generations
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit be241fb25ed0a8d41a642ea811253207f88d0962
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Feb 6 16:38:12 2013 +0000
+
+    sna: Free the handle after pwrite buffer allocation failure
+    
+    Having just allocated the handle, we need to free it if we then fail to
+    allocate memory for the buffer.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4b3b25f0be33d3af3ccecfb3193fc2d365445fdf
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Feb 6 16:37:21 2013 +0000
+
+    sna: Flush our caches if we fail to mmap an object
+    
+    The likely cause for a mmap failure is that we hold too many objects
+    open or have exhausted our address space. In both cases, we need to trim
+    our caches before continuing.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit daba1ae3e7f0532cc53d9a5178778dbaec203052
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Feb 6 16:17:36 2013 +0000
+
+    sna: Correctly handle failure to CPU map a new allocation
+    
+    If we fail to CPU map, we want to fallback to just using pwrite with
+    normal memory.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0adb0b5e1ebcf3ddfeddae99d96912ec4c090832
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Feb 6 16:02:30 2013 +0000
+
+    sna: Handle mapped buffer allocation failure for LLC
+    
+    The presumption was that if we had LLC we would have allocated the
+    buffer by that point - however, it was remotely possible to have fallen
+    through and so we need to handle those cases.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f4cff22afae598f41adf36cd149223d1f7dd6b6e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Feb 6 15:15:36 2013 +0000
+
+    sna: Relax the buffer size assertion to only be larger than required
+    
+    Not all paths request alloc pages, a few just request sufficient pages
+    for the original size. So we can only assert that condition is
+    satisfied.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8bc593c732a2f1ccd1bdabc071c709a44222db61
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Feb 6 15:11:00 2013 +0000
+
+    sna: Make sure we always replace io buffers before inserting into the cache
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 5f72158919098dd5684d1c56d1ba643cc3be2c7d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Feb 6 15:10:23 2013 +0000
+
+    configure: XvMC support is optional, so make failure to find xcb non-fatal
+
+commit cd6d8f9b9df02934ebfff76cb40410c8ce3887dd
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Feb 6 10:37:50 2013 +0000
+
+    xvmc: Add the complementary XCB_CFLAGS
+    
+    After splitting the xvmc dependences into xcb and non-xcb, we then also
+    have to add the xcb CFLAGS to build libIntelXVmc.la
+    
+    Reported-by: Julien Cristau <jcristau@debian.org>
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b96ee47ad97943c3dccd40d9570e29002dc3d85f
+Author: Paul Menzel <paulepanter@users.sourceforge.net>
+Date:   Sun Feb 3 13:33:08 2013 +0100
+
+    configure.ac: Split out XCB libraries from `XVMCLIB` into `XCB`
+    
+    Building the package under Debian Sid/unstable, `dh_shlibdeps` informs
+    that `libI810XvMC.so.1.0.0` does not need to be linked against
+    `libX11-xcb.so.1`, `libxcb-dri2.so.0`, `libxcb-util.so.0` or
+    `libxcb.so.1` [1].
+    
+            $ debuild -b -us -uc
+            […]
+            make[1]: Entering directory `/src/xserver-xorg-video-intel'
+            dh_shlibdeps -- --warnings=6
+            dpkg-shlibdeps: Warnung: debian/xserver-xorg-video-intel/usr/lib/libI810XvMC.so.1.0.0 sollte nicht gegen libX11-xcb.so.1 gelinkt werden (es verwendet keines der Bibliotheks-Symbole)
+            dpkg-shlibdeps: Warnung: debian/xserver-xorg-video-intel/usr/lib/libI810XvMC.so.1.0.0 sollte nicht gegen libxcb-dri2.so.0 gelinkt werden (es verwendet keines der Bibliotheks-Symbole)
+            dpkg-shlibdeps: Warnung: debian/xserver-xorg-video-intel/usr/lib/libI810XvMC.so.1.0.0 sollte nicht gegen libxcb-util.so.0 gelinkt werden (es verwendet keines der Bibliotheks-Symbole)
+            dpkg-shlibdeps: Warnung: debian/xserver-xorg-video-intel/usr/lib/libI810XvMC.so.1.0.0 sollte nicht gegen libxcb.so.1 gelinkt werden (es verwendet keines der Bibliotheks-Symbole)
+            make[1]: Leaving directory `/src/xserver-xorg-video-intel'
+            […]
+    
+    Moving `x11-xcb`, `xcb-dri2` and `xcb-aux` from `XVMCLIBS` into `XCB`
+    and adding `XCB_LIBS` only to the `LIBADD` variables of `libIntelXvMC`
+    makes the warnings go away and the libraries are still built without any
+    issues.
+    
+            make[1]: Entering directory `/src/xserver-xorg-video-intel'
+            dh_shlibdeps -- --warnings=6
+            make[1]: Leaving directory `/src/xserver-xorg-video-intel'
+               dh_installdeb -O--builddirectory=build/
+               dh_xsf_substvars -O--builddirectory=build/
+               dh_gencontrol -O--builddirectory=build/
+            dpkg-gencontrol: Warnung: Feld Depends von Paket xserver-xorg-video-intel-dbg: unbekannte Substitutionsvariable ${shlibs:Depends}
+               dh_md5sums -O--builddirectory=build/
+               dh_builddeb -O--builddirectory=build/
+            dpkg-deb: Paket »xserver-xorg-video-intel« wird in »../xserver-xorg-video-intel_2.19.0-6.1_i386.deb« gebaut.
+            dpkg-deb: Paket »xserver-xorg-video-intel-dbg« wird in »../xserver-xorg-video-intel-dbg_2.19.0-6.1_i386.deb« gebaut.
+             dpkg-genchanges -b >../xserver-xorg-video-intel_2.19.0-6.1_i386.changes
+            dpkg-genchanges: rein binärer Upload - es ist kein Quellcode hinzugefügt
+             dpkg-source --after-build xserver-xorg-video-intel
+            dpkg-buildpackage: Binärpaket(e) hochzuladen (keine Quellen enthalten)
+            Now running lintian...
+            W: xserver-xorg-video-intel: hardening-no-relro usr/lib/libI810XvMC.so.1.0.0
+            W: xserver-xorg-video-intel: hardening-no-fortify-functions usr/lib/libI810XvMC.so.1.0.0
+            W: xserver-xorg-video-intel: hardening-no-relro usr/lib/libIntelXvMC.so.1.0.0
+            W: xserver-xorg-video-intel: hardening-no-fortify-functions usr/lib/libIntelXvMC.so.1.0.0
+            W: xserver-xorg-video-intel: hardening-no-relro usr/lib/xorg/modules/drivers/intel_drv.so
+            W: xserver-xorg-video-intel: hardening-no-fortify-functions usr/lib/xorg/modules/drivers/intel_drv.so
+            N: 1 tag overridden (1 warning)
+            Finished running lintian.
+    
+    The modules were originally added with the following commit present
+    since tag 2.10.0.
+    
+            commit 3e8f2eae3a586aa29be4858698e666e0ec778cea
+            Author: Eric Anholt <eric@anholt.net>
+            Date:   Thu Oct 15 13:48:56 2009 -0700
+    
+                XVMC: Use XCB DRI2 instead of cargo-culting our own copy of Xlib stuff. (v2)
+    
+    [1] https://buildd.debian.org/status/fetch.php?pkg=xserver-xorg-video-intel&arch=i386&ver=2%3A2.19.0-6&stamp=1347825458
+    
+    Signed-off-by: Paul Menzel <paulepanter@users.sourceforge.net>
+
+commit 93770c709aa7d3719b7c717040b16c8f82d5c207
+Author: Paul Menzel <paulepanter@users.sourceforge.net>
+Date:   Tue Jan 22 10:47:22 2013 +0100
+
+    NEWS: Fix a typo: a*n* inadvertent
+
+commit a8cfddd280b5220f23565b21c91f3f7dd10bbe91
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Feb 5 22:06:03 2013 +0000
+
+    sna: Tidy buffer allocation size assertions
+    
+    Rather than perilously update a local variable with the allocated size,
+    just use the size of the bo in the assertion that is large enough to
+    satisfy the allocation request.
+    
+    Reported-by: Jiri Slaby <jirislaby@gmail.com>
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 82dc91e8c24a1fbbf03dcf89a3955319b3399ea0
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Feb 5 21:50:43 2013 +0000
+
+    test: Add a very basic blt benchmark
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9c80a0337ec12b6baab5aab380503e672e925677
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Feb 5 14:56:10 2013 +0000
+
+    sna: ValleyView uses the same scanline registers as SandyBridge
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4c45e3fe456d211afc6ba69878b413a72ef5d0bf
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Feb 5 14:45:39 2013 +0000
+
+    intel: add more ValleyView PCI IDs
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit c6101d9d71a86a579ff9771d456b234a38bd80b7
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Feb 5 11:02:30 2013 +0000
+
+    man: Fix a typo s/debuging/debugging/
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f33c90f7ada238683433d05492434120d06ea1fc
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Feb 1 19:34:56 2013 +0000
+
+    NEWS: Trivial typo s/utilile/utilise/
+
+commit 6346c844525c2b3a82c16fe10485b901a2b5ddbc
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Feb 5 10:17:45 2013 +0000
+
+    sna/gen4: Remove old single-thread SF w/a
+    
+    The alternative of disabling GPU spans seems to be far more effective.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1565917f10d9fb3c7e2e7e273173c38c364b9861
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Feb 5 10:11:14 2013 +0000
+
+    sna/gen4: Disable non-rectilinear GPU span compositing
+    
+    This seems to be the primary victim of the render corruption, so disable
+    until the root cause is fixed.
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=55500
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 37bc822190f36be7b021167ba4d306bbcd97957b
+Author: Damien Lespiau <damien.lespiau@intel.com>
+Date:   Fri Jan 18 14:13:08 2013 +0000
+
+    build: Make generation of gen code depend on intel-gen4asm
+    
+    This way, when a new intel-gen4asm is available (because one just hacked
+    on it and has installed a new version for instance) the shaders will be
+    recompiled. This helps catching regressions, testing the latest changes
+    in the assembler haven't broken too many things.
+    
+    Signed-off-by: Damien Lespiau <damien.lespiau@intel.com>
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 18f8d2291fbb53ac993b926c247ca981e1e5207b
+Author: Damien Lespiau <damien.lespiau@intel.com>
+Date:   Fri Jan 18 14:13:07 2013 +0000
+
+    build: Use $(AM_V_GEN) to silence the assembly of gen programs
+    
+    Signed-off-by: Damien Lespiau <damien.lespiau@intel.com>
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit eea535b7e2a35ec4dfa50550b674d4212676d2ee
+Author: Damien Lespiau <damien.lespiau@intel.com>
+Date:   Fri Jan 18 14:13:06 2013 +0000
+
+    build: Make autoreconf honour ACLOCAL_FLAGS
+    
+    When running autoreconf, it's possible to give flags to the underlying
+    aclocal by declaring a ACLOCAL_AMFLAGS variable in the top level
+    Makefile.am.
+    
+    Putting ${ACLOCAL_FLAGS} there allows the user to set an environment
+    variable up before running autogen.sh and pull in the right directories
+    to look for m4 macros, say an up-to-date version of the xorg-util macros.
+    
+    Signed-off-by: Damien Lespiau <damien.lespiau@intel.com>
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9640640ab02d5de630e903116c1b104752f8b604
+Author: Paul Menzel <paulepanter@users.sourceforge.net>
+Date:   Sat Feb 2 11:44:54 2013 +0100
+
+    configure.ac: Do not include `xext` and `xfixes` in `XVMCLIB`
+    
+    Building the package under Debian Sid/unstable, `dh_shlibdeps` informs
+    that `libIntelXvMC.so.1.0.0` does not need to be linked against
+    `libXext.so.6` or `libXfixes.so.3` [1].
+    
+            $ debuild -b -us -uc
+            […]
+            make[1]: Entering directory `/build/buildd-xserver-xorg-video-intel_2.19.0-6-i386-9thLfo/xserver-xorg-video-intel-2.19.0'
+            dh_shlibdeps -- --warnings=6
+            dpkg-shlibdeps: warning: debian/xserver-xorg-video-intel/usr/lib/libIntelXvMC.so.1.0.0 should not be linked against libXext.so.6 (it uses none of the library's symbols)
+            dpkg-shlibdeps: warning: debian/xserver-xorg-video-intel/usr/lib/libIntelXvMC.so.1.0.0 should not be linked against libXfixes.so.3 (it uses none of the library's symbols)
+            dpkg-shlibdeps: warning: debian/xserver-xorg-video-intel/usr/lib/libI810XvMC.so.1.0.0 should not be linked against libXext.so.6 (it uses none of the library's symbols)
+            dpkg-shlibdeps: warning: debian/xserver-xorg-video-intel/usr/lib/libI810XvMC.so.1.0.0 should not be linked against libXfixes.so.3 (it uses none of the library's symbols)
+            dpkg-shlibdeps: warning: debian/xserver-xorg-video-intel/usr/lib/libI810XvMC.so.1.0.0 should not be linked against libX11-xcb.so.1 (it uses none of the library's symbols)
+            dpkg-shlibdeps: warning: debian/xserver-xorg-video-intel/usr/lib/libI810XvMC.so.1.0.0 should not be linked against libxcb-dri2.so.0 (it uses none of the library's symbols)
+            dpkg-shlibdeps: warning: debian/xserver-xorg-video-intel/usr/lib/libI810XvMC.so.1.0.0 should not be linked against libxcb-util.so.0 (it uses none of the library's symbols)
+            dpkg-shlibdeps: warning: debian/xserver-xorg-video-intel/usr/lib/libI810XvMC.so.1.0.0 should not be linked against libxcb.so.1 (it uses none of the library's symbols)
+            dpkg-shlibdeps: warning: package could avoid a useless dependency if debian/xserver-xorg-video-intel/usr/lib/libIntelXvMC.so.1.0.0 debian/xserver-xorg-video-intel/usr/lib/libI810XvMC.so.1.0.0 were not linked against libXext.so.6 (they use none of the library's symbols)
+            dpkg-shlibdeps: warning: package could avoid a useless dependency if debian/xserver-xorg-video-intel/usr/lib/libIntelXvMC.so.1.0.0 debian/xserver-xorg-video-intel/usr/lib/libI810XvMC.so.1.0.0 were not linked against libXfixes.so.3 (they use none of the library's symbols)
+            make[1]: Leaving directory `/build/buildd-xserver-xorg-video-intel_2.19.0-6-i386-9thLfo/xserver-xorg-video-intel-2.19.0'
+               dh_installdeb -a -O--builddirectory=build/
+            […]
+    
+    Not populating `XVMCLIB` with `xext` and `xfixes` makes the warning go
+    away and the libraries are still built without any issues.
+    
+    [1] https://buildd.debian.org/status/fetch.php?pkg=xserver-xorg-video-intel&arch=i386&ver=2%3A2.19.0-6&stamp=1347825458
+    
+    Signed-off-by: Paul Menzel <paulepanter@users.sourceforge.net>
+
+commit 9807bba950078d86a25b91064ecfebaa0ee459e3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Feb 1 18:25:48 2013 +0000
+
+    sna: Drop bogus refcnt assertion during kgem_bo_retire()
+    
+    As we may call it kgem_bo_sync(), during preparation of the upload
+    buffer which in turn may operate on an object straight out of the snoop
+    cache and hence not yet referenced (or in some cases, ever).
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit a5561f13498066922b54af04cc71549322ce0e3b
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Feb 1 18:05:35 2013 +0000
+
+    sna: Do not add the INPLACE hint if we have the ASYNC hint set
+    
+    If the caller is preparing to use the GPU to rendering into the CPU bo,
+    it will request an ASYNC migration. In those cases, we do not want to
+    substitute it with an INPLACE operation.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d3ff1cb9d7f788002337b1e6c4c81c58112b85b1
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Feb 1 13:46:33 2013 +0000
+
+    2.21.0 release
+
+commit 008f8230a7c47f1249eb51e53b3abf158f2a42bf
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Feb 1 01:54:52 2013 +0000
+
+    sna: Assert that if we have GPU damage we have a GPU bo
+    
+    Scatter the asserts around the migration points to catch where this
+    invariant may be untrue.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit cf0576f87102b1535268691e7e29661b0f9ee73b
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Feb 1 00:19:21 2013 +0000
+
+    sna/video: Correct computation of planar frame size
+    
+    The total frame size is less than 3 times the subsampled chroma planes
+    due to the additional alignment bytes.
+    
+    Bugzilla: https://bugs.launchpad.net/ubuntu/+source/xserver-xorg-video-intel/+bug/1104180
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 268285d9a64fc47fe81fe5bfbfbd1890dad53e1e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 31 21:57:41 2013 +0000
+
+    sna/gen3+: Flush vertex threads before touching global state
+    
+    We need to be careful not just when finishing the current vbo to
+    synchronize with the sharing threads, but also before we emit the batch
+    state that no other thread will try and do the same.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1239e012ae6d4f00ce73f32d7244905a601170ea
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 31 19:18:17 2013 +0000
+
+    sna: Make sure the needs_flush is always accompanied by a tracking request
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9712f49fddc8be939f77c25fcb907873af44619f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 31 18:08:05 2013 +0000
+
+    sna: Remove stale assertion
+    
+    Now the reset is meant to restablish 'rq' if the bo was busy.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit cd7df0004cf6e423d2ae6c0cf83a84e0031161b4
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 31 17:32:57 2013 +0000
+
+    sna: Pass width/height to composite for rotated displays
+    
+    This is essential to handle displays that are too large to be rendered
+    normally via the 3D pipeline and so that the bounds of the fixup region
+    are known.
+    
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=60124
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 38376b56cfe0dfc603bce48e37432622ef9a0135
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 31 17:29:10 2013 +0000
+
+    sna: Remember to move scanouts to the scanout cache after retiring
+    
+    Reported-by: Jiri Slaby <jirislaby@gmail.com>
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0a08de1f02577aef0da289108270c1b35e5d9703
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 31 16:39:47 2013 +0000
+
+    sna: After removing the bo from a batch, check whether it is still busy
+    
+    If we transfer a bo to the current batch, then subsequently discard it,
+    we lose the information about its current active state. Try to recover
+    this information, by querying the kernel and adding it to the flushing
+    list if necessary.
+    
+    Reported-by: Jiri Slaby <jirislaby@gmail.com>
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit fff0686342f8ec3b3f3510340e073defdf2fb73f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 31 12:40:21 2013 +0000
+
+    sna/traps: Thread the fallback rectilinear compositor
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 839542d219bd919c99398d514c1d194d18b78eff
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 31 12:08:52 2013 +0000
+
+    sna/traps: Allow inplace compositing for non-GPU buffers and rectilinear traps
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit e329e04b10c88afb40f2fd8fdad5b24b9f7dfc15
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 31 11:33:37 2013 +0000
+
+    sna/traps: Translate the extents for the rasterization threads
+    
+    The single-threaded code used the pre-computed width/height and only
+    required the origin from the bounds. However, the threads need to
+    allocate memory for themselves based on the computed bounds, and so it
+    helps if those bounds are then correct (rather than only the top-left
+    being in local space with the bottom-right in global coordinates).
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 033f75e5bd94e226e719f87ed4e0091845384679
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 31 01:38:01 2013 +0000
+
+    sna: Stage retirement through the flushing list
+    
+    If the kernel replies that a bo is still busy, stage its retirement
+    through the flushing list to be certain that we never stall on a
+    subsequent write.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 5f5711e62cc4c8ca15782376c4047174299e2db0
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 31 01:21:08 2013 +0000
+
+    sna: Disable dangerous assertions that depend upon external state
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 42529336fd92d39a5a5eceb07f2838d4be50fa8e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 31 00:58:51 2013 +0000
+
+    sna: Prevent falling back to swrast if source is on the GPU
+    
+    Currently if the dst is wholly contained within the CPU, then we try to
+    continue to operate on the GPU. However, if we have FORCE_GPU set, it
+    means that one of the sources for the operation resides on the GPU, and
+    that would require a readback in order to perform the operation on the
+    CPU. Hence, if we try to use a CPU bo and fail, convert back to using
+    the GPU bo if forced.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit c2d06c407e1c2cbbf3f7f6c4989710a799cd43d0
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 30 21:17:42 2013 +0000
+
+    sna: Improve DBG output for damaged slave outputs
+    
+    After computing the intersection of the damage with the slave, give the
+    region extents.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8867aa6a46c33fd2abf3b3f0b1d6115bad6c8017
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 30 21:15:55 2013 +0000
+
+    sna/dri: Handle change of BackBuffer across a pending flip
+    
+    If we encounter a delayed flip with a different back buffer than the
+    current, simply update the info rather than bug out.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit a31fd03bd4c87c48dc3ca15e3082e29348224b8c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 30 17:26:28 2013 +0000
+
+    sna: Add a bunch of assertions to make sure we do not misplace scanouts
+    
+    As scanouts are uncached, they need to be treated carefully and
+    decontaminated before being placed in the general cache. So double check
+    that no bo in those caches are still marked as a scanout.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 6f1b862282ddb4545987fb9f0a45b528b7b7b5ee
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 30 15:44:53 2013 +0000
+
+    sna: Pass the correct WRITE hint when migrating for rendering into the CPU bo
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 5011ed2e729d46fe3cff5454e15a0fd16441f7e1
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 30 15:44:22 2013 +0000
+
+    sna: Only discard the clear hint when writing inplace to the GPU pixmap
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 6312f58014c0bb4afa56855be1e9becc3e3cc3d7
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 30 15:43:05 2013 +0000
+
+    sna: Don't force a migration from CPU rendering for a DRI2 flushed pixmap
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 60a3b370aea0cf9ffb4947a73984c877b4695d4e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 30 15:41:51 2013 +0000
+
+    sna: Retire the bo after a set-domain(CPU,0)
+    
+    Having relaxed the earlier assertion because the kernel is wrong, we can
+    now retire for READ-READ optimisations.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 78ad5a742f40c2311bfe90997aebedeb998464e5
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 30 15:40:06 2013 +0000
+
+    sna: Relax assertion the the kernel considers the bo idle when we call retire
+    
+    All the callers have explicitly changed the domain upon the bo before
+    calling kgem_bo_retire(), so we still get the occasional sporadic
+    failure as kgem_busy() reports true. Kill the assertion for now.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 83bcd310d279758542e366348f808d7ca0f6d0bb
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 30 13:18:21 2013 +0000
+
+    sna: Prefer to use snooped buffers for readbacks
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 496f3ff04453524639a52a3b9dfcb8e198e5e597
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 30 12:21:33 2013 +0000
+
+    uxa: Harden against failures to submit batchbuffers
+    
+    If we fail to submit a batchbuffer, the driver is broken and likely to
+    continue to fail to render. Give up, and fallback to swrast so that the
+    session remains usable.
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=59771
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 04d48fee713e7bbc9cdf4f09855f6663a4bdc59f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 30 11:46:20 2013 +0000
+
+    sna: Fix errors found from asserts in a66c5f9ed51e
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit bc8b191ef6f5030d17a3b6497d1fd7556756c1ff
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 30 09:04:10 2013 +0000
+
+    sna: Return early if the Drawable box exactly matches one CRTC
+    
+    If we are trying to find the best coverage, then by definition if the
+    drawable is an exact match for one CRTC, we can stop looking.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit de28027ffc649920268ae6fdd64146f08310e8a4
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 30 08:42:48 2013 +0000
+
+    sna/dri: Make sure we discard the existing mappings when swapping GPU bo
+    
+    If the GPU bo is currently mapped to the Pixmap, we need to be sure to
+    invalidate that mapping if we swap the GPU bo (for SwapBuffers). If we
+    forget, we leave a dangling pointer to chase.
+    
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=60042
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit cf9b9ac3186299ab2418c55e73e19c81e5f615a4
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 30 08:40:53 2013 +0000
+
+    sna: Only discard the mapping prior to the actual read when uploading
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit a66c5f9ed51e1dcfc2ab03339795b73617629196
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 30 08:22:00 2013 +0000
+
+    sna: Before replacing the devPrivate.ptr assert it is not already mapped
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 3fdd28419adee7145d3925cff2704143a324e9d3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 29 22:26:15 2013 +0000
+
+    sna: Only migrate the sample box if using the BLT engine for a composite
+    
+    Modify the presumption that if we are using a core operation on a shadow
+    pixmap, then we are likely to continue migrating that pixmap back and
+    forth.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0c3b0f11d718d915e502582e9fadd5c0577640db
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 29 22:24:30 2013 +0000
+
+    sna: Verify that we always add the SHM CPU bo to the flush list when using
+    
+    As we need to synchronize that bo before the next reply, we need to keep
+    track of it whenever it is active on the GPU.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f743cd5734ca502aa8bdb0e1327fe84d6ce82755
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 29 18:04:40 2013 +0000
+
+    sna: Avoid promoting SHM CPU bo to GPU to maintain coherence with SHM clients
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9383c5efe9ace34970abddc5e3c84c32505b537f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 29 17:24:24 2013 +0000
+
+    sna/gen3+: Fix a DBG for composite_boxes()
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b02a1ea5573b6f0b58a037dd4788c04c296f7ff3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 29 09:28:33 2013 +0000
+
+    sna: Add GT1/GT2 thread counts for Haswell
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1dc2d9ede5c7f330ebadf85d987559c8a6cb1c6b
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Jan 28 23:14:57 2013 +0000
+
+    sna: Add some more paranoia that we correctly map before fallbacks
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 63c71bcd96202e6da44d6776d119a82f0c06d386
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 27 23:17:13 2013 +0000
+
+    sna: Fix typo in vertex count for threaded source span emitter
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b0d26ca9312695d05c29503a3f892e7f2c5816dd
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 27 21:07:03 2013 +0000
+
+    sna: Replace the forced vertex finish with just a wait
+    
+    When completing a batch mid-operation, we need to wait upon the other
+    threads to complete their writes so that memory is coherent before
+    submitting the work to the GPU. This was achieved by forcing the finish,
+    but all that from that is the wait, which makes the handling of threads
+    much explicit and removes the unnecessary vbo refresh.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b0c3170c1092d01b4937f352a3962854785ee549
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 27 19:09:38 2013 +0000
+
+    sna: Add the pixmap to the flushing list when creating for inplace CPU writes
+    
+    Reported-by: Jiri Slaby <jirislaby@gmail.com>
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 73f574945f2cac14f9bafa6395e2c4dbb16fcf5d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 27 16:02:52 2013 +0000
+
+    sna: Disable all signals in the render threads
+    
+    X uses them (SIGIO especially) for input handling, and gets rightfully
+    confused if it finds itself in a different thread.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9a7bf70365980809d0f02190f2f620a957ff1ba8
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Jan 26 23:03:33 2013 +0000
+
+    sna: Enable threaded rasterisation for non-antialiased geometry
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8178cff5718e69e14d3953a7f754d7585a06838f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Jan 26 14:41:04 2013 +0000
+
+    sna: Begin sketching out a threaded rasteriser for spans
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8ffb3f50b3b4601401da76e2848e059ab63231f4
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Jan 25 10:45:39 2013 +0000
+
+    sna: Spawn threads to rasterize trapezoids through pixman
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0ec2f3a8bac96acc55c8fdb432b97d026abaafb4
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 24 23:10:39 2013 +0000
+
+    sna: Spawn threads to composite trapezoids inplace
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 427b7311fe1b66d54518bae45e9fa149bda8a6e8
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 24 22:25:46 2013 +0000
+
+    sna: Perform the last threaded composite operation directly
+    
+    The point of the refactor was to execute the last stage of the composite
+    in the master thread, so do so.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 326dcd75f2202b1af29e986f5efb6b1e133217cb
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 24 20:58:53 2013 +0000
+
+    sna: Parse cpuinfo to determine the actual number of physical cores/caches
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f597b647180c1e7bf83693060f244926191b7462
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 24 18:45:35 2013 +0000
+
+    sna: Tidy construction of data for threaded composite
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1643c97f8f7b49738b649b5f7d1e574d689d167e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 24 18:24:02 2013 +0000
+
+    sna: Use threads for simple mask generation
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d60128c55e8f5f69476d42c20f2fd62ccc0f411e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 24 15:41:29 2013 +0000
+
+    sna/dri: Compensate clipExtents for drawable offset
+    
+    The clipExtents is in screen coordinates whereas we just want to confirm
+    that the maximum pixel to be copied lies with the DRI2 buffer, which is
+    relative to the drawable.
+    
+    Reported-by: Matthieu Baerts <matttbe@gmail.com>
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=59806
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 264b3b72500c5af74d124a214347d45c9cb90a1d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 24 15:06:12 2013 +0000
+
+    sna: Refactor to use a common fbComposite fallback
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8ecfbea9d1f83b2de62bee0f58299e7a90c741d1
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 24 14:46:03 2013 +0000
+
+    sna: Experiment with a threaded renderer for fallback compositing
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 778dba90cfc4e801a975bd661c56a565ce60524b
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 23 21:32:29 2013 +0000
+
+    sna/dri: Don't contribute missed frames to the target_msc
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 50b41cb485ffb38e6bf705a3a62840bb78af669b
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 23 21:16:49 2013 +0000
+
+    sna/dri: Only reject DRI2 buffers that are too small for the request blit
+    
+    The goal is to reject stale DRI2 buffers that are smaller than the
+    target due to not-yet-handled ConfigureNotify, but not to reject
+    blitting from Windows that are larger than the frontbuffer.
+    
+    Fixes a regression from the overzealous
+    commit b27ecf3059bc066ef59f2a71c1d8d8f0ffec7191
+    Author: Chris Wilson <chris@chris-wilson.co.uk>
+    Date:   Mon Nov 12 14:06:06 2012 +0000
+    
+        sna/dri: Prevent scheduling a swap on stale buffers
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 98b312e579385e6e4adf6bf0abe20f8ca84592af
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 23 20:51:35 2013 +0000
+
+    sna/dri: Stop feeding I915_TILING_Y to mesa i915c
+    
+    Only i915g handles Y-tiling, and we can't differentiate between the two
+    types of clients.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 31796400915a06fc789088b7dcfcecd6ea91e195
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 23 19:37:23 2013 +0000
+
+    sna: Clean up WAIT_FOR_EVENT on gen2/3
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit ea8148b24d48db4f46205817db8a55dd6ea1a4b3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 23 17:47:12 2013 +0000
+
+    sna/dri: Prefer to use the BLT ring for vsync'ed copies on IVB+
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 3c3a87a2d4261cbd66602812637328a04787f510
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 23 17:35:50 2013 +0000
+
+    sna/gen6: Correct the event definition for secondary pipes for MI_WAIT_FOR_EVENT
+    
+    It helps to wait upon the event we program and enable.
+    
+    References: https://bugzilla.kernel.org/show_bug.cgi
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 88753c5a8c6c9acf086d81828260adf330eebb1a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 23 17:35:50 2013 +0000
+
+    sna/gen7: Correct the event definition for secondary pipes for MI_WAIT_FOR_EVENT
+    
+    It helps to wait upon the event we program and enable.
+    
+    References: https://bugzilla.kernel.org/show_bug.cgi
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2d92d8ec562cb1e6b9dca28074adca670734233c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 22 09:24:04 2013 +0000
+
+    sna: Extend rectangular PolyLines to cover corner pixels on ccw paths
+    
+    Reported-by: Joe Peterson <joe@skyrush.com>
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=55484
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit c8817e24a9d97110a961c3803290e38ff5cbfc9a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 22 09:06:50 2013 +0000
+
+    sna/gen7: Fix inversion of bool return code from CA pass
+    
+    As we inverted the predicate, we no longer restored the original
+    operation after performing a CA pass - glyph would randomly become
+    white.
+    
+    Reported-by: Jiri Slaby<jirislaby@gmail.com>
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 10f549332e315cfe2cc86aadab94a95ae6757c34
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Jan 21 16:48:34 2013 +0000
+
+    sna: Free a non-reusable bo if it expires on the flushing list
+    
+    Still no sure just how the bo ends up there, but as there seems to be
+    the occasional malinger, just free it.
+    
+    Reported-by: Jiri Slaby <jirislaby@gmail.com>
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d7f0df27edb20b052ad39beb26a0b1924f432618
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Jan 21 16:34:09 2013 +0000
+
+    sna: Use the maximum backlight value if we fail to read the current value
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 46a3a68e60a1d0a598ec8ece81088a4e6491de55
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Jan 21 16:29:30 2013 +0000
+
+    sna: Assert that if marked as a scanout it is indeed bound.
+    
+    On further review, the invariant must have been violated earlier, so
+    make the assert earlier.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0507d55dd1bc8fedae524a410a9e7b53f1dad920
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Jan 21 16:24:49 2013 +0000
+
+    sna: Only add bound scanouts to the scanout list
+    
+    If we never used the bo as an actual scanout it will never have had been
+    moved to the uncached domain and so we can return it back to the system
+    cache.
+    
+    Reported-by: Jiri Slaby <jirislaby@gmail.com>
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 5a0bc67ba57cf698e100df617474669ed5d036d6
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Jan 21 11:41:38 2013 +0000
+
+    sna: New execbuffer flags for lut-handle and fast-relocs are upstream
+    
+    Now the flags are upstream, we can rely on runtime tests as the
+    interface is now frozen.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 208ca91a31182e8ddad36e6a735c725362cbd071
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 20 18:02:41 2013 +0000
+
+    sna/gen7: Place the vsync commands in the same cacheline
+    
+    Do as told; both the LRI and WAIT_FOR_EVENT need to be in the same
+    cacheline for an unspecified reason.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9a3e3abfe9b624af2354c5a69778aee3024fe46c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 20 17:48:31 2013 +0000
+
+    sna/gen7: Offset start/end scanlines by one
+    
+    The hardware needs to be programmed with the line before the desired
+    scanline, wrapping around as required.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit e6a64f872bfd026aa1ba1bd44b1298918c819849
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 20 16:59:58 2013 +0000
+
+    sna/gen3+: Remove bogus assertion that the vbo in included before finish
+    
+    If we are carrying over a nearly full vbo from one batch to the next, we
+    may indeed finish it prior to writing any new primitives and so the
+    assert is truly bogus.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 5de919336fc1ba1c4116e18ba0560cdb7b0589f0
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 20 16:36:17 2013 +0000
+
+    sna/gen6: Tweak programming scanline values
+    
+    The documentation says that both start/end scanline need to be the line
+    before the desired value, and so to program the first scanline we need
+    to set it to the last scanline. The docs also say that the lower 3 bits
+    are ignored, so tweaked the values programmed accordingly with an extra
+    check that the window is not reduced to 0.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2f9ac4e8a17e9d60bbb55c46929c37e92181d804
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 20 15:53:32 2013 +0000
+
+    sna/gen3+: And restore non-CA compositing state after the CA pass
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 650c9d5ce80afc1d4c8d9f77f6679f085fa4dc9d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 20 14:58:42 2013 +0000
+
+    sna/gen3+: Reset vertex relocation state after discarding the batch
+    
+    Fixes a regression from commit a6ecb6d31d8c543f38fca0be6b0ec82e59dcd8d2
+    Author: Chris Wilson <chris@chris-wilson.co.uk>
+    Date:   Wed Jan 16 09:14:40 2013 +0000
+    
+        sna: Discard the batch if we are discarding the only buffer in it
+    
+    as we may keep a stale relocation for the vertex buffer alive if we
+    attempt to clear the bo using the render engine before discarding it.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 492952e0d6362a046a666956afdf8f9bc0f2b7e7
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 20 14:55:06 2013 +0000
+
+    sna/gen3+: Handle flushing vbo for CA glyphs
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b52c921204df6b2486717fcef05b4a1993aa1071
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 20 14:02:07 2013 +0000
+
+    sna: Adapt error detection and handling for invalid batchbuffers
+    
+    Allow the DDX to continue even if the kernel rejects our batchbuffers by
+    disabling hw acceleration - just extends the existing hang detection to
+    also handle the driver producing garbage.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8215a278f20d34819536edbda05a108a860fefb9
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 20 12:36:07 2013 +0000
+
+    sna/gen3: Always close the vertices for a batch, even if the vbo is empty
+    
+    In the case where we emit a no-op, we may not attempt to finish binding
+    the vbo as it is considered empty. This leaves a stray relocation for
+    the next batch, and also causes it to believe that it has a vbo bound
+    already.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit a88a9b9a59fa2d5fd427fa6e1f74fb9844379264
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 20 12:06:09 2013 +0000
+
+    2.20.19 release
+
+commit 7822bbacbece6fcb2e12863cd6c7a53ab614c37c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 20 11:43:49 2013 +0000
+
+    test: Add script to generate source file for testing vsync
+    
+    Courtesy of an original script by Mark Schreiber,
+    https://bugs.freedesktop.org/show_bug.cgi?id=59606
+
+commit 9329d8755981989ccbe66df6085fbab7c809a2c6
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 20 10:14:21 2013 +0000
+
+    sna: Make DEBUG_SYNC a configure option
+    
+    As it is advisable to combined the synchronous rendering debug option
+    with other debugging options, it is more convenient to make it into a
+    configure option: --enable-debug=sync
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit c9263f192e2f85dd961bc1c4e9ca8180db874517
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 20 01:39:12 2013 +0000
+
+    sna: Apply DEBUG_SYNC prior to emitting error report
+    
+    This is handy for the case where the batch triggers a GPU hang rather
+    than being rejected by the kernel.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 42ab789cce8423d99864776c6d5ba759c4129b54
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Jan 18 13:56:53 2013 +0000
+
+    sna: Clear the non-intersecting damage after skipping the slave update
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 828a3a80aa3f0692e7be2831d58bccf02e2c481d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Jan 18 13:16:23 2013 +0000
+
+    uxa: Clip dirty region to slave pixmap before appending damage
+    
+    Fixes regression from
+    
+    commit c789d06cf8a0debc67058d7be1483f5b542e2baa
+    Author: Dave Airlie <airlied@redhat.com>
+    Date:   Mon Jan 7 13:57:21 2013 +1000
+    
+        intel: fixup damage posting to be done correctly around slave pixmap
+    
+    which causes the entire slave scanout to be readback from uncached
+    memory every time a pixel is modified.
+    
+    Reported-by: Stephen Liang <inteldriver@angrywalls.com>
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=59539
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit e17eaf540b614cdcb8f7349dd01852c3afc5ab05
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Jan 18 13:09:36 2013 +0000
+
+    sna: Replace double negative '!RegionNotEmpty' with the equivalent RegionNil
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2de43a0164ba5364ffd7cb48f0bccc9873e87332
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Jan 18 12:01:54 2013 +0000
+
+    sna: Skip an empty slave update
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 38de17f80d780bf219fc3c4018ad9cc8808ba50f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Jan 18 10:16:42 2013 +0000
+
+    sna: Remove bogus assertion invalidated by 'read-read' sync
+    
+    If we perform a read-read synchronisation, the kernel may still believe
+    that the bo is busy as it remains on the active lists being read by the
+    GPU.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9f68ac60ae37cc72503ec40691d1ae43a476f8e7
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 17 20:00:34 2013 +0000
+
+    sna/dri: Explicitly flag sync copies for the backends
+    
+    As gen6/7 need to prevent ring switching and perform a rendercopy if we
+    need to perform a vsync'ed copy.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1ee00c408d8142cfaf4202393c2364c9ae73cb6e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 17 13:09:47 2013 +0000
+
+    sna/trapezoids: Fix horizontal offset for inplace operation
+    
+    Remember that for an inplace operation we are not dealing with an a8
+    mask, but rather a x8r8g8b8 surface and so need to step accordingly.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0d749f93ea52161e59da1adca1a22e96ba293551
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 17 12:28:18 2013 +0000
+
+    sna: Drop the MOVE_WHOLE_HINT for PutImage
+    
+    It is not as clearly beneficial as for GetImage, as for example toolkits
+    may only push the shadows around a window.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit dc643ef753bcfb69685f1eb10828d0c8f830c30e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 17 12:27:55 2013 +0000
+
+    sna: Apply read-only synchronization hints for move-to-cpu
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 18035a21e147788bea03ab2175ca03ae951701ce
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 17 11:52:10 2013 +0000
+
+    sna: Remove the confusion of the pixmap->undamaged
+    
+    This was to track a pixmap that had been used for migration (i.e had in
+    the past been used for mixed rendering). It is no longer used so remove
+    it.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 46141d277f326ae78f7b0e927a500e0eb1987f1b
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 17 10:16:24 2013 +0000
+
+    sna: Consider fill style for XPolyRectangle
+    
+    The rectangle outline is not always solid...
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d5c8d38afaba04281157bafe212e93f010ae00f5
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 17 10:10:54 2013 +0000
+
+    sna: Refactor to remove a goto from sna_put_zpixmap_blt()
+    
+    The complexity of the function has been moved to move-to-cpu so we can
+    take further advantage of the simplified logic in put_zpixmap to clean
+    up the code by removing an unwanted goto.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9552438caa4d295c99a9b8821cf2644739861c6a
+Author: Colin Walters <walters@verbum.org>
+Date:   Wed Jan 4 17:37:06 2012 -0500
+
+    autogen.sh: Implement GNOME Build API
+    
+    http://people.gnome.org/~walters/docs/build-api.txt
+    
+    Signed-off-by: Adam Jackson <ajax@redhat.com>
+
+commit 87d773249af18ae8722aacb7306b0eee51a90dbc
+Author: Adam Jackson <ajax@redhat.com>
+Date:   Wed Jan 16 13:18:23 2013 -0500
+
+    configure: Drop AM_MAINTAINER_MODE
+    
+    Signed-off-by: Adam Jackson <ajax@redhat.com>
+
+commit dbf1cfec9cd4e9efe7650f2940c92b4e51214288
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 16 12:20:48 2013 +0000
+
+    2.20.18 release
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 47caffc50b5cdd288ad868fa9a697f0d4e2d28dc
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 16 10:49:24 2013 +0000
+
+    sna: Restrict upload buffers to reduce sampler TLB misses
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit ab36300a22222086b94857f356612106ffbeb480
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 16 09:17:59 2013 +0000
+
+    sna: Correct DBG to refer to the actual tiling mode forced
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit a6ecb6d31d8c543f38fca0be6b0ec82e59dcd8d2
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 16 09:14:40 2013 +0000
+
+    sna: Discard the batch if we are discarding the only buffer in it
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 26db2438e34feb8f28444bf7418869b4ecd870da
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 16 09:00:21 2013 +0000
+
+    sna: Fix computation of large object sizes to prevent overflow
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 54c1d97d5ab325874e1c7b2639e58111d7a6b93f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 16 09:00:04 2013 +0000
+
+    sna: Add DBG for when we add the inplace hint
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 588c5aa6bca441d7c9305fe2fcf268e89b6b617d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 15 22:21:56 2013 +0000
+
+    sna: Revert use of a separate CAN_CREATE_SMALL flag
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit af85ffdec7047efa452d6bab3a0ee3889dd4f046
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 15 20:37:11 2013 +0000
+
+    sna: Avoid serialising on an move-to-cpu for an async operation
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d70be85dc723168a481c1955444afd951c4817bf
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 15 20:16:45 2013 +0000
+
+    sna: Assert that we never try to mix INPLACE / ASYNC hints for move-to-cpu
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1287c3a24c277cb42930d8af2943b9f7b016f31d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 15 18:59:15 2013 +0000
+
+    sna: Specialise sna_get_image_blt for clears to avoid sync readback
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit da4972eec57e662b98a7abced6338ceb8a533a48
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 15 18:34:07 2013 +0000
+
+    sna/trapezoids: Avoid the multiply for an opaque source
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 7f968c8c991cff751459939bdb42e14255f529b7
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 15 18:41:00 2013 +0000
+
+    sna: Add DBG to use_shm_bo()
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit af63fab5047a43716c5df875ddc50f7c877f8a83
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 15 18:21:11 2013 +0000
+
+    sna: Hint that a copy from a SHM bo will likely be the last in a batch
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1be436409222c00ff66c6d747487b77f1037b27a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 15 18:20:29 2013 +0000
+
+    sna: Pass the async hint for the upload into the GPU
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2113f7f440dd2f10e80f0bb3bd5cd155f7e19098
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 15 09:33:03 2013 +0000
+
+    sna: Free the SHM pixmaps after b266ae6f6f
+    
+    Since b266ae6f6f protected the static allocations from being reaped in
+    the normal course of events, we need to penetrate those defenses in
+    order to finally free the SHM mappings.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 441c481630a5cf09a7eb26d5db80b1e60cb2b10f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 15 01:26:19 2013 +0000
+
+    sna: Mark uploads with async hints when appropriate
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 6abd442279fd32d1ce9b33a72eabbeb922316151
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 15 00:15:23 2013 +0000
+
+    sna: Avoid allocating an active CPU bo unnecessarily
+    
+    If we will not write back the GPU damage to the bo as we intend to
+    overwrite it for the next operation, we can forgo allocating the active
+    CPU bo and skip the synchronisation overhead.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f235c74cd661970c76e152777e9a2c314a368a56
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Jan 14 15:49:42 2013 +0000
+
+    sna: Tweak considering of last-cpu placement for inplace regions
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 70c5e41b519e44e620948d683d3b1111494d2f48
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Jan 14 15:03:59 2013 +0000
+
+    sna: Limit temporary userptr uploads to large busy targets or LLC machines
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit cf860da1c78244036c59edf934b312cc1367e8aa
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Jan 14 12:50:54 2013 +0000
+
+    sna: Apply PutImage optimisations to move-to-cpu
+    
+    We can replace the custom heuristics for PutImage by applying them to
+    the common path, where hopefully they are equally valid.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit e4ad4477815abe31b1a2323673da86a6def2f246
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Jan 14 13:12:46 2013 +0000
+
+    sna: Use userptr to accelerate GetImage
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 3cc04a8e24f02248b6382c9bc354ea15c42b17b6
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 13 17:34:03 2013 +0000
+
+    sna: Initialize src_bo to detect allocation failure
+    
+    sna_accel.c: In function 'sna_put_image':
+    sna_accel.c:3730:18: warning: 'src_bo' may be used uninitialized in this
+    function [-Wmaybe-uninitialized]
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 3f04b0b98d7f861ff58b82c99d33b7eacfcda5f7
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 13 17:31:15 2013 +0000
+
+    sna: Check size against aperture before attempting to perform the GTT mapping
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 7a7db06c62228acc6d1c03e800c7afa84e886f5a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 13 13:45:18 2013 +0000
+
+    sna: Add a compile flag for measuring impact of userptr uploads
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit bcc212dc7a939505a678f97f6700eee99204249f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 13 13:36:09 2013 +0000
+
+    sna: Use the pixmap size (not drawable) to determine replacement
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 32f43f618d9b11ea44b3e01a95ac3f239a731ad2
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 13 13:23:24 2013 +0000
+
+    sna: Allow large image uploads to utilize temporary mappings
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit bf2b2e2f91208412c8b74a95859def501514be43
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 13 12:24:44 2013 +0000
+
+    sna: Allow creation of a CPU map for pixmaps if needed
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b266ae6f6f8fb4c494ece532ae4621055e66beb2
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 13 11:30:07 2013 +0000
+
+    sna: Relax limitation on not mapping GPU bo with shadow pointers
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit a2d82161436e489f23637d793c737bc6950a62b8
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 13 10:17:33 2013 +0000
+
+    sna: Correct a few assertions after enabling read-only mappings
+    
+    As these do not flush the active state if we have read-read mappings, we
+    need to be careful with our asserts concerning the busy flag.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit ab01fd696e1137ddfb9a85ae68c15c05900f0e8e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Jan 12 09:17:03 2013 +0000
+
+    sna: Experiment with a CPU mapping for certain fallbacks
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 03d392cd1d87e17129c42e4d822d3d1749edb02e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Jan 12 08:51:52 2013 +0000
+
+    sna: Tweak max object sizes to take account of aperture restrictions
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d111c464bfbae57bb7141872810c88b88f30c087
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Jan 12 08:15:13 2013 +0000
+
+    sna: After a size check, double check the batch before flushing
+    
+    As we may fail the size check with an empty batch and a pair of large
+    bo, we need to check before submitting that batch in order to not run
+    afoul of our internal sanity checks.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit ec77a07b41f1062b941774f3782b51d21e7824dd
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Jan 11 11:40:57 2013 +0000
+
+    sna/dri: Prefer to preserve the ring of the destination bo
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 42f1026e11527cb62b4522b44e71a4e72582a876
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Jan 11 11:40:16 2013 +0000
+
+    sna: Reorder struct kgem_bo to move related data into the same cacheline
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit aead71051ed757e7565d395c858bf8ab8f0b0ff6
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Jan 11 01:30:43 2013 +0000
+
+    sna: Disable memcpy_to_tiled_x() uploads on 32-bit systems
+    
+    It's far too slow due to the register starved instruction set producing
+    attrocious code and the extra overhead in the kernel for managing memory
+    mappings.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 220970b1a484e283e2bbb44f79df613ce1ee1146
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 10 19:43:05 2013 +0000
+
+    sna: Also prefer to use the GPU for uploads into a tiled bo
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 672e59851c427c63f43cde7dfd1688a72100e3b3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 10 19:35:29 2013 +0000
+
+    sna: Prefer userptr if copying to a tiled bo
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 441ef916ae6569c88b3d6abaf7fea4d69be49d76
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 10 19:14:21 2013 +0000
+
+    intel: Throttle harder
+    
+    Filling the rings is a very unpleasant user experience, so cap the
+    number of batches we allow to be inflight at any one time.
+    
+    Interestingly, as also found with SNA, throttling can improve
+    performance by reducing RSS. However, typically throughput is improved
+    (at the expense of latency) by oversubscribing work to the GPU and a
+    10-20% slowdown is commonplace for cairo-traces. Notably, x11perf is
+    less affected and in particular application level benchmarks show no
+    change.
+    
+    Note that this exposes another bug in libdrm-intel 2.4.40 on gen2/3.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit a37d56f338c5fae832d5eeea1283b6dbde827678
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 10 16:28:24 2013 +0000
+
+    sna: Use some surplus bits to back our temporary pixman_image_t
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 09ea1f4402b3bd0e411b90eb5575b3ff066d7356
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 10 16:26:24 2013 +0000
+
+    sna: Prefer to use the GPU for copies from SHM onto tiled destinations
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit c63147a3c33fd26f5c04a8648881659b4a90df06
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 10 15:15:15 2013 +0000
+
+    sna: Allow CPU bo to copy to GPU bo if the device is idle.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2933e7595838c28081810d4959ca1e005a0419e1
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 10 13:07:19 2013 +0000
+
+    sna: Ignore the last pixmap cpu setting if overwritting all damage
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 934ea64f7ff080b00d00c50ba94f63247d7bb130
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 10 13:06:06 2013 +0000
+
+    sna: With a GPU bo and a shm source, do not fall all the way back
+    
+    The normal source upload into GPU bo knows a few more tricks that we may
+    want to apply first before copying into the shadow of the GPU bo.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8a8edfe4076ee08558c76eddbb68426e4563888c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 10 03:31:37 2013 +0000
+
+    sna: Make sure all outputs are disabled if no CompatOutput is defined
+    
+    If we have to fallback and the configuration is wonky, make sure that
+    all known outputs are disabled as we takeover the console.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 5449e16c0c2b6ca5af4acf42703164b9d2b2d822
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 10 02:54:41 2013 +0000
+
+    sna: Open-code xf86CompatOutput() to avoid invalid pointers
+    
+    config->compat_output needs to be sanitized during device initialization
+    or we may dereference an invalid xf86OutputPtr.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8881a14200580db731ca6902b289b08989aaa61e
+Author: Mickaël THOMAS <mickael9@gmail.com>
+Date:   Mon Jan 7 20:47:51 2013 +0100
+
+    Set initial value for backlight_active_level
+    
+    If the "Backlight" option is set, backlight_active_level is not set which
+    results in a default value of 0, causing a black screen upon starting Xorg.
+
+commit b8c9598294eaa16e0d1578ad98896f6ec5ba37cf
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Jan 7 13:57:21 2013 +1000
+
+    sna: fixup damage posting to be done correctly around slave pixmap
+    
+    Copied from commit c789d06cf8a0debc67058d7be1483f5b542e2baa
+    Author: Dave Airlie <airlied@redhat.com>
+    Date:   Mon Jan 7 13:57:21 2013 +1000
+    
+    This fixes the damage posting to happen in the correct ordering,
+    not sure if this fixes anything, but it should make things more consistent.
+
+commit c789d06cf8a0debc67058d7be1483f5b542e2baa
+Author: Dave Airlie <airlied@redhat.com>
+Date:   Mon Jan 7 13:57:21 2013 +1000
+
+    intel: fixup damage posting to be done correctly around slave pixmap
+    
+    This fixes the damage posting to happen in the correct ordering,
+    not sure if this fixes anything, but it should make things more consistent.
+    
+    Signed-off-by: Dave Airlie <airlied@redhat.com>
+
+commit 5891c89ff2be277d1a833d4bc092b65184c1f3d6
+Author: Dave Airlie <airlied@redhat.com>
+Date:   Mon Jan 7 13:54:47 2013 +1000
+
+    intel: drop pointless error printf in the slave pixmap sync code.
+    
+    This is left over and spams logs, get rid.
+    
+    Signed-off-by: Dave Airlie <airlied@redhat.com>
+
+commit 27550e81482229007fa9e0e9769fdd20f3616b23
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 6 17:29:19 2013 +0000
+
+    sna/dri: Transfer the DRI2 reference to the new TearFree pixmap
+    
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=58814
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1a5e4fb725da2eb25cf7f476290c02e9880a4efc
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 6 17:08:56 2013 +0000
+
+    sna: Only disable upon a failed pageflip after at least one pipe flips
+    
+    If we have yet to update a pipe for a pageflip, then the state remains
+    consistent and we can fallback to a blit without disabling any pipes. If
+    we fail after flipping a pipe, then unless we disable an output the
+    state becomes inconsistent (the pipes disagree on what the attached fb
+    is).
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit dd66ba8e5666a1ce7da0ddc226d074f591e1fa22
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 6 16:13:56 2013 +0000
+
+    sna: Try to create userptr with the unsync'ed flag set first
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9051f43fa3c8d011921ac6ff75b763280f26d98f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 6 15:20:14 2013 +0000
+
+    sna/gen4+: Handle solids passed to the general texcoord emitter
+    
+    The general texcoord emitter does handle solids (for the case of a
+    transformed mask) and so we need to be careful to setup the
+    VERTEX_ELEMENTS accordingly.
+    
+    Fixes regression from
+    commit 2559cfcc4cbc1d0d84b048565cad3bfee61df8da
+    Author: Chris Wilson <chris@chris-wilson.co.uk>
+    Date:   Wed Jan 2 10:22:14 2013 +0000
+    
+        sna/gen4+: Specialise linear vertex emissio
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4af910e8be92e0ca241ce1e93e322c712dcbe340
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 6 13:43:55 2013 +0000
+
+    sna/gen4+: Trim the redundant float from the fill vertices
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 3244e4b23397f54ca76876dd76ebea9a0abd357e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Jan 6 13:24:23 2013 +0000
+
+    Revert "sna/gen4+: Backport tight vertex packing for simple renderblits"
+    
+    This reverts commit 8ff76fad1fadc5e309f9a12c30f883460a432049 and
+    commit 48e4dc4bd4b2980f0f804f572d0e3fc1bb4bc21e.
+    
+    I forgot gen4 and gen5 do not have the 'non-normalized' bit in their
+    sampler states.
+
+commit d3be77f87916e38af717bafaf2000becd5180d76
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Jan 5 18:07:50 2013 +0000
+
+    sna/trapezoids: filter out cancelling edges upon insertion to edge-list
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2b4a2f52c47a24c297312d51f9a8299c9a54a697
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Jan 5 17:21:34 2013 +0000
+
+    sna/trapezoids: filter out zero-length runs
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 59a7b8b32c694735942fd7e42c1382d91004b0b1
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Jan 4 18:22:14 2013 +0000
+
+    sna: Clear up the caches after handling a request allocation failure
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 3c31a9fc210221ba8e7922bec80c15ec39cab7bc
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Jan 4 18:11:12 2013 +0000
+
+    sna: Embed the pre-allocation of the static request into the device
+    
+    So that in the cache where we are driving multiple independent screens
+    each having their own device, we do not share the global reserved
+    request in the event of an allocation failure.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b5b3cfb0ad1cc5e66c99035f526946bf41011e13
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 3 23:33:44 2013 +0000
+
+    sna: Flush the batch prior to referencing work from another ring
+    
+    In the case where the kernel is inserting semaphores to serialise work
+    between rings, we want to only delay the surface that is coming from the
+    other ring and not interfere with work already queued.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit ea2da97773d858001f98adc880f24b9671c51b2f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 3 16:47:14 2013 +0000
+
+    sna: Convert allocation request from bytes to num_pages when shrinking
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2bd6e4dcd43bb0d836f12232050e73ce1510bb0f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 3 16:38:33 2013 +0000
+
+    sna: Add a pair of asserts to validate fls()/cache_bucket()
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f9d2730974a869f15eac599ca865b50a9a9658d9
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 3 15:20:45 2013 +0000
+
+    sna: Also recognise __i386__ for fls asm
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 69dde74a003ba0168ceca1558a4cb69097421b92
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Jan 3 15:20:23 2013 +0000
+
+    sna: Fix off-by-one in C version of fls
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit fc702cdf534a4694a64408428e8933497a7fc06e
+Author: Matt Turner <mattst88@gmail.com>
+Date:   Wed Jan 2 16:07:54 2013 +0000
+
+    sna: Rewrite __fls without dependence upon x86 assembly
+    
+    The asm() prevents SNA from compiling on ia64.
+    
+    Fixes https://bugs.gentoo.org/show_bug.cgi?id=448570
+
+commit bc67bdcec832f4302951f2789456666dee2f496c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 2 13:47:51 2013 +0000
+
+    sna/gen6+: Fine tune placement of DRI copies
+    
+    Avoid offsetting the overhead of the render copy only to be penalised by
+    the overhead of the semaphore. So compromise.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2559cfcc4cbc1d0d84b048565cad3bfee61df8da
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Jan 2 10:22:14 2013 +0000
+
+    sna/gen4+: Specialise linear vertex emission
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0996ed85fd8bd79f41f28908733b85566f9e2b69
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 1 22:53:26 2013 +0000
+
+    sna/gen2+: Precompute the affine transformation scale factors
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d36cae801f1dcb06d4f93f2f27cc9b9de73e89c9
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 1 21:03:06 2013 +0000
+
+    sna/gen4+: Tidy special handling of 2s2s vertex elements
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8582c6f0bbe1bf01324b46933ff2f50c65f2a82d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 1 20:53:12 2013 +0000
+
+    sna/gen6+: Remove vestigial CC viewport state
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 24264af2912f9abae5aff2a6fb5a50383d9e33be
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 1 20:39:23 2013 +0000
+
+    sna: Fast path inplace addition of solid trapezoids
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit e9a9f9b02978cb2d73c38163827eb7141ebed16c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 1 16:40:28 2013 +0000
+
+    sna: Micro-optimise glyph_valid()
+    
+    Note that this requires fixing up the glyph->info if the xserver didn't
+    create a GlyphPicture.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 372c14aae8f4fd2c5865b9d23cd825dcbc33765f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 1 15:49:12 2013 +0000
+
+    sna: Remove some obsolete Options
+    
+    Throttling and delayed-flush are now redundant.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 65924da91da4bb617df1bb0a7c3e9d4aa475b6b1
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 1 11:40:15 2013 +0000
+
+    sna: Tidy compat interfaces
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0a35d9287397031c95ebd9dc53b68e33e7dcf092
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 1 11:12:02 2013 +0000
+
+    sna/gen2: Always try to use the BLT pipeline first
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit c1457fbd8a169ee19c8e625ea4e779180eb4b070
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Jan 1 10:49:27 2013 +0000
+
+    sna/gen2: Tidy a pair of vertex emitters
+    
+    Switch to the new inline scaled transforms.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 48a5797c0f227204d0723de0ef34b046964c571e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 31 17:30:40 2012 +0000
+
+    sna/gen4: Tweak single-thread SF w/a for solids
+    
+    Allow multiple threads for the rare case of compositing with a solid
+    color.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit e4f6ba6b47c41645a40e314f14047ba0b5f93a01
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 31 14:06:36 2012 +0000
+
+    sna/gen6+: Hint that we prefer to use the BLT with uncached scanouts
+    
+    Once again balancing the trade-off of faster smaller copies with the BLT
+    versus the faster larger copies the RENDER ring.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 6e87e7ddfe0c21e0fb6b3c2cb940a40aa7d4e061
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 31 14:03:16 2012 +0000
+
+    sna/dri: Use the default choice of backend for copying the region
+    
+    Notably, if everything is idle, using the BLT is a win as we can emit
+    them so much faster than a rendercopy, and as the target is uncached we
+    do not benefit as much from the rendercache.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit a7988bf77f5a106a48b6e39b6eaf60ef2f8bec11
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Dec 30 14:50:49 2012 +0000
+
+    sna/dri: Fix triple buffering to not penalise missed frames
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 736b89504a32239a0c7dfb5961c1b8292dd744bd
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Dec 30 10:32:18 2012 +0000
+
+    uxa: Align surface allocations to even tile rows
+    
+    Align surface sizes to an even number of tile rows to cater for sampler
+    prefetch. If we read beyond the last page we may catch the PTE in a
+    state of flux and trigger a GPU hang. Also detected by enabling invalid
+    PTE access checking.
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=56916
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=55984
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 43336c632beb5d599ec0fc614434b88ef7a26422
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 29 16:47:53 2012 +0000
+
+    sna: Seed the solid color cache with an invalid value to prevent false hits
+    
+    After flushing, we *do* need to make sure we cannot hit a false lookup
+    via the last cache.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f6050382095c3bc4f78bc4ff9e9c6086e58d6b28
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 29 16:41:03 2012 +0000
+
+    sna/dri: Gracefully handle failures from pageflip
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1c2ece369177ea6c3fd2f254b2554ceadf5590de
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 29 15:53:23 2012 +0000
+
+    sna/gen4+: Try using the BLT before doing a tiled copy
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 09ca8feb3455c979e799ddf26daae8f2de2813e1
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 29 15:42:02 2012 +0000
+
+    sna: Move the primary color cache into the alpha cache
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8c56c9b1da9e078bd5b7ff4ebc5d8b23f593d500
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 29 14:14:41 2012 +0000
+
+    sna: Allow a flush to occur before batching a flush-bo
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2f53fb389c001f68134f514e30e25e91de41fb9d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 28 22:58:02 2012 +0000
+
+    sna: DBG compile fixes
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit dba83dacd2ccbb2ac23b205ce2a872a889fa30bd
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 28 19:23:36 2012 +0000
+
+    sna/gen3: Use inline transform+scale function
+    
+    So as to avoid reading back from the vbo (which may be wc mapped).
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f0fca544b0602bc4ed2f68e8d260e0a3745b4bad
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 28 18:52:44 2012 +0000
+
+    sna/gen4+: Check for a spare exec slot for an outstanding vbo
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit c6e850b626f4bb44876c683d596ea38f8f6c30ae
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 28 17:14:52 2012 +0000
+
+    sna/gen4+: Trim an extraneous coordinate from solid composite emission
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 3fdc9923447538ed65bf9ffa189d7290ce804730
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 28 17:14:52 2012 +0000
+
+    sna/gen4+: Trim an extraneous coordinate from solid span emission
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit fdd6d222bc92b3e385f5d62f5e03dfd86f290e45
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 28 17:08:00 2012 +0000
+
+    sna/gen4+: Tidy emit_spans_affine()
+    
+    gcc produced abysmal code for the inlined emission, so hand unroll it
+    for sanity.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 5d222d4d21e6e3af5316728e0da49a014e9fea21
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 28 17:08:00 2012 +0000
+
+    sna/gen4+: Tidy emit_spans_solid()
+    
+    gcc produced abysmal code for the inlined emission, so hand unroll it
+    for sanity.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4528f68eff33a5c2f9c1d884e9b3f7228053e0f4
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 28 16:45:50 2012 +0000
+
+    sna: Only allocate a busy CPU bo for a GPU readback
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 99fdd1a1c6aa52688c2c821a90f86700b7ee34b2
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 28 16:33:59 2012 +0000
+
+    sna: Mark kgem_bo_retire() as static
+    
+    The exported function is not used, so mark it static and strengthen the
+    assertions.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 548d284b8cf8cc2b311efe3287e0ae956738189a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 28 14:49:38 2012 +0000
+
+    sna: Skip copying fbcon if we are already on the scanout
+    
+    If we are already the scanout, then there is little point copying to
+    ourselves... Should be paranoia.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 583efd4ba067a0a4319e43ebc18dd81ed9c8db0a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 27 17:59:59 2012 +0000
+
+    sna: Sanity check config->compat_output
+    
+    In a headless setup this may be left initialised to -1.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 7725df8aa1b3eab97618311e3f24769a318bd804
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 27 14:01:59 2012 +0000
+
+    sna/gen2,3: Remove gen-specific vertex_offset
+    
+    Remove the duplication of vertex_offset in favour of the common
+    vertex_offset.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 46af1ff126f3fb1f9470b0cbb19c7c2b09d5b92a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 27 00:40:08 2012 +0000
+
+    sna/gen6+: Tidy up ring preferences
+    
+    Remove a few duplicated tests.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit dd5b653aa2c5fe2e062533db35c83a40c1952ea6
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 27 09:54:35 2012 +0000
+
+    sna: Do not try and set a 0x0 mode
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 861c2362dd38d7d43fe7ffb181cb197199a1c570
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Dec 26 14:12:42 2012 +0000
+
+    sna/gen6+: Tweak to only consider active ring on destination
+    
+    Otherwise we decide to use BLT when hitting the render/sampler cache
+    is preferrable for a source bo.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f9b6aa3aaf784f9149e091a646673ddf341cd7ca
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Dec 26 13:05:52 2012 +0000
+
+    sna: Explicitly track self-relocation entries
+    
+    Avoid having to walk the full relocation array for the few entries that
+    need to be updated for the batch buffer offset.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 90b1b220ee7a3c543301956b01c54a4a04632db4
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Dec 26 12:51:58 2012 +0000
+
+    2.20.17 release
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 52fd223fc970118cbdcb31f9574414debc905e9c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 21 21:36:30 2012 +0000
+
+    sna/video: Initialise alignment for video ports > 0
+    
+    We repeatedly set the alignment value on the first port, rather than
+    once for each.
+    
+    Reported-by: Jiri Slaby <jirislaby@gmail.com>
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 3793ccf7804cfc870b46c623dfeefbe0c381c1d4
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 21 14:48:07 2012 +0000
+
+    sna: Remove assertions that the pixmap is wholly defined when uploading
+    
+    As the user may only write to a portion of a pixmap (thus only creating
+    a small amount of damage) and then attempt to use the whole as a source,
+    we run the risk of triggering an assertion that the whole was defined.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 07dde33a4f51941b4f612823ea6ea7ca01a6efbc
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 21 14:35:32 2012 +0000
+
+    sna: Remove a pair of stale assertions
+    
+    For gen2-5, it does not matter what mode the batch is in when we
+    insert the scanline wait. With the more aggressive batch flushing, and
+    relaxed assigned of mode for those generations, we are likely to see
+    that the batch is idle when we go to insert the waits.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit bdd0cca4e1192df0038621925c4e6243ba419a81
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 21 14:20:23 2012 +0000
+
+    sna: Refactor test for a rotation matrix
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 347c5a7b33729f1bedd408d2ef24756d51b66f1d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 21 10:40:47 2012 +0000
+
+    sna/dri: Refactor get_current_msc between blit/flip paths
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8a67d3f808fcc7c8c51553b1703e8312f28b87a1
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 21 10:21:06 2012 +0000
+
+    sna/dri: Set the correct current_msc for the no readback path
+    
+    If we are asked to render immediately, then in order to pass the tests
+    when comparing it to target, we need to set the current_msc to the
+    ultimate future value, -1.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 48e4dc4bd4b2980f0f804f572d0e3fc1bb4bc21e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 20 21:54:25 2012 +0000
+
+    sna/gen4: Backport tight vertex packing of renderblits
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 08d2b073692836aa22f65f8ba30db5d14550c03e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 20 21:30:32 2012 +0000
+
+    sna/gen4: Backport more recent state tracking tweaks
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8ff76fad1fadc5e309f9a12c30f883460a432049
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 20 20:57:40 2012 +0000
+
+    sna/gen5: Backport tight vertex packing for simple renderblits
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9144c951915a1e0c1899a72161f9f0f1ab9b9ac4
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 21 09:44:52 2012 +0000
+
+    sna/dri: Avoid querying the current-msc with swapbbufers wait disabled
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 84c327e17f68c4a56fcb76be1f45ab6d35291b5d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 20 19:44:46 2012 +0000
+
+    sna/video: Assert that the frame is initialised
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4d750219925cb3199ebc6751cdbd2862dfb4cdfe
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 20 19:34:41 2012 +0000
+
+    uxa/dri: Correct the destination of the blit after chained flip is broken
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit ca5c028c2b4d9bf02002acd484054fe427ea8d09
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 20 19:31:44 2012 +0000
+
+    glamor: Release the drawable after passing to glamor_push_pixels
+    
+    An unlikely path, but a double prepare instead of a prepare/finish.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit de2de36049e2958a60f63fadffe8f54de8da1e56
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 20 19:29:31 2012 +0000
+
+    sna: Check the correct variable for a failed allocation
+    
+    Having already checked 'dst' and just allocated 'src', that is who we
+    should be checking.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit deb908fda74541fba649349db279715b05d0554e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 20 19:22:32 2012 +0000
+
+    intel: ODEV_ATTRIB_PATH is no longer printed, so kill the temporary variable
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0f84ecfc3cd7dfe7f43ff99a6498d2ceccd90225
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 20 12:00:00 2012 +0000
+
+    sna/gen4+: Amalgamate all the gen4-7 vertex buffer emission
+    
+    Having reduced all the vb code for these generations to the same set of
+    routines, we can refactor them into a single set of functions.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1f4ede0ef8f8a8d07e11781ad05617ecdfcd3faf
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Dec 19 20:39:10 2012 +0000
+
+    sna: Do not throttle before move-to-cpu
+    
+    The idea being that when creating a surface to perform inplace
+    rasterisation, we won't be using the GPU for a while and so give it time
+    to naturally throttle.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 5deba2832dc42072d9abaeaa7934bc0e1b28b3ed
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Dec 19 20:03:33 2012 +0000
+
+    sna: Ignore throttling during vertex close
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f91a24fdba517c8e9df5a074db2c789fbf066bb3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 20 09:46:32 2012 +0000
+
+    sna/video: Remove XvMCScreenInitProc
+    
+    The symbols disappears without warning in xorg-1.14
+    
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=58552
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit ee99511846a0f10abeeba8d25d8fb5bf59621b02
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Dec 19 18:02:50 2012 +0000
+
+    sna/gen4+: Tweak preference of GPU placement for spans
+    
+    If the CPU bo is busy, make sure we do not stall for an inplace
+    operation.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit bfd96b092db5e4e0fc2446752deafd1156cf37b3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 18 20:54:33 2012 +0000
+
+    sna/video: Fix presentation of cropped sprites
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2df1b1abf0728f28d2803a096f945779cbe7c70b
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 18 16:07:26 2012 +0000
+
+    sna/video: Fix up copying cropped textured video packed data
+    
+    Simply ignore the cropping and copy the whole plane rather than
+    complicate the computation of the packed destination pixels.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8d523fa824dcb1987557164d048711c1745de378
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 18 16:07:26 2012 +0000
+
+    sna/video: Fix up destination offset for copying cropped textured video planes
+    
+    Oh fun. Textured video expects the source content to be relative to the
+    origin, whereas overlay video expects the source at the origin.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 7bb4573fcc2cf1b8b6bff5d885a2fa81200d2fd7
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 18 15:48:21 2012 +0000
+
+    sna/video: Fix up the image size for copying
+    
+    Yikes, setting image.x2 == image.x1 meant no data was copied whilst the
+    video was clipped.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 551b400377ddc5eb1e89b8b5827a42e810c8d23d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 18 15:14:00 2012 +0000
+
+    sna/video: Amalgamate the computation of source vs dest offsets
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d96a226cc59c641c10153ae3a086a5138c852423
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 18 14:26:18 2012 +0000
+
+    sna/video: Fix adjustment of drawable vs source origin wrt to clip
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 79cb6304e983514dd754065e65e2381a903f9bd6
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 18 13:49:59 2012 +0000
+
+    sna/xvmc: Clean up to avoid crash'n'burn
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0d26082303f3f4006ce4974d402c560613081b23
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 18 10:54:28 2012 +0000
+
+    sna: Prefer the GPU once again for PolyPoint
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0e0a2d300633122d6d0f6f82ff110f513b4e64d7
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 18 10:27:04 2012 +0000
+
+    sna/gen7: Mark the ring switch before checking bo
+    
+    As we may do a batch submission due to the change of mode.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f522fbe7c98ffad86126c3666b2d9f7e616480b8
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 17 23:04:25 2012 +0000
+
+    sna: Refine check for an unset context switch
+    
+    So it appears that we end up performing a context switch on an empty
+    batch, but already has a mode. This is caught later, too late, by
+    assertions. However, we can change the guards slightly to prevent those
+    assertions without altering the code too greatly. And I can then think
+    how to detect where we are setting a mode on the batch but doing no
+    work - which is likely masking a bigger bug.
+    
+    Reported-by: Jiri Slaby <jirislaby@gmail.com>
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 6c50cf4809816dbbd93d54f589a79b0dab996180
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 17 22:27:14 2012 +0000
+
+    sna: Untangle the confusion of caching large LLC bo
+    
+    We only use a single cache for very large buffers, so we need to be
+    careful that we set the tiling on them. More so, we need to take extra
+    care when allocating large CPU bo from that cache to be sure that they
+    are untiled and the flags are true.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit e474abea7cf761e78e777db07b41ec99c0b6f59f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 17 15:38:04 2012 +0000
+
+    sna: Promote pinned-batches to run-time detection
+    
+    Now that the feature has been committed upstream, we can rely on the
+    runtime detection.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4d7e3a9123cf41d2dd97c0a8a0d461c189064822
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 17 12:34:05 2012 +0000
+
+    uxa: Fix copy'n'paste of false not FALSE
+    
+    Bugzilla; https://bugs.freedesktop.org/show_bug.cgi?id=58406
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 7a4d1136bd09bfd4d2657c0b1b64d553eeb6ed4f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 17 09:41:47 2012 +0000
+
+    sna/video: Pass along the video source offset
+    
+    Fortunately nobody had yet noticed that all videos were assumed to play
+    with a matching src/dst origin.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit dfe9d18f9f97a77ceeb410307010424c789c8bd1
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 17 01:06:57 2012 +0000
+
+    sna: Limit the default upload buffer size to half the cpu cache
+    
+    This seems to help with small slow caches.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 5b0572503eab235bc7eff20d369241330c41e630
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Dec 16 23:04:55 2012 +0000
+
+    sna: Enable support for opting out of the kernel CS workaround
+    
+    Keeping a set of pinned batches in userspace is considerably faster as
+    we can avoid the blit overhead. However, combining the two approaches
+    yields even greater performance, as fast as without either w/a, and yet
+    stable.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 805f78addf3ffb36c736df680806cf722b18fea9
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Dec 16 22:04:54 2012 +0000
+
+    sna: Try to reuse pinned batches by inspecting the kernel busy status
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f1aec676810c4a4c180b342d9a83254e08dd55da
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Dec 16 17:37:32 2012 +0000
+
+    sna: Precompute the base set of batch-flags
+    
+    This is to make it easier to extend in future.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit c7ac12003bd0c7d85fa47d43ee2734b222d84a61
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Dec 16 15:28:24 2012 +0000
+
+    sna: Only flush at the low fence wm if idle
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4580bbeac0051417cb03f272112b0cfe697e31b3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Dec 16 15:00:21 2012 +0000
+
+    intel: Support debugging through AccelMethod
+    
+    Ease debugging by allowing all acceleration or render acceleration to be
+    disabled through AccelMethod:
+    
+    Option "AccelMethod" "off" -> disable all acceleration
+    Option "AccelMethod" "blt" -> disable render acceleration (only use BLT)
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 58770b7d6401d2d81f7fee1c8c0e788d44149712
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Dec 16 14:59:03 2012 +0000
+
+    man: Describe Option "AccelMethod"
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 83609af3681fad58af88387077bf7ce0c001a1da
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Dec 16 10:53:26 2012 +0000
+
+    sna: Tweak the idle SHM CopyArea path to also replace a busy GPU bo
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 6490585f65bde487da7bc41fa5cb1c5a028d0bf4
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 15 23:26:30 2012 +0000
+
+    sna: Do not force use of the GPU for a copy from a SHM pixmap
+    
+    As we will undoubtably flush and sync upon the SHM request very shortly
+    afterwards, we only want to use the GPU for the SHM upload iff it is
+    currently busy.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 3a08f091875f2f0f49697ba9852077094b3a704b
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 15 22:53:44 2012 +0000
+
+    sna/gen6+: Tweak prefer-blt-bo
+    
+    Split the decision between where it is imperative to use the BLT to
+    avoid TLB misses and the second case where it is merely preferential to
+    witch.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit ac9ef1fc606e87b48baa47be22bf828dcfe6659f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 15 20:49:56 2012 +0000
+
+    sna/gen6+: Keep the bo on its current ring
+    
+    Track the most recent ring each bo is executed on, and prefer to keep it
+    on that ring for the next operation.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 15ccb7148d15d776a661c1b8c5b9b2360fcae4ad
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 15 20:07:56 2012 +0000
+
+    sna/gen6+: Apply the is_scanout to the key not value in the binding cache
+    
+    Oops, we never managed to reuse the cached location of the target
+    surface as we entered it into the cache with the wrong key.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit fde25b08922d97ca0d4a69c654bf690edbd53b3d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 15 18:59:53 2012 +0000
+
+    sna/trapezoids: Add another inline hint
+    
+    cell_list_alloc() is only called from one place, and the compiler should
+    already be inlining it - but does not appear to be. Hint harder.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2a21c8b351052be9c32c5669264fb05a8510c957
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 15 17:56:27 2012 +0000
+
+    sna: Include shm hint in render placement
+    
+    The goal is to reduce the preference of rendering to a SHM pixmap - only
+    if it is already active, will we consider continuing to use it on the
+    GPU.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit a467102a9539c7f4fa8d0700ecdcaba49d77b3f7
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 15 10:00:48 2012 +0000
+
+    2.20.16 release
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b0f8c823b6cafdfdd064c09d58174f946e290541
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 15 09:28:04 2012 +0000
+
+    sna/dri: Fallback to a blit after a failed flip
+    
+    ...rather than force the exchange.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2c71a8e08abce74b269687d3a6c1edd7f9d643d3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 15 09:27:07 2012 +0000
+
+    sna/dri: Honour TripleBuffer Option
+    
+    In case anyone ever wants to disable the default.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 6593ad3fecb3d044ee5ca161176d8ecaa0b4126a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 14 23:48:00 2012 +0000
+
+    sna/dri: Store and check size of front/back bo attached to a DRI2 drawable
+    
+    So that we can prevent feeding back a stale bo when the DRI2 client
+    tries to swap an old buffer.
+    
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=57212
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9df9585cb00958b42461b3139bb7aec32090a869
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 14 15:37:56 2012 +0000
+
+    sna: Reduce fence watermarks
+    
+    Further restrict the amount of fenced bo we try to fit into the batch to
+    make it easier for the kernel to accommodate the request.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0d3ba44e448c152a570cc469d289ab057fa7be5c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 14 12:47:46 2012 +0000
+
+    sna/gen2+: Experiment with not forcing migration to GPU after CPU rasterisation
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d867fd01cb0060342102a79600daf43e3dc44a07
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 14 13:10:54 2012 +0000
+
+    sna/gen3: Don't combine primitives if beginning a ca 2-pass
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 3ca503dac2ea6c036e7ebe878b41923541daf225
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 14 12:49:14 2012 +0000
+
+    sna/gen3: Remove stray setting of vertex_start
+    
+    It is always done at the beginning of vertex emission.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 7f76f100e8033497620ee46548df45afff41064a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 13 23:12:42 2012 +0000
+
+    sna/gen2: Reorder reuse_source() to avoid NULL dereference for solids
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 044a54c23384756a5dc1895473abf34f7abb3d83
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 13 23:05:30 2012 +0000
+
+    sna/gen2: Initialise channel->is_affine for solid
+    
+    In case we hit a path were we avoid reusing the source for the mask and
+    leave is_affine unset for a solid mask.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 29afd0dc8e893cc4110ee0d70546775dae86ddb3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 13 22:53:18 2012 +0000
+
+    sna/gen2: Assertions
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4d2abe1e3daac74747535d88dff34b024b87bbe9
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 13 22:09:37 2012 +0000
+
+    sna/gen3: Remove incorrect optimisation of an opaque source for CA
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d428dbf7ad7c246acb1c301b73b9df4a1309de03
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 13 14:53:50 2012 +0000
+
+    sna/gen2: Program solid mask using the DIFFUSE component
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9e7311516da81ab45484b291ec668503c5ded0bb
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 13 14:40:25 2012 +0000
+
+    sna/gen2: Align surface sizes to an even tile
+    
+    Makes this 855gm much happier.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit e646047a563598948206167765eaaf4192cfd77f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 13 14:23:54 2012 +0000
+
+    sna: Fix up BLT overwrite detection to use target_handle
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4f96439e39a4bf4b127af9ccfdc09d061caff9bd
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 13 13:15:52 2012 +0000
+
+    sna: Fix typo for 830/845 gen
+    
+    Must remember, its octal not decimal.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f631a56bcb3ff1ce1942b828325a157cef1e0880
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 13 00:55:45 2012 +0000
+
+    sna: Only flush the batch after an actual relocation
+    
+    As we may write preparatory instructions into the batch before checking
+    for a flush.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 74bbf20e6e652cba55d6d0bc17066f4112f8548c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Dec 12 21:56:22 2012 +0000
+
+    sna: Improve the initialisation failure path for pinned batches
+    
+    Simplify the later checks by always populating the lists with a single,
+    albeit unpinned, bo in the case we fail to create pinned batches.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 52c8c9218c8f28fb049b02214d833912a803f911
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Dec 12 21:19:02 2012 +0000
+
+    sna: Fix the error path in kgem_init_pinned_batches() to use the right iter
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit c7f7dd61fd07dbf938fc6ba711de07986d35ce1f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Dec 12 19:43:19 2012 +0000
+
+    sna: Pin some batches to avoid CS incoherence on 830/845
+    
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=26345
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b154d0dc404a152e1283a013a78be06b8d734867
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Dec 12 18:34:54 2012 +0000
+
+    sna/gen2: STIPPLE requires an argument
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9001263b32efde1361555432914d9ac3ee780511
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Dec 12 12:03:40 2012 +0000
+
+    sna/gen3+: Use nearest for unscaled videos
+    
+    If the output is unscaled, then we do not require pixel interpolation
+    (and planar formats are exactly subsampled).
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=58185
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2dbe7d91a7f15a3a9ddad696c5088ca98898fca2
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Dec 12 09:50:34 2012 +0000
+
+    sna/gen4: Use the single-threaded SF w/a for spans as well
+    
+    Fixes the flickering seen in the fishtank demo, for example.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2502218a9340044bb660be68f059971119077e29
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 11 17:47:09 2012 +0000
+
+    sna/dri: Fix handling of current_msc > target_msc
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 39d6c65f437d7877c1647b7ecf45e76daabc76a6
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 11 15:30:54 2012 +0000
+
+    sna/dri: Query current msc before use
+    
+    Might be worth caching the last-known-value so we can skip the query for
+    an old swap request.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 162e9a2bfc693db186aa481551cf76b3dc5ee55c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 10 23:11:48 2012 +0000
+
+    sna/dri: Disable name exchanges for SwapBuffers
+    
+    The DRI2 protocol is inherently racy. Fortuituously, this can be swept
+    under the carpet by forcing the serialisation between the DRI2 clients
+    by using a blit for the SwapBuffers.
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=58005
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0c2287c735f990a98b39d00f28168d7a5df25aba
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 10 20:09:22 2012 +0000
+
+    sna/dri: Only special case 'divisor && msc-passed' for immediate flipping
+    
+    As Jesse pointed out, it is legal for the client to request that the
+    flip be some frame in the future even with no divisor.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2ab29a1688cd313768d928e87e145570f35b4a70
+Author: Jesse Barnes <jbarnes@virtuousgeek.org>
+Date:   Mon Dec 10 14:55:32 2012 -0800
+
+    dri2: don't schedule a flip prematurely at ScheduleSwap time
+    
+    If divisor is 0 but the current MSC is behind the target, we shouldn't
+    schedule a flip (which will occur at the next vblank) or we'll end up
+    displaying it early and returning the wrong timestamp.
+    
+    Preserve the optimization though by allowing us to schedule a flip if
+    both the divisor is 0 and the current MSC is equal to or ahead of the
+    target; this avoids a round trip through the kernel.
+    
+    Reported-by: Mario Kleiner <mario.kleiner@tuebingen.mpg.de>
+    Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
+
+commit 986e13a56a8544d5b32dbcaacbc0ee9cf5d47e27
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 10 17:29:08 2012 +0000
+
+    sna: Try installing a fallback config on VT enter in case full desiredMode fails
+    
+    This can happen naturally for 3-pipe config on Ivybridge or if the
+    outputs are rearranged whilst we slept. Instead of failing to change the
+    display on the VT, install at least a fb on the CompatOutput so that
+    hopefully the DE can take over, or give some control to the user.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8c3b82f207bc8cf697646d3324cb4103da3b7856
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 10 16:20:52 2012 +0000
+
+    sna: Avoid reusing the same 'busy' bit for two different meanings.
+    
+    Oops, I thought the 'busy' bit was now used and apparently forgot it is
+    used to control the periodic flushing...
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d1b479a3404e6b52a23e0443c36d0682cbaf3c2f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 10 11:16:04 2012 +0000
+
+    sna: Compromise and only flush a split batch if writing to scanout
+    
+    A compromise between not flushing quick enough and flushing too often,
+    hopefully.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 3e9120d73c6f0c0e06b617da91cc2edce4434bc3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 10 11:05:16 2012 +0000
+
+    sna: Immediately flush a split batch
+    
+    If we submit a batch early (for example if the GPU is idle), then submit
+    whatever else the client drew immediately upon completion of its
+    blockhandler. This is required to prevent flashing due to visible delay
+    between the clear at the start of the cycle and then the overdraw later.
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=51718
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit aa8c5d8201006397bb32ed6bc28618f9aa77a68a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 6 22:08:08 2012 +0000
+
+    sna/sprite: Add a DBG to report whether the kernel supports sprites
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 25c0d440dee45c03f5e45b8e0e45071c0c32f507
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Dec 9 12:11:53 2012 +0000
+
+    sna: Move source to CPU prior to referencing for inplace trapezoids
+    
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=56825
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 3e82fcc8d243b7ffe1a3d3249a5cdb5fd068093f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Dec 9 11:19:13 2012 +0000
+
+    sna/gen4+: Refine test for preferring GPU spans
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit c8f622726a4463b419d032b379576cfb3bc492df
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Dec 9 09:26:03 2012 +0000
+
+    sna: Replace remaining kgem_is_idle() with kgem_ring_is_idle()
+    
+    Further experimentation...
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4e4e10935d2815fb62aeaedbfffe10aad115ec0b
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 8 22:39:32 2012 +0000
+
+    sna: Flush upon change of target if GPU is idle
+    
+    The aim is to improve GPU concurrency by keeping it busy. The possible
+    complication is that we incur more overhead due to small batches.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit cef11795f627a393d4254845b0a19eefbf6c782c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 7 17:40:37 2012 +0000
+
+    sna: Convert the ring from BLT/3D to the internal index for kgem_ring_is_idle()
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 52405b2aed492dc7f76fbf082122842f621e7c06
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 7 17:24:42 2012 +0000
+
+    sna: Only inspect the target ring for busyness
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4b7bbb2a23b03bac63f864c33f47fab88dedbf67
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 7 16:43:32 2012 +0000
+
+    sna: Only flush before adding fresh surfaces to the batch
+    
+    Previously, before every operation we would look to see if the GPU was
+    idle and we were running under a DRI compositor. If the GPU was idle, we
+    would flush the batch in the hope that we reduce the cost of the context
+    switch and copy from the compositor (by completing the work earlier).
+    However, we would complete the work far too earlier and as a result
+    would need to flush the batch before every single operation resulting in
+    extra overhead and reduced performance. For example, the gtkperf
+    circles benchmark under gnome-shell/compiz would be 2x slower on
+    Ivybridge.
+    
+    Reported-by: Michael Larabel <michael@phoronix.com>
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 65a8c23ca1bc8e2ebd087027a30358704d4bf11c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Dec 7 14:56:18 2012 +0000
+
+    sna: Only flush at the low apeture watermark if idle
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4bfc5e90f54be1b0997dec9e81796d67b376a01f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Nov 8 23:42:10 2012 +0000
+
+    sna: Mark proxies as dirty on first relocation
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1d2fa5731b7ecfe34a8af809e45bcd3b0b70c890
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 6 22:15:09 2012 +0000
+
+    Remove the default log message
+    
+    Breaks compilation with xorg-1.13
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 7f4d4afa629bd18be89d7270e6178a865cf9586e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 6 17:08:15 2012 +0000
+
+    Fix compilation of UMS probe following 13f47008ec
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d733f7d1f1dc343ac34c4a27ac99d71fc0572bc2
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 6 16:55:00 2012 +0000
+
+    sna/gen4+: Add common glyph-to-dst emitters
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 13f47008ec411609968c40b8ec34dd495f14c50b
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Dec 6 14:05:33 2012 +0000
+
+    Refactor the common probe methods for scrn construction
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0040eb84c9187476a75202ebb251dd74354e4fc7
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Dec 5 09:25:17 2012 +0000
+
+    sna: Don't disable CPU bo if supported on unknown hw
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 14069f48645867a735ebdccb1e27a62364643c38
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Dec 5 09:24:02 2012 +0000
+
+    sna: Assume that future hardware only gets more flexible
+    
+    E.g. that BLT can always write to cacheable memory, inflexible fences
+    are a thing of the past, etc.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b82bfcb54a6af0d1ee17806ef94d9da504cad606
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 4 17:26:47 2012 +0000
+
+    sna/gen6+: Cache the scanout targets separately to avoid override PTE caching
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 626dd1324dd2c5b14ca4aff598b5eb1e45550e69
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 4 12:52:50 2012 +0000
+
+    sna/gen4: Workaround render corruption with multiple SF threads
+    
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=57410
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit dbdb8fabfbade3f19fd8af3524468b5e6668bb66
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 4 12:37:23 2012 +0000
+
+    sna/gen4: Special case solids through the general vertex emitter
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit a17354d5ce6aeeab3f6e42aba63fce06ad18c526
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 4 12:01:26 2012 +0000
+
+    sna/gen4: Remove unused CC viewport
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b9afb9cb614d2ad44330eed03b3f577a35184a88
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Dec 4 11:14:58 2012 +0000
+
+    sna/gen4: Avoid emitting URB_FENCE across a cache-line
+    
+    Old erratum.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 08c30c48bdd0db500498bd9617c15f37bacd8de9
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 3 13:23:33 2012 +0000
+
+    sna: Tidy addition of fake GTF modes for panels
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 6f675eeaeade4728af566891b2afbe5b44fbdc2e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Dec 3 10:47:35 2012 +0000
+
+    2.20.15 release
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 37eb7343be1aeeb90a860096756603a577df1a77
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 1 09:40:11 2012 +0000
+
+    sna/gen5: Inspired by gen4, reorder the flushing
+    
+    This may not be totally safe, but it is a nicer explanation for random
+    single character corruption.
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=51422
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit a8a99428c14c8aed2082853cc60d0f98a1fa2d86
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Dec 1 09:44:49 2012 +0000
+
+    sna/dri: Unknown generations are given the max value and presume i965_dri.so
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 382bb7bf77fca412fdefd7c304f395d1fe9483b5
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 30 16:24:30 2012 +0000
+
+    sna/gen6+: Only apply the BLT w/a for self-copies
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1aee8acacfe5869a072d9f20f3b8290b16683260
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 30 12:17:25 2012 +0000
+
+    sna: Unify gen4 acceleration again
+    
+    After disabling render-to-Y, 965g seems just as happy with the new code
+    paths as g4x.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 5d6dd9c5a7eeb1f879525430ad89ab74d427e469
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 30 12:12:49 2012 +0000
+
+    Convert generation counter to octal
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 131600020638ef15166361214cd5e1a0c08c2ea6
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 30 11:43:56 2012 +0000
+
+    sna: Prevent gen4 from rendering to I915_TILING_Y
+    
+    It always seems to end up in a hang...
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit ee4d1873ff504c2150b51d13864164b02b85dd53
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 30 09:52:49 2012 +0000
+
+    sna/g4x: And remove one more flush point
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1b6c1a30723b1d13e9bd3df0b59a8d75639c89be
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 30 09:27:57 2012 +0000
+
+    sna: Increase tiling alignment to an even tile
+    
+    Seems to help g4x.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 6aeb6fdf75fa322d8f5ffe393337d8195d7a6a62
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 30 09:03:53 2012 +0000
+
+    sna/g4x: Remove the flush before the pipelined pointer changes
+    
+    This one appears unneeded. Hopefully.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8be2c6695509809c0ab0c5c014e11dc733f73006
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Nov 29 19:56:15 2012 +0000
+
+    sna/g4x: Emit the flush prior to changing pipelined state
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2d5d55702bb8eced32d5b8cb3c0cd125fd99d6dc
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Nov 28 15:46:29 2012 +0000
+
+    sna/gen6+: Override PTE cacheability bits for reads
+    
+    This is primarily useful for enabling the render cache for reads from
+    scanouts.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 3ed87945c9e83fefcbda053b616856658bf2ac8e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Nov 28 11:05:40 2012 +0000
+
+    sna/gen5+: Disable max-vbo address
+    
+    As we do not use this feature, disable it and save the relocation.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b2c9e9da639a134577485f83e0f66f54e2371b98
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Nov 27 11:32:11 2012 +0000
+
+    sna/blt: Avoid clobbering common state before converting to a BLT
+    
+    In case we need to continue on with the render operation, we need to
+    preserve the existing state.
+    
+    Reported-by: Jiri Slaby <jirislaby@gmail.com>
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=57601
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1e06d19a00f5a5a05369deeb3c5ae15b282c0f92
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Nov 26 15:30:09 2012 +0000
+
+    sna: Disable shadow tracking upon regen
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=56608
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=52255
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d21ed3a6aba5ae227cc5ecd164f3c18bc48c69af
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Nov 26 10:34:28 2012 +0000
+
+    sna: Use a single execobject flag to mark read/write domains
+    
+    Slight modification to the proposed API to only pass the simplified
+    domain tracking now performed by the kernel.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 7a904ce839933d57176e013cdad147533c33ca2f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Nov 26 08:48:31 2012 +0000
+
+    2.20.14 release
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1367e3f9ef5f606c8927cbde441a2ea41fa6d025
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Nov 25 00:24:45 2012 +0000
+
+    sna: Exploit the alpha-fixup of the BLT for texture format conversions
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 6d6399f97cf7cb91dcf89e9a5cd1243f761e4314
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Nov 25 00:05:44 2012 +0000
+
+    sna: Transform a simple repeat pattern into range of the BLT
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 39f1e228b74e98d3d87157cf093fc56ca31e6b13
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Nov 24 20:16:29 2012 +0000
+
+    sna: Make GPU idle more consistent during wakeup
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 40b6121746c55153de444ccb753df80706ff3a69
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 23 18:21:41 2012 +0000
+
+    sna/g4x: Refine the w/a for the broken sf shader
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit e0b906b09697b5fe7a5be2fdc52abd9b1c73f96d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 23 18:01:25 2012 +0000
+
+    sna/g4x: Use the render pipeline more often for composite operations
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 54d8968da5ae39bfbcad511322926931bce2bda3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 23 17:47:49 2012 +0000
+
+    sna/gen4: Revert changes to 965g[m]
+    
+    The changes tested on g45/gm45 prove to be highly unstable on 965gm,
+    suggesting a radical difference in the nature of the bugs between the
+    two generations. In theory, g4x has additional features that could be
+    exploited over and above gen4 which may prove interesting in the future.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d560296561f68c3ac841345c0f4ce2c8e7381156
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 23 16:35:06 2012 +0000
+
+    sna/gen4: Don't force a flush for the dirty target if the we do not read back
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4023b2044757a9a67d564be0c8adf4885973a6e3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 23 15:42:18 2012 +0000
+
+    sna/gen4: Force composite(WHITE, glyph) for building the glyphstring mask
+    
+    For reasons that are not apparent, if we don't composite with
+    source/mask for the glyph strings, there appears to be some cache
+    corruption. About as bizarre as the rest of gen4 idiosynacracies.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f74b62755c6e41097c23cc506984859e556a3415
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 23 14:59:42 2012 +0000
+
+    sna/gen4: Set composite op before testing for a BLT compatible op
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4c922eb52cadb867a0a15929e5a214c84a5992f3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 23 14:19:59 2012 +0000
+
+    sna/gen4: Pass the mask channel explicitly rather than through a dummy mask
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2e68efa8ec66b4c89e9816bfa15067b398da5e3e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 23 14:04:51 2012 +0000
+
+    sna/gen4: Reduce the flush before performing the CA pass
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 43aff6708fe97aa2fae0e30f98fc7cd9d7311b75
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 23 13:37:44 2012 +0000
+
+    sna/gen4: Update render fill routines to use the dummy mask
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 352910712266202ef017066891ec383fd037fc4a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 23 13:29:01 2012 +0000
+
+    sna/gen4: Move the flush from inside the spans to emit_state()
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 217f3e835b99002669999f818afa0d5c3a1cc852
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 23 12:32:14 2012 +0000
+
+    sna/gen4: Backport the more efficient composite box emitters
+    
+    Now that we aren't flushing after every single rectangle, we can strive
+    for a little efficiency.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d3145e3f8146e7d864d32aec49c44c04d619e56a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 23 12:28:21 2012 +0000
+
+    sna/gen4: Use a dummy white mask to avoid the flush w/a when compositing
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 65d530b59b957cc5f303ae819baad8075a555ac0
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Nov 22 08:18:38 2012 +0000
+
+    Revert "uxa: Refactor early-exit paths of uxa_try_driver_composite()."
+    
+    This reverts commit fa5c573455cf63090dbb6d167d4e5f1cb23daf72 as it
+    causes a SIGSEGV.
+    
+    Reported-by: lu hua <huax.lu@intel.com>
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=57389
+
+commit d3a49f36395d737698616fe8ba9da7b74cd2d89a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Nov 21 18:41:00 2012 +0000
+
+    sna/gen3+: Clear the render.vbo when replacing it for vertex upload
+    
+    As we may trigger a flush and a retire when searching for a vertex
+    buffer for the new vertices, we need to be careful to decouple the
+    destroyed vbo in order to avoid a use-after-free when inspecting the
+    state.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1c57a52a1f46e8401429955d8c96fd5095c9012a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Nov 21 18:17:31 2012 +0000
+
+    sna: Assert that we do not replace a GPU bo with a proxy
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8da12a00ee653510e1f1a6fecb28dbb36faa8400
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Nov 21 17:28:46 2012 +0000
+
+    sna: Skip uploading unintialiased pixmap data
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0696ea4bd601ed823dbded03eaef6f316df2a5e8
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Nov 21 17:25:33 2012 +0000
+
+    sna: Add the missing assertion for !proxy
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit ee72375ecd4f6d6e756bc361fa512b6675309540
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Nov 21 17:08:53 2012 +0000
+
+    sna: Do not migrate uninitialised pixmaps
+    
+    Reported-by: Jiri Slaby <jirislaby@gmail.com>
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 44dad490140d85a4c0dcb916030c36a838670c01
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Nov 21 16:22:35 2012 +0000
+
+    sna: Do not dispose of a shadow pixmap
+    
+    Fixes regression from 2249e9edc37811c07e2807d6b4def05585b44c22
+    
+    Reported-by: Jiri Slaby <jirislaby@gmail.com>
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9c627a05247690891062a2c0c1c8f7bbc0273104
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Nov 21 16:03:02 2012 +0000
+
+    sna: Remove the kgem_bo_is_mappable refcnt assertion from freed paths
+    
+    A few callers of kgem_bo_is_mappable operate on freed bo, and so need to
+    avoid the assert(bo->refcnt).
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit c0c48c7a5aca4d24936efbeaefc7674ada2ef87f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Nov 21 14:46:45 2012 +0000
+
+    sna: Add a few refcnt assertions
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0884777c33d20dbc329b98ad0db5ffb0df93ac8c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Nov 21 14:31:44 2012 +0000
+
+    sna: Fix bogus assertion from 03fb9ded43
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2249e9edc37811c07e2807d6b4def05585b44c22
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Nov 21 13:05:36 2012 +0000
+
+    sna: Dispose of local copy for render sources
+    
+    If we transfer the pixmap to the GPU to use as a render source, presume
+    that we will not need to then touch the local copy (at least for a
+    while) and so return that memory to the system.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 736bb0f7058bf05ef48cdfe6a30d880de817aff9
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Nov 21 12:16:46 2012 +0000
+
+    sna: Tighten a couple of assertions for damage with use_bo
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 03fb9ded43f9bf8b73f99d5b3a8dc592fe22b523
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Nov 20 21:13:07 2012 +0000
+
+    sna: Assert that we do not create a proxy with existent GPU damage
+    
+    References: http://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9ab1d1f94e502e5fde87e7c171f3502f8a55f22b
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Nov 20 18:42:58 2012 +0000
+
+    sna/dri: Queue a vblank-continuation after flip-completion
+    
+    If a vblank request was delayed due to a pending flip, we need to make
+    sure that we then queue it after that flip or else progress ceases.
+    
+    Reported-by: Jiri Slaby <jirislaby@gmail.com>
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=56423
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=57156
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 7a7a76b359f73a4c4bcda0d88004f4dd5e94a186
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Nov 20 16:05:32 2012 +0000
+
+    sna/dri: Avoid a NULL dereference inside a DBG
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit fa5c573455cf63090dbb6d167d4e5f1cb23daf72
+Author: Eric Anholt <eric@anholt.net>
+Date:   Sat Nov 17 13:11:13 2012 -0800
+
+    uxa: Refactor early-exit paths of uxa_try_driver_composite().
+    
+    Saves 200b of code at -O2.
+
+commit edefb64d2b1c95b0b678cb222273ab64b2e6db2a
+Author: Eric Anholt <eric@anholt.net>
+Date:   Sat Nov 17 13:11:12 2012 -0800
+
+    uxa: Work around uninitialized-value warning.
+    
+    The compiler isn't noticing that localDst only diverges from pDst when
+    the _copy variables have also been set.
+
+commit 18b2e2a82724407196001ca853bd83150c66c5bd
+Author: Eric Anholt <eric@anholt.net>
+Date:   Sat Nov 17 13:11:10 2012 -0800
+
+    uxa: Add printf attribute to intel_debug_fallback().
+    
+    Shuts up a bunch of warnings with xorg's shared warning flags, and
+    should give us more informative warnings in our code.
+
+commit 2d1e267e662505ca0dd318765464a24739dc5bd8
+Author: Eric Anholt <eric@anholt.net>
+Date:   Sat Nov 17 13:11:09 2012 -0800
+
+    uxa/dri: Factor out the repeated swap fallback code.
+
+commit cd2f373da7a14e004c999f9f0efaf88c785d3d3f
+Author: Eric Anholt <eric@anholt.net>
+Date:   Sat Nov 17 13:11:08 2012 -0800
+
+    configure.ac: Fix bad syntax for test calls
+
+commit b8c01d9bd7ce5656706ebebd16e5a8c5ca0ba487
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Nov 19 15:20:10 2012 +0000
+
+    sna/dri: Add an assert that the cached DRI2 buffer is pinned for DRI
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 84b1a02fa9fde02366e0384044526982e70d0e8d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Nov 19 13:41:28 2012 +0000
+
+    sna/dri: Avoid setting off-delay after a failed flip (use-after-free)
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b4dd1103a55406bcd65f137c668701074a5c41b6
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Nov 18 12:21:49 2012 +0000
+
+    sna/gen6+: Tweak prefer-blt to offset latency when in rc6
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0cb8544dc16d4c1e9ae7f1ee74ee26c7501a9ed7
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Nov 18 12:13:46 2012 +0000
+
+    Remove useless indirection of intel_chipsets
+    
+    Once upon a time this was used to hide a compiler warning about a
+    pointer mismatch, now the compiler still warns about the cast, making
+    the indirect moot.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8509e474f57533fc6afcf213165f4c8633631fb5
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 16 23:02:44 2012 +0000
+
+    sna/dri: Clear the last-used-cpu flag when performing CopyRegion
+    
+    Keeps the internal bookkeeping intact after the small bypass.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 866ed4a26cbbb29ef3845b0aa56383c4d951c65a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Nov 15 15:13:14 2012 +0000
+
+    sna/dri: Add a couple of more asserts to track injection of a rogue bo
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d2897cb0136ffec83365c7530ed544b562cac478
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Nov 13 12:43:44 2012 +0000
+
+    sna/gen4,5: Fix the opacity shader to use the right vertex attribute
+    
+    Reported-by: Edward Sheldrake <ejsheldrake@gmail.com>
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=57054
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 440eaa049756e5266f3bd80e2751f1fd0d5f9890
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Nov 13 12:42:58 2012 +0000
+
+    sna/gen4: Tidy emission of opacity vertex attribute
+    
+    Just make it more consistent between the various emitters
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b6d2bb961517623d46aa6944307cb998ee125459
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Nov 13 10:05:56 2012 +0000
+
+    sna/gen4: Do not prefer inplace non-rectilinear spans
+    
+    As gen4 requires the per-rectangle vertex flush, emitting spans on the
+    GPU is inefficient and so we prefer to composite the mask instead.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit ae293609c7400cd3c753ed3762772264c4741df5
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Nov 13 10:21:29 2012 +0000
+
+    sna/gen4: Always initialise redirect
+    
+    Do not assume the caller cleared the composite-op structure for us.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2954f15e2bcb590a90c2cb6077c0843ee25a4413
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Nov 13 09:46:19 2012 +0000
+
+    sna: Specialise the decision for inplace xor uploads
+    
+    Fixes a regression from
+    
+    commit 0be1d964713ca407f029278a8256d02d925dc9da
+    Author: Chris Wilson <chris@chris-wilson.co.uk>
+    Date:   Tue Sep 11 21:48:24 2012 +0100
+    
+        sna: Use inplace X tiling for LLC uploads
+    
+    which introduced the ability to swizzle into CPU maps, but also
+    convinced the xorg path to the same - which for large images blows up.
+    
+    Reported-by: Michael Laß <bevan@bi-co.net>
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=57031
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 66eb0adffa63ef8ece7621ba90dc96af91549612
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Nov 12 14:45:56 2012 +0000
+
+    sna/dri: Apply the can_blit() check for CopyRegion
+    
+    CopyRegion() also needs to check for stale bo in case the pixmap
+    dimensions have changed size and so may cause out-of-bounds read/writes.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b27ecf3059bc066ef59f2a71c1d8d8f0ffec7191
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Nov 12 14:06:06 2012 +0000
+
+    sna/dri: Prevent scheduling a swap on stale buffers
+    
+    If the screen has been reconfigured and the DRI client tries to swap the
+    old buffers (having not processed the invalidate event and retrieved the
+    current names), quietly drop the request. If we try to obey the request,
+    we will end up attaching a back buffer that is the wrong size to the
+    scanout...
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 34aa1e3b2702a55799a5655a3ba10bce4cc2065a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Nov 12 11:22:53 2012 +0000
+
+    sna: Compile against ancient libdrm
+    
+    We need to trim the sprite video overlays if the prototypes are not
+    known.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8f1afde57dca27e6542b0b8e7c87750f3d6367bf
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Nov 11 16:16:20 2012 +0000
+
+    2.20.13 release
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b16219a19f48b52dda91f26fcbbbbeda056589ab
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Nov 11 11:05:35 2012 +0000
+
+    sna: Filter out the full-damage marker when undoing redirection
+    
+    ==25902== Invalid read of size 4
+    ==25902==    at 0x4980E13: _list_del (intel_list.h:218)
+    ==25902==    by 0x4980EAB: list_del (intel_list.h:240)
+    ==25902==    by 0x4981F4B: free_list (sna_damage.c:403)
+    ==25902==    by 0x4985131: __sna_damage_destroy (sna_damage.c:1467)
+    ==25902==    by 0x49A5276: sna_render_composite_redirect_done (sna_render.c:1921)
+    ==25902==    by 0x49C68FC: gen2_render_composite_done (gen2_render.c:1136)
+    ==25902==    by 0x497F90F: sna_composite (sna_composite.c:567)
+    ==25902==    by 0x4994725: glyphs_via_mask (sna_glyphs.c:1139)
+    ==25902==    by 0x4995FB7: sna_glyphs (sna_glyphs.c:1688)
+    ==25902==    by 0x8150EB4: ??? (in /usr/bin/Xorg)
+    ==25902==    by 0x813CA38: CompositeGlyphs (in /usr/bin/Xorg)
+    ==25902==    by 0x8146DE1: ??? (in /usr/bin/Xorg)
+    ==25902==  Address 0x7c079ac2 is not stack'd, malloc'd or (recently) free'd
+    
+    Reported-by: bonbons67@internet.lu
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=56785
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 69acbb77e8aad3370d5e8d9a9e067c54872d7082
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Nov 11 10:49:59 2012 +0000
+
+    sna: Fix printing of uninitialied value in DBG
+    
+    ==25902== Use of uninitialised value of size 4
+    ==25902==    at 0x423098E: _itoa_word (_itoa.c:196)
+    ==25902==    by 0x4233F7F: vfprintf (vfprintf.c:1602)
+    ==25902==    by 0x42FAFAD: __vsnprintf_chk (vsnprintf_chk.c:65)
+    ==25902==    by 0x81DBE8E: Xvscnprintf (in /usr/bin/Xorg)
+    ==25902==    by 0x81DC8FB: LogVMessageVerb (in /usr/bin/Xorg)
+    ==25902==    by 0x81DCA62: LogVWrite (in /usr/bin/Xorg)
+    ==25902==    by 0x81DCA9B: VErrorF (in /usr/bin/Xorg)
+    ==25902==    by 0x81DC333: ErrorF (in /usr/bin/Xorg)
+    ==25902==    by 0x49B2FA8: trapezoid_span_inplace__x8r8g8b8 (sna_trapezoids.c:5069)
+    ==25902==    by 0x49B3407: trapezoid_span_inplace (sna_trapezoids.c:5166)
+    ==25902==    by 0x49B4C96: sna_composite_trapezoids (sna_trapezoids.c:5619)
+    
+    Reported-by: bonbons67@internet.lu
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=56785
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 66e4c8ff40ab8cf722efa4293bb17b0d8f2dfa88
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Nov 11 09:40:09 2012 +0000
+
+    sna: Flush pending rendering before enabling an output
+    
+    This is to prevent falling in the trap of the rendering being delayed
+    until the next client renders some new content.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 94dd0b9ee9f55e7c09b8c0ee18939fa69ce66da2
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Nov 10 16:52:09 2012 +0000
+
+    sna/gen2: Fix use of uninitialised redirection
+    
+    ==29553== Invalid read of size 4
+    ==29553==    at 0x4980E1B: _list_del (intel_list.h:218)
+    ==29553==    by 0x4980EB3: list_del (intel_list.h:240)
+    ==29553==    by 0x4981F53: free_list (sna_damage.c:403)
+    ==29553==    by 0x4985139: __sna_damage_destroy (sna_damage.c:1467)
+    ==29553==    by 0x49A527E: sna_render_composite_redirect_done (sna_render.c:1921)
+    ==29553==    by 0x49C6904: gen2_render_composite_done (gen2_render.c:1136)
+    ==29553==    by 0x497F917: sna_composite (sna_composite.c:567)
+    ==29553==    by 0x8150C41: ??? (in /usr/bin/Xorg)
+    ==29553==    by 0x8142F13: CompositePicture (in /usr/bin/Xorg)
+    ==29553==    by 0x8145F58: ??? (in /usr/bin/Xorg)
+    ==29553==    by 0x81436F2: ??? (in /usr/bin/Xorg)
+    ==29553==    by 0x807965C: ??? (in /usr/bin/Xorg)
+    ==29553==  Address 0x9407e188 is not stack'd, malloc'd or (recently) free'd
+    
+    Reported-by: bonbons67@internet.lu
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=56785
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0f1c30818c9d782b066147448bbcc9ac95ac834f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Nov 10 16:52:09 2012 +0000
+
+    sna: Fix use of uninitialised value in DBG
+    
+    ==29553== Use of uninitialised value of size 4
+    ==29553==    at 0x4230964: _itoa_word (_itoa.c:195)
+    ==29553==    by 0x4233F7F: vfprintf (vfprintf.c:1602)
+    ==29553==    by 0x42FAFAD: __vsnprintf_chk (vsnprintf_chk.c:65)
+    ==29553==    by 0x81DBE8E: Xvscnprintf (in /usr/bin/Xorg)
+    ==29553==    by 0x81DC8FB: LogVMessageVerb (in /usr/bin/Xorg)
+    ==29553==    by 0x81DCA62: LogVWrite (in /usr/bin/Xorg)
+    ==29553==    by 0x81DCA9B: VErrorF (in /usr/bin/Xorg)
+    ==29553==    by 0x81DC333: ErrorF (in /usr/bin/Xorg)
+    ==29553==    by 0x49434F0: kgem_create_buffer (kgem.c:4887)
+    ==29553==    by 0x4943B09: kgem_create_buffer_2d (kgem.c:4969)
+    ==29553==    by 0x4943E19: kgem_upload_source_image (kgem.c:5021)
+    ==29553==    by 0x49A0567: upload (sna_render.c:505)
+    ==29553==
+    
+    Reported-by: bonbons67@internet.lu
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=56785
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit cc2b13c9c05e57dc5004d93b56f332ea95f0a4ef
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Nov 10 11:50:15 2012 +0000
+
+    sna: Specify read/write domains for no-relocation fastpath
+    
+    On review (read triggering BUGs), we do need to supply the domain tracking
+    of the buffers that is being replaced from the relocation path.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 0c4a2bcc3d63ecc02e3a940e38e9a416b51ad0c8
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Nov 10 12:34:52 2012 +0000
+
+    sna: Allow snooped buffers to be retained (and reused) between batches
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f5d79b202dd448e61ab6ffce26fe9cbf9051d770
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Nov 10 10:30:04 2012 +0000
+
+    sna/gen2: Add a modicum of fallback DBG
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=56785
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 27327633138dce159ca2e91fe5eac1565bd45e1c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 9 17:08:01 2012 +0000
+
+    sna/gen4: Only 965gm suffers the !snoop restriction
+    
+    So fixup the bogus assertion for g4x
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8d3b5ea135fd8f16da2cbfb98041e32c7001a38f
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 9 15:31:03 2012 +0000
+
+    xvmc: Use DRMINTEL_LIBS instead of hardcoding -ldrm_intel
+    
+    Reported-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f040b97b01495aa43f7771ebb8ca5c0d44038bc1
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Nov 8 23:42:10 2012 +0000
+
+    sna: Mark no-reloc write buffers
+    
+    If we bypass the relocation processing, we also then bypass the
+    pending-write analysis, so we need to supply those to the kernel
+    ourselves (to maintain gpu-cpu coherency).
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 85ba7e96268dbb8da4bb34078333695a451c6570
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Nov 8 15:56:13 2012 +0000
+
+    sna: Experiment with using reloc.handle as an index into the execbuffer
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 93d8dddbb92431d6e2c48a17b71cac9f7047902e
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Nov 8 09:41:21 2012 +0000
+
+    sna: Set the known offset for the batch as well
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 120fa0ef8d04f5e82e5f7a0636033d3d96efa1e8
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Nov 7 17:41:20 2012 +0000
+
+    sna: Support a fast no relocation changed path
+    
+    x11perf -copywinwin10 on gm45 with c2d L9400:
+      before: 553,000 op/s
+      after:  565,000 op/s
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b7d2fcf47a9569d0944097a8be60ca3be72b42f6
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Nov 8 08:55:25 2012 +0000
+
+    Remove reliance on hard-coded DRI name
+    
+    This provides for using the existing DDX with future DRI drivers which
+    may break from the traditional names - but only with the help of the
+    user/packager. This scheme needs to be replaced with a robust mechanism
+    for driver loading if AIGLX and co are to be kept.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit cefce9c81585b73db7620e08fcf60c89a8204873
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Nov 6 17:26:34 2012 +0000
+
+    sna: Abandon kernels that do not support execbuffer2
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b2245838c15b54d72557de8facb7cc15d59624ae
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Nov 6 16:32:32 2012 +0000
+
+    sna/gen4: opacity spans requires the per-rectangle flush w/a
+    
+    Note that this is worsened, but not caused, by:
+    
+    commit e1a63de8991a6586b83c06bcb3369208871cf43d
+    Author: Chris Wilson <chris@chris-wilson.co.uk>
+    Date:   Fri Nov 2 09:10:32 2012 +0000
+    
+        sna/gen4+: Prefer GPU spans if the destination is active
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=55500
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit a0540ebff083974688c863e08203e3d71a297340
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Nov 6 16:00:42 2012 +0000
+
+    sna/gen4: Remove a couple of old, now redundant, w/a flushes
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit aaaa6c356456a4bab595c6e6485893c538064e37
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Nov 6 14:34:51 2012 +0000
+
+    sna/gen4: Flush after pipelined pointer updates (inverted logic!)
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 28bda6707d979bca29dbea04e932819de204d920
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Nov 5 22:30:29 2012 +0000
+
+    sna: Prevent use-after-free during partial move-to-gpu
+    
+    As we reuse the input parameter 'box' to hold the array of boxes that
+    need to be migrated, we need to be careful that we do not later confuse
+    it with the original input parameter. Otherwise,
+    
+    ==1315== Invalid read of size 2
+    ==1315==    at 0x928B091: box_inplace (sna.h:506)
+    ==1315==    by 0x9292278: sna_pixmap_move_area_to_gpu (sna_accel.c:2554)
+    ==1315==    by 0x9292C14: sna_drawable_use_bo (sna_accel.c:2774)
+    ==1315==    by 0x9356C01: gen7_composite_set_target (gen7_render.c:2448)
+    ==1315==    by 0x9357AA2: gen7_render_composite (gen7_render.c:2800)
+    ==1315==    by 0x92DB12E: glyphs_to_dst (sna_glyphs.c:552)
+    ==1315==    by 0x92DEA8D: sna_glyphs (sna_glyphs.c:1664)
+    ==1315==    by 0x4F920E: damageGlyphs (in /tmp/Xorg)
+    ==1315==    by 0x4F2FF6: ProcRenderCompositeGlyphs (in /tmp/Xorg)
+    ==1315==    by 0x437260: Dispatch (in /tmp/Xorg)
+    ==1315==    by 0x426466: main (in /tmp/Xorg)
+    ==1315==  Address 0xd637054 is 20 bytes inside a block of size 208,464 free'd
+    ==1315==    at 0x4C2A2FC: free (in /usr/lib64/valgrind/vgpreload_memcheck-amd64-linux.so)
+    ==1315==    by 0x92CCFCD: __sna_damage_destroy (sna_damage.c:1469)
+    ==1315==    by 0x928AD74: sna_damage_destroy (sna_damage.h:284)
+    ==1315==    by 0x9291CB2: sna_pixmap_move_area_to_gpu (sna_accel.c:2470)
+    ==1315==    by 0x9292C14: sna_drawable_use_bo (sna_accel.c:2774)
+    ==1315==    by 0x9356C01: gen7_composite_set_target (gen7_render.c:2448)
+    ==1315==    by 0x9357AA2: gen7_render_composite (gen7_render.c:2800)
+    ==1315==    by 0x92DB12E: glyphs_to_dst (sna_glyphs.c:552)
+    ==1315==    by 0x92DEA8D: sna_glyphs (sna_glyphs.c:1664)
+    ==1315==    by 0x4F920E: damageGlyphs (in /tmp/Xorg)
+    ==1315==    by 0x4F2FF6: ProcRenderCompositeGlyphs (in /tmp/Xorg)
+    ==1315==    by 0x437260: Dispatch (in /tmp/Xorg)
+    
+    Reported-by: Matti Ruohonen <kiesus@gmail.com>
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=56591
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit e62b0cbf958d1ad95e4522973253a1ae5c1a4da9
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 2 15:54:20 2012 +0000
+
+    sna: Add a small delay during startup if another master is still active
+    
+    There exists a race with plymouthd that can cause the drm device to
+    reject us as the rightful master, and so cause X to fail to load. Try
+    waiting for a couple of seconds for whatever it was to close before
+    giving in.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit e1a63de8991a6586b83c06bcb3369208871cf43d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Nov 2 09:10:32 2012 +0000
+
+    sna/gen4+: Prefer GPU spans if the destination is active
+    
+    Trying to avoid using the inplace scanline rasteriser for large shapes.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 72bcd8f85c98502b13a67d9c606371afe513584c
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Nov 1 09:30:18 2012 +0000
+
+    sna: Try to reduce ping-pong migration for intermixed render/legacy code paths
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=56591
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4e363906a5ef15e1eb0a387cfb6b3445ac185b9d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Oct 31 22:58:59 2012 +0000
+
+    sna: Set a valid box when checking for GPU bo for BLT composite ops
+    
+    Reported-by: Jiri Slaby <jirislaby@gmail.com>
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit b924831e445615b82a53b10e1849720e933eddfe
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Oct 31 22:55:56 2012 +0000
+
+    sna: Preserve mode if flushing before a scanline wait
+    
+    Reported-by: Jiri Slaby <jirislaby@gmail.com>
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=47597
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 678f9586807071bef813bb69d451f14d2fcbcc04
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Oct 31 11:26:18 2012 +0000
+
+    sna: assert that the source is not the GTT mapping when uploading
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 783b8048a6d1a9fd0a73ebf7768ae17dc0b21900
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Oct 31 11:16:09 2012 +0000
+
+    sna: Prefer to use the GPU for uploads if continuing on the GPU
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 74c912880c302889f38fe5898c8038a0ba20e5db
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Oct 31 10:57:54 2012 +0000
+
+    sna: Fix a typo in a DBG
+    
+    Reported-by: Matti Ruohonen <kiesus@gmail.com>
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f48a821aa73cb40a51baafc6cd2b063f1f91d864
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Oct 31 10:53:47 2012 +0000
+
+    sna: Add a few DBG tracepoints to screen init/fini
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit bf81d552c4be039fbcf3272387828b1a8b3fbdb8
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Oct 31 08:50:44 2012 +0000
+
+    sna: Clamp the drawable box to prevent int16 overflow
+    
+    And assert that the box is valid when migrating.
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=56591
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 31eb704b2ad7c861ec4e61fb9de0e9592fc6d269
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Oct 26 13:57:30 2012 +0100
+
+    sna: Ensure that the trap is clipped if it ends within the boundary pixel
+    
+    Reported-and-tested-by: Jiri Slaby <jirislaby@gmail.com>
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=56395
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit ef431b2d35c1bf4d77bbcc73688951d22f6aa135
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Thu Oct 25 10:15:39 2012 +0100
+
+    uxa: Drain the DRM event queue before server regeneration
+    
+    Adam Jackson notes that what appeared to be my paranoid ramblings in SNA
+    actually served a purpose - it prevents a server crash following
+    server regen if an indirect client happened to be running at the time
+    (e.g. LIBGL_INDIRECT_ALWAYS=1 glxgears).
+    
+    Reported-by: Adam Jackson <ajax@redhat.com>
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit efb8ff16491ecfb4d9c0c6a718684310d949d8d3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Oct 24 22:56:20 2012 +0100
+
+    sna: Add missing ValidatePicture() for flattening alphamaps
+    
+    Reported-by: Armands Liepins <armandsl@gmail.com>
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=56367
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 1a489142c8e6a4828348cc9afbd0f430d3b1e2d8
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Oct 23 23:43:50 2012 +0100
+
+    sna: Disable RandR hotplug events if Xinerama is enabled
+    
+    Since RandR itself is disabled if Xinerama is enabled, for example with
+    ZaphodHeads, calling RRGetInfo() upon a hotplug event generates an
+    assertion.
+    
+    Reported-by: Stephen Liang <inteldriver@angrywalls.com>
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=55260
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit d87c2756db1af6e4af15864ab0f44d1454079236
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Oct 23 15:50:56 2012 +0100
+
+    sna: Beware 16-bit overflow when computing sample areas
+    
+    Reported-by: Ognian Tenchev <drJeckyll@Jeckyll.net>
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=56324
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit c7f48684cdebc24128a5fa5678614af3deb14b3b
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Oct 23 15:17:56 2012 +0100
+
+    sna: Only disallow hw sprite scaling on Haswell
+    
+    Earlier chips (Ironlake, Sandybridge and Ivybridge) have integrated
+    sprite scalers.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 5c3ea9cf6900855502fcd56214a1b9e180265ff5
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Oct 22 22:35:17 2012 +0100
+
+    sna: Update DRI buffer if attached to the framebuffer for TearFree flips
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4dfc83359d006a4e410e3280003b49683309afc3
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Oct 22 14:56:01 2012 +0100
+
+    sna: Tidy udev install/remove and add a couple of lines of DBG
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=55260
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 4d9687d49c9869b2e88d408e5f451c9a1f8f3389
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Mon Oct 22 13:41:54 2012 +0100
+
+    sna: Refactor the common code to enable a timer
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit fb729788872ccb429ddde8a9a4281b1933243096
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Oct 21 14:36:48 2012 +0100
+
+    sna: Only query the system time if we are processing timers
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit c0d6a75f02eb97e5c80a4345ae5c68e9a81d49b6
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Oct 21 14:32:14 2012 +0100
+
+    sna: Use the FLUSH_TIMER as the only wakeup timer source
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 7bc829c39a203c17053eb728412f698a429ad9fe
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Oct 21 14:24:01 2012 +0100
+
+    sna: Remove the unused inactive eviction
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 9fa6e4aa2daee99ff5f6efc11232de22100bac80
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Oct 21 12:48:06 2012 +0100
+
+    intel: Sanity check that the platform probes points to a i915.ko GEM device
+    
+    References: https://bugs.launchpad.net/ubuntu/+source/xserver-xorg-video-intel/+bug/1069031
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f6eed98fcfea54d50a282ac71ee654645551ae11
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sun Oct 21 10:46:14 2012 +0100
+
+    sna: Mark the to-be-damaged region first, then Process afterwards
+    
+    Damage is processed in two phases, with the actual Damage being appended
+    before the operation is performed so that a copy can be made before
+    modification (e.g. software cursors).
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 60e4e009f90543bfd57f6a4e51ebc5b32b4af33b
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Oct 20 17:59:45 2012 +0100
+
+    sna: Move the source region to the CPU for a self-copy fallback CopyArea
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 7ff8b1ba543508f3b209f2ade7655c3aa34d546d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Oct 20 16:23:26 2012 +0100
+
+    2.20.12 release
+    
+    How embarrassing! My fault for rushing :(
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 695b2ce2d32bde191080129b55f9bf8a9d48ee77
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Oct 20 16:19:21 2012 +0100
+
+    uxa: Fixup drm_intel_bo_disable_reuse() typo
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2083e253b3d1ecc218ab1e523e4026ddd4561112
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Oct 20 16:07:11 2012 +0100
+
+    2.20.11 release
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit fc0ba65f5efe217f2ab5e920255d2133d7c9e5e8
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Sat Oct 20 09:29:10 2012 +0100
+
+    uxa: Disable bo reuse after binding to a scanout
+    
+    On gen6+, bo are expected to be LLC by default. However, as using the bo
+    for the scanout causes it to be moved into the uncached domain, this
+    assumption is then false and we should release the bo back to the system
+    rather than spread the uncached buffers around. The most common
+    allocator of scanouts is for pageflipping which are already non-reusable
+    due to the DRI2 export, so there should actually be little impact.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit f4c32af48b0c92a48131090886a6a6b6c45dbe34
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Oct 19 16:29:19 2012 +0100
+
+    sna: Clear the damage along with the BO when forcing the stall for inplace BLT
+    
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=56180
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 299232bdb69c8c2b6231905e0f45e9cfe74fe09a
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Fri Oct 19 15:02:00 2012 +0100
+
+    sna: Reorder final checks for using the BO and setting the damage pointer
+    
+    When we return NULL from sna_drawable_use_bo(), the expectation is that
+    the damage pointer is also NULL. However, one SHM path leaked.
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=56180
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 257abfdabe39629fb458ed65fab11283f7518dc4
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Oct 17 23:34:22 2012 +0100
+
+    sna/gen4: Presume we need a flush upon state change similar to gen5+
+    
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=55627
+    References: https://bugs.freedesktop.org/show_bug.cgi?id=55500
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 8238c672984e31ae655353d6412e3395a9cdfbe6
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Oct 17 22:16:29 2012 +0100
+
+    sna: secure batches accepted upstream, so simply use runtime detection
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 540666a0a81c7daedbd47830d0932df5e57ec903
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Oct 17 16:59:05 2012 +0100
+
+    sna/overlay: Move bo out of GTT domain after binding to overlay plane
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 891bae4aa91e85542dcbe38f6ee92141e3efc801
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Oct 17 11:29:10 2012 +0100
+
+    sna: Use the secure batches to program scanline waits on gen6+
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 41be80a8cae1eb0e294392e5033511bfdf2895c5
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Oct 17 11:25:52 2012 +0100
+
+    sna: Enable support for SECURE batch buffers
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit ba6c82cd9d8089354b90632ca8edbb35cc09b9c4
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Wed Oct 17 13:54:51 2012 +0100
+
+    sna/dri: Defensively check for GTT mmap failure during fallback
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 7927f9a351ead1a5593bc91e465706bdd889bb8d
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Oct 16 17:56:30 2012 +0100
+
+    sna/gen7: Filter BLEND flags for CA glyphs
+    
+    Fixes regression from commit c51aaa731e5cffc892e59730194ad7c98789b02b
+    Author: Chris Wilson <chris@chris-wilson.co.uk>
+    Date:   Thu Oct 11 11:36:00 2012 +0100
+    
+        sna/gen7: Replace bogus state tracking assertion
+    
+    The assumption being that we only used the encoded flags for determining
+    the composite state is false for the magic CA pass.
+    
+    Reported-by: Oleksij Rempel <bug-track@fisher-privat.net>
+    Reported-by: Eyal Lotem <eyal.lotem@gmail.com>
+    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=56037
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
+commit 2ad4aa195571fe214ccffa55e123507f1be66243
+Author: Chris Wilson <chris@chris-wilson.co.uk>
+Date:   Tue Oct 16 11:59:28 2012 +0100
+
+    sna: Drop fake tiled CPU mapping
+    
+    The only path where this is correct already handles it as the special
+    case that it is, everywhere else it just nonsense.
+    
+    Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
+
 commit b42d81b63f5b6a571faffaadd42c74adce40128a
 Author: Chris Wilson <chris@chris-wilson.co.uk>
 Date:   Sun Oct 14 09:15:38 2012 +0100
diff --git a/Makefile.am b/Makefile.am
index b3d37b273..5001674ee 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -18,6 +18,7 @@
 #  IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 #  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
+ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS}
 
 SUBDIRS = man
 
diff --git a/Makefile.in b/Makefile.in
index 6f2e8f426..edf7e301a 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -225,7 +225,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -264,6 +263,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -322,6 +323,7 @@ target_alias = @target_alias@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
+ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS}
 SUBDIRS = man $(am__append_1) src $(am__append_2)
 MAINTAINERCLEANFILES = ChangeLog INSTALL
 all: config.h
@@ -330,7 +332,7 @@ all: config.h
 .SUFFIXES:
 am--refresh: Makefile
 	@:
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -357,9 +359,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	$(SHELL) ./config.status --recheck
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	$(am__cd) $(srcdir) && $(AUTOCONF)
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	$(am__cd) $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
 $(am__aclocal_m4_deps):
 
@@ -370,7 +372,7 @@ config.h: stamp-h1
 stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status
 	@rm -f stamp-h1
 	cd $(top_builddir) && $(SHELL) ./config.status config.h
-$(srcdir)/config.h.in: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) 
+$(srcdir)/config.h.in:  $(am__configure_deps) 
 	($(am__cd) $(top_srcdir) && $(AUTOHEADER))
 	rm -f stamp-h1
 	touch $@
diff --git a/NEWS b/NEWS
index be1070043..32977fa19 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,293 @@
+Release 2.21.2 (2013-02-10)
+===========================
+Pass the brown paper bags, I need half a dozen or so. That seemingly
+innocuous build fix to xorg-1.13 happned to have the little side-effect
+of breaking glyph rendering on xorg-1.12 and older on 64-bit machines.
+
+
+Release 2.21.1 (2013-02-10)
+===========================
+A fix for a potential GPU hang on 945gm (GMA3100) and earlier chipsets,
+along with backporting SNA to the packages found in stable distributions
+like Debian 6.0 (Squeeze).
+
+ * Cleanup compilation warnings from deblint, thanks to Paul Menzel
+
+ * Minor build improvements by Damien Lespiau.
+
+ * Disable generating span geometry for non-rectilinear spans on gen4
+   in order to work around and prevent one class of render corruption.
+
+ * Prevent cache thrashing and severe performance degradation on LLC
+   machines for streaming texture updates. However, note the effect was
+   only observed on just one particular laptop.
+
+ * Fix alignment of subsurface proxies for old chipsets.
+   https://bugs.launchpad.net/ubuntu/+source/xserver-xorg-video-intel/+bug/1120108
+
+ * Repair build against Xorg-1.6 and contemporary packages.
+
+
+Release 2.21.0 (2013-02-01)
+===========================
+A few new features:
+
+ * Enable render acceleration for Haswell GT1/GT2.
+
+ * Enable multi-threaded rasterisation of trapezoids and fallback composition
+
+ * Utilise a new kernel interface (v3.9) for processing relocations
+
+along with a few older features from the 2.20.x series:
+
+ * PRIME support for hotplug GPUs and hybrid systems
+
+ * Support for IvyBridge GT1 machines, aka HD2500 graphics.
+
+ * Stable 830gm/845g support, at last!
+
+As usual we have a large number of bug fixes since the last release:
+
+ * Prevent a stray relocation being left after a buffer is removed from
+   a batch, leading to GPU hangs.
+
+ * Make the driver more robust against its own failures to submit batches
+   by falling back to software rendering.
+
+ * Fix emission of scanline waits for secondary pipes on gen6/7. Otherwise
+   you may encounter GPU hangs in MI_WAIT_FOR_EVENT.
+
+ * Fix a missing corner pixel when drawing rectangles with PolyLines
+   https://bugs.freedesktop.org/show_bug.cgi?id=55484
+
+ * Don't try to use Y-tiling colour buffers with mesa/i915c as mesa
+   doesn't support them and wil fallback to software rendering
+
+ * Ensure that any cached mmaps are invalidated for a SwapBuffers
+   https://bugs.freedesktop.org/show_bug.cgi?id=60042
+
+ * Correctly handle the composition of rotated displays too large for the
+   3D pipeline
+   https://bugs.freedesktop.org/show_bug.cgi?id=60124
+
+ * Fix the computation of the planar video frame size
+   https://bugs.launchpad.net/ubuntu/+source/xserver-xorg-video-intel/+bug/1104180
+
+
+Release 2.20.19 (2013-01-20)
+============================
+A quick release as the last broke USB DisplayLink slave outputs badly. The
+performance of those displays was unusable due to an inadvertent change that
+caused us to flush the entire scanout over the USB for every drawing
+operation.
+
+ * Implement the GNOME Build API. A couple of minor changes to make
+   integrators and distributors lives a little easier, or at least more
+   consistent.
+
+ * Correctly offset inplace trapezoids for subwindows, such as the GTK+
+   close button after it has a background image uploaded.
+
+ * Explicitly prevent ring-switching for synchronized rendering to
+   scanouts (for vsync).
+
+ * Clip dirty region to slave pixmaps (otherwise UDL is nigh unusuable)
+   https://bugs.freedesktop.org/show_bug.cgi?id=59539
+
+
+Release 2.20.18 (2013-01-16)
+============================
+A bunch of miscellaneous fixes for assertion failures and various
+performance regressions when mixing new methods for offloads, along with
+a couple of improvements for rendering with gen4.
+
+ * Remove use of packed unnormalized texture coordinates on gen4/5 as
+   these GPUs do not support unnormalized coordinates in the sampler.
+
+ * Remove dependency upon x86 asm for cross-building to unsupported
+   architectures.
+   https://bugs.gentoo.org/show_bug.cgi?id=448570
+
+ * Apply damage around PRIME updates in the correct order.
+
+ * Correctly read the initial backlight level for when the user
+   overrides UXA's choice of backlight controller.
+
+ * Throttle UXA and prevent it queuing work much faster than the GPU can
+   complete it. This realised itself in impossible performance figures and
+   the entire display freezing for several seconds whlist the GPU caught
+   up. One side effect is that it also caused the DDX to consume more
+   memory than was required as it could not recycle buffers quick
+   enough, and in some cases this produces a marked improvement in
+   performance. Also note on gen2/3 this requires a new libdrm [2.4.41]
+   in order to prevent a bug causing the DDX to fallback to swrast.
+
+Release 2.20.17 (2012-12-26)
+============================
+A minor update to prepare for co-operating with the kernel over managing
+stability on 830gm/845g. On this pair of chipsets, the kernel will perform
+an extra copy of the batchbuffer into reserved memory, which prevents them
+from randomly dying. However, that extra copy does have a noticeable
+impact upon throughput, so we also have a mechanism for userspace to
+opt-out of the kernel workaround and take responsibility for ensuring its
+batches are coherent.
+
+ * Build fixes against xorg-1.14
+   https://bugs.freedesktop.org/show_bug.cgi?id=58552
+   https://bugs.freedesktop.org/show_bug.cgi?id=58406
+
+ * Fixed the origin of cropped (textured) video windows (Xv and XvMC)
+   https://bugs.freedesktop.org/show_bug.cgi?id=23033
+
+ * Fix potential corruption when using images larger than ~1GiB
+
+
+Release 2.20.16 (2012-12-15)
+============================
+Rejoice! We have found a trick to make 830gm/845g stable at long last.
+Ever since the switch to GEM and dynamic video memory, those early
+second generation chipsets have been plagued by instability. The lack of
+flushing cachelines from the CPU to GMCH was eventually solved by using
+an undocmented bit, but 830/845 were still hanging under memory pressure.
+These deaths were all due to garbage finding its way into the command
+streamer, and they go away if we take a leaf out of the original driver
+and never reuse those pages for anything else. So for the first time
+ever, I have been able to complete running the test suite on an 845g,
+even whilst thrashing the page and buffer caches!
+
+ * Run the SF stage as single-threaded on gen4 to workaround a few issues
+   https://bugs.freedesktop.org/show_bug.cgi?id=57410
+
+ * Keep the scanout SURFACE_STATE separate to avoid overriding its
+   memory access control on gen6/7 (i.e. writes to the scanout need to
+   be kept out of the render cache)
+
+ * Tune batch flushing after an operation to an exported surface under a
+   compositor.
+
+ * Make sure the source is on the CPU for inplace composition of trapezoids
+   using the CPU
+   https://bugs.freedesktop.org/show_bug.cgi?id=56825
+
+ * Immediately flush in the block hander after a split batch to reduce
+   latency between the two halves of an operation.
+   https://bugs.freedesktop.org/show_bug.cgi?id=51718
+
+ * Install a fallback config if we fail to install the desired config
+   at VT switch (i.e. booting, after resume with 3 incompatible pipes on
+   Ivybridge)
+
+ * Pin batches to avoid CS incoherence on 830/845
+   https://bugs.freedesktop.org/show_bug.cgi?id=26345
+
+
+Release 2.20.15 (2012-12-03)
+============================
+And lo, enabling more of the common acceleration paths for gen4 revealed
+another lurking bug - something is wrong with how we prepare Y-tiling
+surfaces for rendering. For the time being, we can surreptiously disable
+them for gen4 and avoid hitting GPU hangs.
+
+ * Avoid clobbering the render state after failing to convert the
+   operation to use the blitter.
+   https://bugs.freedesktop.org/show_bug.cgi?id=57601
+
+ * Disable shadow tracking upon server regeneration, and so fix a crash
+   if you restart the server whilst a RandR transform (e.g. rotation) is
+   in effect.
+   https://bugs.freedesktop.org/show_bug.cgi?id=52255
+   https://bugs.freedesktop.org/show_bug.cgi?id=56608
+
+
+Release 2.20.14 (2012-11-26)
+============================
+The highlight of this release is gen4, from 965g to gm45. Quite an old
+bug surfaced in the shader assembly, sparking a chance to review a few
+design choices within that backend and experiment on fresh ways to
+workaround the remaining issues.
+
+ * Avoid using inplace XOR'ed uploads for very large buffers
+   https://bugs.freedesktop.org/show_bug.cgi?id=57031
+
+ * Fix the gen4/5 opacity shader
+   https://bugs.freedesktop.org/show_bug.cgi?id=57054
+
+ * Queue a pending vblank request after flip completion
+   https://bugs.freedesktop.org/show_bug.cgi?id=56423
+
+ * Avoid migrating an uninitialised pixmap for use as a render source
+   https://bugs.freedesktop.org/show_bug.cgi?id=47597
+
+ * Improve handing of texture fallbacks for 830/845.
+   https://bugs.freedesktop.org/show_bug.cgi?id=57392
+
+
+Release 2.20.13 (2012-11-11)
+============================
+Nothing but bug fixes. Many thanks to everyone who took the time to
+report their issues, and for their help in improving the driver.
+
+ * Sanity check the platform probe points to our expected i915 device
+   https://bugs.launchpad.net/ubuntu/+source/xserver-xorg-video-intel/+bug/1069031
+
+ * Prevent 16-bit overflow for computing the sample area to upload of
+   sources for render operations
+   https://bugs.freedesktop.org/show_bug.cgi?id=56324
+
+ * Clamp the drawable box for migration to prevent 16-bit overflow
+   https://bugs.freedesktop.org/show_bug.cgi?id=56591
+
+ * Disable RandR hotplug events if Xinerama is enabled and thereby prevent
+   a crash upon hotplug
+   https://bugs.freedesktop.org/show_bug.cgi?id=55260
+
+ * Call ValidatePicture before attempting to flatten the alphamaps
+   https://bugs.freedesktop.org/show_bug.cgi?id=56367
+
+ * Clip the trapezoid correctly if it ends on the boundary pixel
+   https://bugs.freedesktop.org/show_bug.cgi?id=56395
+
+ * Make sure the pipeline choice is propagated to the scanline wait
+   across a batch flush
+   https://bugs.freedesktop.org/show_bug.cgi?id=47597
+
+ * Set the valid drawable box when choosing placement of BLT composite ops
+   https://bugs.freedesktop.org/show_bug.cgi?id=47597
+
+ * Prevent use-after-free when promoting a partial-GPU bo to a full-GPU bo
+   https://bugs.freedesktop.org/show_bug.cgi?id=56591
+
+ * gen4 opacity spans require the per-rectangle workaround
+   https://bugs.freedesktop.org/show_bug.cgi?id=55500
+
+ * Prevent use of invalid damage pointers when redirecting rendering
+   https://bugs.freedesktop.org/show_bug.cgi?id=56785
+
+
+Release 2.20.12 (2012-10-20)
+============================
+More bug reports, more bug fixes! Perhaps the headline feature is
+that with a secure batches, coming to a 3.8 kernel near you, we may
+finally have the ability to perform updates to the scanout synchronized
+to the refresh rate on later SandyBridge and IvyBridge chipsets. It comes
+at quite a power cost as we need to keep the GPU out of its power saving
+modes, but it should allow legacy vsync to function at last. But this
+should allow us to address a longstanding issue with tearing on
+SandyBridge+.
+
+ * Fix component-alpha rendering on IvyBridge, for example subpixel
+   antialiased glyphs.
+   https://bugs.freedesktop.org/show_bug.cgi?id=56037
+
+ * Flush before some "pipelined" state changes on gen4. The evidence is
+   that the same flushes as required on gen5+ are also required for gen4.
+   https://bugs.freedesktop.org/show_bug.cgi?id=55627
+
+ * Prevent a potential crash when forcing a stall on a busy CPU bo
+   https://bugs.freedesktop.org/show_bug.cgi?id=56180
+
+[Release 2.20.11 contained a typo causing UXA to fail immediately.]
+
 Release 2.20.10 (2012-10-14)
 ============================
 The last couple of weeks have been fairly retrospective, a dive into
diff --git a/aclocal.m4 b/aclocal.m4
index fa97284c0..77dfdcecf 100644
--- a/aclocal.m4
+++ b/aclocal.m4
@@ -1338,7 +1338,14 @@ s390*-*linux*|s390*-*tpf*|sparc*-*linux*)
 	    LD="${LD-ld} -m elf_i386_fbsd"
 	    ;;
 	  x86_64-*linux*)
-	    LD="${LD-ld} -m elf_i386"
+	    case `/usr/bin/file conftest.o` in
+	      *x86-64*)
+		LD="${LD-ld} -m elf32_x86_64"
+		;;
+	      *)
+		LD="${LD-ld} -m elf_i386"
+		;;
+	    esac
 	    ;;
 	  ppc64-*linux*|powerpc64-*linux*)
 	    LD="${LD-ld} -m elf32ppclinux"
@@ -1702,7 +1709,8 @@ AC_CACHE_VAL([lt_cv_sys_max_cmd_len], [dnl
     ;;
   *)
     lt_cv_sys_max_cmd_len=`(getconf ARG_MAX) 2> /dev/null`
-    if test -n "$lt_cv_sys_max_cmd_len"; then
+    if test -n "$lt_cv_sys_max_cmd_len" && \
+	test undefined != "$lt_cv_sys_max_cmd_len"; then
       lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \/ 4`
       lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \* 3`
     else
@@ -11251,46 +11259,6 @@ fi
 rmdir .tst 2>/dev/null
 AC_SUBST([am__leading_dot])])
 
-# Add --enable-maintainer-mode option to configure.         -*- Autoconf -*-
-# From Jim Meyering
-
-# Copyright (C) 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005, 2008,
-# 2011 Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 5
-
-# AM_MAINTAINER_MODE([DEFAULT-MODE])
-# ----------------------------------
-# Control maintainer-specific portions of Makefiles.
-# Default is to disable them, unless `enable' is passed literally.
-# For symmetry, `disable' may be passed as well.  Anyway, the user
-# can override the default with the --enable/--disable switch.
-AC_DEFUN([AM_MAINTAINER_MODE],
-[m4_case(m4_default([$1], [disable]),
-       [enable], [m4_define([am_maintainer_other], [disable])],
-       [disable], [m4_define([am_maintainer_other], [enable])],
-       [m4_define([am_maintainer_other], [enable])
-        m4_warn([syntax], [unexpected argument to AM@&t@_MAINTAINER_MODE: $1])])
-AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
-  dnl maintainer-mode's default is 'disable' unless 'enable' is passed
-  AC_ARG_ENABLE([maintainer-mode],
-[  --][am_maintainer_other][-maintainer-mode  am_maintainer_other make rules and dependencies not useful
-			  (and sometimes confusing) to the casual installer],
-      [USE_MAINTAINER_MODE=$enableval],
-      [USE_MAINTAINER_MODE=]m4_if(am_maintainer_other, [enable], [no], [yes]))
-  AC_MSG_RESULT([$USE_MAINTAINER_MODE])
-  AM_CONDITIONAL([MAINTAINER_MODE], [test $USE_MAINTAINER_MODE = yes])
-  MAINT=$MAINTAINER_MODE_TRUE
-  AC_SUBST([MAINT])dnl
-]
-)
-
-AU_DEFUN([jm_MAINTAINER_MODE], [AM_MAINTAINER_MODE])
-
 # Check to see how 'make' treats includes.	            -*- Autoconf -*-
 
 # Copyright (C) 2001, 2002, 2003, 2005, 2009  Free Software Foundation, Inc.
diff --git a/config.h.in b/config.h.in
index b02108a49..546254232 100644
--- a/config.h.in
+++ b/config.h.in
@@ -11,6 +11,9 @@
 /* Enable pixmap debugging */
 #undef DEBUG_PIXMAP
 
+/* Enable synchronous rendering for debugging */
+#undef DEBUG_SYNC
+
 /* Default acceleration method */
 #undef DEFAULT_ACCEL_METHOD
 
@@ -23,6 +26,12 @@
 /* Enable pixman glyph cache */
 #undef HAS_PIXMAN_GLYPHS
 
+/* Enable pixman triangle rasterisation */
+#undef HAS_PIXMAN_TRIANGLES
+
+/* Enable if your compiler supports the Intel __sync_* atomic primitives */
+#undef HAVE_ATOMIC_PRIMITIVES
+
 /* Define to 1 if you have the <dgaproc.h> header file. */
 #undef HAVE_DGAPROC_H
 
@@ -47,6 +56,9 @@
 /* Define to 1 if you have the <inttypes.h> header file. */
 #undef HAVE_INTTYPES_H
 
+/* Enable if you have libatomic-ops-dev installed */
+#undef HAVE_LIB_ATOMIC_OPS
+
 /* Define to 1 if you have the <memory.h> header file. */
 #undef HAVE_MEMORY_H
 
diff --git a/configure b/configure
index cd430d053..96a698d92 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for xf86-video-intel 2.20.10.
+# Generated by GNU Autoconf 2.69 for xf86-video-intel 2.21.2.
 #
 # Report bugs to <https://bugs.freedesktop.org/enter_bug.cgi?product=xorg>.
 #
@@ -591,8 +591,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='xf86-video-intel'
 PACKAGE_TARNAME='xf86-video-intel'
-PACKAGE_VERSION='2.20.10'
-PACKAGE_STRING='xf86-video-intel 2.20.10'
+PACKAGE_VERSION='2.21.2'
+PACKAGE_STRING='xf86-video-intel 2.21.2'
 PACKAGE_BUGREPORT='https://bugs.freedesktop.org/enter_bug.cgi?product=xorg'
 PACKAGE_URL=''
 
@@ -651,6 +651,8 @@ KMS_ONLY_FALSE
 KMS_ONLY_TRUE
 XVMC_FALSE
 XVMC_TRUE
+XCB_LIBS
+XCB_CFLAGS
 XVMCLIB_LIBS
 XVMCLIB_CFLAGS
 DRI2_FALSE
@@ -681,10 +683,10 @@ LIBGLAMOR_LIBS
 LIBGLAMOR_CFLAGS
 GLAMOR_FALSE
 GLAMOR_TRUE
-DRMINTEL_LIBS
-DRMINTEL_CFLAGS
 UXA_FALSE
 UXA_TRUE
+DRMINTEL_LIBS
+DRMINTEL_CFLAGS
 SNA_FALSE
 SNA_TRUE
 HAVE_X11_FALSE
@@ -773,9 +775,6 @@ CPPFLAGS
 LDFLAGS
 CFLAGS
 CC
-MAINT
-MAINTAINER_MODE_FALSE
-MAINTAINER_MODE_TRUE
 am__untar
 am__tar
 AMTAR
@@ -840,7 +839,6 @@ SHELL'
 ac_subst_files=''
 ac_user_opts='
 enable_option_checking
-enable_maintainer_mode
 enable_dependency_tracking
 enable_selective_werror
 enable_strict_compilation
@@ -907,6 +905,8 @@ PCIACCESS_CFLAGS
 PCIACCESS_LIBS
 XVMCLIB_CFLAGS
 XVMCLIB_LIBS
+XCB_CFLAGS
+XCB_LIBS
 VALGRIND_CFLAGS
 VALGRIND_LIBS'
 
@@ -1449,7 +1449,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures xf86-video-intel 2.20.10 to adapt to many kinds of systems.
+\`configure' configures xf86-video-intel 2.21.2 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1520,7 +1520,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of xf86-video-intel 2.20.10:";;
+     short | recursive ) echo "Configuration of xf86-video-intel 2.21.2:";;
    esac
   cat <<\_ACEOF
 
@@ -1528,8 +1528,6 @@ Optional Features:
   --disable-option-checking  ignore unrecognized --enable/--with options
   --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
   --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
-  --enable-maintainer-mode  enable make rules and dependencies not useful
-			  (and sometimes confusing) to the casual installer
   --disable-dependency-tracking  speeds up one-time build
   --enable-dependency-tracking   do not reject slow dependency extractors
   --disable-selective-werror
@@ -1636,6 +1634,8 @@ Some influential environment variables:
               C compiler flags for XVMCLIB, overriding pkg-config
   XVMCLIB_LIBS
               linker flags for XVMCLIB, overriding pkg-config
+  XCB_CFLAGS  C compiler flags for XCB, overriding pkg-config
+  XCB_LIBS    linker flags for XCB, overriding pkg-config
   VALGRIND_CFLAGS
               C compiler flags for VALGRIND, overriding pkg-config
   VALGRIND_LIBS
@@ -1707,7 +1707,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-xf86-video-intel configure 2.20.10
+xf86-video-intel configure 2.21.2
 generated by GNU Autoconf 2.69
 
 Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2122,7 +2122,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by xf86-video-intel $as_me 2.20.10, which was
+It was created by xf86-video-intel $as_me 2.21.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   $ $0 $@
@@ -2942,7 +2942,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='xf86-video-intel'
- VERSION='2.20.10'
+ VERSION='2.21.2'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -2983,29 +2983,6 @@ am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'
 
 
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to enable maintainer-specific portions of Makefiles" >&5
-$as_echo_n "checking whether to enable maintainer-specific portions of Makefiles... " >&6; }
-    # Check whether --enable-maintainer-mode was given.
-if test "${enable_maintainer_mode+set}" = set; then :
-  enableval=$enable_maintainer_mode; USE_MAINTAINER_MODE=$enableval
-else
-  USE_MAINTAINER_MODE=no
-fi
-
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $USE_MAINTAINER_MODE" >&5
-$as_echo "$USE_MAINTAINER_MODE" >&6; }
-   if test $USE_MAINTAINER_MODE = yes; then
-  MAINTAINER_MODE_TRUE=
-  MAINTAINER_MODE_FALSE='#'
-else
-  MAINTAINER_MODE_TRUE='#'
-  MAINTAINER_MODE_FALSE=
-fi
-
-  MAINT=$MAINTAINER_MODE_TRUE
-
-
-
 # Require X.Org macros 1.8 or later for MAN_SUBSTS set by XORG_MANPAGE_SECTIONS
 
 
@@ -11514,7 +11491,8 @@ else
     ;;
   *)
     lt_cv_sys_max_cmd_len=`(getconf ARG_MAX) 2> /dev/null`
-    if test -n "$lt_cv_sys_max_cmd_len"; then
+    if test -n "$lt_cv_sys_max_cmd_len" && \
+	test undefined != "$lt_cv_sys_max_cmd_len"; then
       lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \/ 4`
       lt_cv_sys_max_cmd_len=`expr $lt_cv_sys_max_cmd_len \* 3`
     else
@@ -13050,7 +13028,14 @@ s390*-*linux*|s390*-*tpf*|sparc*-*linux*)
 	    LD="${LD-ld} -m elf_i386_fbsd"
 	    ;;
 	  x86_64-*linux*)
-	    LD="${LD-ld} -m elf_i386"
+	    case `/usr/bin/file conftest.o` in
+	      *x86-64*)
+		LD="${LD-ld} -m elf32_x86_64"
+		;;
+	      *)
+		LD="${LD-ld} -m elf_i386"
+		;;
+	    esac
 	    ;;
 	  ppc64-*linux*|powerpc64-*linux*)
 	    LD="${LD-ld} -m elf32ppclinux"
@@ -18261,6 +18246,72 @@ else
 fi
 
 
+# Check for atomic intrinsics
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for native atomic primitives" >&5
+$as_echo_n "checking for native atomic primitives... " >&6; }
+if ${intel_cv_atomic_primitives+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+
+    intel_cv_atomic_primitives="none"
+
+    cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int atomic_add(int i) { return __sync_fetch_and_add (&i, 1); }
+int atomic_cmpxchg(int i, int j, int k) { return __sync_val_compare_and_swap (&i, j, k); }
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  intel_cv_atomic_primitives="Intel"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+
+    if test "x$intel_cv_atomic_primitives" = "xnone"; then
+	    ac_fn_c_check_header_mongrel "$LINENO" "atomic_ops.h" "ac_cv_header_atomic_ops_h" "$ac_includes_default"
+if test "x$ac_cv_header_atomic_ops_h" = xyes; then :
+  intel_cv_atomic_primitives="libatomic-ops"
+fi
+
+
+    fi
+
+    # atomic functions defined in <atomic.h> & libc on Solaris
+    if test "x$intel_cv_atomic_primitives" = "xnone"; then
+	    ac_fn_c_check_func "$LINENO" "atomic_cas_uint" "ac_cv_func_atomic_cas_uint"
+if test "x$ac_cv_func_atomic_cas_uint" = xyes; then :
+  intel_cv_atomic_primitives="Solaris"
+fi
+
+    fi
+
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $intel_cv_atomic_primitives" >&5
+$as_echo "$intel_cv_atomic_primitives" >&6; }
+if test "x$intel_cv_atomic_primitives" = xIntel; then
+
+$as_echo "#define HAVE_ATOMIC_PRIMITIVES 1" >>confdefs.h
+
+fi
+if test "x$intel_cv_atomic_primitives" = "xlibatomic-ops"; then
+
+$as_echo "#define HAVE_LIB_ATOMIC_OPS 1" >>confdefs.h
+
+fi
+
+if test "x$intel_cv_atomic_primitives" = "xnone"; then
+		as_fn_error $? "xf86-video-intel depends upon atomic operations, which were not found for your compiler/cpu. Try compiling with -march=native, or install the libatomics-op-dev package." "$LINENO" 5
+fi
+
 # Check whether --enable-udev was given.
 if test "${enable_udev+set}" = set; then :
   enableval=$enable_udev; UDEV="$enableval"
@@ -18341,7 +18392,7 @@ else
 $as_echo "yes" >&6; }
 	udev=yes
 fi
-	if test x$UDEV == xyes -a x$udev != xyes; then
+	if test x$UDEV = xyes -a x$udev != xyes; then
 		as_fn_error $? "udev support requested but not found (libudev)" "$LINENO" 5
 	fi
 	if test x$udev = xyes; then
@@ -18473,7 +18524,7 @@ fi
 
 
 required_xorg_xserver_version=1.6
-required_pixman_version=0.24
+required_pixman_version=0.16
 
 if pkg-config --exists 'pixman-1 >= 0.27.1'; then
 
@@ -18481,6 +18532,12 @@ $as_echo "#define HAS_PIXMAN_GLYPHS 1" >>confdefs.h
 
 fi
 
+if pkg-config --exists 'pixman-1 >= 0.24.0'; then
+
+$as_echo "#define HAS_PIXMAN_TRIANGLES 1" >>confdefs.h
+
+fi
+
 # Check whether --enable-sna was given.
 if test "${enable_sna+set}" = set; then :
   enableval=$enable_sna; SNA="$enableval"
@@ -18507,7 +18564,6 @@ if test "x$SNA" = "xauto" && pkg-config --exists "xorg-server >= 1.10"; then
 	SNA=yes
 fi
 if test "x$SNA" != "xno"; then
-	required_xorg_xserver_version=1.10
 
 $as_echo "#define USE_SNA 1" >>confdefs.h
 
@@ -18529,21 +18585,19 @@ $as_echo "$SNA" >&6; }
 if test "${enable_uxa+set}" = set; then :
   enableval=$enable_uxa; UXA="$enableval"
 else
-  UXA=yes
+  UXA=auto
 fi
 
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to include UXA support" >&5
 $as_echo_n "checking whether to include UXA support... " >&6; }
- if test x$UXA != xno; then
-  UXA_TRUE=
-  UXA_FALSE='#'
-else
-  UXA_TRUE='#'
-  UXA_FALSE=
+if test "x$UXA" = "xauto"; then
+	if ! pkg-config --exists 'libdrm_intel >= 2.4.29'; then
+		UXA=no
+	fi
+	if ! pkg-config --exists 'pixman-1 >= 0.24.0'; then
+		UXA=no
+	fi
 fi
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $UXA" >&5
-$as_echo "$UXA" >&6; }
 if test "x$UXA" != "xno"; then
 
 $as_echo "#define USE_UXA 1" >>confdefs.h
@@ -18639,8 +18693,20 @@ else
 $as_echo "yes" >&6; }
 
 fi
+	required_pixman_version=0.24
+	UXA=yes
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $UXA" >&5
+$as_echo "$UXA" >&6; }
+ if test x$UXA != xno; then
+  UXA_TRUE=
+  UXA_FALSE='#'
+else
+  UXA_TRUE='#'
+  UXA_FALSE=
 fi
 
+
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to include GLAMOR support" >&5
 $as_echo_n "checking whether to include GLAMOR support... " >&6; }
 # Check whether --enable-glamor was given.
@@ -19278,12 +19344,12 @@ if test -n "$DRM_CFLAGS"; then
     pkg_cv_DRM_CFLAGS="$DRM_CFLAGS"
  elif test -n "$PKG_CONFIG"; then
     if test -n "$PKG_CONFIG" && \
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libdrm >= 2.4.24\""; } >&5
-  ($PKG_CONFIG --exists --print-errors "libdrm >= 2.4.24") 2>&5
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libdrm >= 2.4.20\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "libdrm >= 2.4.20") 2>&5
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; then
-  pkg_cv_DRM_CFLAGS=`$PKG_CONFIG --cflags "libdrm >= 2.4.24" 2>/dev/null`
+  pkg_cv_DRM_CFLAGS=`$PKG_CONFIG --cflags "libdrm >= 2.4.20" 2>/dev/null`
 		      test "x$?" != "x0" && pkg_failed=yes
 else
   pkg_failed=yes
@@ -19295,12 +19361,12 @@ if test -n "$DRM_LIBS"; then
     pkg_cv_DRM_LIBS="$DRM_LIBS"
  elif test -n "$PKG_CONFIG"; then
     if test -n "$PKG_CONFIG" && \
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libdrm >= 2.4.24\""; } >&5
-  ($PKG_CONFIG --exists --print-errors "libdrm >= 2.4.24") 2>&5
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"libdrm >= 2.4.20\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "libdrm >= 2.4.20") 2>&5
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; then
-  pkg_cv_DRM_LIBS=`$PKG_CONFIG --libs "libdrm >= 2.4.24" 2>/dev/null`
+  pkg_cv_DRM_LIBS=`$PKG_CONFIG --libs "libdrm >= 2.4.20" 2>/dev/null`
 		      test "x$?" != "x0" && pkg_failed=yes
 else
   pkg_failed=yes
@@ -19321,14 +19387,14 @@ else
         _pkg_short_errors_supported=no
 fi
         if test $_pkg_short_errors_supported = yes; then
-	        DRM_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libdrm >= 2.4.24" 2>&1`
+	        DRM_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "libdrm >= 2.4.20" 2>&1`
         else
-	        DRM_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libdrm >= 2.4.24" 2>&1`
+	        DRM_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "libdrm >= 2.4.20" 2>&1`
         fi
 	# Put the nasty error message in config.log where it belongs
 	echo "$DRM_PKG_ERRORS" >&5
 
-	as_fn_error $? "Package requirements (libdrm >= 2.4.24) were not met:
+	as_fn_error $? "Package requirements (libdrm >= 2.4.20) were not met:
 
 $DRM_PKG_ERRORS
 
@@ -19702,12 +19768,12 @@ if test -n "$XVMCLIB_CFLAGS"; then
     pkg_cv_XVMCLIB_CFLAGS="$XVMCLIB_CFLAGS"
  elif test -n "$PKG_CONFIG"; then
     if test -n "$PKG_CONFIG" && \
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"xvmc xext xfixes dri2proto x11-xcb xcb-dri2 xcb-aux\""; } >&5
-  ($PKG_CONFIG --exists --print-errors "xvmc xext xfixes dri2proto x11-xcb xcb-dri2 xcb-aux") 2>&5
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"xvmc dri2proto\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "xvmc dri2proto") 2>&5
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; then
-  pkg_cv_XVMCLIB_CFLAGS=`$PKG_CONFIG --cflags "xvmc xext xfixes dri2proto x11-xcb xcb-dri2 xcb-aux" 2>/dev/null`
+  pkg_cv_XVMCLIB_CFLAGS=`$PKG_CONFIG --cflags "xvmc dri2proto" 2>/dev/null`
 		      test "x$?" != "x0" && pkg_failed=yes
 else
   pkg_failed=yes
@@ -19719,12 +19785,12 @@ if test -n "$XVMCLIB_LIBS"; then
     pkg_cv_XVMCLIB_LIBS="$XVMCLIB_LIBS"
  elif test -n "$PKG_CONFIG"; then
     if test -n "$PKG_CONFIG" && \
-    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"xvmc xext xfixes dri2proto x11-xcb xcb-dri2 xcb-aux\""; } >&5
-  ($PKG_CONFIG --exists --print-errors "xvmc xext xfixes dri2proto x11-xcb xcb-dri2 xcb-aux") 2>&5
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"xvmc dri2proto\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "xvmc dri2proto") 2>&5
   ac_status=$?
   $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }; then
-  pkg_cv_XVMCLIB_LIBS=`$PKG_CONFIG --libs "xvmc xext xfixes dri2proto x11-xcb xcb-dri2 xcb-aux" 2>/dev/null`
+  pkg_cv_XVMCLIB_LIBS=`$PKG_CONFIG --libs "xvmc dri2proto" 2>/dev/null`
 		      test "x$?" != "x0" && pkg_failed=yes
 else
   pkg_failed=yes
@@ -19745,9 +19811,9 @@ else
         _pkg_short_errors_supported=no
 fi
         if test $_pkg_short_errors_supported = yes; then
-	        XVMCLIB_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "xvmc xext xfixes dri2proto x11-xcb xcb-dri2 xcb-aux" 2>&1`
+	        XVMCLIB_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "xvmc dri2proto" 2>&1`
         else
-	        XVMCLIB_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "xvmc xext xfixes dri2proto x11-xcb xcb-dri2 xcb-aux" 2>&1`
+	        XVMCLIB_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "xvmc dri2proto" 2>&1`
         fi
 	# Put the nasty error message in config.log where it belongs
 	echo "$XVMCLIB_PKG_ERRORS" >&5
@@ -19762,7 +19828,78 @@ else
 	XVMCLIB_LIBS=$pkg_cv_XVMCLIB_LIBS
         { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
 $as_echo "yes" >&6; }
-	XVMC=yes
+
+fi
+
+pkg_failed=no
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for XCB" >&5
+$as_echo_n "checking for XCB... " >&6; }
+
+if test -n "$XCB_CFLAGS"; then
+    pkg_cv_XCB_CFLAGS="$XCB_CFLAGS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"x11-xcb xcb-dri2 xcb-aux\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "x11-xcb xcb-dri2 xcb-aux") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_XCB_CFLAGS=`$PKG_CONFIG --cflags "x11-xcb xcb-dri2 xcb-aux" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+if test -n "$XCB_LIBS"; then
+    pkg_cv_XCB_LIBS="$XCB_LIBS"
+ elif test -n "$PKG_CONFIG"; then
+    if test -n "$PKG_CONFIG" && \
+    { { $as_echo "$as_me:${as_lineno-$LINENO}: \$PKG_CONFIG --exists --print-errors \"x11-xcb xcb-dri2 xcb-aux\""; } >&5
+  ($PKG_CONFIG --exists --print-errors "x11-xcb xcb-dri2 xcb-aux") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then
+  pkg_cv_XCB_LIBS=`$PKG_CONFIG --libs "x11-xcb xcb-dri2 xcb-aux" 2>/dev/null`
+		      test "x$?" != "x0" && pkg_failed=yes
+else
+  pkg_failed=yes
+fi
+ else
+    pkg_failed=untried
+fi
+
+
+
+if test $pkg_failed = yes; then
+   	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+
+if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then
+        _pkg_short_errors_supported=yes
+else
+        _pkg_short_errors_supported=no
+fi
+        if test $_pkg_short_errors_supported = yes; then
+	        XCB_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "x11-xcb xcb-dri2 xcb-aux" 2>&1`
+        else
+	        XCB_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "x11-xcb xcb-dri2 xcb-aux" 2>&1`
+        fi
+	# Put the nasty error message in config.log where it belongs
+	echo "$XCB_PKG_ERRORS" >&5
+
+	XVMC=no
+elif test $pkg_failed = untried; then
+     	{ $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+	XVMC=no
+else
+	XCB_CFLAGS=$pkg_cv_XCB_CFLAGS
+	XCB_LIBS=$pkg_cv_XCB_LIBS
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+
 fi
 fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to include XvMC support" >&5
@@ -19818,7 +19955,7 @@ else
   DEBUG_FALSE=
 fi
 
- if test x$FULL_DEBUG == xfull; then
+ if test x$DEBUG = xfull; then
   FULL_DEBUG_TRUE=
   FULL_DEBUG_FALSE='#'
 else
@@ -19909,6 +20046,11 @@ $as_echo "#define HAVE_VALGRIND 1" >>confdefs.h
 
 	fi
 fi
+if test "x$DEBUG" = xsync; then
+
+$as_echo "#define DEBUG_SYNC 1" >>confdefs.h
+
+fi
 if test "x$DEBUG" = xmemory; then
 
 $as_echo "#define DEBUG_MEMORY 1" >>confdefs.h
@@ -20055,10 +20197,6 @@ else
   am__EXEEXT_FALSE=
 fi
 
-if test -z "${MAINTAINER_MODE_TRUE}" && test -z "${MAINTAINER_MODE_FALSE}"; then
-  as_fn_error $? "conditional \"MAINTAINER_MODE\" was never defined.
-Usually this means the macro was only invoked conditionally." "$LINENO" 5
-fi
 if test -z "${AMDEP_TRUE}" && test -z "${AMDEP_FALSE}"; then
   as_fn_error $? "conditional \"AMDEP\" was never defined.
 Usually this means the macro was only invoked conditionally." "$LINENO" 5
@@ -20532,7 +20670,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by xf86-video-intel $as_me 2.20.10, which was
+This file was extended by xf86-video-intel $as_me 2.21.2, which was
 generated by GNU Autoconf 2.69.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -20598,7 +20736,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-xf86-video-intel config.status 2.20.10
+xf86-video-intel config.status 2.21.2
 configured by $0, generated by GNU Autoconf 2.69,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 972d9188e..3a4b6dbcb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -23,7 +23,7 @@
 # Initialize Autoconf
 AC_PREREQ([2.60])
 AC_INIT([xf86-video-intel],
-        [2.20.10],
+        [2.21.2],
         [https://bugs.freedesktop.org/enter_bug.cgi?product=xorg],
         [xf86-video-intel])
 AC_CONFIG_SRCDIR([Makefile.am])
@@ -32,7 +32,6 @@ AC_CONFIG_AUX_DIR(.)
 
 # Initialize Automake
 AM_INIT_AUTOMAKE([foreign dist-bzip2])
-AM_MAINTAINER_MODE
 
 # Require X.Org macros 1.8 or later for MAN_SUBSTS set by XORG_MANPAGE_SECTIONS
 m4_ifndef([XORG_MACROS_VERSION],
@@ -105,6 +104,40 @@ if test x$ASM != "xno"; then
 fi
 AM_CONDITIONAL(HAVE_GEN4ASM, test x$gen4asm = xyes)
 
+# Check for atomic intrinsics
+AC_CACHE_CHECK([for native atomic primitives], intel_cv_atomic_primitives,
+[
+    intel_cv_atomic_primitives="none"
+
+    AC_LINK_IFELSE([AC_LANG_PROGRAM([[
+int atomic_add(int i) { return __sync_fetch_and_add (&i, 1); }
+int atomic_cmpxchg(int i, int j, int k) { return __sync_val_compare_and_swap (&i, j, k); }
+				    ]],[[]])],
+		   [intel_cv_atomic_primitives="Intel"],[])
+
+    if test "x$intel_cv_atomic_primitives" = "xnone"; then
+	    AC_CHECK_HEADER([atomic_ops.h], intel_cv_atomic_primitives="libatomic-ops")
+    fi
+
+    # atomic functions defined in <atomic.h> & libc on Solaris
+    if test "x$intel_cv_atomic_primitives" = "xnone"; then
+	    AC_CHECK_FUNC([atomic_cas_uint],
+			  intel_cv_atomic_primitives="Solaris")
+    fi
+
+])
+if test "x$intel_cv_atomic_primitives" = xIntel; then
+    AC_DEFINE(HAVE_ATOMIC_PRIMITIVES, 1,
+	      [Enable if your compiler supports the Intel __sync_* atomic primitives])
+fi
+if test "x$intel_cv_atomic_primitives" = "xlibatomic-ops"; then
+    AC_DEFINE(HAVE_LIB_ATOMIC_OPS, 1, [Enable if you have libatomic-ops-dev installed])
+fi
+
+if test "x$intel_cv_atomic_primitives" = "xnone"; then
+		AC_MSG_ERROR([xf86-video-intel depends upon atomic operations, which were not found for your compiler/cpu. Try compiling with -march=native, or install the libatomics-op-dev package.])
+fi
+
 AC_ARG_ENABLE(udev,
               AS_HELP_STRING([--disable-udev],
                              [Disable udev-based monitor hotplug detection [default=auto]]),
@@ -113,7 +146,7 @@ AC_ARG_ENABLE(udev,
 
 if test x$UDEV != "xno"; then
 	PKG_CHECK_MODULES(UDEV, [libudev], [udev=yes], [udev=no])
-	if test x$UDEV == xyes -a x$udev != xyes; then
+	if test x$UDEV = xyes -a x$udev != xyes; then
 		AC_MSG_ERROR([udev support requested but not found (libudev)])
 	fi
 	if test x$udev = xyes; then
@@ -151,12 +184,16 @@ AC_ARG_ENABLE(ums-only, AS_HELP_STRING([--enable-ums-only],
               [UMS_ONLY=no])
 
 required_xorg_xserver_version=1.6
-required_pixman_version=0.24
+required_pixman_version=0.16
 
 if pkg-config --exists 'pixman-1 >= 0.27.1'; then
 	AC_DEFINE([HAS_PIXMAN_GLYPHS], 1, [Enable pixman glyph cache])
 fi
 
+if pkg-config --exists 'pixman-1 >= 0.24.0'; then
+	AC_DEFINE([HAS_PIXMAN_TRIANGLES], 1, [Enable pixman triangle rasterisation])
+fi
+
 AC_ARG_ENABLE(sna,
 	      AS_HELP_STRING([--enable-sna],
 			     [Enable SandyBridge's New Acceleration (SNA) [default=auto]]),
@@ -168,7 +205,6 @@ if test "x$SNA" = "xauto" && pkg-config --exists "xorg-server >= 1.10"; then
 	SNA=yes
 fi
 if test "x$SNA" != "xno"; then
-	required_xorg_xserver_version=1.10
 	AC_DEFINE(USE_SNA, 1, [Enable SNA support])
 fi
 AC_MSG_CHECKING([whether to include SNA support])
@@ -179,14 +215,24 @@ AC_ARG_ENABLE(uxa,
 	      AS_HELP_STRING([--enable-uxa],
 			     [Enable Unified Acceleration Architecture (UXA) [default=yes]]),
 	      [UXA="$enableval"],
-	      [UXA=yes])
+	      [UXA=auto])
 AC_MSG_CHECKING([whether to include UXA support])
-AM_CONDITIONAL(UXA, test x$UXA != xno)
-AC_MSG_RESULT([$UXA])
+if test "x$UXA" = "xauto"; then
+	if ! pkg-config --exists 'libdrm_intel >= 2.4.29'; then
+		UXA=no
+	fi
+	if ! pkg-config --exists 'pixman-1 >= 0.24.0'; then
+		UXA=no
+	fi
+fi
 if test "x$UXA" != "xno"; then
 	AC_DEFINE(USE_UXA, 1, [Enable UXA support])
 	PKG_CHECK_MODULES(DRMINTEL, [libdrm_intel >= 2.4.29])
+	required_pixman_version=0.24
+	UXA=yes
 fi
+AC_MSG_RESULT([$UXA])
+AM_CONDITIONAL(UXA, test x$UXA != xno)
 
 AC_MSG_CHECKING([whether to include GLAMOR support])
 AC_ARG_ENABLE(glamor,
@@ -314,7 +360,7 @@ XORG_DRIVER_CHECK_EXT(XF86DRI, xextproto x11)
 XORG_DRIVER_CHECK_EXT(DPMSExtension, xextproto)
 
 # Obtain compiler/linker options for the driver dependencies
-PKG_CHECK_MODULES(DRM, [libdrm >= 2.4.24]) # libdrm_intel is checked separately
+PKG_CHECK_MODULES(DRM, [libdrm >= 2.4.20]) # libdrm_intel is checked separately
 PKG_CHECK_MODULES(DRI, [xf86driproto], , DRI=no)
 PKG_CHECK_MODULES(DRI2, [dri2proto >= 2.6],, DRI2=no)
 PKG_CHECK_MODULES(PCIACCESS, [pciaccess >= 0.10])
@@ -370,9 +416,8 @@ AM_CONDITIONAL(DRI2, test "x$DRI2" = xyes)
 AC_MSG_RESULT([$DRI2])
 
 if test "$XVMC" = yes; then
-	PKG_CHECK_MODULES(XVMCLIB,
-			  [xvmc xext xfixes dri2proto x11-xcb xcb-dri2 xcb-aux],
-			  [XVMC=yes], [XVMC=no])
+	PKG_CHECK_MODULES(XVMCLIB, [xvmc dri2proto], [], [XVMC=no])
+	PKG_CHECK_MODULES(XCB, [x11-xcb xcb-dri2 xcb-aux], [], [XVMC=no])
 fi
 AC_MSG_CHECKING([whether to include XvMC support])
 AC_MSG_RESULT([$XVMC])
@@ -391,7 +436,7 @@ if test "x$UMS_ONLY" = xyes; then
 fi
 
 AM_CONDITIONAL(DEBUG, test x$DEBUG != xno)
-AM_CONDITIONAL(FULL_DEBUG, test x$FULL_DEBUG == xfull)
+AM_CONDITIONAL(FULL_DEBUG, test x$DEBUG = xfull)
 if test "x$DEBUG" = xno; then
 	AC_DEFINE(NDEBUG,1,[Disable internal debugging])
 fi
@@ -401,6 +446,9 @@ if test "x$DEBUG" != xno; then
 		AC_DEFINE([HAVE_VALGRIND], 1, [Use valgrind intrinsics to suppress false warnings])
 	fi
 fi
+if test "x$DEBUG" = xsync; then
+	AC_DEFINE(DEBUG_SYNC,1,[Enable synchronous rendering for debugging])
+fi
 if test "x$DEBUG" = xmemory; then
 	AC_DEFINE(DEBUG_MEMORY,1,[Enable memory debugging])
 fi
diff --git a/man/Makefile.in b/man/Makefile.in
index 29efd9588..278ae4405 100644
--- a/man/Makefile.in
+++ b/man/Makefile.in
@@ -196,7 +196,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -235,6 +234,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -304,7 +305,7 @@ all: all-am
 
 .SUFFIXES:
 .SUFFIXES: .$(DRIVER_MAN_SUFFIX) .man
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -329,9 +330,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
 
diff --git a/man/intel.man b/man/intel.man
index 0942dc1c0..fbd0230aa 100644
--- a/man/intel.man
+++ b/man/intel.man
@@ -116,6 +116,24 @@ The following driver
 .B Options
 are supported for the 830M and later chipsets:
 .TP
+.BI "Option \*qNoAccel\*q \*q" boolean \*q
+Disable or enable acceleration.
+.IP
+Default: acceleration is enabled.
+.TP
+.BI "Option \*qAccelMethod\*q \*q" string \*q
+Select acceleration method.
+There are a couple of backends available for accelerating the DDX. \*qUXA\*q (Unified
+Acceleration Architecture) is the mature backend that was introduced to support
+the GEM driver model. It is in the process of being superseded by \*qSNA\*q
+(Sandybridge's New Acceleration). Until that process is complete, the ability to
+choose which backend to use remains for backwards compatibility.
+In addition, there are a pair of sub-options to limit the acceleration for
+debugging use. Specify \*qoff\*q to disable all acceleration, or \*qblt\*q to
+disable render acceleration and only use the BLT engine.
+.IP
+Default: use UXA (render acceleration)
+.TP
 .BI "Option \*qVideoKey\*q \*q" integer \*q
 This is the same as the
 .B \*qColorKey\*q
diff --git a/src/Makefile.in b/src/Makefile.in
index 3c5a911f7..e28de984e 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -343,7 +343,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -382,6 +381,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -463,7 +464,7 @@ all: all-recursive
 
 .SUFFIXES:
 .SUFFIXES: .c .lo .o .obj
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -488,9 +489,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
 install-intel_drv_laLTLIBRARIES: $(intel_drv_la_LTLIBRARIES)
diff --git a/src/compat-api.h b/src/compat-api.h
index 6b7657241..6d147c74d 100644
--- a/src/compat-api.h
+++ b/src/compat-api.h
@@ -28,6 +28,10 @@
 #ifndef COMPAT_API_H
 #define COMPAT_API_H
 
+#include <xorg-server.h>
+#include <xorgVersion.h>
+
+#include <picturestr.h>
 #ifndef GLYPH_HAS_GLYPH_PICTURE_ACCESSOR
 #define GetGlyphPicture(g, s) GlyphPicture((g))[(s)->myNum]
 #define SetGlyphPicture(g, s, p) GlyphPicture((g))[(s)->myNum] = p
@@ -103,4 +107,54 @@
 
 #endif
 
+#ifndef INCLUDE_LEGACY_REGION_DEFINES
+#define RegionCreate(r, s) REGION_CREATE(NULL, r, s)
+#define RegionBreak(r) REGION_BREAK(NULL, r)
+#define RegionSizeof REGION_SZOF
+#define RegionBoxptr REGION_BOXPTR
+#define RegionEnd REGION_END
+#define RegionExtents(r) REGION_EXTENTS(NULL, r)
+#define RegionRects REGION_RECTS
+#define RegionNumRects REGION_NUM_RECTS
+#define RegionContainsRect(r, b) RECT_IN_REGION(NULL, r, b)
+#define RegionContainsPoint(r, x, y, b) POINT_IN_REGION(NULL, r, x, y, b)
+#define RegionCopy(res, r) REGION_COPY(NULL, res, r)
+#define RegionIntersect(res, r1, r2) REGION_INTERSECT(NULL, res, r1, r2)
+#define RegionUnion(res, r1, r2) REGION_UNION(NULL, res, r1, r2)
+#define RegionTranslate(r, x, y) REGION_TRANSLATE(NULL, r, x, y)
+#define RegionUninit(r) REGION_UNINIT(NULL, r)
+#define region_from_bitmap BITMAP_TO_REGION
+#define RegionNil REGION_NIL
+#define RegionNull(r) REGION_NULL(NULL, r)
+#define RegionNotEmpty(r) REGION_NOTEMPTY(NULL, r)
+#define RegionEmpty(r) REGION_EMPTY(NULL, r)
+#define RegionDestroy(r) REGION_DESTROY(NULL, r)
+#else
+#define region_from_bitmap BitmapToRegion
+#endif
+
+#ifndef _X_UNUSED
+#define _X_UNUSED
+#endif
+
+#if HAS_DEVPRIVATEKEYREC
+#define __get_private(p, key) dixGetPrivateAddr(&(p)->devPrivates, &(key))
+#else
+#define __get_private(p, key) dixLookupPrivate(&(p)->devPrivates, &(key))
+typedef int DevPrivateKeyRec;
+static inline void FreePixmap(PixmapPtr pixmap)
+{
+	dixFreePrivates(pixmap->devPrivates);
+	free(pixmap);
+}
+#endif
+
+#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,9,99,902,0)
+#define SourceValidate(d, x, y, w, h, mode) \
+	if ((d)->pScreen->SourceValidate) (d)->pScreen->SourceValidate(d, x, y, w, h, mode)
+#else
+#define SourceValidate(d, x, y, w, h, mode) \
+	if ((d)->pScreen->SourceValidate) (d)->pScreen->SourceValidate(d, x, y, w, h)
+#endif
+
 #endif
diff --git a/src/i965_3d.c b/src/i965_3d.c
index a18db1251..fe2d9aa6b 100644
--- a/src/i965_3d.c
+++ b/src/i965_3d.c
@@ -35,7 +35,7 @@
 void
 gen6_upload_invariant_states(intel_screen_private *intel)
 {
-	Bool ivb = INTEL_INFO(intel)->gen >= 70;
+	Bool ivb = INTEL_INFO(intel)->gen >= 070;
 
 	OUT_BATCH(BRW_PIPE_CONTROL | (4 - 2));
 	OUT_BATCH(BRW_PIPE_CONTROL_IS_FLUSH |
@@ -280,7 +280,7 @@ gen7_upload_bypass_states(intel_screen_private *intel)
 void
 gen6_upload_vs_state(intel_screen_private *intel)
 {
-	Bool ivb = INTEL_INFO(intel)->gen >= 70;
+	Bool ivb = INTEL_INFO(intel)->gen >= 070;
 	/* disable VS constant buffer */
 	OUT_BATCH(GEN6_3DSTATE_CONSTANT_VS | ((ivb ? 7 : 5) - 2));
 	OUT_BATCH(0);
diff --git a/src/i965_render.c b/src/i965_render.c
index 42b195992..39698b0dc 100644
--- a/src/i965_render.c
+++ b/src/i965_render.c
@@ -1054,7 +1054,7 @@ i965_create_sampler_state(intel_screen_private *intel,
 			  sampler_state_extend_t mask_extend,
 			  drm_intel_bo * border_color_bo)
 {
-	if (INTEL_INFO(intel)->gen < 70)
+	if (INTEL_INFO(intel)->gen < 070)
 		return gen4_create_sampler_state(intel, src_filter, src_extend,
 						 mask_filter, mask_extend,
 						 border_color_bo);
@@ -1417,7 +1417,7 @@ i965_set_picture_surface_state(intel_screen_private *intel,
 			       PicturePtr picture, PixmapPtr pixmap,
 			       Bool is_dst)
 {
-    if (INTEL_INFO(intel)->gen < 70)
+    if (INTEL_INFO(intel)->gen < 070)
         return gen4_set_picture_surface_state(intel, picture, pixmap, is_dst);
     return gen7_set_picture_surface_state(intel, picture, pixmap, is_dst);
 }
@@ -1571,7 +1571,7 @@ static void i965_emit_composite_state(struct intel_screen_private *intel)
 		}
 
 		/* Match Mesa driver setup */
-		if (INTEL_INFO(intel)->gen >= 45)
+		if (INTEL_INFO(intel)->gen >= 045)
 			OUT_BATCH(NEW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
 		else
 			OUT_BATCH(BRW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
@@ -1751,7 +1751,7 @@ static Bool i965_composite_check_aperture(intel_screen_private *intel)
 		render_state->gen6_depth_stencil_bo,
 	};
 
-	if (INTEL_INFO(intel)->gen >= 60)
+	if (INTEL_INFO(intel)->gen >= 060)
 		return drm_intel_bufmgr_check_aperture_space(gen6_bo_table,
 							ARRAY_SIZE(gen6_bo_table)) == 0;
 	else
@@ -2181,7 +2181,7 @@ static void i965_select_vertex_buffer(struct intel_screen_private *intel)
 	if (intel->vertex_id & (1 << id))
 		return;
 
-	if (INTEL_INFO(intel)->gen >= 70)
+	if (INTEL_INFO(intel)->gen >= 070)
 		modifyenable = GEN7_VB0_ADDRESS_MODIFYENABLE;
 
 	/* Set up the pointer to our (single) vertex buffer */
@@ -2190,7 +2190,7 @@ static void i965_select_vertex_buffer(struct intel_screen_private *intel)
 	/* XXX could use multiple vbo to reduce relocations if
 	 * frequently switching between vertex sizes, like rgb10text.
 	 */
-	if (INTEL_INFO(intel)->gen >= 60) {
+	if (INTEL_INFO(intel)->gen >= 060) {
 		OUT_BATCH((id << GEN6_VB0_BUFFER_INDEX_SHIFT) |
 			  GEN6_VB0_VERTEXDATA |
 			  modifyenable |
@@ -2201,7 +2201,7 @@ static void i965_select_vertex_buffer(struct intel_screen_private *intel)
 			  (4*intel->floats_per_vertex << VB0_BUFFER_PITCH_SHIFT));
 	}
 	OUT_RELOC(intel->vertex_bo, I915_GEM_DOMAIN_VERTEX, 0, 0);
-	if (INTEL_INFO(intel)->gen >= 50)
+	if (INTEL_INFO(intel)->gen >= 050)
 		OUT_RELOC(intel->vertex_bo,
 			  I915_GEM_DOMAIN_VERTEX, 0,
 			  sizeof(intel->vertex_ptr) - 1);
@@ -2252,7 +2252,7 @@ i965_composite(PixmapPtr dest, int srcX, int srcY, int maskX, int maskY,
 	if (intel->needs_render_state_emit) {
 		i965_bind_surfaces(intel);
 
-		if (INTEL_INFO(intel)->gen >= 60)
+		if (INTEL_INFO(intel)->gen >= 060)
 			gen6_emit_composite_state(intel);
 		else
 			i965_emit_composite_state(intel);
@@ -2271,7 +2271,7 @@ i965_composite(PixmapPtr dest, int srcX, int srcY, int maskX, int maskY,
 	i965_select_vertex_buffer(intel);
 
 	if (intel->vertex_offset == 0) {
-		if (INTEL_INFO(intel)->gen >= 70) {
+		if (INTEL_INFO(intel)->gen >= 070) {
 			OUT_BATCH(BRW_3DPRIMITIVE | (7 - 2));
 			OUT_BATCH(BRW_3DPRIMITIVE_VERTEX_SEQUENTIAL |
 				  _3DPRIM_RECTLIST);
@@ -2298,7 +2298,7 @@ i965_composite(PixmapPtr dest, int srcX, int srcY, int maskX, int maskY,
 			 w, h);
 	intel->vertex_index += 3;
 
-	if (INTEL_INFO(intel)->gen < 50) {
+	if (INTEL_INFO(intel)->gen < 050) {
 	    /* XXX OMG! */
 	    i965_vertex_flush(intel);
 	    OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
@@ -2355,7 +2355,7 @@ void gen4_render_state_init(ScrnInfoPtr scrn)
 		assert(intel->gen4_render_state != NULL);
 	}
 
-	if (INTEL_INFO(intel)->gen >= 60)
+	if (INTEL_INFO(intel)->gen >= 060)
 		return gen6_render_state_init(scrn);
 
 	render = intel->gen4_render_state;
@@ -2601,7 +2601,7 @@ gen6_composite_cc_state_pointers(intel_screen_private *intel,
 		cc_bo = render_state->cc_state_bo;
 		depth_stencil_bo = render_state->gen6_depth_stencil_bo;
 	}
-	if (INTEL_INFO(intel)->gen >= 70) {
+	if (INTEL_INFO(intel)->gen >= 070) {
 		gen7_upload_cc_state_pointers(intel, render_state->gen6_blend_bo, cc_bo, depth_stencil_bo, blend_offset);
 	} else {
 		gen6_upload_cc_state_pointers(intel, render_state->gen6_blend_bo, cc_bo, depth_stencil_bo, blend_offset);
@@ -2619,7 +2619,7 @@ gen6_composite_sampler_state_pointers(intel_screen_private *intel,
 
 	intel->gen6_render_state.samplers = bo;
 
-	if (INTEL_INFO(intel)->gen >= 70)
+	if (INTEL_INFO(intel)->gen >= 070)
 		gen7_upload_sampler_state_pointers(intel, bo);
 	else
 		gen6_upload_sampler_state_pointers(intel, bo);
@@ -2628,7 +2628,7 @@ gen6_composite_sampler_state_pointers(intel_screen_private *intel,
 static void
 gen6_composite_wm_constants(intel_screen_private *intel)
 {
-	Bool ivb = INTEL_INFO(intel)->gen >= 70;
+	Bool ivb = INTEL_INFO(intel)->gen >= 070;
 	/* disable WM constant buffer */
 	OUT_BATCH(GEN6_3DSTATE_CONSTANT_PS | ((ivb ? 7 : 5) - 2));
 	OUT_BATCH(0);
@@ -2652,7 +2652,7 @@ gen6_composite_sf_state(intel_screen_private *intel,
 
 	intel->gen6_render_state.num_sf_outputs = num_sf_outputs;
 
-	if (INTEL_INFO(intel)->gen >= 70)
+	if (INTEL_INFO(intel)->gen >= 070)
 		gen7_upload_sf_state(intel, num_sf_outputs, 1);
 	else
 		gen6_upload_sf_state(intel, num_sf_outputs, 1);
@@ -2839,7 +2839,7 @@ gen6_emit_composite_state(struct intel_screen_private *intel)
 	sampler_state_extend_t mask_extend = composite_op->mask_extend;
 	Bool is_affine = composite_op->is_affine;
 	Bool has_mask = intel->render_mask != NULL;
-	Bool ivb = INTEL_INFO(intel)->gen >= 70;
+	Bool ivb = INTEL_INFO(intel)->gen >= 070;
 	uint32_t src, dst;
 	drm_intel_bo *ps_sampler_state_bo = render->ps_sampler_state_bo[src_filter][src_extend][mask_filter][mask_extend];
 
diff --git a/src/i965_video.c b/src/i965_video.c
index 3276788fb..65f60612a 100644
--- a/src/i965_video.c
+++ b/src/i965_video.c
@@ -897,7 +897,7 @@ i965_emit_video_setup(ScrnInfoPtr scrn, drm_intel_bo * surface_state_binding_tab
 
 	/* brw_debug (scrn, "before base address modify"); */
 	/* Match Mesa driver setup */
-	if (INTEL_INFO(intel)->gen >= 45)
+	if (INTEL_INFO(intel)->gen >= 045)
 		OUT_BATCH(NEW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
 	else
 		OUT_BATCH(BRW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
@@ -1428,7 +1428,7 @@ gen6_create_vidoe_objects(ScrnInfoPtr scrn)
 	const uint32_t *packed_ps_kernel, *planar_ps_kernel;
 	unsigned int packed_ps_size, planar_ps_size;
 	
-	if (INTEL_INFO(intel)->gen >= 70) {
+	if (INTEL_INFO(intel)->gen >= 070) {
 		create_sampler_state = gen7_create_sampler_state;
 		packed_ps_kernel = &ps_kernel_packed_static_gen7[0][0];
 		packed_ps_size = sizeof(ps_kernel_packed_static_gen7);
@@ -1787,7 +1787,7 @@ void Gen6DisplayVideoTextured(ScrnInfoPtr scrn,
 				PixmapPtr,
 				drm_intel_bo *, uint32_t);
 
-	if (INTEL_INFO(intel)->gen >= 70) {
+	if (INTEL_INFO(intel)->gen >= 070) {
 		create_dst_surface_state = gen7_create_dst_surface_state;
 		create_src_surface_state = gen7_create_src_surface_state;
 		emit_video_setup = gen7_emit_video_setup;
diff --git a/src/intel.h b/src/intel.h
index a5603fee6..d4c9aff21 100644
--- a/src/intel.h
+++ b/src/intel.h
@@ -182,7 +182,7 @@ typedef struct intel_screen_private {
 	unsigned int batch_emit_start;
 	/** Number of bytes to be emitted in the current BEGIN_BATCH. */
 	uint32_t batch_emitting;
-	dri_bo *batch_bo;
+	dri_bo *batch_bo, *last_batch_bo[2];
 	/** Whether we're in a section of code that can't tolerate flushing */
 	Bool in_batch_atomic;
 	/** Ending batch_used that was verified by intel_start_batch_atomic() */
@@ -366,6 +366,7 @@ extern Bool intel_mode_pre_init(ScrnInfoPtr pScrn, int fd, int cpp);
 extern void intel_mode_init(struct intel_screen_private *intel);
 extern void intel_mode_disable_unused_functions(ScrnInfoPtr scrn);
 extern void intel_mode_remove_fb(intel_screen_private *intel);
+extern void intel_mode_close(intel_screen_private *intel);
 extern void intel_mode_fini(intel_screen_private *intel);
 
 extern int intel_get_pipe_from_crtc_id(drm_intel_bufmgr *bufmgr, xf86CrtcPtr crtc);
@@ -552,6 +553,9 @@ intel_get_transformed_coordinates_3d(int x, int y, PictTransformPtr transform,
 				    float *x_out, float *y_out, float *z_out);
 
 static inline void
+intel_debug_fallback(ScrnInfoPtr scrn, const char *format, ...) _X_ATTRIBUTE_PRINTF(2, 3);
+
+static inline void
 intel_debug_fallback(ScrnInfoPtr scrn, const char *format, ...)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
diff --git a/src/intel_batchbuffer.c b/src/intel_batchbuffer.c
index 46f22bc36..a44a15632 100644
--- a/src/intel_batchbuffer.c
+++ b/src/intel_batchbuffer.c
@@ -67,17 +67,26 @@ void intel_next_vertex(intel_screen_private *intel)
 		dri_bo_alloc(intel->bufmgr, "vertex", sizeof (intel->vertex_ptr), 4096);
 }
 
-static void intel_next_batch(ScrnInfoPtr scrn)
+static dri_bo *bo_alloc(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
-
+	int size = 4 * 4096;
 	/* The 865 has issues with larger-than-page-sized batch buffers. */
 	if (IS_I865G(intel))
-		intel->batch_bo =
-		    dri_bo_alloc(intel->bufmgr, "batch", 4096, 4096);
-	else
-		intel->batch_bo =
-		    dri_bo_alloc(intel->bufmgr, "batch", 4096 * 4, 4096);
+		size = 4096;
+	return dri_bo_alloc(intel->bufmgr, "batch", size, 4096);
+}
+
+static void intel_next_batch(ScrnInfoPtr scrn, int mode)
+{
+	intel_screen_private *intel = intel_get_screen_private(scrn);
+	dri_bo *tmp;
+
+	drm_intel_gem_bo_clear_relocs(intel->batch_bo, 0);
+
+	tmp = intel->last_batch_bo[mode];
+	intel->last_batch_bo[mode] = intel->batch_bo;
+	intel->batch_bo = tmp;
 
 	intel->batch_used = 0;
 
@@ -95,12 +104,25 @@ void intel_batch_init(ScrnInfoPtr scrn)
 	intel->batch_emitting = 0;
 	intel->vertex_id = 0;
 
-	intel_next_batch(scrn);
+	intel->last_batch_bo[0] = bo_alloc(scrn);
+	intel->last_batch_bo[1] = bo_alloc(scrn);
+
+	intel->batch_bo = bo_alloc(scrn);
+	intel->batch_used = 0;
+	intel->last_3d = LAST_3D_OTHER;
 }
 
 void intel_batch_teardown(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(intel->last_batch_bo); i++) {
+		if (intel->last_batch_bo[i] != NULL) {
+			dri_bo_unreference(intel->last_batch_bo[i]);
+			intel->last_batch_bo[i] = NULL;
+		}
+	}
 
 	if (intel->batch_bo != NULL) {
 		dri_bo_unreference(intel->batch_bo);
@@ -162,7 +184,7 @@ void intel_batch_emit_flush(ScrnInfoPtr scrn)
 	assert (!intel->in_batch_atomic);
 
 	/* Big hammer, look to the pipelined flushes in future. */
-	if ((INTEL_INFO(intel)->gen >= 60)) {
+	if ((INTEL_INFO(intel)->gen >= 060)) {
 		if (intel->current_batch == BLT_BATCH) {
 			BEGIN_BATCH_BLT(4);
 			OUT_BATCH(MI_FLUSH_DW | 2);
@@ -171,7 +193,7 @@ void intel_batch_emit_flush(ScrnInfoPtr scrn)
 			OUT_BATCH(0);
 			ADVANCE_BATCH();
 		} else  {
-			if ((INTEL_INFO(intel)->gen == 60)) {
+			if ((INTEL_INFO(intel)->gen == 060)) {
 				/* HW-Workaround for Sandybdrige */
 				intel_emit_post_sync_nonzero_flush(scrn);
 			} else {
@@ -187,7 +209,7 @@ void intel_batch_emit_flush(ScrnInfoPtr scrn)
 		}
 	} else {
 		flags = MI_WRITE_DIRTY_STATE | MI_INVALIDATE_MAP_CACHE;
-		if (INTEL_INFO(intel)->gen >= 40)
+		if (INTEL_INFO(intel)->gen >= 040)
 			flags = 0;
 
 		BEGIN_BATCH(1);
@@ -239,22 +261,21 @@ void intel_batch_submit(ScrnInfoPtr scrn)
 	}
 
 	if (ret != 0) {
-		if (ret == -EIO) {
-			static int once;
-
-			/* The GPU has hung and unlikely to recover by this point. */
-			if (!once) {
+		static int once;
+		if (!once) {
+			if (ret == -EIO) {
+				/* The GPU has hung and unlikely to recover by this point. */
 				xf86DrvMsg(scrn->scrnIndex, X_ERROR, "Detected a hung GPU, disabling acceleration.\n");
 				xf86DrvMsg(scrn->scrnIndex, X_ERROR, "When reporting this, please include i915_error_state from debugfs and the full dmesg.\n");
-				uxa_set_force_fallback(xf86ScrnToScreen(scrn), TRUE);
-				intel->force_fallback = TRUE;
-				once = 1;
+			} else {
+				/* The driver is broken. */
+				xf86DrvMsg(scrn->scrnIndex, X_ERROR,
+					   "Failed to submit batch buffer, expect rendering corruption: %s.\n ",
+					   strerror(-ret));
 			}
-		} else {
-			xf86DrvMsg(scrn->scrnIndex, X_ERROR,
-				   "Failed to submit batch buffer, expect rendering corruption "
-				   "or even a frozen display: %s.\n",
-				   strerror(-ret));
+			uxa_set_force_fallback(xf86ScrnToScreen(scrn), TRUE);
+			intel->force_fallback = TRUE;
+			once = 1;
 		}
 	}
 
@@ -273,8 +294,7 @@ void intel_batch_submit(ScrnInfoPtr scrn)
 	if (intel->debug_flush & DEBUG_FLUSH_WAIT)
 		drm_intel_bo_wait_rendering(intel->batch_bo);
 
-	dri_bo_unreference(intel->batch_bo);
-	intel_next_batch(scrn);
+	intel_next_batch(scrn, intel->current_batch == I915_EXEC_BLT);
 
 	if (intel->batch_commit_notify)
 		intel->batch_commit_notify(intel);
diff --git a/src/intel_display.c b/src/intel_display.c
index d58e6e0b6..5ee955ee6 100644
--- a/src/intel_display.c
+++ b/src/intel_display.c
@@ -31,6 +31,7 @@
 
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/poll.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <errno.h>
@@ -270,6 +271,7 @@ intel_output_backlight_init(xf86OutputPtr output)
 			intel_output->backlight_iface = str;
 			intel_output->backlight_max = intel_output_backlight_get_max(output);
 			if (intel_output->backlight_max > 0) {
+				intel_output->backlight_active_level = intel_output_backlight_get(output);
 				xf86DrvMsg(output->scrn->scrnIndex, X_CONFIG,
 					   "found backlight control interface %s\n", path);
 				return;
@@ -493,6 +495,8 @@ intel_crtc_set_mode_major(xf86CrtcPtr crtc, DisplayModePtr mode,
 			ErrorF("failed to add fb\n");
 			return FALSE;
 		}
+
+		drm_intel_bo_disable_reuse(intel->front_buffer);
 	}
 
 	saved_mode = crtc->mode;
@@ -597,6 +601,8 @@ intel_crtc_shadow_allocate(xf86CrtcPtr crtc, int width, int height)
 		return NULL;
 	}
 
+	drm_intel_bo_disable_reuse(intel_crtc->rotate_bo);
+
 	intel_crtc->rotate_pitch = rotate_pitch;
 	return intel_crtc->rotate_bo;
 }
@@ -723,6 +729,8 @@ intel_set_scanout_pixmap(xf86CrtcPtr crtc, PixmapPtr ppix)
 		ErrorF("have front buffer\n");
 	}
 
+	drm_intel_bo_disable_reuse(bo);
+
 	intel_crtc->scanout_pixmap = ppix;
 	return drmModeAddFB(intel->drmSubFD, ppix->drawable.width,
 			   ppix->drawable.height, ppix->drawable.depth,
@@ -1494,6 +1502,7 @@ intel_xf86crtc_resize(ScrnInfoPtr scrn, int width, int height)
 	if (ret)
 		goto fail;
 
+	drm_intel_bo_disable_reuse(intel->front_buffer);
 	intel->front_pitch = pitch;
 	intel->front_tiling = tiling;
 
@@ -1555,6 +1564,7 @@ intel_do_pageflip(intel_screen_private *intel,
 			 new_front->handle, &new_fb_id))
 		goto error_out;
 
+	drm_intel_bo_disable_reuse(new_front);
 	intel_glamor_flush(intel);
 	intel_batch_submit(scrn);
 
@@ -1822,6 +1832,26 @@ intel_mode_remove_fb(intel_screen_private *intel)
 	}
 }
 
+static Bool has_pending_events(int fd)
+{
+	struct pollfd pfd;
+	pfd.fd = fd;
+	pfd.events = POLLIN;
+	return poll(&pfd, 1, 0) == 1;
+}
+
+void
+intel_mode_close(intel_screen_private *intel)
+{
+	struct intel_mode *mode = intel->modes;
+
+	if (mode == NULL)
+		return;
+
+	while (has_pending_events(mode->fd))
+		drmHandleEvent(mode->fd, &mode->event_context);
+}
+
 void
 intel_mode_fini(intel_screen_private *intel)
 {
diff --git a/src/intel_dri.c b/src/intel_dri.c
index 867a4653f..f3512034a 100644
--- a/src/intel_dri.c
+++ b/src/intel_dri.c
@@ -451,7 +451,7 @@ I830DRI2CopyRegion(DrawablePtr drawable, RegionPtr pRegion,
 	/* Wait for the scanline to be outside the region to be copied */
 	if (scrn->vtSema &&
 	    pixmap_is_scanout(get_drawable_pixmap(dst)) &&
-	    intel->swapbuffers_wait && INTEL_INFO(intel)->gen < 60) {
+	    intel->swapbuffers_wait && INTEL_INFO(intel)->gen < 060) {
 		BoxPtr box;
 		BoxRec crtcbox;
 		int y1, y2;
@@ -485,20 +485,20 @@ I830DRI2CopyRegion(DrawablePtr drawable, RegionPtr pRegion,
 			 * of extra time for the blitter to start up and
 			 * do its job for a full height blit
 			 */
-			if (full_height && INTEL_INFO(intel)->gen < 40)
+			if (full_height && INTEL_INFO(intel)->gen < 040)
 			    y2 -= 2;
 
 			if (pipe == 0) {
 				event = MI_WAIT_FOR_PIPEA_SCAN_LINE_WINDOW;
 				load_scan_lines_pipe =
 				    MI_LOAD_SCAN_LINES_DISPLAY_PIPEA;
-				if (full_height && INTEL_INFO(intel)->gen >= 40)
+				if (full_height && INTEL_INFO(intel)->gen >= 040)
 				    event = MI_WAIT_FOR_PIPEA_SVBLANK;
 			} else {
 				event = MI_WAIT_FOR_PIPEB_SCAN_LINE_WINDOW;
 				load_scan_lines_pipe =
 				    MI_LOAD_SCAN_LINES_DISPLAY_PIPEB;
-				if (full_height && INTEL_INFO(intel)->gen >= 40)
+				if (full_height && INTEL_INFO(intel)->gen >= 040)
 				    event = MI_WAIT_FOR_PIPEB_SVBLANK;
 			}
 
@@ -547,6 +547,23 @@ I830DRI2CopyRegion(DrawablePtr drawable, RegionPtr pRegion,
 	intel_batch_submit(scrn);
 }
 
+static void
+I830DRI2FallbackBlitSwap(DrawablePtr drawable,
+			 DRI2BufferPtr dst,
+			 DRI2BufferPtr src)
+{
+	BoxRec box;
+	RegionRec region;
+
+	box.x1 = 0;
+	box.y1 = 0;
+	box.x2 = drawable->width;
+	box.y2 = drawable->height;
+	REGION_INIT(pScreen, &region, &box, 0);
+
+	I830DRI2CopyRegion(drawable, &region, dst, src);
+}
+
 #if DRI2INFOREC_VERSION >= 4
 
 static void I830DRI2ReferenceBuffer(DRI2Buffer2Ptr buffer)
@@ -996,17 +1013,8 @@ void I830DRI2FrameEventHandler(unsigned int frame, unsigned int tv_sec,
 
 		/* else fall through to exchange/blit */
 	case DRI2_SWAP: {
-		BoxRec box;
-		RegionRec region;
-
-		box.x1 = 0;
-		box.y1 = 0;
-		box.x2 = drawable->width;
-		box.y2 = drawable->height;
-		REGION_INIT(pScreen, &region, &box, 0);
-
-		I830DRI2CopyRegion(drawable,
-				   &region, swap_info->front, swap_info->back);
+		I830DRI2FallbackBlitSwap(drawable,
+					 swap_info->front, swap_info->back);
 		DRI2SwapComplete(swap_info->client, drawable, frame, tv_sec, tv_usec,
 				 DRI2_BLIT_COMPLETE,
 				 swap_info->client ? swap_info->event_complete : NULL,
@@ -1089,17 +1097,10 @@ void I830DRI2FlipEventHandler(unsigned int frame, unsigned int tv_sec,
 				i830_dri2_del_frame_event(chain_drawable, chain);
 			} else if (!can_exchange(chain_drawable, chain->front, chain->back) ||
 				   !I830DRI2ScheduleFlip(intel, chain_drawable, chain)) {
-				BoxRec box;
-				RegionRec region;
-
-				box.x1 = 0;
-				box.y1 = 0;
-				box.x2 = chain_drawable->width;
-				box.y2 = chain_drawable->height;
-				REGION_INIT(pScreen, &region, &box, 0);
+				I830DRI2FallbackBlitSwap(chain_drawable,
+							 chain->front,
+							 chain->back);
 
-				I830DRI2CopyRegion(chain_drawable, &region,
-						   chain->front, chain->back);
 				DRI2SwapComplete(chain->client, chain_drawable, frame, tv_sec, tv_usec,
 						 DRI2_BLIT_COMPLETE,
 						 chain->client ? chain->event_complete : NULL,
@@ -1162,8 +1163,6 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	DRI2FrameEventPtr swap_info = NULL;
 	enum DRI2FrameEventType swap_type = DRI2_SWAP;
 	CARD64 current_msc;
-	BoxRec box;
-	RegionRec region;
 
 	/* Drawable not displayed... just complete the swap */
 	if (pipe == -1)
@@ -1231,7 +1230,13 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	 * the swap.
 	 */
 	if (divisor == 0 || current_msc < *target_msc) {
-		if (flip && I830DRI2ScheduleFlip(intel, draw, swap_info))
+		/*
+		 * If we can, schedule the flip directly from here rather
+		 * than waiting for an event from the kernel for the current
+		 * (or a past) MSC.
+		 */
+		if (flip && divisor == 0 && current_msc >= *target_msc &&
+		    I830DRI2ScheduleFlip(intel, draw, swap_info))
 			return TRUE;
 
 		vbl.request.type =
@@ -1313,14 +1318,7 @@ I830DRI2ScheduleSwap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	return TRUE;
 
 blit_fallback:
-	box.x1 = 0;
-	box.y1 = 0;
-	box.x2 = draw->width;
-	box.y2 = draw->height;
-	REGION_INIT(pScreen, &region, &box, 0);
-
-	I830DRI2CopyRegion(draw, &region, front, back);
-
+	I830DRI2FallbackBlitSwap(draw, front, back);
 	DRI2SwapComplete(client, draw, 0, 0, 0, DRI2_BLIT_COMPLETE, func, data);
 	if (swap_info)
 	    i830_dri2_del_frame_event(draw, swap_info);
@@ -1515,6 +1513,17 @@ out_complete:
 static int dri2_server_generation;
 #endif
 
+static const char *dri_driver_name(intel_screen_private *intel)
+{
+	const char *s = xf86GetOptValString(intel->Options, OPTION_DRI);
+	Bool dummy;
+
+	if (s == NULL || xf86getBoolValue(&dummy, s))
+		return INTEL_INFO(intel)->gen < 040 ? "i915" : "i965";
+
+	return s;
+}
+
 Bool I830DRI2ScreenInit(ScreenPtr screen)
 {
 	ScrnInfoPtr scrn = xf86ScreenToScrn(screen);
@@ -1564,7 +1573,7 @@ Bool I830DRI2ScreenInit(ScreenPtr screen)
 	intel->deviceName = drmGetDeviceNameFromFd(intel->drmSubFD);
 	memset(&info, '\0', sizeof(info));
 	info.fd = intel->drmSubFD;
-	info.driverName = INTEL_INFO(intel)->gen < 40 ? "i915" : "i965";
+	info.driverName = dri_driver_name(intel);
 	info.deviceName = intel->deviceName;
 
 #if DRI2INFOREC_VERSION == 1
diff --git a/src/intel_driver.c b/src/intel_driver.c
index 65a50088e..780710624 100644
--- a/src/intel_driver.c
+++ b/src/intel_driver.c
@@ -221,11 +221,19 @@ static Bool I830GetEarlyOptions(ScrnInfoPtr scrn)
 	return TRUE;
 }
 
+static Bool intel_option_cast_string_to_bool(intel_screen_private *intel,
+					     int id, Bool val)
+{
+	xf86getBoolValue(&val, xf86GetOptValString(intel->Options, id));
+	return val;
+}
+
 static void intel_check_dri_option(ScrnInfoPtr scrn)
 {
 	intel_screen_private *intel = intel_get_screen_private(scrn);
+
 	intel->directRenderingType = DRI_NONE;
-	if (!xf86ReturnOptValBool(intel->Options, OPTION_DRI, TRUE))
+	if (!intel_option_cast_string_to_bool(intel, OPTION_DRI, TRUE))
 		intel->directRenderingType = DRI_DISABLED;
 
 	if (scrn->depth != 16 && scrn->depth != 24 && scrn->depth != 30) {
@@ -317,7 +325,7 @@ static int intel_init_bufmgr(intel_screen_private *intel)
 
 	list_init(&intel->batch_pixmaps);
 
-	if ((INTEL_INFO(intel)->gen == 60)) {
+	if ((INTEL_INFO(intel)->gen == 060)) {
 		intel->wa_scratch_bo =
 			drm_intel_bo_alloc(intel->bufmgr, "wa scratch",
 					   4096, 4096);
@@ -397,13 +405,14 @@ static Bool can_accelerate_blt(struct intel_screen_private *intel)
 	if (INTEL_INFO(intel)->gen == -1)
 		return FALSE;
 
-	if (xf86ReturnOptValBool(intel->Options, OPTION_ACCEL_DISABLE, FALSE)) {
+	if (xf86ReturnOptValBool(intel->Options, OPTION_ACCEL_DISABLE, FALSE) ||
+	    !intel_option_cast_string_to_bool(intel, OPTION_ACCEL_METHOD, TRUE)) {
 		xf86DrvMsg(intel->scrn->scrnIndex, X_CONFIG,
 			   "Disabling hardware acceleration.\n");
 		return FALSE;
 	}
 
-	if (INTEL_INFO(intel)->gen == 60) {
+	if (INTEL_INFO(intel)->gen == 060) {
 		struct pci_device *const device = intel->PciInfo;
 
 		/* Sandybridge rev07 locks up easily, even with the
@@ -418,7 +427,7 @@ static Bool can_accelerate_blt(struct intel_screen_private *intel)
 		}
 	}
 
-	if (INTEL_INFO(intel)->gen >= 60) {
+	if (INTEL_INFO(intel)->gen >= 060) {
 		drm_i915_getparam_t gp;
 		int value;
 
@@ -579,7 +588,7 @@ static Bool I830PreInit(ScrnInfoPtr scrn, int flags)
 	intel->has_relaxed_fencing =
 		xf86ReturnOptValBool(intel->Options,
 				     OPTION_RELAXED_FENCING,
-				     INTEL_INFO(intel)->gen >= 33);
+				     INTEL_INFO(intel)->gen >= 033);
 	/* And override the user if there is no kernel support */
 	if (intel->has_relaxed_fencing)
 		intel->has_relaxed_fencing = has_relaxed_fencing(intel);
@@ -677,7 +686,7 @@ void IntelEmitInvarientState(ScrnInfoPtr scrn)
 }
 
 #ifdef INTEL_PIXMAP_SHARING
-static Bool
+static void
 redisplay_dirty(ScreenPtr screen, PixmapDirtyUpdatePtr dirty)
 {
 	ScrnInfoPtr scrn = xf86ScreenToScrn(screen);
@@ -686,8 +695,19 @@ redisplay_dirty(ScreenPtr screen, PixmapDirtyUpdatePtr dirty)
 	int was_blocked;
 
 	PixmapRegionInit(&pixregion, dirty->slave_dst->master_pixmap);
+	RegionTranslate(&pixregion, dirty->x, dirty->y);
+	RegionIntersect(&pixregion, &pixregion, DamageRegion(dirty->damage));
+	RegionTranslate(&pixregion, -dirty->x, -dirty->y);
+	was_blocked = RegionNil(&pixregion);
+	DamageRegionAppend(&dirty->slave_dst->drawable, &pixregion);
+	RegionUninit(&pixregion);
+	if (was_blocked)
+		return;
 
+	PixmapRegionInit(&pixregion, dirty->slave_dst->master_pixmap);
 	PixmapSyncDirtyHelper(dirty, &pixregion);
+	RegionUninit(&pixregion);
+
 	intel_batch_submit(scrn);
 	if (!intel->has_prime_vmap_flush) {
 		drm_intel_bo *bo = intel_get_pixmap_bo(dirty->slave_dst->master_pixmap);
@@ -695,10 +715,10 @@ redisplay_dirty(ScreenPtr screen, PixmapDirtyUpdatePtr dirty)
 		drm_intel_bo_map(bo, FALSE);
 		drm_intel_bo_unmap(bo);
 		xf86UnblockSIGIO(was_blocked);
-        }
-        DamageRegionAppend(&dirty->slave_dst->drawable, &pixregion);
-        RegionUninit(&pixregion);
-	return 0;
+	}
+
+	DamageRegionProcessPending(&dirty->slave_dst->drawable);
+	return;
 }
 
 static void
@@ -710,7 +730,6 @@ intel_dirty_update(ScreenPtr screen)
 	if (xorg_list_is_empty(&screen->pixmap_dirty_list))
 	    return;
 
-	ErrorF("list is not empty\n");
 	xorg_list_for_each_entry(ent, &screen->pixmap_dirty_list, ent) {
 		region = DamageRegion(ent->damage);
 		if (RegionNotEmpty(region)) {
@@ -921,7 +940,7 @@ I830ScreenInit(SCREEN_INIT_ARGS_DECL)
 
 	intel_batch_init(scrn);
 
-	if (INTEL_INFO(intel)->gen >= 40)
+	if (INTEL_INFO(intel)->gen >= 040)
 		gen4_render_state_init(scrn);
 
 	miClearVisualTypes();
@@ -1014,7 +1033,7 @@ I830ScreenInit(SCREEN_INIT_ARGS_DECL)
 	xf86DPMSInit(screen, xf86DPMSSet, 0);
 
 #ifdef INTEL_XVMC
-	if (INTEL_INFO(intel)->gen >= 40)
+	if (INTEL_INFO(intel)->gen >= 040)
 		intel->XvMCEnabled = TRUE;
 	from = ((intel->directRenderingType == DRI_DRI2) &&
 		xf86GetOptValBool(intel->Options, OPTION_XVMC,
@@ -1139,6 +1158,8 @@ static Bool I830CloseScreen(CLOSE_SCREEN_ARGS_DECL)
 	I830UeventFini(scrn);
 #endif
 
+	intel_mode_close(intel);
+
 	DeleteCallback(&FlushCallback, intel_flush_callback, scrn);
 
 	intel_glamor_close_screen(screen);
@@ -1174,7 +1195,7 @@ static Bool I830CloseScreen(CLOSE_SCREEN_ARGS_DECL)
 
 	intel_batch_teardown(scrn);
 
-	if (INTEL_INFO(intel)->gen >= 40)
+	if (INTEL_INFO(intel)->gen >= 040)
 		gen4_render_state_cleanup(scrn);
 
 	xf86_cursors_fini(screen);
diff --git a/src/intel_driver.h b/src/intel_driver.h
index b7190620d..c98025bac 100644
--- a/src/intel_driver.h
+++ b/src/intel_driver.h
@@ -230,6 +230,9 @@
 #define PCI_CHIP_HASWELL_CRW_S_GT2_PLUS	0x0D3A
 
 #define PCI_CHIP_VALLEYVIEW_PO		0x0f30
+#define PCI_CHIP_VALLEYVIEW_1		0x0f31
+#define PCI_CHIP_VALLEYVIEW_2		0x0f32
+#define PCI_CHIP_VALLEYVIEW_3		0x0f33
 
 #endif
 
@@ -249,7 +252,7 @@
 #define CHIP_REVISION(p)  (p)->revision
 
 #define INTEL_INFO(intel) ((intel)->info)
-#define IS_GENx(intel, X) (INTEL_INFO(intel)->gen >= 10*(X) && INTEL_INFO(intel)->gen < 10*((X)+1))
+#define IS_GENx(intel, X) (INTEL_INFO(intel)->gen >= 8*(X) && INTEL_INFO(intel)->gen < 8*((X)+1))
 #define IS_GEN1(intel) IS_GENx(intel, 1)
 #define IS_GEN2(intel) IS_GENx(intel, 2)
 #define IS_GEN3(intel) IS_GENx(intel, 3)
@@ -257,7 +260,7 @@
 #define IS_GEN5(intel) IS_GENx(intel, 5)
 #define IS_GEN6(intel) IS_GENx(intel, 6)
 #define IS_GEN7(intel) IS_GENx(intel, 7)
-#define IS_HSW(intel) (INTEL_INFO(intel)->gen == 75)
+#define IS_HSW(intel) (INTEL_INFO(intel)->gen == 075)
 
 /* Some chips have specific errata (or limits) that we need to workaround. */
 #define IS_I830(intel) (DEVICE_ID((intel)->PciInfo) == PCI_CHIP_I830_M)
@@ -270,8 +273,8 @@
 #define IS_965_Q(pI810) (DEVICE_ID(pI810->PciInfo) == PCI_CHIP_I965_Q)
 
 /* supports Y tiled surfaces (pre-965 Mesa isn't ready yet) */
-#define SUPPORTS_YTILING(pI810) (INTEL_INFO(intel)->gen >= 40)
-#define HAS_BLT(pI810) (INTEL_INFO(intel)->gen >= 60)
+#define SUPPORTS_YTILING(pI810) (INTEL_INFO(intel)->gen >= 040)
+#define HAS_BLT(pI810) (INTEL_INFO(intel)->gen >= 060)
 
 struct intel_device_info {
 	int gen;
diff --git a/src/intel_hwmc.c b/src/intel_hwmc.c
index af8bd8134..25978d22a 100644
--- a/src/intel_hwmc.c
+++ b/src/intel_hwmc.c
@@ -75,11 +75,11 @@ static int create_context(ScrnInfoPtr scrn, XvMCContextPtr pContext,
 		contextRec->type = XVMC_I915_MPEG2_MC;
 		contextRec->i915.use_phys_addr = 0;
 	} else {
-		if (INTEL_INFO(intel)->gen >= 45)
+		if (INTEL_INFO(intel)->gen >= 045)
 			contextRec->type = XVMC_I965_MPEG2_VLD;
 		else
 			contextRec->type = XVMC_I965_MPEG2_MC;
-		contextRec->i965.is_g4x = INTEL_INFO(intel)->gen == 45;
+		contextRec->i965.is_g4x = INTEL_INFO(intel)->gen == 045;
 		contextRec->i965.is_965_q = IS_965_Q(intel);
 		contextRec->i965.is_igdng = IS_GEN5(intel);
 	}
@@ -227,7 +227,7 @@ Bool intel_xvmc_adaptor_init(ScreenPtr pScreen)
 		name = "i915_xvmc",
 		pAdapt->num_surfaces = ARRAY_SIZE(surface_info_i915);
 		pAdapt->surfaces = surface_info_i915;
-	} else if (INTEL_INFO(intel)->gen >= 45) {
+	} else if (INTEL_INFO(intel)->gen >= 045) {
 		name = "xvmc_vld",
 		pAdapt->num_surfaces = ARRAY_SIZE(surface_info_vld);
 		pAdapt->surfaces = surface_info_vld;
diff --git a/src/intel_memory.c b/src/intel_memory.c
index f08ebdd01..e51fa33a9 100644
--- a/src/intel_memory.c
+++ b/src/intel_memory.c
@@ -94,7 +94,7 @@ unsigned long intel_get_fence_size(intel_screen_private *intel, unsigned long si
 	unsigned long i;
 	unsigned long start;
 
-	if (INTEL_INFO(intel)->gen >= 40 || intel->has_relaxed_fencing) {
+	if (INTEL_INFO(intel)->gen >= 040 || intel->has_relaxed_fencing) {
 		/* The 965 can have fences at any page boundary. */
 		return ALIGN(size, 4096);
 	} else {
@@ -127,7 +127,7 @@ intel_get_fence_pitch(intel_screen_private *intel, unsigned long pitch,
 		return pitch;
 
 	/* 965+ is flexible */
-	if (INTEL_INFO(intel)->gen >= 40)
+	if (INTEL_INFO(intel)->gen >= 040)
 		return ALIGN(pitch, tile_width);
 
 	/* Pre-965 needs power of two tile width */
@@ -173,7 +173,7 @@ static inline int intel_pad_drawable_width(int width)
 static size_t
 agp_aperture_size(struct pci_device *dev, int gen)
 {
-	return dev->regions[gen < 30 ? 0 : 2].size;
+	return dev->regions[gen < 030 ? 0 : 2].size;
 }
 
 static void intel_set_gem_max_sizes(ScrnInfoPtr scrn)
diff --git a/src/intel_module.c b/src/intel_module.c
index e6ca964d6..141f77afe 100644
--- a/src/intel_module.c
+++ b/src/intel_module.c
@@ -56,62 +56,62 @@ static const struct intel_device_info intel_generic_info = {
 };
 
 static const struct intel_device_info intel_i81x_info = {
-	.gen = 10,
+	.gen = 010,
 };
 
 static const struct intel_device_info intel_i830_info = {
-	.gen = 20,
+	.gen = 020,
 };
 static const struct intel_device_info intel_i845_info = {
-	.gen = 20,
+	.gen = 020,
 };
 static const struct intel_device_info intel_i855_info = {
-	.gen = 21,
+	.gen = 021,
 };
 static const struct intel_device_info intel_i865_info = {
-	.gen = 22,
+	.gen = 022,
 };
 
 static const struct intel_device_info intel_i915_info = {
-	.gen = 30,
+	.gen = 030,
 };
 static const struct intel_device_info intel_i945_info = {
-	.gen = 31,
+	.gen = 031,
 };
 
 static const struct intel_device_info intel_g33_info = {
-	.gen = 33,
+	.gen = 033,
 };
 
 static const struct intel_device_info intel_i965_info = {
-	.gen = 40,
+	.gen = 040,
 };
 
 static const struct intel_device_info intel_g4x_info = {
-	.gen = 45,
+	.gen = 045,
 };
 
 static const struct intel_device_info intel_ironlake_info = {
-	.gen = 50,
+	.gen = 050,
 };
 
 static const struct intel_device_info intel_sandybridge_info = {
-	.gen = 60,
+	.gen = 060,
 };
 
 static const struct intel_device_info intel_ivybridge_info = {
-	.gen = 70,
+	.gen = 070,
 };
 
 static const struct intel_device_info intel_valleyview_info = {
-	.gen = 70,
+	.gen = 071,
 };
 
 static const struct intel_device_info intel_haswell_info = {
-	.gen = 75,
+	.gen = 075,
 };
 
-static const SymTabRec _intel_chipsets[] = {
+static const SymTabRec intel_chipsets[] = {
 	{PCI_CHIP_I810,				"i810"},
 	{PCI_CHIP_I810_DC100,			"i810-dc100"},
 	{PCI_CHIP_I810_E,			"i810e"},
@@ -199,9 +199,7 @@ static const SymTabRec _intel_chipsets[] = {
 	{PCI_CHIP_VALLEYVIEW_PO,		"ValleyView PO board" },
 	{-1,					NULL}
 };
-#define NUM_CHIPSETS (sizeof(_intel_chipsets) / sizeof(_intel_chipsets[0]))
-
-static SymTabRec *intel_chipsets = (SymTabRec *) _intel_chipsets;
+#define NUM_CHIPSETS (sizeof(intel_chipsets) / sizeof(intel_chipsets[0]))
 
 #define INTEL_DEVICE_MATCH(d,i) \
     { 0x8086, (d), PCI_MATCH_ANY, PCI_MATCH_ANY, 0x3 << 16, 0xff << 16, (intptr_t)(i) }
@@ -308,6 +306,9 @@ static const struct pci_id_match intel_device_match[] = {
 	INTEL_DEVICE_MATCH (PCI_CHIP_HASWELL_CRW_S_GT2_PLUS, &intel_haswell_info ),
 
 	INTEL_DEVICE_MATCH (PCI_CHIP_VALLEYVIEW_PO, &intel_valleyview_info ),
+	INTEL_DEVICE_MATCH (PCI_CHIP_VALLEYVIEW_1, &intel_valleyview_info ),
+	INTEL_DEVICE_MATCH (PCI_CHIP_VALLEYVIEW_2, &intel_valleyview_info ),
+	INTEL_DEVICE_MATCH (PCI_CHIP_VALLEYVIEW_3, &intel_valleyview_info ),
 
 	INTEL_DEVICE_MATCH (PCI_MATCH_ANY, &intel_generic_info ),
 #endif
@@ -383,7 +384,7 @@ static Bool intel_driver_func(ScrnInfoPtr pScrn,
 	}
 }
 
-static Bool has_kernel_mode_setting(struct pci_device *dev)
+static Bool has_kernel_mode_setting(const struct pci_device *dev)
 {
 	char id[20];
 	int ret, fd;
@@ -418,7 +419,6 @@ static Bool has_kernel_mode_setting(struct pci_device *dev)
 			if (drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp))
 				ret = FALSE;
 		}
-
 		close(fd);
 	}
 
@@ -465,50 +465,15 @@ static enum accel_method { UXA, SNA } get_accel_method(void)
 }
 #endif
 
-/*
- * intel_pci_probe --
- *
- * Look through the PCI bus to find cards that are intel boards.
- * Setup the dispatch table for the rest of the driver functions.
- *
- */
-static Bool intel_pci_probe(DriverPtr		driver,
-			    int			entity_num,
-			    struct pci_device	*device,
-			    intptr_t		match_data)
+static Bool
+intel_scrn_create(DriverPtr		driver,
+		  int			entity_num,
+		  intptr_t		match_data,
+		  unsigned		flags)
 {
 	ScrnInfoPtr scrn;
-	PciChipsets intel_pci_chipsets[NUM_CHIPSETS];
-	unsigned i;
-
-	if (!has_kernel_mode_setting(device)) {
-#if KMS_ONLY
-		return FALSE;
-#else
-		switch (DEVICE_ID(device)) {
-		case PCI_CHIP_I810:
-		case PCI_CHIP_I810_DC100:
-		case PCI_CHIP_I810_E:
-		case PCI_CHIP_I815:
-			break;
-		default:
-			return FALSE;
-		}
-#endif
-	}
 
-	for (i = 0; i < NUM_CHIPSETS; i++) {
-		intel_pci_chipsets[i].numChipset = intel_chipsets[i].token;
-		intel_pci_chipsets[i].PCIid = intel_chipsets[i].token;
-#if XORG_VERSION_CURRENT < XORG_VERSION_NUMERIC(1,6,99,0,0)
-		intel_pci_chipsets[i].resList = RES_SHARED_VGA;
-#else
-		intel_pci_chipsets[i].dummy = NULL;
-#endif
-	}
-
-	scrn = xf86ConfigPciEntity(NULL, 0, entity_num, intel_pci_chipsets,
-				   NULL, NULL, NULL, NULL, NULL);
+	scrn = xf86AllocateScreen(driver, flags);
 	if (scrn == NULL)
 		return FALSE;
 
@@ -518,14 +483,13 @@ static Bool intel_pci_probe(DriverPtr		driver,
 	scrn->driverPrivate = (void *)(match_data | 1);
 	scrn->Probe = NULL;
 
+	if (xf86IsEntitySharable(entity_num))
+		xf86SetEntityShared(entity_num);
+	xf86AddEntityToScreen(scrn, entity_num);
+
 #if !KMS_ONLY
-	switch (DEVICE_ID(device)) {
-	case PCI_CHIP_I810:
-	case PCI_CHIP_I810_DC100:
-	case PCI_CHIP_I810_E:
-	case PCI_CHIP_I815:
+	if ((unsigned)((struct intel_device_info *)match_data)->gen < 020)
 		return lg_i810_init(scrn);
-	}
 #endif
 
 #if !UMS_ONLY
@@ -533,7 +497,6 @@ static Bool intel_pci_probe(DriverPtr		driver,
 #if USE_SNA
 	case SNA: return sna_init_scrn(scrn, entity_num);
 #endif
-
 #if USE_UXA
 	case UXA: return intel_init_scrn(scrn);
 #endif
@@ -545,6 +508,37 @@ static Bool intel_pci_probe(DriverPtr		driver,
 	return FALSE;
 }
 
+/*
+ * intel_pci_probe --
+ *
+ * Look through the PCI bus to find cards that are intel boards.
+ * Setup the dispatch table for the rest of the driver functions.
+ *
+ */
+static Bool intel_pci_probe(DriverPtr		driver,
+			    int			entity_num,
+			    struct pci_device	*device,
+			    intptr_t		match_data)
+{
+	if (!has_kernel_mode_setting(device)) {
+#if KMS_ONLY
+		return FALSE;
+#else
+		switch (DEVICE_ID(device)) {
+		case PCI_CHIP_I810:
+		case PCI_CHIP_I810_DC100:
+		case PCI_CHIP_I810_E:
+		case PCI_CHIP_I815:
+			break;
+		default:
+			return FALSE;
+		}
+#endif
+	}
+
+	return intel_scrn_create(driver, entity_num, match_data, 0);
+}
+
 #ifdef XSERVER_PLATFORM_BUS
 static Bool
 intel_platform_probe(DriverPtr driver,
@@ -552,13 +546,14 @@ intel_platform_probe(DriverPtr driver,
 		     struct xf86_platform_device *dev,
 		     intptr_t match_data)
 {
-	ScrnInfoPtr scrn = NULL;
-	char *path = xf86_get_platform_device_attrib(dev, ODEV_ATTRIB_PATH);
 	unsigned scrn_flags = 0;
 
 	if (!dev->pdev)
 		return FALSE;
 
+	if (!has_kernel_mode_setting(dev->pdev))
+		return FALSE;
+
 	/* Allow ourselves to act as a slaved output if not primary */
 	if (flags & PLATFORM_PROBE_GPU_SCREEN) {
 		flags &= ~PLATFORM_PROBE_GPU_SCREEN;
@@ -569,37 +564,7 @@ intel_platform_probe(DriverPtr driver,
 	if (flags)
 		return FALSE;
 
-	scrn = xf86AllocateScreen(driver, scrn_flags);
-	if (scrn == NULL)
-		return FALSE;
-
-	scrn->driverVersion = INTEL_VERSION;
-	scrn->driverName = INTEL_DRIVER_NAME;
-	scrn->name = INTEL_NAME;
-	scrn->driverPrivate = (void *)(match_data | 1);
-	scrn->Probe = NULL;
-
-	if (xf86IsEntitySharable(entity_num))
-		xf86SetEntityShared(entity_num);
-	xf86AddEntityToScreen(scrn, entity_num);
-
-	xf86DrvMsg(scrn->scrnIndex, X_INFO,
-		   "using device path '%s'\n", path ? path : "Default device");
-
-#if !UMS_ONLY
-	switch (get_accel_method()) {
-#if USE_SNA
-        case SNA: return sna_init_scrn(scrn, entity_num);
-#endif
-#if USE_UXA
-        case UXA: return intel_init_scrn(scrn);
-#endif
-
-	default: break;
-	}
-#endif
-
-	return FALSE;
+	return intel_scrn_create(driver, entity_num, match_data, scrn_flags);
 }
 #endif
 
diff --git a/src/intel_options.c b/src/intel_options.c
index dcab9e729..fda2e8b0f 100644
--- a/src/intel_options.c
+++ b/src/intel_options.c
@@ -8,12 +8,13 @@ const OptionInfoRec intel_options[] = {
 	{OPTION_ACCEL_DISABLE,	"NoAccel",	OPTV_BOOLEAN,	{0},	0},
 	{OPTION_ACCEL_METHOD,	"AccelMethod",	OPTV_STRING,	{0},	0},
 	{OPTION_BACKLIGHT,	"Backlight",	OPTV_STRING,	{0},	0},
-	{OPTION_DRI,		"DRI",		OPTV_BOOLEAN,	{0},	1},
+	{OPTION_DRI,		"DRI",		OPTV_STRING,	{0},	0},
 	{OPTION_COLOR_KEY,	"ColorKey",	OPTV_INTEGER,	{0},	0},
 	{OPTION_VIDEO_KEY,	"VideoKey",	OPTV_INTEGER,	{0},	0},
 	{OPTION_TILING_2D,	"Tiling",	OPTV_BOOLEAN,	{0},	1},
 	{OPTION_TILING_FB,	"LinearFramebuffer",	OPTV_BOOLEAN,	{0},	0},
 	{OPTION_SWAPBUFFERS_WAIT, "SwapbuffersWait", OPTV_BOOLEAN,	{0},	1},
+	{OPTION_TRIPLE_BUFFER,	"TripleBuffer", OPTV_BOOLEAN,	{0},	1},
 	{OPTION_PREFER_OVERLAY, "XvPreferOverlay", OPTV_BOOLEAN, {0}, 0},
 	{OPTION_HOTPLUG,	"HotPlug",	OPTV_BOOLEAN,	{0},	1},
 	{OPTION_RELAXED_FENCING,"RelaxedFencing",	OPTV_BOOLEAN,	{0},	1},
@@ -21,9 +22,7 @@ const OptionInfoRec intel_options[] = {
 	{OPTION_XVMC,	"XvMC",		OPTV_BOOLEAN,	{0},	1},
 #endif
 #ifdef USE_SNA
-	{OPTION_THROTTLE,	"Throttle",	OPTV_BOOLEAN,	{0},	1},
 	{OPTION_ZAPHOD,		"ZaphodHeads",	OPTV_STRING,	{0},	0},
-	{OPTION_DELAYED_FLUSH,	"DelayedFlush",	OPTV_BOOLEAN,	{0},	1},
 	{OPTION_TEAR_FREE,	"TearFree",	OPTV_BOOLEAN,	{0},	0},
 	{OPTION_CRTC_PIXMAPS,	"PerCrtcPixmaps", OPTV_BOOLEAN,	{0},	0},
 #endif
@@ -33,7 +32,6 @@ const OptionInfoRec intel_options[] = {
 	{OPTION_DEBUG_FLUSH_CACHES, "DebugFlushCaches", OPTV_BOOLEAN, {0}, 0},
 	{OPTION_DEBUG_WAIT, "DebugWait", OPTV_BOOLEAN, {0}, 0},
 	{OPTION_BUFFER_CACHE,	"BufferCache",	OPTV_BOOLEAN,   {0},    1},
-	{OPTION_TRIPLE_BUFFER,	"TripleBuffer", OPTV_BOOLEAN,	{0},	1},
 #endif
 	{-1,			NULL,		OPTV_NONE,	{0},	0}
 };
diff --git a/src/intel_options.h b/src/intel_options.h
index 3b5262a55..8fa7a8fb8 100644
--- a/src/intel_options.h
+++ b/src/intel_options.h
@@ -20,6 +20,7 @@ enum intel_options {
 	OPTION_TILING_2D,
 	OPTION_TILING_FB,
 	OPTION_SWAPBUFFERS_WAIT,
+	OPTION_TRIPLE_BUFFER,
 	OPTION_PREFER_OVERLAY,
 	OPTION_HOTPLUG,
 	OPTION_RELAXED_FENCING,
@@ -28,9 +29,7 @@ enum intel_options {
 #define INTEL_XVMC 1
 #endif
 #ifdef USE_SNA
-	OPTION_THROTTLE,
 	OPTION_ZAPHOD,
-	OPTION_DELAYED_FLUSH,
 	OPTION_TEAR_FREE,
 	OPTION_CRTC_PIXMAPS,
 #endif
@@ -40,7 +39,6 @@ enum intel_options {
 	OPTION_DEBUG_FLUSH_CACHES,
 	OPTION_DEBUG_WAIT,
 	OPTION_BUFFER_CACHE,
-	OPTION_TRIPLE_BUFFER,
 #endif
 	NUM_OPTIONS,
 };
diff --git a/src/intel_uxa.c b/src/intel_uxa.c
index 6d202c776..2f141735d 100644
--- a/src/intel_uxa.c
+++ b/src/intel_uxa.c
@@ -170,7 +170,7 @@ intel_uxa_pixmap_compute_size(PixmapPtr pixmap,
 		pitch = (w * pixmap->drawable.bitsPerPixel + 7) / 8;
 		pitch = ALIGN(pitch, 64);
 		size = pitch * ALIGN (h, 2);
-		if (INTEL_INFO(intel)->gen < 40) {
+		if (INTEL_INFO(intel)->gen < 040) {
 			/* Gen 2/3 has a maximum stride for tiling of
 			 * 8192 bytes.
 			 */
@@ -209,7 +209,7 @@ intel_uxa_pixmap_compute_size(PixmapPtr pixmap,
 			tile_height = 8;
 		else
 			tile_height = 32;
-		aligned_h = ALIGN(h, tile_height);
+		aligned_h = ALIGN(h, 2*tile_height);
 
 		*stride = intel_get_fence_pitch(intel,
 						ALIGN(pitch, 512),
@@ -331,7 +331,7 @@ static void intel_uxa_solid(PixmapPtr pixmap, int x1, int y1, int x2, int y2)
 			cmd |=
 			    XY_COLOR_BLT_WRITE_ALPHA | XY_COLOR_BLT_WRITE_RGB;
 
-		if (INTEL_INFO(intel)->gen >= 40 && intel_pixmap_tiled(pixmap)) {
+		if (INTEL_INFO(intel)->gen >= 040 && intel_pixmap_tiled(pixmap)) {
 			assert((pitch % 512) == 0);
 			pitch >>= 2;
 			cmd |= XY_COLOR_BLT_TILED;
@@ -470,7 +470,7 @@ intel_uxa_copy(PixmapPtr dest, int src_x1, int src_y1, int dst_x1,
 			    XY_SRC_COPY_BLT_WRITE_ALPHA |
 			    XY_SRC_COPY_BLT_WRITE_RGB;
 
-		if (INTEL_INFO(intel)->gen >= 40) {
+		if (INTEL_INFO(intel)->gen >= 040) {
 			if (intel_pixmap_tiled(dest)) {
 				assert((dst_pitch % 512) == 0);
 				dst_pitch >>= 2;
@@ -1281,7 +1281,7 @@ intel_limits_init(intel_screen_private *intel)
 	 * the front, which will have an appropriate pitch/offset already set up,
 	 * so UXA doesn't need to worry.
 	 */
-	if (INTEL_INFO(intel)->gen >= 40) {
+	if (INTEL_INFO(intel)->gen >= 040) {
 		intel->accel_pixmap_offset_alignment = 4 * 2;
 		intel->accel_max_x = 8192;
 		intel->accel_max_y = 8192;
@@ -1292,6 +1292,17 @@ intel_limits_init(intel_screen_private *intel)
 	}
 }
 
+static Bool intel_option_accel_blt(intel_screen_private *intel)
+{
+	const char *s;
+
+	s = xf86GetOptValString(intel->Options, OPTION_ACCEL_METHOD);
+	if (s == NULL)
+		return FALSE;
+
+	return strcasecmp(s, "blt") == 0;
+}
+
 Bool intel_uxa_init(ScreenPtr screen)
 {
 	ScrnInfoPtr scrn = xf86ScreenToScrn(screen);
@@ -1338,7 +1349,8 @@ Bool intel_uxa_init(ScreenPtr screen)
 	intel->uxa_driver->done_copy = intel_uxa_done;
 
 	/* Composite */
-	if (IS_GEN2(intel)) {
+	if (intel_option_accel_blt(intel)) {
+	} else if (IS_GEN2(intel)) {
 		intel->uxa_driver->check_composite = i830_check_composite;
 		intel->uxa_driver->check_composite_target = i830_check_composite_target;
 		intel->uxa_driver->check_composite_texture = i830_check_composite_texture;
diff --git a/src/intel_video.c b/src/intel_video.c
index 09782aa5d..6cce18240 100644
--- a/src/intel_video.c
+++ b/src/intel_video.c
@@ -353,7 +353,7 @@ void I830InitVideo(ScreenPtr screen)
 	 * supported hardware.
 	 */
 	if (scrn->bitsPerPixel >= 16 &&
-	    INTEL_INFO(intel)->gen >= 30) {
+	    INTEL_INFO(intel)->gen >= 030) {
 		texturedAdaptor = I830SetupImageVideoTextured(screen);
 		if (texturedAdaptor != NULL) {
 			xf86DrvMsg(scrn->scrnIndex, X_INFO,
@@ -436,7 +436,7 @@ static XF86VideoAdaptorPtr I830SetupImageVideoOverlay(ScreenPtr screen)
 
 	adapt->pPortPrivates[0].ptr = (pointer) (adaptor_priv);
 	adapt->nAttributes = NUM_ATTRIBUTES;
-	if (INTEL_INFO(intel)->gen >= 30)
+	if (INTEL_INFO(intel)->gen >= 030)
 		adapt->nAttributes += GAMMA_ATTRIBUTES;	/* has gamma */
 	adapt->pAttributes =
 	    xnfalloc(sizeof(XF86AttributeRec) * adapt->nAttributes);
@@ -445,7 +445,7 @@ static XF86VideoAdaptorPtr I830SetupImageVideoOverlay(ScreenPtr screen)
 	memcpy((char *)att, (char *)Attributes,
 	       sizeof(XF86AttributeRec) * NUM_ATTRIBUTES);
 	att += NUM_ATTRIBUTES;
-	if (INTEL_INFO(intel)->gen >= 30) {
+	if (INTEL_INFO(intel)->gen >= 030) {
 		memcpy((char *)att, (char *)GammaAttributes,
 		       sizeof(XF86AttributeRec) * GAMMA_ATTRIBUTES);
 	}
@@ -495,7 +495,7 @@ static XF86VideoAdaptorPtr I830SetupImageVideoOverlay(ScreenPtr screen)
 	/* Allow the pipe to be switched from pipe A to B when in clone mode */
 	xvPipe = MAKE_ATOM("XV_PIPE");
 
-	if (INTEL_INFO(intel)->gen >= 30) {
+	if (INTEL_INFO(intel)->gen >= 030) {
 		xvGamma0 = MAKE_ATOM("XV_GAMMA0");
 		xvGamma1 = MAKE_ATOM("XV_GAMMA1");
 		xvGamma2 = MAKE_ATOM("XV_GAMMA2");
@@ -681,17 +681,17 @@ I830SetPortAttributeOverlay(ScrnInfoPtr scrn,
 			adaptor_priv->desired_crtc = NULL;
 		else
 			adaptor_priv->desired_crtc = xf86_config->crtc[value];
-	} else if (attribute == xvGamma0 && (INTEL_INFO(intel)->gen >= 30)) {
+	} else if (attribute == xvGamma0 && (INTEL_INFO(intel)->gen >= 030)) {
 		adaptor_priv->gamma0 = value;
-	} else if (attribute == xvGamma1 && (INTEL_INFO(intel)->gen >= 30)) {
+	} else if (attribute == xvGamma1 && (INTEL_INFO(intel)->gen >= 030)) {
 		adaptor_priv->gamma1 = value;
-	} else if (attribute == xvGamma2 && (INTEL_INFO(intel)->gen >= 30)) {
+	} else if (attribute == xvGamma2 && (INTEL_INFO(intel)->gen >= 030)) {
 		adaptor_priv->gamma2 = value;
-	} else if (attribute == xvGamma3 && (INTEL_INFO(intel)->gen >= 30)) {
+	} else if (attribute == xvGamma3 && (INTEL_INFO(intel)->gen >= 030)) {
 		adaptor_priv->gamma3 = value;
-	} else if (attribute == xvGamma4 && (INTEL_INFO(intel)->gen >= 30)) {
+	} else if (attribute == xvGamma4 && (INTEL_INFO(intel)->gen >= 030)) {
 		adaptor_priv->gamma4 = value;
-	} else if (attribute == xvGamma5 && (INTEL_INFO(intel)->gen >= 30)) {
+	} else if (attribute == xvGamma5 && (INTEL_INFO(intel)->gen >= 030)) {
 		adaptor_priv->gamma5 = value;
 	} else if (attribute == xvColorKey) {
 		adaptor_priv->colorKey = value;
@@ -704,7 +704,7 @@ I830SetPortAttributeOverlay(ScrnInfoPtr scrn,
 	     attribute == xvGamma2 ||
 	     attribute == xvGamma3 ||
 	     attribute == xvGamma4 ||
-	     attribute == xvGamma5) && (INTEL_INFO(intel)->gen >= 30)) {
+	     attribute == xvGamma5) && (INTEL_INFO(intel)->gen >= 030)) {
 		OVERLAY_DEBUG("GAMMA\n");
 	}
 
@@ -739,17 +739,17 @@ I830GetPortAttribute(ScrnInfoPtr scrn,
 		if (c == xf86_config->num_crtc)
 			c = -1;
 		*value = c;
-	} else if (attribute == xvGamma0 && (INTEL_INFO(intel)->gen >= 30)) {
+	} else if (attribute == xvGamma0 && (INTEL_INFO(intel)->gen >= 030)) {
 		*value = adaptor_priv->gamma0;
-	} else if (attribute == xvGamma1 && (INTEL_INFO(intel)->gen >= 30)) {
+	} else if (attribute == xvGamma1 && (INTEL_INFO(intel)->gen >= 030)) {
 		*value = adaptor_priv->gamma1;
-	} else if (attribute == xvGamma2 && (INTEL_INFO(intel)->gen >= 30)) {
+	} else if (attribute == xvGamma2 && (INTEL_INFO(intel)->gen >= 030)) {
 		*value = adaptor_priv->gamma2;
-	} else if (attribute == xvGamma3 && (INTEL_INFO(intel)->gen >= 30)) {
+	} else if (attribute == xvGamma3 && (INTEL_INFO(intel)->gen >= 030)) {
 		*value = adaptor_priv->gamma3;
-	} else if (attribute == xvGamma4 && (INTEL_INFO(intel)->gen >= 30)) {
+	} else if (attribute == xvGamma4 && (INTEL_INFO(intel)->gen >= 030)) {
 		*value = adaptor_priv->gamma4;
-	} else if (attribute == xvGamma5 && (INTEL_INFO(intel)->gen >= 30)) {
+	} else if (attribute == xvGamma5 && (INTEL_INFO(intel)->gen >= 030)) {
 		*value = adaptor_priv->gamma5;
 	} else if (attribute == xvColorKey) {
 		*value = adaptor_priv->colorKey;
@@ -1313,18 +1313,18 @@ intel_wait_for_scanline(ScrnInfoPtr scrn, PixmapPtr pixmap,
 	 * of extra time for the blitter to start up and
 	 * do its job for a full height blit
 	 */
-	if (full_height && INTEL_INFO(intel)->gen < 40)
+	if (full_height && INTEL_INFO(intel)->gen < 040)
 		y2 -= 2;
 
 	if (pipe == 0) {
 		pipe = MI_LOAD_SCAN_LINES_DISPLAY_PIPEA;
 		event = MI_WAIT_FOR_PIPEA_SCAN_LINE_WINDOW;
-		if (full_height && INTEL_INFO(intel)->gen >= 40)
+		if (full_height && INTEL_INFO(intel)->gen >= 040)
 			event = MI_WAIT_FOR_PIPEA_SVBLANK;
 	} else {
 		pipe = MI_LOAD_SCAN_LINES_DISPLAY_PIPEB;
 		event = MI_WAIT_FOR_PIPEB_SCAN_LINE_WINDOW;
-		if (full_height && INTEL_INFO(intel)->gen >= 40)
+		if (full_height && INTEL_INFO(intel)->gen >= 040)
 			event = MI_WAIT_FOR_PIPEB_SVBLANK;
 	}
 
@@ -1381,7 +1381,7 @@ intel_setup_dst_params(ScrnInfoPtr scrn, intel_adaptor_private *adaptor_priv, sh
 	if (adaptor_priv->textured) {
 		pitchAlign = 4;
 	} else {
-		if (INTEL_INFO(intel)->gen >= 40)
+		if (INTEL_INFO(intel)->gen >= 040)
 			/* Actually the alignment is 64 bytes, too. But the
 			 * stride must be at least 512 bytes. Take the easy fix
 			 * and align on 512 bytes unconditionally. */
@@ -1561,16 +1561,16 @@ I830PutImageTextured(ScrnInfoPtr scrn,
 			return BadAlloc;
 	}
 
-	if (crtc && adaptor_priv->SyncToVblank != 0 && INTEL_INFO(intel)->gen < 60) {
+	if (crtc && adaptor_priv->SyncToVblank != 0 && INTEL_INFO(intel)->gen < 060) {
 		intel_wait_for_scanline(scrn, pixmap, crtc, clipBoxes);
 	}
 
-	if (INTEL_INFO(intel)->gen >= 60) {
+	if (INTEL_INFO(intel)->gen >= 060) {
 		Gen6DisplayVideoTextured(scrn, adaptor_priv, id, clipBoxes,
 					 width, height, dstPitch, dstPitch2,
 					 src_w, src_h,
 					 drw_w, drw_h, pixmap);
-	} else if (INTEL_INFO(intel)->gen >= 40) {
+	} else if (INTEL_INFO(intel)->gen >= 040) {
 		I965DisplayVideoTextured(scrn, adaptor_priv, id, clipBoxes,
 					 width, height, dstPitch, dstPitch2,
 					 src_w, src_h,
diff --git a/src/legacy/Makefile.in b/src/legacy/Makefile.in
index a086138e1..928cf6401 100644
--- a/src/legacy/Makefile.in
+++ b/src/legacy/Makefile.in
@@ -218,7 +218,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -257,6 +256,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -324,7 +325,7 @@ EXTRA_DIST = README
 all: all-recursive
 
 .SUFFIXES:
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -349,9 +350,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
 
diff --git a/src/legacy/i810/Makefile.in b/src/legacy/i810/Makefile.in
index 8f339e3fc..c992feb5f 100644
--- a/src/legacy/i810/Makefile.in
+++ b/src/legacy/i810/Makefile.in
@@ -237,7 +237,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -276,6 +275,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -349,7 +350,7 @@ all: all-recursive
 
 .SUFFIXES:
 .SUFFIXES: .c .lo .o .obj
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -374,9 +375,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
 
diff --git a/src/legacy/i810/xvmc/Makefile.in b/src/legacy/i810/xvmc/Makefile.in
index 6e03d10c7..830d67767 100644
--- a/src/legacy/i810/xvmc/Makefile.in
+++ b/src/legacy/i810/xvmc/Makefile.in
@@ -206,7 +206,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -245,6 +244,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -314,7 +315,7 @@ all: all-am
 
 .SUFFIXES:
 .SUFFIXES: .c .lo .o .obj
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -339,9 +340,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
 install-libLTLIBRARIES: $(lib_LTLIBRARIES)
diff --git a/src/render_program/Makefile.am b/src/render_program/Makefile.am
index 8b82b2e0e..1298625fd 100644
--- a/src/render_program/Makefile.am
+++ b/src/render_program/Makefile.am
@@ -191,21 +191,21 @@ if HAVE_GEN4ASM
 
 SUFFIXES = .g4a .g4b .g5a .g5b .g6a .g6b .g7a .g7b
 .g4a.g4b:
-	m4 -I$(srcdir) -s $< > $*.g4m && @INTEL_GEN4ASM@ -o $@ $*.g4m && @INTEL_GEN4ASM@ -g 5 -o $@.gen5 $*.g4m && rm $*.g4m
+	$(AM_V_GEN)m4 -I$(srcdir) -s $< > $*.g4m && @INTEL_GEN4ASM@ -o $@ $*.g4m && @INTEL_GEN4ASM@ -g 5 -o $@.gen5 $*.g4m && rm $*.g4m
 
 .g5a.g5b:
-	m4 -I$(srcdir) -s $< > $*.g5m && @INTEL_GEN4ASM@ -g 5 -o $@ $*.g5m && rm $*.g5m
+	$(AM_V_GEN)m4 -I$(srcdir) -s $< > $*.g5m && @INTEL_GEN4ASM@ -g 5 -o $@ $*.g5m && rm $*.g5m
 
 .g6a.g6b:
-	m4 -I$(srcdir) -s $< > $*.g6m && @INTEL_GEN4ASM@ -g 6 -o $@ $*.g6m && rm $*.g6m
+	$(AM_V_GEN)m4 -I$(srcdir) -s $< > $*.g6m && @INTEL_GEN4ASM@ -g 6 -o $@ $*.g6m && rm $*.g6m
 
 .g7a.g7b:
-	m4 -I$(srcdir) -s $< > $*.g7m && @INTEL_GEN4ASM@ -g 7 -o $@ $*.g7m && rm $*.g7m
+	$(AM_V_GEN)m4 -I$(srcdir) -s $< > $*.g7m && @INTEL_GEN4ASM@ -g 7 -o $@ $*.g7m && rm $*.g7m
 
-$(INTEL_G4B): $(INTEL_G4I)
-$(INTEL_G5B): $(INTEL_G4I)
-$(INTEL_G6B): $(INTEL_G4I) $(INTEL_G6I)
-$(INTEL_G7B): $(INTEL_G4I) $(INTEL_G6I)
+$(INTEL_G4B): $(INTEL_GEN4ASM) $(INTEL_G4I)
+$(INTEL_G5B): $(INTEL_GEN4ASM) $(INTEL_G4I)
+$(INTEL_G6B): $(INTEL_GEN4ASM) $(INTEL_G4I) $(INTEL_G6I)
+$(INTEL_G7B): $(INTEL_GEN4ASM) $(INTEL_G4I) $(INTEL_G6I)
 
 BUILT_SOURCES= $(INTEL_G4B) $(INTEL_G5B) $(INTEL_G6B) $(INTEL_G7B)
 
diff --git a/src/render_program/Makefile.in b/src/render_program/Makefile.in
index c079921f3..c941f240c 100644
--- a/src/render_program/Makefile.in
+++ b/src/render_program/Makefile.in
@@ -143,7 +143,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -182,6 +181,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -435,7 +436,7 @@ all: $(BUILT_SOURCES)
 
 .SUFFIXES:
 .SUFFIXES: .g4a .g4b .g5a .g5b .g6a .g6b .g7a .g7b
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -460,9 +461,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
 
@@ -627,21 +628,21 @@ uninstall-am:
 	uninstall uninstall-am
 
 @HAVE_GEN4ASM_TRUE@.g4a.g4b:
-@HAVE_GEN4ASM_TRUE@	m4 -I$(srcdir) -s $< > $*.g4m && @INTEL_GEN4ASM@ -o $@ $*.g4m && @INTEL_GEN4ASM@ -g 5 -o $@.gen5 $*.g4m && rm $*.g4m
+@HAVE_GEN4ASM_TRUE@	$(AM_V_GEN)m4 -I$(srcdir) -s $< > $*.g4m && @INTEL_GEN4ASM@ -o $@ $*.g4m && @INTEL_GEN4ASM@ -g 5 -o $@.gen5 $*.g4m && rm $*.g4m
 
 @HAVE_GEN4ASM_TRUE@.g5a.g5b:
-@HAVE_GEN4ASM_TRUE@	m4 -I$(srcdir) -s $< > $*.g5m && @INTEL_GEN4ASM@ -g 5 -o $@ $*.g5m && rm $*.g5m
+@HAVE_GEN4ASM_TRUE@	$(AM_V_GEN)m4 -I$(srcdir) -s $< > $*.g5m && @INTEL_GEN4ASM@ -g 5 -o $@ $*.g5m && rm $*.g5m
 
 @HAVE_GEN4ASM_TRUE@.g6a.g6b:
-@HAVE_GEN4ASM_TRUE@	m4 -I$(srcdir) -s $< > $*.g6m && @INTEL_GEN4ASM@ -g 6 -o $@ $*.g6m && rm $*.g6m
+@HAVE_GEN4ASM_TRUE@	$(AM_V_GEN)m4 -I$(srcdir) -s $< > $*.g6m && @INTEL_GEN4ASM@ -g 6 -o $@ $*.g6m && rm $*.g6m
 
 @HAVE_GEN4ASM_TRUE@.g7a.g7b:
-@HAVE_GEN4ASM_TRUE@	m4 -I$(srcdir) -s $< > $*.g7m && @INTEL_GEN4ASM@ -g 7 -o $@ $*.g7m && rm $*.g7m
+@HAVE_GEN4ASM_TRUE@	$(AM_V_GEN)m4 -I$(srcdir) -s $< > $*.g7m && @INTEL_GEN4ASM@ -g 7 -o $@ $*.g7m && rm $*.g7m
 
-@HAVE_GEN4ASM_TRUE@$(INTEL_G4B): $(INTEL_G4I)
-@HAVE_GEN4ASM_TRUE@$(INTEL_G5B): $(INTEL_G4I)
-@HAVE_GEN4ASM_TRUE@$(INTEL_G6B): $(INTEL_G4I) $(INTEL_G6I)
-@HAVE_GEN4ASM_TRUE@$(INTEL_G7B): $(INTEL_G4I) $(INTEL_G6I)
+@HAVE_GEN4ASM_TRUE@$(INTEL_G4B): $(INTEL_GEN4ASM) $(INTEL_G4I)
+@HAVE_GEN4ASM_TRUE@$(INTEL_G5B): $(INTEL_GEN4ASM) $(INTEL_G4I)
+@HAVE_GEN4ASM_TRUE@$(INTEL_G6B): $(INTEL_GEN4ASM) $(INTEL_G4I) $(INTEL_G6I)
+@HAVE_GEN4ASM_TRUE@$(INTEL_G7B): $(INTEL_GEN4ASM) $(INTEL_G4I) $(INTEL_G6I)
 
 @HAVE_GEN4ASM_TRUE@clean-local:
 @HAVE_GEN4ASM_TRUE@	-rm -f $(INTEL_G4B) $(INTEL_G4B_GEN5)
diff --git a/src/sna/Makefile.am b/src/sna/Makefile.am
index 306996b57..c74c904dc 100644
--- a/src/sna/Makefile.am
+++ b/src/sna/Makefile.am
@@ -34,9 +34,11 @@ AM_CFLAGS += @VALGRIND_CFLAGS@
 endif
 
 noinst_LTLIBRARIES = libsna.la
+libsna_la_LDFLAGS = -pthread
 libsna_la_LIBADD = @UDEV_LIBS@ -lm @DRM_LIBS@ brw/libbrw.la fb/libfb.la
 
 libsna_la_SOURCES = \
+	atomic.h \
 	blt.c \
 	compiler.h \
 	kgem.c \
@@ -62,6 +64,8 @@ libsna_la_SOURCES = \
 	sna_trapezoids.c \
 	sna_tiling.c \
 	sna_transform.c \
+	sna_threads.c \
+	sna_vertex.c \
 	sna_video.c \
 	sna_video.h \
 	sna_video_overlay.c \
@@ -73,6 +77,10 @@ libsna_la_SOURCES = \
 	gen3_render.h \
 	gen4_render.c \
 	gen4_render.h \
+	gen4_source.c \
+	gen4_source.h \
+	gen4_vertex.c \
+	gen4_vertex.h \
 	gen5_render.c \
 	gen5_render.h \
 	gen6_render.c \
diff --git a/src/sna/Makefile.in b/src/sna/Makefile.in
index 978d36e1d..7b80b60be 100644
--- a/src/sna/Makefile.in
+++ b/src/sna/Makefile.in
@@ -106,18 +106,19 @@ am__DEPENDENCIES_1 =
 @DRI2_TRUE@am__DEPENDENCIES_2 = $(am__DEPENDENCIES_1)
 libsna_la_DEPENDENCIES = brw/libbrw.la fb/libfb.la \
 	$(am__DEPENDENCIES_2)
-am__libsna_la_SOURCES_DIST = blt.c compiler.h kgem.c kgem.h rop.h \
-	sna.h sna_accel.c sna_blt.c sna_composite.c sna_damage.c \
+am__libsna_la_SOURCES_DIST = atomic.h blt.c compiler.h kgem.c kgem.h \
+	rop.h sna.h sna_accel.c sna_blt.c sna_composite.c sna_damage.c \
 	sna_damage.h sna_display.c sna_driver.c sna_glyphs.c \
 	sna_gradient.c sna_io.c sna_module.h sna_render.c sna_render.h \
 	sna_render_inline.h sna_reg.h sna_stream.c sna_trapezoids.c \
-	sna_tiling.c sna_transform.c sna_video.c sna_video.h \
-	sna_video_overlay.c sna_video_sprite.c sna_video_textured.c \
-	gen2_render.c gen2_render.h gen3_render.c gen3_render.h \
-	gen4_render.c gen4_render.h gen5_render.c gen5_render.h \
-	gen6_render.c gen6_render.h gen7_render.c gen7_render.h \
-	sna_dri.c sna_video_hwmc.h sna_video_hwmc.c kgem_debug.c \
-	kgem_debug.h kgem_debug_gen2.c kgem_debug_gen3.c \
+	sna_tiling.c sna_transform.c sna_threads.c sna_vertex.c \
+	sna_video.c sna_video.h sna_video_overlay.c sna_video_sprite.c \
+	sna_video_textured.c gen2_render.c gen2_render.h gen3_render.c \
+	gen3_render.h gen4_render.c gen4_render.h gen4_source.c \
+	gen4_source.h gen4_vertex.c gen4_vertex.h gen5_render.c \
+	gen5_render.h gen6_render.c gen6_render.h gen7_render.c \
+	gen7_render.h sna_dri.c sna_video_hwmc.h sna_video_hwmc.c \
+	kgem_debug.c kgem_debug.h kgem_debug_gen2.c kgem_debug_gen3.c \
 	kgem_debug_gen4.c kgem_debug_gen5.c kgem_debug_gen6.c \
 	kgem_debug_gen7.c
 @DRI2_TRUE@am__objects_1 = sna_dri.lo
@@ -130,14 +131,18 @@ am_libsna_la_OBJECTS = blt.lo kgem.lo sna_accel.lo sna_blt.lo \
 	sna_composite.lo sna_damage.lo sna_display.lo sna_driver.lo \
 	sna_glyphs.lo sna_gradient.lo sna_io.lo sna_render.lo \
 	sna_stream.lo sna_trapezoids.lo sna_tiling.lo sna_transform.lo \
-	sna_video.lo sna_video_overlay.lo sna_video_sprite.lo \
-	sna_video_textured.lo gen2_render.lo gen3_render.lo \
-	gen4_render.lo gen5_render.lo gen6_render.lo gen7_render.lo \
-	$(am__objects_1) $(am__objects_2) $(am__objects_3)
+	sna_threads.lo sna_vertex.lo sna_video.lo sna_video_overlay.lo \
+	sna_video_sprite.lo sna_video_textured.lo gen2_render.lo \
+	gen3_render.lo gen4_render.lo gen4_source.lo gen4_vertex.lo \
+	gen5_render.lo gen6_render.lo gen7_render.lo $(am__objects_1) \
+	$(am__objects_2) $(am__objects_3)
 libsna_la_OBJECTS = $(am_libsna_la_OBJECTS)
 AM_V_lt = $(am__v_lt_@AM_V@)
 am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
 am__v_lt_0 = --silent
+libsna_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(libsna_la_LDFLAGS) $(LDFLAGS) -o $@
 DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
 depcomp = $(SHELL) $(top_srcdir)/depcomp
 am__depfiles_maybe = depfiles
@@ -281,7 +286,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -320,6 +324,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -383,24 +389,27 @@ AM_CFLAGS = @CWARNFLAGS@ -I$(top_srcdir)/src \
 	-I$(top_srcdir)/src/render_program @XORG_CFLAGS@ @UDEV_CFLAGS@ \
 	@DRM_CFLAGS@ $(NULL) $(am__append_1) $(am__append_2)
 noinst_LTLIBRARIES = libsna.la
+libsna_la_LDFLAGS = -pthread
 libsna_la_LIBADD = @UDEV_LIBS@ -lm @DRM_LIBS@ brw/libbrw.la \
 	fb/libfb.la $(am__append_4)
-libsna_la_SOURCES = blt.c compiler.h kgem.c kgem.h rop.h sna.h \
-	sna_accel.c sna_blt.c sna_composite.c sna_damage.c \
+libsna_la_SOURCES = atomic.h blt.c compiler.h kgem.c kgem.h rop.h \
+	sna.h sna_accel.c sna_blt.c sna_composite.c sna_damage.c \
 	sna_damage.h sna_display.c sna_driver.c sna_glyphs.c \
 	sna_gradient.c sna_io.c sna_module.h sna_render.c sna_render.h \
 	sna_render_inline.h sna_reg.h sna_stream.c sna_trapezoids.c \
-	sna_tiling.c sna_transform.c sna_video.c sna_video.h \
-	sna_video_overlay.c sna_video_sprite.c sna_video_textured.c \
-	gen2_render.c gen2_render.h gen3_render.c gen3_render.h \
-	gen4_render.c gen4_render.h gen5_render.c gen5_render.h \
-	gen6_render.c gen6_render.h gen7_render.c gen7_render.h \
-	$(NULL) $(am__append_3) $(am__append_5) $(am__append_6)
+	sna_tiling.c sna_transform.c sna_threads.c sna_vertex.c \
+	sna_video.c sna_video.h sna_video_overlay.c sna_video_sprite.c \
+	sna_video_textured.c gen2_render.c gen2_render.h gen3_render.c \
+	gen3_render.h gen4_render.c gen4_render.h gen4_source.c \
+	gen4_source.h gen4_vertex.c gen4_vertex.h gen5_render.c \
+	gen5_render.h gen6_render.c gen6_render.h gen7_render.c \
+	gen7_render.h $(NULL) $(am__append_3) $(am__append_5) \
+	$(am__append_6)
 all: all-recursive
 
 .SUFFIXES:
 .SUFFIXES: .c .lo .o .obj
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -425,9 +434,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
 
@@ -440,7 +449,7 @@ clean-noinstLTLIBRARIES:
 	  rm -f "$${dir}/so_locations"; \
 	done
 libsna.la: $(libsna_la_OBJECTS) $(libsna_la_DEPENDENCIES) $(EXTRA_libsna_la_DEPENDENCIES) 
-	$(AM_V_CCLD)$(LINK)  $(libsna_la_OBJECTS) $(libsna_la_LIBADD) $(LIBS)
+	$(AM_V_CCLD)$(libsna_la_LINK)  $(libsna_la_OBJECTS) $(libsna_la_LIBADD) $(LIBS)
 
 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
@@ -452,6 +461,8 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gen2_render.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gen3_render.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gen4_render.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gen4_source.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gen4_vertex.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gen5_render.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gen6_render.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/gen7_render.Plo@am__quote@
@@ -475,9 +486,11 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sna_io.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sna_render.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sna_stream.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sna_threads.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sna_tiling.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sna_transform.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sna_trapezoids.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sna_vertex.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sna_video.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sna_video_hwmc.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sna_video_overlay.Plo@am__quote@
diff --git a/src/sna/atomic.h b/src/sna/atomic.h
new file mode 100644
index 000000000..306dc6db8
--- /dev/null
+++ b/src/sna/atomic.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#ifndef ATOMIC_H
+#define ATOMIC_H
+
+#if HAVE_ATOMIC_PRIMITIVES
+
+#define HAS_ATOMIC_OPS 1
+
+typedef struct {
+	int atomic;
+} atomic_t;
+
+# define atomic_read(x) ((x)->atomic)
+# define atomic_set(x, val) ((x)->atomic = (val))
+# define atomic_inc(x) ((void) __sync_fetch_and_add (&(x)->atomic, 1))
+# define atomic_dec_and_test(x) (__sync_fetch_and_add (&(x)->atomic, -1) == 1)
+# define atomic_add(x, v) ((void) __sync_add_and_fetch(&(x)->atomic, (v)))
+# define atomic_dec(x, v) ((void) __sync_sub_and_fetch(&(x)->atomic, (v)))
+# define atomic_cmpxchg(x, oldv, newv) __sync_val_compare_and_swap (&(x)->atomic, oldv, newv)
+
+#endif
+
+#if HAVE_LIB_ATOMIC_OPS
+#include <atomic_ops.h>
+
+#define HAS_ATOMIC_OPS 1
+
+typedef struct {
+	AO_t atomic;
+} atomic_t;
+
+# define atomic_read(x) AO_load_full(&(x)->atomic)
+# define atomic_set(x, val) AO_store_full(&(x)->atomic, (val))
+# define atomic_inc(x) ((void) AO_fetch_and_add1_full(&(x)->atomic))
+# define atomic_add(x, v) ((void) AO_fetch_and_add_full(&(x)->atomic, (v)))
+# define atomic_dec(x, v) ((void) AO_fetch_and_add_full(&(x)->atomic, -(v)))
+# define atomic_dec_and_test(x) (AO_fetch_and_sub1_full(&(x)->atomic) == 1)
+# define atomic_cmpxchg(x, oldv, newv) AO_compare_and_swap_full(&(x)->atomic, oldv, newv)
+
+#endif
+
+#if defined(__sun) && !defined(HAS_ATOMIC_OPS)  /* Solaris & OpenSolaris */
+
+#include <sys/atomic.h>
+#define HAS_ATOMIC_OPS 1
+
+typedef struct { uint_t atomic; } atomic_t;
+
+# define atomic_read(x) (int) ((x)->atomic)
+# define atomic_set(x, val) ((x)->atomic = (uint_t)(val))
+# define atomic_inc(x) (atomic_inc_uint (&(x)->atomic))
+# define atomic_dec_and_test(x) (atomic_dec_uint_nv(&(x)->atomic) == 1)
+# define atomic_add(x, v) (atomic_add_int(&(x)->atomic, (v)))
+# define atomic_dec(x, v) (atomic_add_int(&(x)->atomic, -(v)))
+# define atomic_cmpxchg(x, oldv, newv) atomic_cas_uint (&(x)->atomic, oldv, newv)
+
+#endif
+
+#if ! HAS_ATOMIC_OPS
+#error xf86-video-intel requires atomic operations, please define them for your CPU/compiler.
+#endif
+
+#endif
diff --git a/src/sna/brw/Makefile.in b/src/sna/brw/Makefile.in
index 1c8fbf3dd..8d70764e8 100644
--- a/src/sna/brw/Makefile.in
+++ b/src/sna/brw/Makefile.in
@@ -203,7 +203,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -242,6 +241,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -331,7 +332,7 @@ all: all-am
 
 .SUFFIXES:
 .SUFFIXES: .c .lo .o .obj
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -356,9 +357,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
 
diff --git a/src/sna/brw/brw_disasm.c b/src/sna/brw/brw_disasm.c
index e6da17454..ea6155c8e 100644
--- a/src/sna/brw/brw_disasm.c
+++ b/src/sna/brw/brw_disasm.c
@@ -875,16 +875,17 @@ void brw_disasm(FILE *file, const struct brw_instruction *inst, int gen)
 		string(file, ")");
 	}
 
-	if (inst->header.opcode == BRW_OPCODE_SEND && gen < 60)
+	if (inst->header.opcode == BRW_OPCODE_SEND && gen < 060)
 		format(file, " %d", inst->header.destreg__conditionalmod);
 
 	if (opcode[inst->header.opcode].ndst > 0) {
 		pad(file, 16);
 		dest(file, inst);
-	} else if (gen >= 60 && (inst->header.opcode == BRW_OPCODE_IF ||
-				 inst->header.opcode == BRW_OPCODE_ELSE ||
-				 inst->header.opcode == BRW_OPCODE_ENDIF ||
-				 inst->header.opcode == BRW_OPCODE_WHILE)) {
+	} else if (gen >= 060 &&
+		   (inst->header.opcode == BRW_OPCODE_IF ||
+		    inst->header.opcode == BRW_OPCODE_ELSE ||
+		    inst->header.opcode == BRW_OPCODE_ENDIF ||
+		    inst->header.opcode == BRW_OPCODE_WHILE)) {
 		format(file, " %d", inst->bits1.branch_gen6.jump_count);
 	}
 
@@ -901,9 +902,9 @@ void brw_disasm(FILE *file, const struct brw_instruction *inst, int gen)
 	    inst->header.opcode == BRW_OPCODE_SENDC) {
 		enum brw_message_target target;
 
-		if (gen >= 60)
+		if (gen >= 060)
 			target = inst->header.destreg__conditionalmod;
-		else if (gen >= 50)
+		else if (gen >= 050)
 			target = inst->bits2.send_gen5.sfid;
 		else
 			target = inst->bits3.generic.msg_target;
@@ -912,7 +913,7 @@ void brw_disasm(FILE *file, const struct brw_instruction *inst, int gen)
 		pad (file, 16);
 		space = 0;
 
-		if (gen >= 60) {
+		if (gen >= 060) {
 			control (file, "target function", target_function_gen6,
 				 target, &space);
 		} else {
@@ -934,19 +935,19 @@ void brw_disasm(FILE *file, const struct brw_instruction *inst, int gen)
 				 inst->bits3.math.precision, &space);
 			break;
 		case BRW_SFID_SAMPLER:
-			if (gen >= 70) {
+			if (gen >= 070) {
 				format (file, " (%d, %d, %d, %d)",
 					inst->bits3.sampler_gen7.binding_table_index,
 					inst->bits3.sampler_gen7.sampler,
 					inst->bits3.sampler_gen7.msg_type,
 					inst->bits3.sampler_gen7.simd_mode);
-			} else if (gen >= 50) {
+			} else if (gen >= 050) {
 				format (file, " (%d, %d, %d, %d)",
 					inst->bits3.sampler_gen5.binding_table_index,
 					inst->bits3.sampler_gen5.sampler,
 					inst->bits3.sampler_gen5.msg_type,
 					inst->bits3.sampler_gen5.simd_mode);
-			} else if (gen >= 45) {
+			} else if (gen >= 045) {
 				format (file, " (%d, %d)",
 					inst->bits3.sampler_g4x.binding_table_index,
 					inst->bits3.sampler_g4x.sampler);
@@ -961,13 +962,13 @@ void brw_disasm(FILE *file, const struct brw_instruction *inst, int gen)
 			}
 			break;
 		case BRW_SFID_DATAPORT_READ:
-			if (gen >= 60) {
+			if (gen >= 060) {
 				format (file, " (%d, %d, %d, %d)",
 					inst->bits3.gen6_dp.binding_table_index,
 					inst->bits3.gen6_dp.msg_control,
 					inst->bits3.gen6_dp.msg_type,
 					inst->bits3.gen6_dp.send_commit_msg);
-			} else if (gen >= 45) {
+			} else if (gen >= 045) {
 				format (file, " (%d, %d, %d)",
 					inst->bits3.dp_read_gen5.binding_table_index,
 					inst->bits3.dp_read_gen5.msg_control,
@@ -981,7 +982,7 @@ void brw_disasm(FILE *file, const struct brw_instruction *inst, int gen)
 			break;
 
 		case BRW_SFID_DATAPORT_WRITE:
-			if (gen >= 70) {
+			if (gen >= 070) {
 				format (file, " (");
 
 				control (file, "DP rc message type",
@@ -992,7 +993,7 @@ void brw_disasm(FILE *file, const struct brw_instruction *inst, int gen)
 					inst->bits3.gen7_dp.binding_table_index,
 					inst->bits3.gen7_dp.msg_control,
 					inst->bits3.gen7_dp.msg_type);
-			} else if (gen >= 60) {
+			} else if (gen >= 060) {
 				format (file, " (");
 
 				control (file, "DP rc message type",
@@ -1015,14 +1016,14 @@ void brw_disasm(FILE *file, const struct brw_instruction *inst, int gen)
 			break;
 
 		case BRW_SFID_URB:
-			if (gen >= 50) {
+			if (gen >= 050) {
 				format (file, " %d", inst->bits3.urb_gen5.offset);
 			} else {
 				format (file, " %d", inst->bits3.urb.offset);
 			}
 
 			space = 1;
-			if (gen >= 50) {
+			if (gen >= 050) {
 				control (file, "urb opcode", urb_opcode,
 					 inst->bits3.urb_gen5.opcode, &space);
 			}
@@ -1051,7 +1052,7 @@ void brw_disasm(FILE *file, const struct brw_instruction *inst, int gen)
 		}
 		if (space)
 			string (file, " ");
-		if (gen >= 50) {
+		if (gen >= 050) {
 			format (file, "mlen %d",
 				inst->bits3.generic_gen5.msg_length);
 			format (file, " rlen %d",
@@ -1068,13 +1069,13 @@ void brw_disasm(FILE *file, const struct brw_instruction *inst, int gen)
 		string(file, "{");
 		space = 1;
 		control(file, "access mode", access_mode, inst->header.access_mode, &space);
-		if (gen >= 60)
+		if (gen >= 060)
 			control(file, "write enable control", wectrl, inst->header.mask_control, &space);
 		else
 			control(file, "mask control", mask_ctrl, inst->header.mask_control, &space);
 		control(file, "dependency control", dep_ctrl, inst->header.dependency_control, &space);
 
-		if (gen >= 60)
+		if (gen >= 060)
 			qtr_ctrl(file, inst);
 		else {
 			if (inst->header.compression_control == BRW_COMPRESSION_COMPRESSED &&
@@ -1089,7 +1090,7 @@ void brw_disasm(FILE *file, const struct brw_instruction *inst, int gen)
 		}
 
 		control(file, "thread control", thread_ctrl, inst->header.thread_control, &space);
-		if (gen >= 60)
+		if (gen >= 060)
 			control(file, "acc write control", accwr, inst->header.acc_wr_control, &space);
 		if (inst->header.opcode == BRW_OPCODE_SEND ||
 		    inst->header.opcode == BRW_OPCODE_SENDC)
diff --git a/src/sna/brw/brw_eu.c b/src/sna/brw/brw_eu.c
index 7c32ea191..9bd8ba5dc 100644
--- a/src/sna/brw/brw_eu.c
+++ b/src/sna/brw/brw_eu.c
@@ -79,7 +79,7 @@ void brw_set_compression_control(struct brw_compile *p,
 {
 	p->compressed = (compression_control == BRW_COMPRESSION_COMPRESSED);
 
-	if (p->gen >= 60) {
+	if (p->gen >= 060) {
 		/* Since we don't use the 32-wide support in gen6, we translate
 		 * the pre-gen6 compression control here.
 		 */
diff --git a/src/sna/brw/brw_eu.h b/src/sna/brw/brw_eu.h
index 65e66d5ec..24ab599ad 100644
--- a/src/sna/brw/brw_eu.h
+++ b/src/sna/brw/brw_eu.h
@@ -1862,7 +1862,7 @@ static inline void brw_set_saturate(struct brw_compile *p, unsigned value)
 
 static inline void brw_set_acc_write_control(struct brw_compile *p, unsigned value)
 {
-	if (p->gen >= 60)
+	if (p->gen >= 060)
 		p->current->header.acc_wr_control = value;
 }
 
@@ -1938,7 +1938,7 @@ static inline void brw_##OP(struct brw_compile *p,			\
 	rnd = brw_next_insn(p, BRW_OPCODE_##OP);			\
 	brw_set_dest(p, rnd, dest);					\
 	brw_set_src0(p, rnd, src);					\
-	if (p->gen < 60) {						\
+	if (p->gen < 060) {						\
 		/* turn on round-increments */				\
 		rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \
 		add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));		\
diff --git a/src/sna/brw/brw_eu_emit.c b/src/sna/brw/brw_eu_emit.c
index 3f01ae7b7..5c0b30654 100644
--- a/src/sna/brw/brw_eu_emit.c
+++ b/src/sna/brw/brw_eu_emit.c
@@ -61,7 +61,7 @@ gen6_resolve_implied_move(struct brw_compile *p,
 			  struct brw_reg *src,
 			  unsigned msg_reg_nr)
 {
-	if (p->gen < 60)
+	if (p->gen < 060)
 		return;
 
 	if (src->file == BRW_MESSAGE_REGISTER_FILE)
@@ -88,7 +88,7 @@ gen7_convert_mrf_to_grf(struct brw_compile *p, struct brw_reg *reg)
 	 * Since we're pretending to have 16 MRFs anyway, we may as well use the
 	 * registers required for messages with EOT.
 	 */
-	if (p->gen >= 70 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
+	if (p->gen >= 070 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
 		reg->file = BRW_GENERAL_REGISTER_FILE;
 		reg->nr += 111;
 	}
@@ -378,13 +378,13 @@ brw_set_message_descriptor(struct brw_compile *p,
 {
 	brw_set_src1(p, inst, brw_imm_d(0));
 
-	if (p->gen >= 50) {
+	if (p->gen >= 050) {
 		inst->bits3.generic_gen5.header_present = header_present;
 		inst->bits3.generic_gen5.response_length = response_length;
 		inst->bits3.generic_gen5.msg_length = msg_length;
 		inst->bits3.generic_gen5.end_of_thread = end_of_thread;
 
-		if (p->gen >= 60) {
+		if (p->gen >= 060) {
 			/* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
 			inst->header.destreg__conditionalmod = sfid;
 		} else {
@@ -439,7 +439,7 @@ static void brw_set_math_message(struct brw_compile *p,
 	brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
 				   msg_length, response_length,
 				   false, false);
-	if (p->gen == 50) {
+	if (p->gen == 050) {
 		insn->bits3.math_gen5.function = function;
 		insn->bits3.math_gen5.int_type = integer_type;
 		insn->bits3.math_gen5.precision = low_precision;
@@ -485,7 +485,7 @@ static void brw_set_urb_message(struct brw_compile *p,
 {
 	brw_set_message_descriptor(p, insn, BRW_SFID_URB,
 				   msg_length, response_length, true, end_of_thread);
-	if (p->gen >= 70) {
+	if (p->gen >= 070) {
 		insn->bits3.urb_gen7.opcode = 0;	/* URB_WRITE_HWORD */
 		insn->bits3.urb_gen7.offset = offset;
 		assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
@@ -493,7 +493,7 @@ static void brw_set_urb_message(struct brw_compile *p,
 		/* per_slot_offset = 0 makes it ignore offsets in message header */
 		insn->bits3.urb_gen7.per_slot_offset = 0;
 		insn->bits3.urb_gen7.complete = complete;
-	} else if (p->gen >= 50) {
+	} else if (p->gen >= 050) {
 		insn->bits3.urb_gen5.opcode = 0;	/* URB_WRITE */
 		insn->bits3.urb_gen5.offset = offset;
 		insn->bits3.urb_gen5.swizzle_control = swizzle_control;
@@ -525,13 +525,13 @@ brw_set_dp_write_message(struct brw_compile *p,
 {
 	unsigned sfid;
 
-	if (p->gen >= 70) {
+	if (p->gen >= 070) {
 		/* Use the Render Cache for RT writes; otherwise use the Data Cache */
 		if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
 			sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 		else
 			sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
-	} else if (p->gen >= 60) {
+	} else if (p->gen >= 060) {
 		/* Use the render cache for all write messages. */
 		sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 	} else {
@@ -542,18 +542,18 @@ brw_set_dp_write_message(struct brw_compile *p,
 				   msg_length, response_length,
 				   header_present, end_of_thread);
 
-	if (p->gen >= 70) {
+	if (p->gen >= 070) {
 		insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 		insn->bits3.gen7_dp.msg_control = msg_control;
 		insn->bits3.gen7_dp.last_render_target = last_render_target;
 		insn->bits3.gen7_dp.msg_type = msg_type;
-	} else if (p->gen >= 60) {
+	} else if (p->gen >= 060) {
 		insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 		insn->bits3.gen6_dp.msg_control = msg_control;
 		insn->bits3.gen6_dp.last_render_target = last_render_target;
 		insn->bits3.gen6_dp.msg_type = msg_type;
 		insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
-	} else if (p->gen >= 50) {
+	} else if (p->gen >= 050) {
 		insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
 		insn->bits3.dp_write_gen5.msg_control = msg_control;
 		insn->bits3.dp_write_gen5.last_render_target = last_render_target;
@@ -580,9 +580,9 @@ brw_set_dp_read_message(struct brw_compile *p,
 {
 	unsigned sfid;
 
-	if (p->gen >= 70) {
+	if (p->gen >= 070) {
 		sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
-	} else if (p->gen >= 60) {
+	} else if (p->gen >= 060) {
 		if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
 			sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
 		else
@@ -595,23 +595,23 @@ brw_set_dp_read_message(struct brw_compile *p,
 				   msg_length, response_length,
 				   true, false);
 
-	if (p->gen >= 70) {
+	if (p->gen >= 070) {
 		insn->bits3.gen7_dp.binding_table_index = binding_table_index;
 		insn->bits3.gen7_dp.msg_control = msg_control;
 		insn->bits3.gen7_dp.last_render_target = 0;
 		insn->bits3.gen7_dp.msg_type = msg_type;
-	} else if (p->gen >= 60) {
+	} else if (p->gen >= 060) {
 		insn->bits3.gen6_dp.binding_table_index = binding_table_index;
 		insn->bits3.gen6_dp.msg_control = msg_control;
 		insn->bits3.gen6_dp.last_render_target = 0;
 		insn->bits3.gen6_dp.msg_type = msg_type;
 		insn->bits3.gen6_dp.send_commit_msg = 0;
-	} else if (p->gen >= 50) {
+	} else if (p->gen >= 050) {
 		insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
 		insn->bits3.dp_read_gen5.msg_control = msg_control;
 		insn->bits3.dp_read_gen5.msg_type = msg_type;
 		insn->bits3.dp_read_gen5.target_cache = target_cache;
-	} else if (p->gen >= 45) {
+	} else if (p->gen >= 045) {
 		insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /*0:7*/
 		insn->bits3.dp_read_g4x.msg_control = msg_control;  /*8:10*/
 		insn->bits3.dp_read_g4x.msg_type = msg_type;  /*11:13*/
@@ -638,17 +638,17 @@ static void brw_set_sampler_message(struct brw_compile *p,
 				   msg_length, response_length,
 				   header_present, false);
 
-	if (p->gen >= 70) {
+	if (p->gen >= 070) {
 		insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
 		insn->bits3.sampler_gen7.sampler = sampler;
 		insn->bits3.sampler_gen7.msg_type = msg_type;
 		insn->bits3.sampler_gen7.simd_mode = simd_mode;
-	} else if (p->gen >= 50) {
+	} else if (p->gen >= 050) {
 		insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
 		insn->bits3.sampler_gen5.sampler = sampler;
 		insn->bits3.sampler_gen5.msg_type = msg_type;
 		insn->bits3.sampler_gen5.simd_mode = simd_mode;
-	} else if (p->gen >= 45) {
+	} else if (p->gen >= 045) {
 		insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
 		insn->bits3.sampler_g4x.sampler = sampler;
 		insn->bits3.sampler_g4x.msg_type = msg_type;
@@ -706,11 +706,11 @@ brw_IF(struct brw_compile *p, unsigned execute_size)
 	insn = brw_next_insn(p, BRW_OPCODE_IF);
 
 	/* Override the defaults for this instruction: */
-	if (p->gen < 60) {
+	if (p->gen < 060) {
 		brw_set_dest(p, insn, brw_ip_reg());
 		brw_set_src0(p, insn, brw_ip_reg());
 		brw_set_src1(p, insn, brw_imm_d(0x0));
-	} else if (p->gen < 70) {
+	} else if (p->gen < 070) {
 		brw_set_dest(p, insn, brw_imm_w(0));
 		insn->bits1.branch_gen6.jump_count = 0;
 		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
@@ -827,7 +827,7 @@ patch_IF_ELSE(struct brw_compile *p,
 	/* Jump count is for 64bit data chunk each, so one 128bit instruction
 	 * requires 2 chunks.
 	 */
-	if (p->gen >= 50)
+	if (p->gen >= 050)
 		br = 2;
 
 	assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
@@ -835,7 +835,7 @@ patch_IF_ELSE(struct brw_compile *p,
 
 	if (else_inst == NULL) {
 		/* Patch IF -> ENDIF */
-		if (p->gen < 60) {
+		if (p->gen < 060) {
 			/* Turn it into an IFF, which means no mask stack operations for
 			 * all-false and jumping past the ENDIF.
 			 */
@@ -843,7 +843,7 @@ patch_IF_ELSE(struct brw_compile *p,
 			if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
 			if_inst->bits3.if_else.pop_count = 0;
 			if_inst->bits3.if_else.pad0 = 0;
-		} else if (p->gen < 70) {
+		} else if (p->gen < 070) {
 			/* As of gen6, there is no IFF and IF must point to the ENDIF. */
 			if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
 		} else {
@@ -854,23 +854,23 @@ patch_IF_ELSE(struct brw_compile *p,
 		else_inst->header.execution_size = if_inst->header.execution_size;
 
 		/* Patch IF -> ELSE */
-		if (p->gen < 60) {
+		if (p->gen < 060) {
 			if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
 			if_inst->bits3.if_else.pop_count = 0;
 			if_inst->bits3.if_else.pad0 = 0;
-		} else if (p->gen <= 70) {
+		} else if (p->gen <= 070) {
 			if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
 		}
 
 		/* Patch ELSE -> ENDIF */
-		if (p->gen < 60) {
+		if (p->gen < 060) {
 			/* BRW_OPCODE_ELSE pre-gen6 should point just past the
 			 * matching ENDIF.
 			 */
 			else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
 			else_inst->bits3.if_else.pop_count = 1;
 			else_inst->bits3.if_else.pad0 = 0;
-		} else if (p->gen < 70) {
+		} else if (p->gen < 070) {
 			/* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
 			else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
 		} else {
@@ -890,11 +890,11 @@ brw_ELSE(struct brw_compile *p)
 
 	insn = brw_next_insn(p, BRW_OPCODE_ELSE);
 
-	if (p->gen < 60) {
+	if (p->gen < 060) {
 		brw_set_dest(p, insn, brw_ip_reg());
 		brw_set_src0(p, insn, brw_ip_reg());
 		brw_set_src1(p, insn, brw_imm_d(0x0));
-	} else if (p->gen < 70) {
+	} else if (p->gen < 070) {
 		brw_set_dest(p, insn, brw_imm_w(0));
 		insn->bits1.branch_gen6.jump_count = 0;
 		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
@@ -938,11 +938,11 @@ brw_ENDIF(struct brw_compile *p)
 
 	insn = brw_next_insn(p, BRW_OPCODE_ENDIF);
 
-	if (p->gen < 60) {
+	if (p->gen < 060) {
 		brw_set_dest(p, insn, __retype_ud(brw_vec4_grf(0,0)));
 		brw_set_src0(p, insn, __retype_ud(brw_vec4_grf(0,0)));
 		brw_set_src1(p, insn, brw_imm_d(0x0));
-	} else if (p->gen < 70) {
+	} else if (p->gen < 070) {
 		brw_set_dest(p, insn, brw_imm_w(0));
 		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
 		brw_set_src1(p, insn, __retype_d(brw_null_reg()));
@@ -957,11 +957,11 @@ brw_ENDIF(struct brw_compile *p)
 	insn->header.thread_control = BRW_THREAD_SWITCH;
 
 	/* Also pop item off the stack in the endif instruction: */
-	if (p->gen < 60) {
+	if (p->gen < 060) {
 		insn->bits3.if_else.jump_count = 0;
 		insn->bits3.if_else.pop_count = 1;
 		insn->bits3.if_else.pad0 = 0;
-	} else if (p->gen < 70) {
+	} else if (p->gen < 070) {
 		insn->bits1.branch_gen6.jump_count = 2;
 	} else {
 		insn->bits3.break_cont.jip = 2;
@@ -974,7 +974,7 @@ struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
 	struct brw_instruction *insn;
 
 	insn = brw_next_insn(p, BRW_OPCODE_BREAK);
-	if (p->gen >= 60) {
+	if (p->gen >= 060) {
 		brw_set_dest(p, insn, __retype_d(brw_null_reg()));
 		brw_set_src0(p, insn, __retype_d(brw_null_reg()));
 		brw_set_src1(p, insn, brw_imm_d(0x0));
@@ -1041,7 +1041,7 @@ struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
  */
 struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
 {
-	if (p->gen >= 60 || p->single_program_flow) {
+	if (p->gen >= 060 || p->single_program_flow) {
 		return &p->store[p->nr_insn];
 	} else {
 		struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_DO);
@@ -1068,10 +1068,10 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
 	struct brw_instruction *insn;
 	unsigned br = 1;
 
-	if (p->gen >= 50)
+	if (p->gen >= 050)
 		br = 2;
 
-	if (p->gen >= 70) {
+	if (p->gen >= 070) {
 		insn = brw_next_insn(p, BRW_OPCODE_WHILE);
 
 		brw_set_dest(p, insn, __retype_d(brw_null_reg()));
@@ -1080,7 +1080,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
 		insn->bits3.break_cont.jip = br * (do_insn - insn);
 
 		insn->header.execution_size = BRW_EXECUTE_8;
-	} else if (p->gen >= 60) {
+	} else if (p->gen >= 060) {
 		insn = brw_next_insn(p, BRW_OPCODE_WHILE);
 
 		brw_set_dest(p, insn, brw_imm_w(0));
@@ -1126,7 +1126,7 @@ void brw_land_fwd_jump(struct brw_compile *p,
 	struct brw_instruction *landing = &p->store[p->nr_insn];
 	unsigned jmpi = 1;
 
-	if (p->gen >= 50)
+	if (p->gen >= 050)
 		jmpi = 2;
 
 	assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
@@ -1195,7 +1195,7 @@ void brw_math(struct brw_compile *p,
 	      unsigned data_type,
 	      unsigned precision)
 {
-	if (p->gen >= 60) {
+	if (p->gen >= 060) {
 		struct brw_instruction *insn = brw_next_insn(p, BRW_OPCODE_MATH);
 
 		assert(dest.file == BRW_GENERAL_REGISTER_FILE);
@@ -1294,7 +1294,7 @@ void brw_math_16(struct brw_compile *p,
 {
 	struct brw_instruction *insn;
 
-	if (p->gen >= 60) {
+	if (p->gen >= 060) {
 		insn = brw_next_insn(p, BRW_OPCODE_MATH);
 
 		/* Math is the same ISA format as other opcodes, except that CondModifier
@@ -1362,7 +1362,7 @@ void brw_oword_block_write_scratch(struct brw_compile *p,
 	uint32_t msg_control, msg_type;
 	int mlen;
 
-	if (p->gen >= 60)
+	if (p->gen >= 060)
 		offset /= 16;
 
 	mrf = __retype_ud(mrf);
@@ -1418,7 +1418,7 @@ void brw_oword_block_write_scratch(struct brw_compile *p,
 		 * protection.  Our use of DP writes is all about register
 		 * spilling within a thread.
 		 */
-		if (p->gen >= 60) {
+		if (p->gen >= 060) {
 			dest = __retype_uw(vec16(brw_null_reg()));
 			send_commit_msg = 0;
 		} else {
@@ -1427,13 +1427,13 @@ void brw_oword_block_write_scratch(struct brw_compile *p,
 		}
 
 		brw_set_dest(p, insn, dest);
-		if (p->gen >= 60) {
+		if (p->gen >= 060) {
 			brw_set_src0(p, insn, mrf);
 		} else {
 			brw_set_src0(p, insn, brw_null_reg());
 		}
 
-		if (p->gen >= 60)
+		if (p->gen >= 060)
 			msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
 		else
 			msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
@@ -1470,7 +1470,7 @@ brw_oword_block_read_scratch(struct brw_compile *p,
 	uint32_t msg_control;
 	int rlen;
 
-	if (p->gen >= 60)
+	if (p->gen >= 060)
 		offset /= 16;
 
 	mrf = __retype_ud(mrf);
@@ -1507,7 +1507,7 @@ brw_oword_block_read_scratch(struct brw_compile *p,
 		insn->header.destreg__conditionalmod = mrf.nr;
 
 		brw_set_dest(p, insn, dest); /* UW? */
-		if (p->gen >= 60) {
+		if (p->gen >= 060) {
 			brw_set_src0(p, insn, mrf);
 		} else {
 			brw_set_src0(p, insn, brw_null_reg());
@@ -1538,7 +1538,7 @@ void brw_oword_block_read(struct brw_compile *p,
 	struct brw_instruction *insn;
 
 	/* On newer hardware, offset is in units of owords. */
-	if (p->gen >= 60)
+	if (p->gen >= 060)
 		offset /= 16;
 
 	mrf = __retype_ud(mrf);
@@ -1562,7 +1562,7 @@ void brw_oword_block_read(struct brw_compile *p,
 	dest = __retype_uw(vec8(dest));
 
 	brw_set_dest(p, insn, dest);
-	if (p->gen >= 60) {
+	if (p->gen >= 060) {
 		brw_set_src0(p, insn, mrf);
 	} else {
 		brw_set_src0(p, insn, brw_null_reg());
@@ -1634,7 +1634,7 @@ void brw_dp_READ_4_vs(struct brw_compile *p,
 	struct brw_instruction *insn;
 	unsigned msg_reg_nr = 1;
 
-	if (p->gen >= 60)
+	if (p->gen >= 060)
 		location /= 16;
 
 	/* Setup MRF[1] with location/offset into const buffer */
@@ -1655,7 +1655,7 @@ void brw_dp_READ_4_vs(struct brw_compile *p,
 	insn->header.mask_control = BRW_MASK_DISABLE;
 
 	brw_set_dest(p, insn, dest);
-	if (p->gen >= 60) {
+	if (p->gen >= 060) {
 		brw_set_src0(p, insn, brw_message_reg(msg_reg_nr));
 	} else {
 		brw_set_src0(p, insn, brw_null_reg());
@@ -1710,9 +1710,9 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p,
 	brw_set_dest(p, insn, dest);
 	brw_set_src0(p, insn, src);
 
-	if (p->gen >= 60)
+	if (p->gen >= 060)
 		msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
-	else if (p->gen >= 45)
+	else if (p->gen >= 045)
 		msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
 	else
 		msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
@@ -1747,7 +1747,7 @@ void brw_fb_WRITE(struct brw_compile *p,
 	else
 		dest = __retype_uw(vec8(brw_null_reg()));
 
-	if (p->gen >= 60 && binding_table_index == 0) {
+	if (p->gen >= 060 && binding_table_index == 0) {
 		insn = brw_next_insn(p, BRW_OPCODE_SENDC);
 	} else {
 		insn = brw_next_insn(p, BRW_OPCODE_SEND);
@@ -1756,7 +1756,7 @@ void brw_fb_WRITE(struct brw_compile *p,
 	insn->header.predicate_control = 0;
 	insn->header.compression_control = BRW_COMPRESSION_NONE;
 
-	if (p->gen >= 60) {
+	if (p->gen >= 060) {
 		/* headerless version, just submit color payload */
 		src0 = brw_message_reg(msg_reg_nr);
 
@@ -1802,7 +1802,7 @@ void brw_SAMPLE(struct brw_compile *p,
 {
 	assert(writemask);
 
-	if (p->gen < 50 || writemask != WRITEMASK_XYZW) {
+	if (p->gen < 050 || writemask != WRITEMASK_XYZW) {
 		struct brw_reg m1 = brw_message_reg(msg_reg_nr);
 
 		writemask = ~writemask & WRITEMASK_XYZW;
@@ -1828,7 +1828,7 @@ void brw_SAMPLE(struct brw_compile *p,
 		insn = brw_next_insn(p, BRW_OPCODE_SEND);
 		insn->header.predicate_control = 0; /* XXX */
 		insn->header.compression_control = BRW_COMPRESSION_NONE;
-		if (p->gen < 60)
+		if (p->gen < 060)
 			insn->header.destreg__conditionalmod = msg_reg_nr;
 
 		brw_set_dest(p, insn, dest);
@@ -1865,7 +1865,7 @@ void brw_urb_WRITE(struct brw_compile *p,
 
 	gen6_resolve_implied_move(p, &src0, msg_reg_nr);
 
-	if (p->gen >= 70) {
+	if (p->gen >= 070) {
 		/* Enable Channel Masks in the URB_WRITE_HWORD message header */
 		brw_push_insn_state(p);
 		brw_set_access_mode(p, BRW_ALIGN_1);
@@ -1883,7 +1883,7 @@ void brw_urb_WRITE(struct brw_compile *p,
 	brw_set_src0(p, insn, src0);
 	brw_set_src1(p, insn, brw_imm_d(0));
 
-	if (p->gen <= 60)
+	if (p->gen <= 060)
 		insn->header.destreg__conditionalmod = msg_reg_nr;
 
 	brw_set_urb_message(p,
@@ -1931,7 +1931,7 @@ brw_find_loop_end(struct brw_compile *p, int start)
 		struct brw_instruction *insn = &p->store[ip];
 
 		if (insn->header.opcode == BRW_OPCODE_WHILE) {
-			int jip = p->gen <= 70 ? insn->bits1.branch_gen6.jump_count
+			int jip = p->gen <= 070 ? insn->bits1.branch_gen6.jump_count
 				: insn->bits3.break_cont.jip;
 			if (ip + jip / br <= start)
 				return ip;
@@ -1950,7 +1950,7 @@ brw_set_uip_jip(struct brw_compile *p)
 	int ip;
 	int br = 2;
 
-	if (p->gen <= 60)
+	if (p->gen <= 060)
 		return;
 
 	for (ip = 0; ip < p->nr_insn; ip++) {
@@ -1961,7 +1961,7 @@ brw_set_uip_jip(struct brw_compile *p)
 			insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
 			/* Gen7 UIP points to WHILE; Gen6 points just after it */
 			insn->bits3.break_cont.uip =
-				br * (brw_find_loop_end(p, ip) - ip + (p->gen <= 70 ? 1 : 0));
+				br * (brw_find_loop_end(p, ip) - ip + (p->gen <= 070 ? 1 : 0));
 			break;
 		case BRW_OPCODE_CONTINUE:
 			insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
@@ -1991,7 +1991,7 @@ void brw_ff_sync(struct brw_compile *p,
 	brw_set_src0(p, insn, src0);
 	brw_set_src1(p, insn, brw_imm_d(0));
 
-	if (p->gen < 60)
+	if (p->gen < 060)
 		insn->header.destreg__conditionalmod = msg_reg_nr;
 
 	brw_set_ff_sync_message(p,
diff --git a/src/sna/brw/brw_wm.c b/src/sna/brw/brw_wm.c
index f54e55efe..e8dc6ac47 100644
--- a/src/sna/brw/brw_wm.c
+++ b/src/sna/brw/brw_wm.c
@@ -41,15 +41,15 @@ static void brw_wm_affine_st(struct brw_compile *p, int dw,
 
 	if (dw == 16) {
 		brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
-		uv = p->gen >= 60 ? 6 : 3;
+		uv = p->gen >= 060 ? 6 : 3;
 	} else {
 		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-		uv = p->gen >= 60 ? 4 : 3;
+		uv = p->gen >= 060 ? 4 : 3;
 	}
 	uv += 2*channel;
 
 	msg++;
-	if (p->gen >= 60) {
+	if (p->gen >= 060) {
 		brw_PLN(p,
 			brw_message_reg(msg),
 			brw_vec1_grf(uv, 0),
@@ -96,7 +96,7 @@ static int brw_wm_sample(struct brw_compile *p, int dw,
 	int len;
 
 	len = dw == 16 ? 4 : 2;
-	if (p->gen >= 60) {
+	if (p->gen >= 060) {
 		header = false;
 		src0 = brw_message_reg(++msg);
 	} else {
@@ -125,7 +125,7 @@ static int brw_wm_sample__alpha(struct brw_compile *p, int dw,
 		rlen = 2;
 	}
 
-	if (p->gen >= 60)
+	if (p->gen >= 060)
 		src0 = brw_message_reg(msg);
 	else
 		src0 = brw_vec8_grf(0, 0);
@@ -182,7 +182,7 @@ static void brw_fb_write(struct brw_compile *p, int dw)
 		msg_len = 4;
 	}
 
-	if (p->gen < 60) {
+	if (p->gen < 060) {
 		brw_push_insn_state(p);
 		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 		brw_set_mask_control(p, BRW_MASK_DISABLE);
@@ -197,7 +197,7 @@ static void brw_fb_write(struct brw_compile *p, int dw)
 	insn->header.predicate_control = 0;
 	insn->header.compression_control = BRW_COMPRESSION_NONE;
 
-	if (p->gen >= 60) {
+	if (p->gen >= 060) {
 		msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
 		src0 = brw_message_reg(2);
 		header = false;
@@ -219,7 +219,7 @@ static void brw_wm_write(struct brw_compile *p, int dw, int src)
 {
 	int n;
 
-	if (dw == 8 && p->gen >= 60) {
+	if (dw == 8 && p->gen >= 060) {
 		/* XXX pixel execution mask? */
 		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
@@ -233,11 +233,11 @@ static void brw_wm_write(struct brw_compile *p, int dw, int src)
 	brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 
 	for (n = 0; n < 4; n++) {
-		if (p->gen >= 60) {
+		if (p->gen >= 060) {
 			brw_MOV(p,
 				brw_message_reg(2 + 2*n),
 				brw_vec8_grf(src + 2*n, 0));
-		} else if (p->gen >= 45 && dw == 16) {
+		} else if (p->gen >= 045 && dw == 16) {
 			brw_MOV(p,
 				brw_message_reg(2 + n + BRW_MRF_COMPR4),
 				brw_vec8_grf(src + 2*n, 0));
@@ -265,7 +265,7 @@ static void brw_wm_write__mask(struct brw_compile *p, int dw,
 {
 	int n;
 
-	if (dw == 8 && p->gen >= 60) {
+	if (dw == 8 && p->gen >= 060) {
 		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
 		brw_MUL(p,
@@ -291,12 +291,12 @@ static void brw_wm_write__mask(struct brw_compile *p, int dw,
 	brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 
 	for (n = 0; n < 4; n++) {
-		if (p->gen >= 60) {
+		if (p->gen >= 060) {
 			brw_MUL(p,
 				brw_message_reg(2 + 2*n),
 				brw_vec8_grf(src + 2*n, 0),
 				brw_vec8_grf(mask, 0));
-		} else if (p->gen >= 45 && dw == 16) {
+		} else if (p->gen >= 045 && dw == 16) {
 			brw_MUL(p,
 				brw_message_reg(2 + n + BRW_MRF_COMPR4),
 				brw_vec8_grf(src + 2*n, 0),
@@ -327,7 +327,7 @@ static void brw_wm_write__opacity(struct brw_compile *p, int dw,
 {
 	int n;
 
-	if (dw == 8 && p->gen >= 60) {
+	if (dw == 8 && p->gen >= 060) {
 		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
 		brw_MUL(p,
@@ -353,12 +353,12 @@ static void brw_wm_write__opacity(struct brw_compile *p, int dw,
 	brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 
 	for (n = 0; n < 4; n++) {
-		if (p->gen >= 60) {
+		if (p->gen >= 060) {
 			brw_MUL(p,
 				brw_message_reg(2 + 2*n),
 				brw_vec8_grf(src + 2*n, 0),
 				brw_vec1_grf(mask, 3));
-		} else if (p->gen >= 45 && dw == 16) {
+		} else if (p->gen >= 045 && dw == 16) {
 			brw_MUL(p,
 				brw_message_reg(2 + n + BRW_MRF_COMPR4),
 				brw_vec8_grf(src + 2*n, 0),
@@ -389,7 +389,7 @@ static void brw_wm_write__mask_ca(struct brw_compile *p, int dw,
 {
 	int n;
 
-	if (dw == 8 && p->gen >= 60) {
+	if (dw == 8 && p->gen >= 060) {
 		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
 		brw_MUL(p,
@@ -415,12 +415,12 @@ static void brw_wm_write__mask_ca(struct brw_compile *p, int dw,
 	brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
 
 	for (n = 0; n < 4; n++) {
-		if (p->gen >= 60) {
+		if (p->gen >= 060) {
 			brw_MUL(p,
 				brw_message_reg(2 + 2*n),
 				brw_vec8_grf(src + 2*n, 0),
 				brw_vec8_grf(mask + 2*n, 0));
-		} else if (p->gen >= 45 && dw == 16) {
+		} else if (p->gen >= 045 && dw == 16) {
 			brw_MUL(p,
 				brw_message_reg(2 + n + BRW_MRF_COMPR4),
 				brw_vec8_grf(src + 2*n, 0),
@@ -449,7 +449,7 @@ done:
 bool
 brw_wm_kernel__affine(struct brw_compile *p, int dispatch)
 {
-	if (p->gen < 60)
+	if (p->gen < 060)
 		brw_wm_xy(p, dispatch);
 	brw_wm_write(p, dispatch, brw_wm_affine(p, dispatch, 0, 1, 12));
 
@@ -461,7 +461,7 @@ brw_wm_kernel__affine_mask(struct brw_compile *p, int dispatch)
 {
 	int src, mask;
 
-	if (p->gen < 60)
+	if (p->gen < 060)
 		brw_wm_xy(p, dispatch);
 
 	src = brw_wm_affine(p, dispatch, 0, 1, 12);
@@ -476,7 +476,7 @@ brw_wm_kernel__affine_mask_ca(struct brw_compile *p, int dispatch)
 {
 	int src, mask;
 
-	if (p->gen < 60)
+	if (p->gen < 060)
 		brw_wm_xy(p, dispatch);
 
 	src = brw_wm_affine(p, dispatch, 0, 1, 12);
@@ -491,7 +491,7 @@ brw_wm_kernel__affine_mask_sa(struct brw_compile *p, int dispatch)
 {
 	int src, mask;
 
-	if (p->gen < 60)
+	if (p->gen < 060)
 		brw_wm_xy(p, dispatch);
 
 	src = brw_wm_affine__alpha(p, dispatch, 0, 1, 12);
@@ -510,15 +510,15 @@ static void brw_wm_projective_st(struct brw_compile *p, int dw,
 
 	if (dw == 16) {
 		brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
-		uv = p->gen >= 60 ? 6 : 3;
+		uv = p->gen >= 060 ? 6 : 3;
 	} else {
 		brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-		uv = p->gen >= 60 ? 4 : 3;
+		uv = p->gen >= 060 ? 4 : 3;
 	}
 	uv += 2*channel;
 
 	msg++;
-	if (p->gen >= 60) {
+	if (p->gen >= 060) {
 		/* First compute 1/z */
 		brw_PLN(p,
 			brw_message_reg(msg),
@@ -594,7 +594,7 @@ static int brw_wm_projective__alpha(struct brw_compile *p, int dw,
 bool
 brw_wm_kernel__projective(struct brw_compile *p, int dispatch)
 {
-	if (p->gen < 60)
+	if (p->gen < 060)
 		brw_wm_xy(p, dispatch);
 	brw_wm_write(p, dispatch, brw_wm_projective(p, dispatch, 0, 1, 12));
 
@@ -606,7 +606,7 @@ brw_wm_kernel__projective_mask(struct brw_compile *p, int dispatch)
 {
 	int src, mask;
 
-	if (p->gen < 60)
+	if (p->gen < 060)
 		brw_wm_xy(p, dispatch);
 
 	src = brw_wm_projective(p, dispatch, 0, 1, 12);
@@ -621,7 +621,7 @@ brw_wm_kernel__projective_mask_ca(struct brw_compile *p, int dispatch)
 {
 	int src, mask;
 
-	if (p->gen < 60)
+	if (p->gen < 060)
 		brw_wm_xy(p, dispatch);
 
 	src = brw_wm_projective(p, dispatch, 0, 1, 12);
@@ -636,7 +636,7 @@ brw_wm_kernel__projective_mask_sa(struct brw_compile *p, int dispatch)
 {
 	int src, mask;
 
-	if (p->gen < 60)
+	if (p->gen < 060)
 		brw_wm_xy(p, dispatch);
 
 	src = brw_wm_projective__alpha(p, dispatch, 0, 1, 12);
@@ -651,9 +651,9 @@ brw_wm_kernel__affine_opacity(struct brw_compile *p, int dispatch)
 {
 	int src, mask;
 
-	if (p->gen < 60) {
+	if (p->gen < 060) {
 		brw_wm_xy(p, dispatch);
-		mask = 4;
+		mask = 5;
 	} else
 		mask = dispatch == 16 ? 8 : 6;
 
@@ -668,9 +668,9 @@ brw_wm_kernel__projective_opacity(struct brw_compile *p, int dispatch)
 {
 	int src, mask;
 
-	if (p->gen < 60) {
+	if (p->gen < 060) {
 		brw_wm_xy(p, dispatch);
-		mask = 4;
+		mask = 5;
 	} else
 		mask = dispatch == 16 ? 8 : 6;
 
diff --git a/src/sna/compiler.h b/src/sna/compiler.h
index ff80365eb..b985f2bca 100644
--- a/src/sna/compiler.h
+++ b/src/sna/compiler.h
@@ -36,6 +36,7 @@
 #define fastcall __attribute__((regparm(3)))
 #define must_check __attribute__((warn_unused_result))
 #define constant __attribute__((const))
+#define __packed__ __attribute__((__packed__))
 #else
 #define likely(expr) (expr)
 #define unlikely(expr) (expr)
@@ -44,6 +45,7 @@
 #define fastcall
 #define must_check
 #define constant
+#define __packed__
 #endif
 
 #ifdef HAVE_VALGRIND
diff --git a/src/sna/fb/Makefile.in b/src/sna/fb/Makefile.in
index d9ca271f0..d21411b96 100644
--- a/src/sna/fb/Makefile.in
+++ b/src/sna/fb/Makefile.in
@@ -182,7 +182,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -221,6 +220,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -319,7 +320,7 @@ all: all-am
 
 .SUFFIXES:
 .SUFFIXES: .c .lo .o .obj
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -344,9 +345,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
 
diff --git a/src/sna/fb/fb.h b/src/sna/fb/fb.h
index e58e03967..d99453da6 100644
--- a/src/sna/fb/fb.h
+++ b/src/sna/fb/fb.h
@@ -33,6 +33,7 @@
 #include <gcstruct.h>
 #include <colormap.h>
 #include <windowstr.h>
+#include <regionstr.h>
 
 #include <stdbool.h>
 #include <pixman.h>
@@ -45,6 +46,8 @@
 
 #include "sfb.h"
 
+#include "../../compat-api.h"
+
 #define WRITE(ptr, val) (*(ptr) = (val))
 #define READ(ptr) (*(ptr))
 
@@ -294,12 +297,12 @@ extern DevPrivateKeyRec sna_window_key;
 
 static inline FbGCPrivate *fb_gc(GCPtr gc)
 {
-	return dixGetPrivateAddr(&gc->devPrivates, &sna_gc_key);
+	return (FbGCPrivate *)__get_private(gc, sna_gc_key);
 }
 
 static inline PixmapPtr fbGetWindowPixmap(WindowPtr window)
 {
-	return *(PixmapPtr *)dixGetPrivateAddr(&window->devPrivates, &sna_window_key);
+	return *(PixmapPtr *)__get_private(window, sna_window_key);
 }
 
 #ifdef ROOTLESS
@@ -360,8 +363,14 @@ static inline PixmapPtr fbGetWindowPixmap(WindowPtr window)
  * XFree86 empties the root BorderClip when the VT is inactive,
  * here's a macro which uses that to disable GetImage and GetSpans
  */
+
+#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,10,0,0,0)
 #define fbWindowEnabled(pWin) \
-    RegionNotEmpty(&(pWin)->drawable.pScreen->root->borderClip)
+	RegionNotEmpty(&(pWin)->drawable.pScreen->root->borderClip)
+#else
+#define fbWindowEnabled(pWin) \
+	RegionNotEmpty(&WindowTable[(pWin)->drawable.pScreen->myNum]->borderClip)
+#endif
 #define fbDrawableEnabled(drawable) \
     ((drawable)->type == DRAWABLE_PIXMAP ? \
      TRUE : fbWindowEnabled((WindowPtr) drawable))
diff --git a/src/sna/fb/fbbitmap.c b/src/sna/fb/fbbitmap.c
index 7c037fe36..2ea92a992 100644
--- a/src/sna/fb/fbbitmap.c
+++ b/src/sna/fb/fbbitmap.c
@@ -25,21 +25,50 @@
 
 #include "fb.h"
 
+static Bool region_grow(RegionPtr region)
+{
+	RegDataPtr data;
+	int n;
+
+	n = 16;
+	if (!region->data) {
+		region->data = malloc(RegionSizeof(n));
+		if (!region->data)
+			return RegionBreak(region);
+		region->data->numRects = 1;
+		*RegionBoxptr(region) = region->extents;
+	} else if (!region->data->size) {
+		region->data = malloc(RegionSizeof(n));
+		if (!region->data)
+			return RegionBreak(region);
+		region->data->numRects = 0;
+	} else {
+		n = 2 * region->data->numRects;
+		data = (RegDataPtr) realloc(region->data, RegionSizeof(n));
+		if (!data)
+			return RegionBreak(region);
+		region->data = data;
+	}
+	region->data->size = n;
+	return TRUE;
+}
+
 static inline void add(RegionPtr region,
 		       int16_t x1, int16_t y1, int16_t x2, int16_t y2)
 {
 	BoxPtr r;
 
-	if (region->data->numRects == region->data->size)
-		RegionRectAlloc(region, 1);
+	if (region->data->numRects == region->data->size &&
+	    !region_grow(region))
+		return;
 
 	r = RegionBoxptr(region) + region->data->numRects++;
 	r->x1 = x1; r->y1 = y1;
 	r->x2 = x2; r->y2 = y2;
 
-	DBG(("%s[%d/%d]: (%d, %d), (%d, %d)\n",
+	DBG(("%s[%ld/%ld]: (%d, %d), (%d, %d)\n",
 	     __FUNCTION__,
-	     region->data->numRects, region->data->size,
+	     (long)region->data->numRects, (long)region->data->size,
 	     x1, y1, x2, y2));
 
 	if (x1 < region->extents.x1)
@@ -149,11 +178,11 @@ fbBitmapToRegion(PixmapPtr pixmap)
 	} else
 		region->extents.x1 = region->extents.x2 = 0;
 
-	DBG(("%s: region extents=(%d, %d), (%d, %d) x %d\n",
+	DBG(("%s: region extents=(%d, %d), (%d, %d) x %ld\n",
 	     __FUNCTION__,
 	     region->extents.x1, region->extents.y1,
 	     region->extents.x2, region->extents.y2,
-	     RegionNumRects(region)));
+	     (long)RegionNumRects(region)));
 
 	return region;
 }
diff --git a/src/sna/fb/fbblt.c b/src/sna/fb/fbblt.c
index d4d20b68b..5ad2e2e25 100644
--- a/src/sna/fb/fbblt.c
+++ b/src/sna/fb/fbblt.c
@@ -270,7 +270,7 @@ fbBlt(FbBits *srcLine, FbStride srcStride, int srcX,
       int alu, FbBits pm, int bpp,
       Bool reverse, Bool upsidedown)
 {
-	DBG(("%s %dx%d, alu=%d, pm=%d, bpp=%d (reverse=%d, upsidedown=%d)\n",
+	DBG(("%s %dx%d, alu=%d, pm=%x, bpp=%d (reverse=%d, upsidedown=%d)\n",
 	     __FUNCTION__, width, height, alu, pm, bpp, reverse, upsidedown));
 
 	if (alu == GXcopy && pm == FB_ALLONES && ((srcX|dstX|width) & 7) == 0) {
@@ -285,9 +285,9 @@ fbBlt(FbBits *srcLine, FbStride srcStride, int srcX,
 		s += srcX >> 3;
 		d += dstX >> 3;
 
-		DBG(("%s fast blt, src_stride=%d, dst_stride=%d, width=%d (offset=%d)\n",
+		DBG(("%s fast blt, src_stride=%d, dst_stride=%d, width=%d (offset=%ld)\n",
 		     __FUNCTION__,
-		     srcStride, dstStride, width, s - d));
+		     srcStride, dstStride, width, (long)(s - d)));
 
 		if (width == srcStride && width == dstStride) {
 			width *= height;
diff --git a/src/sna/fb/fbpict.c b/src/sna/fb/fbpict.c
index a2038518e..906a5f316 100644
--- a/src/sna/fb/fbpict.c
+++ b/src/sna/fb/fbpict.c
@@ -26,25 +26,19 @@
 #include <string.h>
 
 #include "fb.h"
-
-#include <picturestr.h>
-#include <mipict.h>
 #include "fbpict.h"
 
 static void
 SourceValidateOnePicture(PicturePtr picture)
 {
 	DrawablePtr drawable = picture->pDrawable;
-	ScreenPtr screen;
 
 	if (!drawable)
 		return;
 
-	screen = drawable->pScreen;
-	if (screen->SourceValidate)
-		screen->SourceValidate(drawable,
-				       0, 0, drawable->width, drawable->height,
-				       picture->subWindowMode);
+	SourceValidate(drawable,
+		       0, 0, drawable->width, drawable->height,
+		       picture->subWindowMode);
 }
 
 static void
diff --git a/src/sna/fb/fbpict.h b/src/sna/fb/fbpict.h
index 1ce09df25..560138533 100644
--- a/src/sna/fb/fbpict.h
+++ b/src/sna/fb/fbpict.h
@@ -24,17 +24,23 @@
 #ifndef FBPICT_H
 #define FBPICT_H
 
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <picturestr.h>
+
 #include "sfb.h"
 
 extern void
 fbComposite(CARD8 op,
-            PicturePtr pSrc,
-            PicturePtr pMask,
-            PicturePtr pDst,
-            INT16 xSrc,
-            INT16 ySrc,
-            INT16 xMask,
-            INT16 yMask, INT16 xDst, INT16 yDst, CARD16 width, CARD16 height);
+	    PicturePtr pSrc,
+	    PicturePtr pMask,
+	    PicturePtr pDst,
+	    INT16 xSrc, INT16 ySrc,
+	    INT16 xMask, INT16 yMask,
+	    INT16 xDst, INT16 yDst,
+	    CARD16 width, CARD16 height);
 
 extern pixman_image_t *image_from_pict(PicturePtr pict,
 				       Bool has_clip,
diff --git a/src/sna/fb/fbpoint.c b/src/sna/fb/fbpoint.c
index 3df79a261..c5f0f876f 100644
--- a/src/sna/fb/fbpoint.c
+++ b/src/sna/fb/fbpoint.c
@@ -93,10 +93,10 @@ fbPolyPoint(DrawablePtr drawable, GCPtr gc,
 		     int xoff, int yoff,
 		     FbBits and, FbBits xor);
 
-	DBG(("%s x %d, clip=[(%d, %d), (%d, %d)]x%d\n", __FUNCTION__, n,
+	DBG(("%s x %d, clip=[(%d, %d), (%d, %d)]x%ld\n", __FUNCTION__, n,
 	     gc->pCompositeClip->extents.x1, gc->pCompositeClip->extents.y1,
 	     gc->pCompositeClip->extents.x2, gc->pCompositeClip->extents.y2,
-	     RegionNumRects(gc->pCompositeClip)));
+	     (long)RegionNumRects(gc->pCompositeClip)));
 
 	if (mode == CoordModePrevious)
 		fbFixCoordModePrevious(n, pt);
diff --git a/src/sna/fb/fbseg.c b/src/sna/fb/fbseg.c
index 5b8173f08..67ad38958 100644
--- a/src/sna/fb/fbseg.c
+++ b/src/sna/fb/fbseg.c
@@ -353,7 +353,8 @@ fbSelectBres(DrawablePtr drawable, GCPtr gc)
 	FbBres *bres;
 
 	DBG(("%s: line=%d, fill=%d, and=%lx, bgand=%lx\n",
-	     __FUNCTION__, gc->lineStyle, gc->fillStyle, pgc->and, pgc->bgand));
+	     __FUNCTION__, gc->lineStyle, gc->fillStyle,
+	     (long)pgc->and, (long)pgc->bgand));
 	assert(gc->lineWidth == 0);
 
 	if (gc->lineStyle == LineSolid) {
diff --git a/src/sna/gen2_render.c b/src/sna/gen2_render.c
index 501266241..4d92adcf7 100644
--- a/src/sna/gen2_render.c
+++ b/src/sna/gen2_render.c
@@ -46,9 +46,6 @@
 #define NO_FILL_ONE 0
 #define NO_FILL_BOXES 0
 
-#define PREFER_BLT_FILL 1
-#define PREFER_BLT_COPY 1
-
 #define MAX_3D_SIZE 2048
 #define MAX_3D_PITCH 8192
 
@@ -175,7 +172,7 @@ gen2_get_card_format(struct sna *sna, uint32_t format)
 		if (i8xx_tex_formats[i].fmt == format)
 			return i8xx_tex_formats[i].card_fmt;
 
-	if (sna->kgem.gen < 21) {
+	if (sna->kgem.gen < 021) {
 		/* Whilst these are not directly supported on 830/845,
 		 * we only enable them when we can implicitly convert
 		 * them to a supported variant through the texture
@@ -203,7 +200,7 @@ gen2_check_format(struct sna *sna, PicturePtr p)
 		if (i8xx_tex_formats[i].fmt == p->format)
 			return true;
 
-	if (sna->kgem.gen > 21) {
+	if (sna->kgem.gen > 021) {
 		for (i = 0; i < ARRAY_SIZE(i85x_tex_formats); i++)
 			if (i85x_tex_formats[i].fmt == p->format)
 				return true;
@@ -396,6 +393,15 @@ gen2_get_blend_factors(const struct sna_composite_op *op,
 
 		cblend |= TB0C_OP_MODULATE;
 		ablend |= TB0A_OP_MODULATE;
+	} else if (op->mask.is_solid) {
+		cblend |= TB0C_ARG2_SEL_DIFFUSE;
+		ablend |= TB0A_ARG2_SEL_DIFFUSE;
+
+		if (op->dst.format == PICT_a8 || !op->has_component_alpha)
+			cblend |= TB0C_ARG2_REPLICATE_ALPHA;
+
+		cblend |= TB0C_OP_MODULATE;
+		ablend |= TB0A_OP_MODULATE;
 	} else {
 		cblend |= TB0C_OP_ARG1;
 		ablend |= TB0A_OP_ARG1;
@@ -504,6 +510,7 @@ static void gen2_emit_invariant(struct sna *sna)
 	      ENABLE_TEX_CACHE);
 
 	BATCH(_3DSTATE_STIPPLE);
+	BATCH(0);
 
 	BATCH(_3DSTATE_MAP_BLEND_OP_CMD(0) |
 	      TEXPIPE_COLOR |
@@ -536,9 +543,9 @@ static void gen2_emit_invariant(struct sna *sna)
 }
 
 static void
-gen2_get_batch(struct sna *sna)
+gen2_get_batch(struct sna *sna, const struct sna_composite_op *op)
 {
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
 
 	if (!kgem_check_batch(&sna->kgem, INVARIANT_SIZE+40)) {
 		DBG(("%s: flushing batch: size %d > %d\n",
@@ -574,7 +581,7 @@ static void gen2_emit_target(struct sna *sna, const struct sna_composite_op *op)
 {
 	assert(!too_large(op->dst.width, op->dst.height));
 	assert(op->dst.bo->pitch >= 8 && op->dst.bo->pitch <= MAX_3D_PITCH);
-	assert(sna->render_state.gen2.vertex_offset == 0);
+	assert(sna->render.vertex_offset == 0);
 
 	if (sna->render_state.gen2.target == op->dst.bo->unique_id) {
 		kgem_bo_mark_dirty(op->dst.bo);
@@ -662,7 +669,7 @@ static void gen2_emit_composite_state(struct sna *sna,
 	uint32_t cblend, ablend;
 	int tex;
 
-	gen2_get_batch(sna);
+	gen2_get_batch(sna, op);
 
 	if (kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
 		if (op->src.bo == op->dst.bo || op->mask.bo == op->dst.bo)
@@ -726,6 +733,12 @@ static void gen2_emit_composite_state(struct sna *sna,
 		else
 			texcoordfmt |= TEXCOORDFMT_3D << (2*tex);
 		gen2_emit_texture(sna, &op->mask, tex++);
+	} else if (op->mask.is_solid) {
+		if (op->mask.u.gen2.pixel != sna->render_state.gen2.diffuse) {
+			BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
+			BATCH(op->mask.u.gen2.pixel);
+			sna->render_state.gen2.diffuse = op->mask.u.gen2.pixel;
+		}
 	}
 
 	v = _3DSTATE_VERTEX_FORMAT_2_CMD | texcoordfmt;
@@ -749,9 +762,9 @@ gen2_emit_composite_linear(struct sna *sna,
 {
 	float v;
 
-	v = (x * channel->u.gen2.linear_dx +
-	     y * channel->u.gen2.linear_dy +
-	     channel->u.gen2.linear_offset);
+	v = (x * channel->u.linear.dx +
+	     y * channel->u.linear.dy +
+	     channel->u.linear.offset);
 	DBG(("%s: (%d, %d) -> %f\n", __FUNCTION__, x, y, v));
 	VERTEX(v);
 	VERTEX(v);
@@ -895,33 +908,30 @@ gen2_emit_composite_primitive_affine(struct sna *sna,
 				     const struct sna_composite_rectangles *r)
 {
 	PictTransform *transform = op->src.transform;
-	int16_t dst_x = r->dst.x + op->dst.x;
-	int16_t dst_y = r->dst.y + op->dst.y;
 	int src_x = r->src.x + (int)op->src.offset[0];
 	int src_y = r->src.y + (int)op->src.offset[1];
-	float sx, sy;
+	float *v;
 
-	_sna_get_transformed_coordinates(src_x + r->width, src_y + r->height,
-					 transform,
-					 &sx, &sy);
+	v = (float *)sna->kgem.batch + sna->kgem.nbatch;
+	sna->kgem.nbatch += 12;
 
-	gen2_emit_composite_dstcoord(sna, dst_x + r->width, dst_y + r->height);
-	VERTEX(sx * op->src.scale[0]);
-	VERTEX(sy * op->src.scale[1]);
+	v[8] = v[4] = r->dst.x + op->dst.x;
+	v[0] = v[4] + r->width;
 
-	_sna_get_transformed_coordinates(src_x, src_y + r->height,
-					 transform,
-					 &sx, &sy);
-	gen2_emit_composite_dstcoord(sna, dst_x, dst_y + r->height);
-	VERTEX(sx * op->src.scale[0]);
-	VERTEX(sy * op->src.scale[1]);
+	v[9] = r->dst.y + op->dst.y;
+	v[5] = v[1] = v[9] + r->height;
 
-	_sna_get_transformed_coordinates(src_x, src_y,
-					 transform,
-					 &sx, &sy);
-	gen2_emit_composite_dstcoord(sna, dst_x, dst_y);
-	VERTEX(sx * op->src.scale[0]);
-	VERTEX(sy * op->src.scale[1]);
+	_sna_get_transformed_scaled(src_x + r->width, src_y + r->height,
+				    transform, op->src.scale,
+				    &v[2], &v[3]);
+
+	_sna_get_transformed_scaled(src_x, src_y + r->height,
+				    transform, op->src.scale,
+				    &v[6], &v[7]);
+
+	_sna_get_transformed_scaled(src_x, src_y,
+				    transform, op->src.scale,
+				    &v[10], &v[11]);
 }
 
 fastcall static void
@@ -959,7 +969,7 @@ static void gen2_magic_ca_pass(struct sna *sna,
 		return;
 
 	DBG(("%s: batch=%x, vertex=%x\n", __FUNCTION__,
-	     sna->kgem.nbatch, sna->render_state.gen2.vertex_offset));
+	     sna->kgem.nbatch, sna->render.vertex_offset));
 
 	assert(op->mask.bo);
 	assert(op->has_component_alpha);
@@ -978,7 +988,7 @@ static void gen2_magic_ca_pass(struct sna *sna,
 	BATCH(ablend);
 	sna->render_state.gen2.ls2 = 0;
 
-	src = sna->kgem.batch + sna->render_state.gen2.vertex_offset;
+	src = sna->kgem.batch + sna->render.vertex_offset;
 	dst = sna->kgem.batch + sna->kgem.nbatch;
 	n = 1 + sna->render.vertex_index;
 	sna->kgem.nbatch += n;
@@ -993,12 +1003,12 @@ static void gen2_vertex_flush(struct sna *sna,
 	if (sna->render.vertex_index == 0)
 		return;
 
-	sna->kgem.batch[sna->render_state.gen2.vertex_offset] |=
+	sna->kgem.batch[sna->render.vertex_offset] |=
 		sna->render.vertex_index - 1;
 
 	gen2_magic_ca_pass(sna, op);
 
-	sna->render_state.gen2.vertex_offset = 0;
+	sna->render.vertex_offset = 0;
 	sna->render.vertex_index = 0;
 }
 
@@ -1006,7 +1016,6 @@ inline static int gen2_get_rectangles(struct sna *sna,
 				      const struct sna_composite_op *op,
 				      int want)
 {
-	struct gen2_render_state *state = &sna->render_state.gen2;
 	int rem = batch_space(sna), size, need;
 
 	DBG(("%s: want=%d, floats_per_vertex=%d, rem=%d\n",
@@ -1030,16 +1039,17 @@ inline static int gen2_get_rectangles(struct sna *sna,
 	}
 
 	rem -= need;
-	if (state->vertex_offset == 0) {
+	if (sna->render.vertex_offset == 0) {
 		if ((sna->kgem.batch[sna->kgem.nbatch-1] & ~0xffff) ==
 		    (PRIM3D_INLINE | PRIM3D_RECTLIST)) {
 			uint32_t *b = &sna->kgem.batch[sna->kgem.nbatch-1];
+			assert(*b & 0xffff);
 			sna->render.vertex_index = 1 + (*b & 0xffff);
 			*b = PRIM3D_INLINE | PRIM3D_RECTLIST;
-			state->vertex_offset = sna->kgem.nbatch - 1;
+			sna->render.vertex_offset = sna->kgem.nbatch - 1;
 			assert(!op->need_magic_ca_pass);
 		} else {
-			state->vertex_offset = sna->kgem.nbatch;
+			sna->render.vertex_offset = sna->kgem.nbatch;
 			BATCH(PRIM3D_INLINE | PRIM3D_RECTLIST);
 		}
 	}
@@ -1144,6 +1154,7 @@ gen2_composite_solid_init(struct sna *sna,
 	channel->filter = PictFilterNearest;
 	channel->repeat = RepeatNormal;
 	channel->is_solid  = true;
+	channel->is_affine = true;
 	channel->width  = 1;
 	channel->height = 1;
 	channel->pict_format = PICT_a8r8g8b8;
@@ -1251,12 +1262,12 @@ gen2_composite_linear_init(struct sna *sna,
 	dx /= sf;
 	dy /= sf;
 
-	channel->u.gen2.linear_dx = dx;
-	channel->u.gen2.linear_dy = dy;
-	channel->u.gen2.linear_offset = -dx*(x0+dst_x-x) + -dy*(y0+dst_y-y);
+	channel->u.linear.dx = dx;
+	channel->u.linear.dy = dy;
+	channel->u.linear.offset = -dx*(x0+dst_x-x) + -dy*(y0+dst_y-y);
 
 	DBG(("%s: dx=%f, dy=%f, offset=%f\n",
-	     __FUNCTION__, dx, dy, channel->u.gen2.linear_offset));
+	     __FUNCTION__, dx, dy, channel->u.linear.offset));
 
 	return channel->bo != NULL;
 }
@@ -1304,7 +1315,8 @@ static bool
 gen2_check_card_format(struct sna *sna,
 		       PicturePtr picture,
 		       struct sna_composite_channel *channel,
-		       int x, int y, int w, int h)
+		       int x, int y, int w, int h,
+		       bool *fixup_alpha)
 {
 	uint32_t format = picture->format;
 	unsigned int i;
@@ -1316,7 +1328,7 @@ gen2_check_card_format(struct sna *sna,
 
 	for (i = 0; i < ARRAY_SIZE(i85x_tex_formats); i++) {
 		if (i85x_tex_formats[i].fmt == format) {
-			if (sna->kgem.gen >= 21)
+			if (sna->kgem.gen >= 021)
 				return true;
 
 			if (source_is_covered(picture, x, y, w,h)) {
@@ -1324,10 +1336,12 @@ gen2_check_card_format(struct sna *sna,
 				return true;
 			}
 
+			*fixup_alpha = true;
 			return false;
 		}
 	}
 
+	*fixup_alpha = false;
 	return false;
 }
 
@@ -1343,6 +1357,7 @@ gen2_composite_picture(struct sna *sna,
 	PixmapPtr pixmap;
 	uint32_t color;
 	int16_t dx, dy;
+	bool fixup_alpha;
 
 	DBG(("%s: (%d, %d)x(%d, %d), dst=(%d, %d)\n",
 	     __FUNCTION__, x, y, w, h, dst_x, dst_y));
@@ -1417,9 +1432,9 @@ gen2_composite_picture(struct sna *sna,
 	} else
 		channel->transform = picture->transform;
 
-	if (!gen2_check_card_format(sna, picture, channel, x,  y, w ,h))
+	if (!gen2_check_card_format(sna, picture, channel, x,  y, w ,h, &fixup_alpha))
 		return sna_render_picture_convert(sna, picture, channel, pixmap,
-						  x, y, w, h, dst_x, dst_y);
+						  x, y, w, h, dst_x, dst_y, fixup_alpha);
 
 	channel->pict_format = picture->format;
 	if (too_large(pixmap->drawable.width, pixmap->drawable.height))
@@ -1499,49 +1514,6 @@ gen2_composite_set_target(struct sna *sna,
 }
 
 static bool
-try_blt(struct sna *sna,
-	PicturePtr dst,
-	PicturePtr src,
-	int width, int height)
-{
-	uint32_t color;
-
-	if (sna->kgem.mode != KGEM_RENDER) {
-		DBG(("%s: already performing BLT\n", __FUNCTION__));
-		return true;
-	}
-
-	if (too_large(width, height)) {
-		DBG(("%s: operation too large for 3D pipe (%d, %d)\n",
-		     __FUNCTION__, width, height));
-		return true;
-	}
-
-	if (too_large(dst->pDrawable->width, dst->pDrawable->height)) {
-		DBG(("%s: target too large for 3D pipe (%d, %d)\n",
-		     __FUNCTION__,
-		     dst->pDrawable->width, dst->pDrawable->height));
-		return true;
-	}
-
-	/* If it is a solid, try to use the BLT paths */
-	if (sna_picture_is_solid(src, &color))
-		return true;
-
-	if (!src->pDrawable)
-		return false;
-
-	if (too_large(src->pDrawable->width, src->pDrawable->height)) {
-		DBG(("%s: source too large for 3D pipe (%d, %d)\n",
-		     __FUNCTION__,
-		     src->pDrawable->width, src->pDrawable->height));
-		return true;
-	}
-
-	return !is_gpu(src->pDrawable);
-}
-
-static bool
 is_unhandled_gradient(PicturePtr picture)
 {
 	if (picture->pDrawable)
@@ -1563,12 +1535,6 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
-untransformed(PicturePtr p)
-{
-	return !p->transform || pixman_transform_is_int_translate(p->transform);
-}
-
-static bool
 need_upload(PicturePtr p)
 {
 	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
@@ -1614,7 +1580,6 @@ gen2_composite_fallback(struct sna *sna,
 			PicturePtr mask,
 			PicturePtr dst)
 {
-	struct sna_pixmap *priv;
 	PixmapPtr src_pixmap;
 	PixmapPtr mask_pixmap;
 	PixmapPtr dst_pixmap;
@@ -1653,8 +1618,7 @@ gen2_composite_fallback(struct sna *sna,
 	}
 
 	/* If anything is on the GPU, push everything out to the GPU */
-	priv = sna_pixmap(dst_pixmap);
-	if (priv && priv->gpu_damage && !priv->clear) {
+	if (dst_use_gpu(dst_pixmap)) {
 		DBG(("%s: dst is already on the GPU, try to use GPU\n",
 		     __FUNCTION__));
 		return false;
@@ -1689,14 +1653,14 @@ gen2_composite_fallback(struct sna *sna,
 
 	if (too_large(dst_pixmap->drawable.width,
 		      dst_pixmap->drawable.height) &&
-	    (priv == NULL || DAMAGE_IS_ALL(priv->cpu_damage))) {
+	    dst_is_cpu(dst_pixmap)) {
 		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
 		return true;
 	}
 
 	DBG(("%s: dst is not on the GPU and the operation should not fallback\n",
 	     __FUNCTION__));
-	return false;
+	return dst_use_cpu(dst_pixmap);
 }
 
 static int
@@ -1709,6 +1673,12 @@ reuse_source(struct sna *sna,
 	if (src_x != msk_x || src_y != msk_y)
 		return false;
 
+	if (sna_picture_is_solid(mask, &color))
+		return gen2_composite_solid_init(sna, mc, color);
+
+	if (sc->is_solid)
+		return false;
+
 	if (src == mask) {
 		DBG(("%s: mask is source\n", __FUNCTION__));
 		*mc = *sc;
@@ -1716,12 +1686,6 @@ reuse_source(struct sna *sna,
 		return true;
 	}
 
-	if (sna_picture_is_solid(mask, &color))
-		return gen2_composite_solid_init(sna, mc, color);
-
-	if (sc->is_solid)
-		return false;
-
 	if (src->pDrawable == NULL || mask->pDrawable != src->pDrawable)
 		return false;
 
@@ -1773,13 +1737,8 @@ gen2_render_composite(struct sna *sna,
 		return false;
 	}
 
-	/* Try to use the BLT engine unless it implies a
-	 * 3D -> 2D context switch.
-	 */
 	if (mask == NULL &&
-	    try_blt(sna, dst, src, width, height) &&
-	    sna_blt_composite(sna,
-			      op, src, dst,
+	    sna_blt_composite(sna, op, src, dst,
 			      src_x, src_y,
 			      dst_x, dst_y,
 			      width, height,
@@ -1805,6 +1764,8 @@ gen2_render_composite(struct sna *sna,
 	}
 
 	tmp->op = op;
+
+	sna_render_composite_redirect_init(tmp);
 	if (too_large(tmp->dst.width, tmp->dst.height) ||
 	    tmp->dst.bo->pitch > MAX_3D_PITCH) {
 		if (!sna_render_composite_redirect(sna, tmp,
@@ -1818,6 +1779,8 @@ gen2_render_composite(struct sna *sna,
 				       dst_x, dst_y,
 				       dst->polyMode == PolyModePrecise)) {
 	case -1:
+		DBG(("%s: fallback -- unable to prepare source\n",
+		     __FUNCTION__));
 		goto cleanup_dst;
 	case 0:
 		gen2_composite_solid_init(sna, &tmp->src, 0);
@@ -1841,6 +1804,8 @@ gen2_render_composite(struct sna *sna,
 						       dst_x,  dst_y,
 						       dst->polyMode == PolyModePrecise)) {
 			case -1:
+				DBG(("%s: fallback -- unable to prepare mask\n",
+				     __FUNCTION__));
 				goto cleanup_src;
 			case 0:
 				gen2_composite_solid_init(sna, &tmp->mask, 0);
@@ -1857,8 +1822,12 @@ gen2_render_composite(struct sna *sna,
 			tmp->has_component_alpha = true;
 			if (gen2_blend_op[op].src_alpha &&
 			    (gen2_blend_op[op].src_blend != BLENDFACTOR_ZERO)) {
-				if (op != PictOpOver)
-					return false;
+				if (op != PictOpOver) {
+					DBG(("%s: fallback -- unsupported CA blend (src_blend=%d)\n",
+					     __FUNCTION__,
+					     gen2_blend_op[op].src_blend));
+					goto cleanup_src;
+				}
 
 				tmp->need_magic_ca_pass = true;
 				tmp->op = PictOpOutReverse;
@@ -1866,8 +1835,12 @@ gen2_render_composite(struct sna *sna,
 		}
 
 		/* convert solid to a texture (pure convenience) */
-		if (tmp->mask.is_solid)
+		if (tmp->mask.is_solid && tmp->src.is_solid) {
+			assert(tmp->mask.is_affine);
 			tmp->mask.bo = sna_render_get_solid(sna, tmp->mask.u.gen2.pixel);
+			if (!tmp->mask.bo)
+				goto cleanup_src;
+		}
 	}
 
 	tmp->floats_per_vertex = 2;
@@ -1880,18 +1853,27 @@ gen2_render_composite(struct sna *sna,
 	tmp->prim_emit = gen2_emit_composite_primitive;
 	if (tmp->mask.bo) {
 		if (tmp->mask.transform == NULL) {
-			if (tmp->src.is_solid)
+			if (tmp->src.is_solid) {
+				assert(tmp->floats_per_rect == 12);
 				tmp->prim_emit = gen2_emit_composite_primitive_constant_identity_mask;
+			}
 		}
 	} else {
-		if (tmp->src.is_solid)
+		if (tmp->src.is_solid) {
+			assert(tmp->floats_per_rect == 6);
 			tmp->prim_emit = gen2_emit_composite_primitive_constant;
-		else if (tmp->src.is_linear)
+		} else if (tmp->src.is_linear) {
+			assert(tmp->floats_per_rect == 12);
 			tmp->prim_emit = gen2_emit_composite_primitive_linear;
-		else if (tmp->src.transform == NULL)
+		} else if (tmp->src.transform == NULL) {
+			assert(tmp->floats_per_rect == 12);
 			tmp->prim_emit = gen2_emit_composite_primitive_identity;
-		else if (tmp->src.is_affine)
+		} else if (tmp->src.is_affine) {
+			assert(tmp->floats_per_rect == 12);
+			tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
+			tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
 			tmp->prim_emit = gen2_emit_composite_primitive_affine;
+		}
 	}
 
 	tmp->blt   = gen2_render_composite_blt;
@@ -1905,8 +1887,11 @@ gen2_render_composite(struct sna *sna,
 		kgem_submit(&sna->kgem);
 		if (!kgem_check_bo(&sna->kgem,
 				   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
-				   NULL))
+				   NULL)) {
+			DBG(("%s: fallback, operation does not fit into GTT\n",
+			     __FUNCTION__));
 			goto cleanup_mask;
+		}
 	}
 
 	gen2_emit_composite_state(sna, tmp);
@@ -2016,8 +2001,8 @@ gen2_emit_composite_spans_primitive_affine_source(struct sna *sna,
 {
 	PictTransform *transform = op->base.src.transform;
 	uint32_t alpha = (uint8_t)(255 * opacity) << 24;
-	float x, y, *v;
-       
+	float *v;
+
 	v = (float *)sna->kgem.batch + sna->kgem.nbatch;
 	sna->kgem.nbatch += 15;
 
@@ -2029,26 +2014,20 @@ gen2_emit_composite_spans_primitive_affine_source(struct sna *sna,
 	*((uint32_t *)v + 7) = alpha;
 	*((uint32_t *)v + 12) = alpha;
 
-	_sna_get_transformed_coordinates((int)op->base.src.offset[0] + box->x2,
-					 (int)op->base.src.offset[1] + box->y2,
-					 transform,
-					 &x, &y);
-	v[3] = x * op->base.src.scale[0];
-	v[4] = y * op->base.src.scale[1];
+	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x2,
+				    (int)op->base.src.offset[1] + box->y2,
+				    transform, op->base.src.scale,
+				    &v[3], &v[4]);
 
-	_sna_get_transformed_coordinates((int)op->base.src.offset[0] + box->x1,
-					 (int)op->base.src.offset[1] + box->y2,
-					 transform,
-					 &x, &y);
-	v[8] = x * op->base.src.scale[0];
-	v[9] = y * op->base.src.scale[1];
+	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
+				    (int)op->base.src.offset[1] + box->y2,
+				    transform, op->base.src.scale,
+				    &v[8], &v[9]);
 
-	_sna_get_transformed_coordinates((int)op->base.src.offset[0] + box->x1,
-					 (int)op->base.src.offset[1] + box->y1,
-					 transform,
-					 &x, &y);
-	v[13] = x * op->base.src.scale[0];
-	v[14] = y * op->base.src.scale[1];
+	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
+				    (int)op->base.src.offset[1] + box->y1,
+				    transform, op->base.src.scale,
+				    &v[13], &v[14]);
 }
 
 static void
@@ -2131,7 +2110,7 @@ static void gen2_emit_composite_spans_state(struct sna *sna,
 {
 	uint32_t unwind;
 
-	gen2_get_batch(sna);
+	gen2_get_batch(sna, &op->base);
 	gen2_emit_target(sna, &op->base);
 
 	unwind = sna->kgem.nbatch;
@@ -2248,7 +2227,7 @@ gen2_check_composite_spans(struct sna *sna,
 		return false;
 
 	if (need_tiling(sna, width, height)) {
-		if (!is_gpu(dst->pDrawable)) {
+		if (!is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
 			DBG(("%s: fallback, tiled operation not on GPU\n",
 			     __FUNCTION__));
 			return false;
@@ -2289,6 +2268,8 @@ gen2_render_composite_spans(struct sna *sna,
 	}
 
 	tmp->base.op = op;
+
+	sna_render_composite_redirect_init(&tmp->base);
 	if (too_large(tmp->base.dst.width, tmp->base.dst.height) ||
 	    tmp->base.dst.bo->pitch > MAX_3D_PITCH) {
 		if (!sna_render_composite_redirect(sna, &tmp->base,
@@ -2321,8 +2302,11 @@ gen2_render_composite_spans(struct sna *sna,
 		tmp->base.floats_per_vertex += tmp->base.src.is_affine ? 2 : 3;
 		if (tmp->base.src.transform == NULL)
 			tmp->prim_emit = gen2_emit_composite_spans_primitive_identity_source;
-		else if (tmp->base.src.is_affine)
+		else if (tmp->base.src.is_affine) {
+			tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
+			tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
 			tmp->prim_emit = gen2_emit_composite_spans_primitive_affine_source;
+		}
 	}
 	tmp->base.mask.bo = NULL;
 	tmp->base.floats_per_rect = 3*tmp->base.floats_per_vertex;
@@ -2387,7 +2371,7 @@ static void gen2_emit_fill_composite_state(struct sna *sna,
 {
 	uint32_t ls1;
 
-	gen2_get_batch(sna);
+	gen2_get_batch(sna, op);
 	gen2_emit_target(sna, op);
 
 	ls1 = sna->kgem.nbatch;
@@ -2443,24 +2427,6 @@ gen2_render_fill_boxes_try_blt(struct sna *sna,
 				  pixel, box, n);
 }
 
-static inline bool prefer_blt_fill(struct sna *sna)
-{
-#if PREFER_BLT_FILL
-	return true;
-#else
-	return sna->kgem.mode != KGEM_RENDER;
-#endif
-}
-
-static inline bool prefer_blt_copy(struct sna *sna, unsigned flags)
-{
-#if PREFER_BLT_COPY
-	return true;
-#else
-	return sna->kgem.mode != KGEM_RENDER;
-#endif
-}
-
 static bool
 gen2_render_fill_boxes(struct sna *sna,
 		       CARD8 op,
@@ -2483,6 +2449,11 @@ gen2_render_fill_boxes(struct sna *sna,
 					      dst, dst_bo,
 					      box, n);
 #endif
+	if (gen2_render_fill_boxes_try_blt(sna, op, format, color,
+					   dst, dst_bo,
+					   box, n))
+		return true;
+
 
 	DBG(("%s (op=%d, format=%x, color=(%04x,%04x,%04x, %04x))\n",
 	     __FUNCTION__, op, (int)format,
@@ -2493,11 +2464,6 @@ gen2_render_fill_boxes(struct sna *sna,
 	    !gen2_check_dst_format(format)) {
 		DBG(("%s: try blt, too large or incompatible destination\n",
 		     __FUNCTION__));
-		if (gen2_render_fill_boxes_try_blt(sna, op, format, color,
-						   dst, dst_bo,
-						   box, n))
-			return true;
-
 		if (!gen2_check_dst_format(format))
 			return false;
 
@@ -2506,12 +2472,6 @@ gen2_render_fill_boxes(struct sna *sna,
 					     dst, dst_bo, box, n);
 	}
 
-	if (prefer_blt_fill(sna) &&
-	    gen2_render_fill_boxes_try_blt(sna, op, format, color,
-					   dst, dst_bo,
-					   box, n))
-		return true;
-
 	if (op == PictOpClear)
 		pixel = 0;
 	else if (!sna_get_pixel_from_rgba(&pixel,
@@ -2572,7 +2532,7 @@ static void gen2_emit_fill_state(struct sna *sna,
 {
 	uint32_t ls1;
 
-	gen2_get_batch(sna);
+	gen2_get_batch(sna, op);
 	gen2_emit_target(sna, op);
 
 	ls1 = sna->kgem.nbatch;
@@ -2683,8 +2643,7 @@ gen2_render_fill(struct sna *sna, uint8_t alu,
 #endif
 
 	/* Prefer to use the BLT if already engaged */
-	if (prefer_blt_fill(sna) &&
-	    sna_blt_fill(sna, alu,
+	if (sna_blt_fill(sna, alu,
 			 dst_bo, dst->drawable.bitsPerPixel,
 			 color,
 			 tmp))
@@ -2693,10 +2652,7 @@ gen2_render_fill(struct sna *sna, uint8_t alu,
 	/* Must use the BLT if we can't RENDER... */
 	if (too_large(dst->drawable.width, dst->drawable.height) ||
 	    dst_bo->pitch < 8 || dst_bo->pitch > MAX_3D_PITCH)
-		return sna_blt_fill(sna, alu,
-				    dst_bo, dst->drawable.bitsPerPixel,
-				    color,
-				    tmp);
+		return false;
 
 	tmp->base.op = alu;
 	tmp->base.dst.pixmap = dst;
@@ -2761,16 +2717,14 @@ gen2_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 #endif
 
 	/* Prefer to use the BLT if already engaged */
-	if (prefer_blt_fill(sna) &&
-	    gen2_render_fill_one_try_blt(sna, dst, bo, color,
+	if (gen2_render_fill_one_try_blt(sna, dst, bo, color,
 					 x1, y1, x2, y2, alu))
 		return true;
 
 	/* Must use the BLT if we can't RENDER... */
 	if (too_large(dst->drawable.width, dst->drawable.height) ||
 	    bo->pitch < 8 || bo->pitch > MAX_3D_PITCH)
-		return gen2_render_fill_one_try_blt(sna, dst, bo, color,
-						    x1, y1, x2, y2, alu);
+		return false;
 
 	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
 		kgem_submit(&sna->kgem);
@@ -2865,7 +2819,7 @@ static void gen2_emit_copy_state(struct sna *sna, const struct sna_composite_op
 {
 	uint32_t ls1, v;
 
-	gen2_get_batch(sna);
+	gen2_get_batch(sna, op);
 
 	if (kgem_bo_is_dirty(op->src.bo)) {
 		if (op->src.bo == op->dst.bo)
@@ -2925,8 +2879,7 @@ gen2_render_copy_boxes(struct sna *sna, uint8_t alu,
 	DBG(("%s (%d, %d)->(%d, %d) x %d\n",
 	     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy, n));
 
-	if (prefer_blt_copy(sna, flags) &&
-	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
+	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 	    sna_blt_copy_boxes(sna, alu,
 			       src_bo, src_dx, src_dy,
 			       dst_bo, dst_dx, dst_dy,
@@ -3091,8 +3044,7 @@ gen2_render_copy(struct sna *sna, uint8_t alu,
 #endif
 
 	/* Prefer to use the BLT */
-	if (prefer_blt_copy(sna, 0) &&
-	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
+	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 	    sna_blt_copy(sna, alu,
 			 src_bo, dst_bo,
 			 dst->drawable.bitsPerPixel,
@@ -3145,7 +3097,6 @@ gen2_render_reset(struct sna *sna)
 {
 	sna->render_state.gen2.need_invariant = true;
 	sna->render_state.gen2.logic_op_enabled = 0;
-	sna->render_state.gen2.vertex_offset = 0;
 	sna->render_state.gen2.target = 0;
 
 	sna->render_state.gen2.ls1 = 0;
@@ -3160,6 +3111,7 @@ static void
 gen2_render_flush(struct sna *sna)
 {
 	assert(sna->render.vertex_index == 0);
+	assert(sna->render.vertex_offset == 0);
 }
 
 static void
@@ -3168,13 +3120,13 @@ gen2_render_context_switch(struct kgem *kgem,
 {
 	struct sna *sna = container_of(kgem, struct sna, kgem);
 
-	if (!kgem->mode)
+	if (!kgem->nbatch)
 		return;
 
 	/* Reload BLT registers following a lost context */
 	sna->blt_state.fill_bo = 0;
 
-	if (kgem_is_idle(kgem)) {
+	if (kgem_ring_is_idle(kgem, kgem->ring)) {
 		DBG(("%s: GPU idle, flushing\n", __FUNCTION__));
 		_kgem_submit(kgem);
 	}
@@ -3191,10 +3143,12 @@ bool gen2_render_init(struct sna *sna)
 	 */
 #if !NO_COMPOSITE
 	render->composite = gen2_render_composite;
+	render->prefer_gpu |= PREFER_GPU_RENDER;
 #endif
 #if !NO_COMPOSITE_SPANS
 	render->check_composite_spans = gen2_check_composite_spans;
 	render->composite_spans = gen2_render_composite_spans;
+	render->prefer_gpu |= PREFER_GPU_SPANS;
 #endif
 	render->fill_boxes = gen2_render_fill_boxes;
 	render->fill = gen2_render_fill;
diff --git a/src/sna/gen3_render.c b/src/sna/gen3_render.c
index 7c303f419..95d44ab56 100644
--- a/src/sna/gen3_render.c
+++ b/src/sna/gen3_render.c
@@ -431,6 +431,26 @@ gen3_emit_composite_primitive_constant(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_boxes_constant(const struct sna_composite_op *op,
+				   const BoxRec *box, int nbox,
+				   float *v)
+{
+	do {
+		v[0] = box->x2;
+		v[1] = box->y2;
+
+		v[2] = box->x1;
+		v[3] = box->y2;
+
+		v[4] = box->x1;
+		v[5] = box->y1;
+
+		box++;
+		v += 6;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_primitive_identity_gradient(struct sna *sna,
 						const struct sna_composite_op *op,
 						const struct sna_composite_rectangles *r)
@@ -457,6 +477,32 @@ gen3_emit_composite_primitive_identity_gradient(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_boxes_identity_gradient(const struct sna_composite_op *op,
+					    const BoxRec *box, int nbox,
+					    float *v)
+{
+	do {
+		v[0] = box->x2;
+		v[1] = box->y2;
+		v[2] = box->x2 + op->src.offset[0];
+		v[3] = box->y2 + op->src.offset[1];
+
+		v[4] = box->x1;
+		v[5] = box->y2;
+		v[6] = box->x1 + op->src.offset[0];
+		v[7] = box->y2 + op->src.offset[1];
+
+		v[8] = box->x1;
+		v[9] = box->y1;
+		v[10] = box->x1 + op->src.offset[0];
+		v[11] = box->y1 + op->src.offset[1];
+
+		v += 12;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_primitive_affine_gradient(struct sna *sna,
 					      const struct sna_composite_op *op,
 					      const struct sna_composite_rectangles *r)
@@ -494,6 +540,40 @@ gen3_emit_composite_primitive_affine_gradient(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_boxes_affine_gradient(const struct sna_composite_op *op,
+					  const BoxRec *box, int nbox,
+					  float *v)
+{
+	const PictTransform *transform = op->src.transform;
+
+	do {
+		v[0] = box->x2;
+		v[1] = box->y2;
+		sna_get_transformed_coordinates(box->x2 + op->src.offset[0],
+						box->y2 + op->src.offset[1],
+						transform,
+						&v[2], &v[3]);
+
+		v[4] = box->x1;
+		v[5] = box->y2;
+		sna_get_transformed_coordinates(box->x1 + op->src.offset[0],
+						box->y2 + op->src.offset[1],
+						transform,
+						&v[6], &v[7]);
+
+		v[8] = box->x1;
+		v[9] = box->y1;
+		sna_get_transformed_coordinates(box->x1 + op->src.offset[0],
+						box->y1 + op->src.offset[1],
+						transform,
+						&v[10], &v[11]);
+
+		box++;
+		v += 12;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_primitive_identity_source(struct sna *sna,
 					      const struct sna_composite_op *op,
 					      const struct sna_composite_rectangles *r)
@@ -519,6 +599,28 @@ gen3_emit_composite_primitive_identity_source(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_boxes_identity_source(const struct sna_composite_op *op,
+					  const BoxRec *box, int nbox,
+					  float *v)
+{
+	do {
+		v[0] = box->x2 + op->dst.x;
+		v[8] = v[4] = box->x1 + op->dst.x;
+		v[5] = v[1] = box->y2 + op->dst.y;
+		v[9] = box->y1 + op->dst.y;
+
+		v[10] = v[6] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
+		v[2] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
+
+		v[11] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
+		v[7] = v[3] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
+
+		v += 12;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_primitive_identity_source_no_offset(struct sna *sna,
 							const struct sna_composite_op *op,
 							const struct sna_composite_rectangles *r)
@@ -544,6 +646,28 @@ gen3_emit_composite_primitive_identity_source_no_offset(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_boxes_identity_source_no_offset(const struct sna_composite_op *op,
+						    const BoxRec *box, int nbox,
+						    float *v)
+{
+	do {
+		v[0] = box->x2;
+		v[8] = v[4] = box->x1;
+		v[5] = v[1] = box->y2;
+		v[9] = box->y1;
+
+		v[10] = v[6] = box->x1 * op->src.scale[0];
+		v[2] = box->x2 * op->src.scale[0];
+
+		v[11] = box->y1 * op->src.scale[1];
+		v[7] = v[3] = box->y2 * op->src.scale[1];
+
+		v += 12;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_primitive_affine_source(struct sna *sna,
 					    const struct sna_composite_op *op,
 					    const struct sna_composite_rectangles *r)
@@ -553,29 +677,60 @@ gen3_emit_composite_primitive_affine_source(struct sna *sna,
 	int16_t dst_y = r->dst.y + op->dst.y;
 	int src_x = r->src.x + (int)op->src.offset[0];
 	int src_y = r->src.y + (int)op->src.offset[1];
-	float sx, sy;
+	float *v;
 
-	_sna_get_transformed_coordinates(src_x + r->width, src_y + r->height,
-					 transform,
-					 &sx, &sy);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 12;
 
-	gen3_emit_composite_dstcoord(sna, dst_x + r->width, dst_y + r->height);
-	OUT_VERTEX(sx * op->src.scale[0]);
-	OUT_VERTEX(sy * op->src.scale[1]);
+	v[0] = dst_x + r->width;
+	v[5] = v[1] = dst_y + r->height;
+	v[8] = v[4] = dst_x;
+	v[9] = dst_y;
 
-	_sna_get_transformed_coordinates(src_x, src_y + r->height,
-					 transform,
-					 &sx, &sy);
-	gen3_emit_composite_dstcoord(sna, dst_x, dst_y + r->height);
-	OUT_VERTEX(sx * op->src.scale[0]);
-	OUT_VERTEX(sy * op->src.scale[1]);
+	_sna_get_transformed_scaled(src_x + r->width, src_y + r->height,
+				    transform, op->src.scale,
+				    &v[2], &v[3]);
 
-	_sna_get_transformed_coordinates(src_x, src_y,
-					 transform,
-					 &sx, &sy);
-	gen3_emit_composite_dstcoord(sna, dst_x, dst_y);
-	OUT_VERTEX(sx * op->src.scale[0]);
-	OUT_VERTEX(sy * op->src.scale[1]);
+	_sna_get_transformed_scaled(src_x, src_y + r->height,
+				    transform, op->src.scale,
+				    &v[6], &v[7]);
+
+	_sna_get_transformed_scaled(src_x, src_y,
+				    transform, op->src.scale,
+				    &v[10], &v[11]);
+}
+
+fastcall static void
+gen3_emit_composite_boxes_affine_source(const struct sna_composite_op *op,
+					const BoxRec *box, int nbox,
+					float *v)
+{
+	const PictTransform *transform = op->src.transform;
+
+	do {
+		v[0] = box->x2;
+		v[5] = v[1] = box->y2;
+		v[8] = v[4] = box->x1;
+		v[9] = box->y1;
+
+		_sna_get_transformed_scaled(box->x2 + op->src.offset[0],
+					    box->y2 + op->src.offset[1],
+					    transform, op->src.scale,
+					    &v[2], &v[3]);
+
+		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
+					    box->y2 + op->src.offset[1],
+					    transform, op->src.scale,
+					    &v[6], &v[7]);
+
+		_sna_get_transformed_scaled(box->x1 + op->src.offset[0],
+					    box->y1 + op->src.offset[1],
+					    transform, op->src.scale,
+					    &v[10], &v[11]);
+
+		v += 12;
+		box++;
+	} while (--nbox);
 }
 
 fastcall static void
@@ -929,13 +1084,6 @@ gen3_composite_emit_shader(struct sna *sna,
 	if (mask->u.gen3.type == SHADER_NONE)
 		mask = NULL;
 
-	if (mask && src->is_opaque &&
-	    gen3_blend_op[blend].src_alpha &&
-	    op->has_component_alpha) {
-		src = mask;
-		mask = NULL;
-	}
-
 	id = (src->u.gen3.type |
 	      src->is_affine << 4 |
 	      src->alpha_fixup << 5 |
@@ -1298,9 +1446,9 @@ static void gen3_emit_invariant(struct sna *sna)
 #define MAX_OBJECTS 3 /* worst case: dst + src + mask  */
 
 static void
-gen3_get_batch(struct sna *sna)
+gen3_get_batch(struct sna *sna, const struct sna_composite_op *op)
 {
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
 
 	if (!kgem_check_batch(&sna->kgem, 200)) {
 		DBG(("%s: flushing batch: size %d > %d\n",
@@ -1389,7 +1537,7 @@ static void gen3_emit_composite_state(struct sna *sna,
 	unsigned int tex_count, n;
 	uint32_t ss2;
 
-	gen3_get_batch(sna);
+	gen3_get_batch(sna, op);
 
 	if (kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
 		if (op->src.bo == op->dst.bo || op->mask.bo == op->dst.bo)
@@ -1578,11 +1726,11 @@ static void gen3_emit_composite_state(struct sna *sna,
 	gen3_composite_emit_shader(sna, op, op->op);
 }
 
-static void gen3_magic_ca_pass(struct sna *sna,
+static bool gen3_magic_ca_pass(struct sna *sna,
 			       const struct sna_composite_op *op)
 {
 	if (!op->need_magic_ca_pass)
-		return;
+		return false;
 
 	DBG(("%s(%d)\n", __FUNCTION__,
 	     sna->render.vertex_index - sna->render.vertex_start));
@@ -1596,23 +1744,24 @@ static void gen3_magic_ca_pass(struct sna *sna,
 	OUT_BATCH(sna->render.vertex_start);
 
 	sna->render_state.gen3.last_blend = 0;
+	return true;
 }
 
 static void gen3_vertex_flush(struct sna *sna)
 {
-	assert(sna->render_state.gen3.vertex_offset);
+	assert(sna->render.vertex_offset);
 
 	DBG(("%s[%x] = %d\n", __FUNCTION__,
-	     4*sna->render_state.gen3.vertex_offset,
+	     4*sna->render.vertex_offset,
 	     sna->render.vertex_index - sna->render.vertex_start));
 
-	sna->kgem.batch[sna->render_state.gen3.vertex_offset] =
+	sna->kgem.batch[sna->render.vertex_offset] =
 		PRIM3D_RECTLIST | PRIM3D_INDIRECT_SEQUENTIAL |
 		(sna->render.vertex_index - sna->render.vertex_start);
-	sna->kgem.batch[sna->render_state.gen3.vertex_offset + 1] =
+	sna->kgem.batch[sna->render.vertex_offset + 1] =
 		sna->render.vertex_start;
 
-	sna->render_state.gen3.vertex_offset = 0;
+	sna->render.vertex_offset = 0;
 }
 
 static int gen3_vertex_finish(struct sna *sna)
@@ -1622,24 +1771,27 @@ static int gen3_vertex_finish(struct sna *sna)
 	DBG(("%s: used=%d/%d, vbo active? %d\n",
 	     __FUNCTION__, sna->render.vertex_used, sna->render.vertex_size,
 	     sna->render.vbo ? sna->render.vbo->handle : 0));
+	assert(sna->render.vertex_offset == 0);
 	assert(sna->render.vertex_used);
 	assert(sna->render.vertex_used <= sna->render.vertex_size);
 
+	sna_vertex_wait__locked(&sna->render);
+
 	bo = sna->render.vbo;
 	if (bo) {
-		if (sna->render_state.gen3.vertex_offset)
-			gen3_vertex_flush(sna);
-
 		DBG(("%s: reloc = %d\n", __FUNCTION__,
 		     sna->render.vertex_reloc[0]));
 
-		sna->kgem.batch[sna->render.vertex_reloc[0]] =
-			kgem_add_reloc(&sna->kgem, sna->render.vertex_reloc[0],
-				       bo, I915_GEM_DOMAIN_VERTEX << 16, 0);
+		if (sna->render.vertex_reloc[0]) {
+			sna->kgem.batch[sna->render.vertex_reloc[0]] =
+				kgem_add_reloc(&sna->kgem, sna->render.vertex_reloc[0],
+					       bo, I915_GEM_DOMAIN_VERTEX << 16, 0);
 
-		sna->render.vertex_reloc[0] = 0;
+			sna->render.vertex_reloc[0] = 0;
+		}
 		sna->render.vertex_used = 0;
 		sna->render.vertex_index = 0;
+		sna->render.vbo = NULL;
 
 		kgem_bo_destroy(&sna->kgem, bo);
 	}
@@ -1671,15 +1823,14 @@ static void gen3_vertex_close(struct sna *sna)
 	struct kgem_bo *bo, *free_bo = NULL;
 	unsigned int delta = 0;
 
-	assert(sna->render_state.gen3.vertex_offset == 0);
+	assert(sna->render.vertex_offset == 0);
+	if (sna->render.vertex_reloc[0] == 0)
+		return;
 
 	DBG(("%s: used=%d/%d, vbo active? %d\n",
 	     __FUNCTION__, sna->render.vertex_used, sna->render.vertex_size,
 	     sna->render.vbo ? sna->render.vbo->handle : 0));
 
-	if (sna->render.vertex_used == 0)
-		return;
-
 	bo = sna->render.vbo;
 	if (bo) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 64) {
@@ -1713,7 +1864,8 @@ static void gen3_vertex_close(struct sna *sna)
 			DBG(("%s: new vbo: %d\n", __FUNCTION__,
 			     sna->render.vertex_used));
 			bo = kgem_create_linear(&sna->kgem,
-						4*sna->render.vertex_used, 0);
+						4*sna->render.vertex_used,
+						CREATE_NO_THROTTLE);
 			if (bo) {
 				assert(bo->snoop == false);
 				kgem_bo_write(&sna->kgem, bo,
@@ -1724,15 +1876,11 @@ static void gen3_vertex_close(struct sna *sna)
 		}
 	}
 
-	DBG(("%s: reloc = %d\n", __FUNCTION__,
-	     sna->render.vertex_reloc[0]));
-
-	if (sna->render.vertex_reloc[0]) {
-		sna->kgem.batch[sna->render.vertex_reloc[0]] =
-			kgem_add_reloc(&sna->kgem, sna->render.vertex_reloc[0],
-				       bo, I915_GEM_DOMAIN_VERTEX << 16, delta);
-		sna->render.vertex_reloc[0] = 0;
-	}
+	DBG(("%s: reloc = %d\n", __FUNCTION__, sna->render.vertex_reloc[0]));
+	sna->kgem.batch[sna->render.vertex_reloc[0]] =
+		kgem_add_reloc(&sna->kgem, sna->render.vertex_reloc[0],
+			       bo, I915_GEM_DOMAIN_VERTEX << 16, delta);
+	sna->render.vertex_reloc[0] = 0;
 
 	if (sna->render.vbo == NULL) {
 		DBG(("%s: resetting vbo\n", __FUNCTION__));
@@ -1752,6 +1900,9 @@ static bool gen3_rectangle_begin(struct sna *sna,
 	struct gen3_render_state *state = &sna->render_state.gen3;
 	int ndwords, i1_cmd = 0, i1_len = 0;
 
+	if (sna_vertex_wait__locked(&sna->render) && sna->render.vertex_offset)
+		return true;
+
 	ndwords = 2;
 	if (op->need_magic_ca_pass)
 		ndwords += 100;
@@ -1774,14 +1925,15 @@ static bool gen3_rectangle_begin(struct sna *sna,
 		}
 	}
 
-	if (sna->kgem.nbatch == 2 + state->last_vertex_offset) {
-		state->vertex_offset = state->last_vertex_offset;
+	if (sna->kgem.nbatch == 2 + state->last_vertex_offset &&
+	    !op->need_magic_ca_pass) {
+		sna->render.vertex_offset = state->last_vertex_offset;
 	} else {
-		state->vertex_offset = sna->kgem.nbatch;
+		sna->render.vertex_offset = sna->kgem.nbatch;
 		OUT_BATCH(MI_NOOP); /* to be filled later */
 		OUT_BATCH(MI_NOOP);
 		sna->render.vertex_start = sna->render.vertex_index;
-		state->last_vertex_offset = state->vertex_offset;
+		state->last_vertex_offset = sna->render.vertex_offset;
 	}
 
 	return true;
@@ -1790,13 +1942,28 @@ static bool gen3_rectangle_begin(struct sna *sna,
 static int gen3_get_rectangles__flush(struct sna *sna,
 				      const struct sna_composite_op *op)
 {
+	/* Preventing discarding new vbo after lock contention */
+	if (sna_vertex_wait__locked(&sna->render)) {
+		int rem = vertex_space(sna);
+		if (rem > op->floats_per_rect)
+			return rem;
+	}
+
 	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 105: 5))
 		return 0;
 	if (!kgem_check_reloc_and_exec(&sna->kgem, 1))
 		return 0;
 
-	if (op->need_magic_ca_pass && sna->render.vbo)
-		return 0;
+	if (sna->render.vertex_offset) {
+		gen3_vertex_flush(sna);
+		if (gen3_magic_ca_pass(sna, op)) {
+			OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(6) | 0);
+			OUT_BATCH(gen3_get_blend_cntl(op->op,
+						      op->has_component_alpha,
+						      op->dst.format));
+			gen3_composite_emit_shader(sna, op, op->op);
+		}
+	}
 
 	return gen3_vertex_finish(sna);
 }
@@ -1822,7 +1989,7 @@ start:
 			goto flush;
 	}
 
-	if (unlikely(sna->render_state.gen3.vertex_offset == 0 &&
+	if (unlikely(sna->render.vertex_offset == 0 &&
 		     !gen3_rectangle_begin(sna, op)))
 		goto flush;
 
@@ -1836,12 +2003,15 @@ start:
 
 flush:
 	DBG(("%s: flushing batch\n", __FUNCTION__));
-	if (sna->render_state.gen3.vertex_offset) {
+	if (sna->render.vertex_offset) {
 		gen3_vertex_flush(sna);
 		gen3_magic_ca_pass(sna, op);
 	}
+	sna_vertex_wait__locked(&sna->render);
 	_kgem_submit(&sna->kgem);
 	gen3_emit_composite_state(sna, op);
+	assert(sna->render.vertex_offset == 0);
+	assert(sna->render.vertex_reloc[0] == 0);
 	goto start;
 }
 
@@ -1886,9 +2056,9 @@ gen3_render_composite_box(struct sna *sna,
 }
 
 static void
-gen3_render_composite_boxes(struct sna *sna,
-			    const struct sna_composite_op *op,
-			    const BoxRec *box, int nbox)
+gen3_render_composite_boxes__blt(struct sna *sna,
+				 const struct sna_composite_op *op,
+				 const BoxRec *box, int nbox)
 {
 	DBG(("%s: nbox=%d, src=+(%d, %d), mask=+(%d, %d), dst=+(%d, %d)\n",
 	     __FUNCTION__, nbox,
@@ -1922,12 +2092,66 @@ gen3_render_composite_boxes(struct sna *sna,
 }
 
 static void
+gen3_render_composite_boxes(struct sna *sna,
+			    const struct sna_composite_op *op,
+			    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen3_get_rectangles(sna, op, nbox);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+	} while (nbox);
+}
+
+static void
+gen3_render_composite_boxes__thread(struct sna *sna,
+				    const struct sna_composite_op *op,
+				    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen3_get_rectangles(sna, op, nbox);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+static void
 gen3_render_composite_done(struct sna *sna,
 			   const struct sna_composite_op *op)
 {
 	DBG(("%s()\n", __FUNCTION__));
 
-	if (sna->render_state.gen3.vertex_offset) {
+	if (sna->render.vertex_offset) {
 		gen3_vertex_flush(sna);
 		gen3_magic_ca_pass(sna, op);
 	}
@@ -1971,7 +2195,6 @@ gen3_render_reset(struct sna *sna)
 	state->floats_per_vertex = 0;
 	state->last_floats_per_vertex = 0;
 	state->last_vertex_offset = 0;
-	state->vertex_offset = 0;
 
 	if (sna->render.vbo != NULL &&
 	    !kgem_bo_is_mappable(&sna->kgem, sna->render.vbo)) {
@@ -1979,6 +2202,9 @@ gen3_render_reset(struct sna *sna)
 		     __FUNCTION__, sna->render.vbo->presumed_offset));
 		discard_vbo(sna);
 	}
+
+	sna->render.vertex_reloc[0] = 0;
+	sna->render.vertex_offset = 0;
 }
 
 static void
@@ -2401,7 +2627,8 @@ gen3_composite_picture(struct sna *sna,
 	if (!gen3_composite_channel_set_format(channel, picture->format) &&
 	    !gen3_composite_channel_set_xformat(picture, channel, x, y, w, h))
 		return sna_render_picture_convert(sna, picture, channel, pixmap,
-						  x, y, w, h, dst_x, dst_y);
+						  x, y, w, h, dst_x, dst_y,
+						  false);
 
 	if (too_large(pixmap->drawable.width, pixmap->drawable.height)) {
 		DBG(("%s: pixmap too large (%dx%d), extracting (%d, %d)x(%d,%d)\n",
@@ -2431,7 +2658,7 @@ source_use_blt(struct sna *sna, PicturePtr picture)
 	if (too_large(picture->pDrawable->width, picture->pDrawable->height))
 		return true;
 
-	return !is_gpu(picture->pDrawable);
+	return !is_gpu(sna, picture->pDrawable, PREFER_GPU_RENDER);
 }
 
 static bool
@@ -2589,12 +2816,6 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
-untransformed(PicturePtr p)
-{
-	return !p->transform || pixman_transform_is_int_translate(p->transform);
-}
-
-static bool
 need_upload(PicturePtr p)
 {
 	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
@@ -2641,7 +2862,6 @@ gen3_composite_fallback(struct sna *sna,
 			PicturePtr mask,
 			PicturePtr dst)
 {
-	struct sna_pixmap *priv;
 	PixmapPtr src_pixmap;
 	PixmapPtr mask_pixmap;
 	PixmapPtr dst_pixmap;
@@ -2681,17 +2901,16 @@ gen3_composite_fallback(struct sna *sna,
 
 	if (mask &&
 	    mask->componentAlpha && PICT_FORMAT_RGB(mask->format) &&
-	    op != PictOpOver &&
-	    gen3_blend_op[op].src_blend != BLENDFACT_ZERO)
-	{
+	    gen3_blend_op[op].src_alpha &&
+	    gen3_blend_op[op].src_blend != BLENDFACT_ZERO &&
+	    op != PictOpOver) {
 		DBG(("%s: component-alpha mask with op=%d, should fallback\n",
 		     __FUNCTION__, op));
 		return true;
 	}
 
 	/* If anything is on the GPU, push everything out to the GPU */
-	priv = sna_pixmap(dst_pixmap);
-	if (priv && priv->gpu_damage && !priv->clear) {
+	if (dst_use_gpu(dst_pixmap)) {
 		DBG(("%s: dst is already on the GPU, try to use GPU\n",
 		     __FUNCTION__));
 		return false;
@@ -2726,14 +2945,14 @@ gen3_composite_fallback(struct sna *sna,
 
 	if (too_large(dst_pixmap->drawable.width,
 		      dst_pixmap->drawable.height) &&
-	    (priv == NULL || DAMAGE_IS_ALL(priv->cpu_damage))) {
+	    dst_is_cpu(dst_pixmap)) {
 		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
 		return true;
 	}
 
-	DBG(("%s: dst is not on the GPU and the operation should not fallback\n",
-	     __FUNCTION__));
-	return false;
+	DBG(("%s: dst is not on the GPU and the operation should not fallback: use-cpu? %d\n",
+	     __FUNCTION__, dst_use_cpu(dst_pixmap)));
+	return dst_use_cpu(dst_pixmap);
 }
 
 static int
@@ -2922,13 +3141,12 @@ gen3_render_composite(struct sna *sna,
 					tmp->mask.u.gen3.type = SHADER_NONE;
 					tmp->has_component_alpha = false;
 				} else if (gen3_blend_op[op].src_alpha &&
-					   (gen3_blend_op[op].src_blend != BLENDFACT_ZERO)) {
+					   gen3_blend_op[op].src_blend != BLENDFACT_ZERO) {
 					if (op != PictOpOver)
 						goto cleanup_mask;
 
 					tmp->need_magic_ca_pass = true;
 					tmp->op = PictOpOutReverse;
-					sna->render.vertex_start = sna->render.vertex_index;
 				}
 			} else {
 				if (tmp->mask.is_opaque) {
@@ -2978,22 +3196,33 @@ gen3_render_composite(struct sna *sna,
 		case SHADER_WHITE:
 		case SHADER_CONSTANT:
 			tmp->prim_emit = gen3_emit_composite_primitive_constant;
+			tmp->emit_boxes = gen3_emit_composite_boxes_constant;
 			break;
 		case SHADER_LINEAR:
 		case SHADER_RADIAL:
-			if (tmp->src.transform == NULL)
+			if (tmp->src.transform == NULL) {
 				tmp->prim_emit = gen3_emit_composite_primitive_identity_gradient;
-			else if (tmp->src.is_affine)
+				tmp->emit_boxes = gen3_emit_composite_boxes_identity_gradient;
+			} else if (tmp->src.is_affine) {
 				tmp->prim_emit = gen3_emit_composite_primitive_affine_gradient;
+				tmp->emit_boxes = gen3_emit_composite_boxes_affine_gradient;
+			}
 			break;
 		case SHADER_TEXTURE:
 			if (tmp->src.transform == NULL) {
-				if ((tmp->src.offset[0]|tmp->src.offset[1]|tmp->dst.x|tmp->dst.y) == 0)
+				if ((tmp->src.offset[0]|tmp->src.offset[1]|tmp->dst.x|tmp->dst.y) == 0) {
 					tmp->prim_emit = gen3_emit_composite_primitive_identity_source_no_offset;
-				else
+					tmp->emit_boxes = gen3_emit_composite_boxes_identity_source_no_offset;
+				} else {
 					tmp->prim_emit = gen3_emit_composite_primitive_identity_source;
-			} else if (tmp->src.is_affine)
+					tmp->emit_boxes = gen3_emit_composite_boxes_identity_source;
+				}
+			} else if (tmp->src.is_affine) {
+				tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
+				tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
 				tmp->prim_emit = gen3_emit_composite_primitive_affine_source;
+				tmp->emit_boxes = gen3_emit_composite_boxes_affine_source;
+			}
 			break;
 		}
 	} else if (tmp->mask.u.gen3.type == SHADER_TEXTURE) {
@@ -3024,7 +3253,11 @@ gen3_render_composite(struct sna *sna,
 
 	tmp->blt   = gen3_render_composite_blt;
 	tmp->box   = gen3_render_composite_box;
-	tmp->boxes = gen3_render_composite_boxes;
+	tmp->boxes = gen3_render_composite_boxes__blt;
+	if (tmp->emit_boxes) {
+		tmp->boxes = gen3_render_composite_boxes;
+		tmp->thread_boxes = gen3_render_composite_boxes__thread;
+	}
 	tmp->done  = gen3_render_composite_done;
 
 	if (!kgem_check_bo(&sna->kgem,
@@ -3084,6 +3317,26 @@ gen3_emit_composite_spans_primitive_zero(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_spans_primitive_zero__boxes(const struct sna_composite_spans_op *op,
+						const struct sna_opacity_box *b,
+						int nbox, float *v)
+{
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[1] = op->base.dst.y + b->box.y2;
+
+		v[2] = op->base.dst.x + b->box.x1;
+		v[3] = v[1];
+
+		v[4] = v[2];
+		v[5] = op->base.dst.x + b->box.y1;
+
+		v += 6;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_spans_primitive_zero_no_offset(struct sna *sna,
 						   const struct sna_composite_spans_op *op,
 						   const BoxRec *box,
@@ -3099,6 +3352,22 @@ gen3_emit_composite_spans_primitive_zero_no_offset(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_spans_primitive_zero_no_offset__boxes(const struct sna_composite_spans_op *op,
+							  const struct sna_opacity_box *b,
+							  int nbox, float *v)
+{
+	do {
+		v[0] = b->box.x2;
+		v[3] = v[1] = b->box.y2;
+		v[4] = v[2] = b->box.x1;
+		v[5] = b->box.y1;
+
+		b++;
+		v += 6;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_spans_primitive_constant(struct sna *sna,
 					     const struct sna_composite_spans_op *op,
 					     const BoxRec *box,
@@ -3115,6 +3384,24 @@ gen3_emit_composite_spans_primitive_constant(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_spans_primitive_constant__boxes(const struct sna_composite_spans_op *op,
+						    const struct sna_opacity_box *b,
+						    int nbox,
+						    float *v)
+{
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[6] = v[3] = op->base.dst.x + b->box.x1;
+		v[4] = v[1] = op->base.dst.y + b->box.y2;
+		v[7] = op->base.dst.y + b->box.y1;
+		v[8] = v[5] = v[2] = b->alpha;
+
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_spans_primitive_constant_no_offset(struct sna *sna,
 						       const struct sna_composite_spans_op *op,
 						       const BoxRec *box,
@@ -3131,6 +3418,23 @@ gen3_emit_composite_spans_primitive_constant_no_offset(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_spans_primitive_constant_no_offset__boxes(const struct sna_composite_spans_op *op,
+							      const struct sna_opacity_box *b,
+							      int nbox, float *v)
+{
+	do {
+		v[0] = b->box.x2;
+		v[6] = v[3] = b->box.x1;
+		v[4] = v[1] = b->box.y2;
+		v[7] = b->box.y1;
+		v[8] = v[5] = v[2] = b->alpha;
+
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_spans_primitive_identity_source(struct sna *sna,
 						    const struct sna_composite_spans_op *op,
 						    const BoxRec *box,
@@ -3159,13 +3463,43 @@ gen3_emit_composite_spans_primitive_identity_source(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_spans_primitive_identity_source__boxes(const struct sna_composite_spans_op *op,
+							   const struct sna_opacity_box *b,
+							   int nbox,
+							   float *v)
+{
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[1] = op->base.dst.y + b->box.y2;
+		v[2] = (op->base.src.offset[0] + b->box.x2) * op->base.src.scale[0];
+		v[3] = (op->base.src.offset[1] + b->box.y2) * op->base.src.scale[1];
+		v[4] = b->alpha;
+
+		v[5] = op->base.dst.x + b->box.x1;
+		v[6] = v[1];
+		v[7] = (op->base.src.offset[0] + b->box.x1) * op->base.src.scale[0];
+		v[8] = v[3];
+		v[9] = b->alpha;
+
+		v[10] = v[5];
+		v[11] = op->base.dst.y + b->box.y1;
+		v[12] = v[7];
+		v[13] = (op->base.src.offset[1] + b->box.y1) * op->base.src.scale[1];
+		v[14] = b->alpha;
+
+		v += 15;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_spans_primitive_affine_source(struct sna *sna,
 						  const struct sna_composite_spans_op *op,
 						  const BoxRec *box,
 						  float opacity)
 {
 	PictTransform *transform = op->base.src.transform;
-	float x, y, *v;
+	float *v;
 
 	v = sna->render.vertices + sna->render.vertex_used;
 	sna->render.vertex_used += 15;
@@ -3174,30 +3508,56 @@ gen3_emit_composite_spans_primitive_affine_source(struct sna *sna,
 	v[6]  = v[1] = op->base.dst.y + box->y2;
 	v[10] = v[5] = op->base.dst.x + box->x1;
 	v[11] = op->base.dst.y + box->y1;
-	v[4]  = opacity;
-	v[9]  = opacity;
-	v[14] = opacity;
-
-	_sna_get_transformed_coordinates((int)op->base.src.offset[0] + box->x2,
-					 (int)op->base.src.offset[1] + box->y2,
-					 transform,
-					 &x, &y);
-	v[2] = x * op->base.src.scale[0];
-	v[3] = y * op->base.src.scale[1];
+	v[14] = v[9] = v[4]  = opacity;
+
+	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x2,
+				    (int)op->base.src.offset[1] + box->y2,
+				    transform, op->base.src.scale,
+				    &v[2], &v[3]);
+
+	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
+				    (int)op->base.src.offset[1] + box->y2,
+				    transform, op->base.src.scale,
+				    &v[7], &v[8]);
+
+	_sna_get_transformed_scaled((int)op->base.src.offset[0] + box->x1,
+				    (int)op->base.src.offset[1] + box->y1,
+				    transform, op->base.src.scale,
+				    &v[12], &v[13]);
+}
 
-	_sna_get_transformed_coordinates((int)op->base.src.offset[0] + box->x1,
-					 (int)op->base.src.offset[1] + box->y2,
-					 transform,
-					 &x, &y);
-	v[7] = x * op->base.src.scale[0];
-	v[8] = y * op->base.src.scale[1];
+fastcall static void
+gen3_emit_composite_spans_primitive_affine_source__boxes(const struct sna_composite_spans_op *op,
+							 const struct sna_opacity_box *b,
+							 int nbox,
+							 float *v)
+{
+	PictTransform *transform = op->base.src.transform;
 
-	_sna_get_transformed_coordinates((int)op->base.src.offset[0] + box->x1,
-					 (int)op->base.src.offset[1] + box->y1,
-					 transform,
-					 &x, &y);
-	v[12] = x * op->base.src.scale[0];
-	v[13] = y * op->base.src.scale[1];
+	do {
+		v[0]  = op->base.dst.x + b->box.x2;
+		v[6]  = v[1] = op->base.dst.y + b->box.y2;
+		v[10] = v[5] = op->base.dst.x + b->box.x1;
+		v[11] = op->base.dst.y + b->box.y1;
+		v[14] = v[9] = v[4]  = b->alpha;
+
+		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x2,
+					    (int)op->base.src.offset[1] + b->box.y2,
+					    transform, op->base.src.scale,
+					    &v[2], &v[3]);
+
+		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
+					    (int)op->base.src.offset[1] + b->box.y2,
+					    transform, op->base.src.scale,
+					    &v[7], &v[8]);
+
+		_sna_get_transformed_scaled((int)op->base.src.offset[0] + b->box.x1,
+					    (int)op->base.src.offset[1] + b->box.y1,
+					    transform, op->base.src.scale,
+					    &v[12], &v[13]);
+		v += 15;
+		b++;
+	} while (--nbox);
 }
 
 fastcall static void
@@ -3229,6 +3589,36 @@ gen3_emit_composite_spans_primitive_identity_gradient(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_spans_primitive_identity_gradient__boxes(const struct sna_composite_spans_op *op,
+							     const struct sna_opacity_box *b,
+							     int nbox,
+							     float *v)
+{
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[1] = op->base.dst.y + b->box.y2;
+		v[2] = op->base.src.offset[0] + b->box.x2;
+		v[3] = op->base.src.offset[1] + b->box.y2;
+		v[4] = b->alpha;
+
+		v[5] = op->base.dst.x + b->box.x1;
+		v[6] = v[1];
+		v[7] = op->base.src.offset[0] + b->box.x1;
+		v[8] = v[3];
+		v[9] = b->alpha;
+
+		v[10] = v[5];
+		v[11] = op->base.dst.y + b->box.y1;
+		v[12] = v[7];
+		v[13] = op->base.src.offset[1] + b->box.y1;
+		v[14] = b->alpha;
+
+		v += 15;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_spans_primitive_affine_gradient(struct sna *sna,
 						    const struct sna_composite_spans_op *op,
 						    const BoxRec *box,
@@ -3264,6 +3654,43 @@ gen3_emit_composite_spans_primitive_affine_gradient(struct sna *sna,
 }
 
 fastcall static void
+gen3_emit_composite_spans_primitive_affine_gradient__boxes(const struct sna_composite_spans_op *op,
+							   const struct sna_opacity_box *b,
+							   int nbox,
+							   float *v)
+{
+	PictTransform *transform = op->base.src.transform;
+
+	do {
+		v[0] = op->base.dst.x + b->box.x2;
+		v[1] = op->base.dst.y + b->box.y2;
+		_sna_get_transformed_coordinates((int)op->base.src.offset[0] + b->box.x2,
+						 (int)op->base.src.offset[1] + b->box.y2,
+						 transform,
+						 &v[2], &v[3]);
+		v[4] = b->alpha;
+
+		v[5] = op->base.dst.x + b->box.x1;
+		v[6] = v[1];
+		_sna_get_transformed_coordinates((int)op->base.src.offset[0] + b->box.x1,
+						 (int)op->base.src.offset[1] + b->box.y2,
+						 transform,
+						 &v[7], &v[8]);
+		v[9] = b->alpha;
+
+		v[10] = v[5];
+		v[11] = op->base.dst.y + b->box.y1;
+		_sna_get_transformed_coordinates((int)op->base.src.offset[0] + b->box.x1,
+						 (int)op->base.src.offset[1] + b->box.y1,
+						 transform,
+						 &v[12], &v[13]);
+		v[14] = b->alpha;
+		v += 15;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
 gen3_emit_composite_spans_primitive(struct sna *sna,
 				    const struct sna_composite_spans_op *op,
 				    const BoxRec *box,
@@ -3308,6 +3735,48 @@ gen3_render_composite_spans_constant_box(struct sna *sna,
 }
 
 fastcall static void
+gen3_render_composite_spans_constant_thread_boxes(struct sna *sna,
+						  const struct sna_composite_spans_op *op,
+						  const struct sna_opacity_box *box,
+						  int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     op->base.dst.x, op->base.dst.y));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * 9;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		do {
+			v[0] = box->box.x2;
+			v[6] = v[3] = box->box.x1;
+			v[4] = v[1] = box->box.y2;
+			v[7] = box->box.y1;
+			v[8] = v[5] = v[2] = box->alpha;
+			v += 9;
+			box++;
+		} while (--nbox_this_time);
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+fastcall static void
 gen3_render_composite_spans_box(struct sna *sna,
 				const struct sna_composite_spans_op *op,
 				const BoxRec *box, float opacity)
@@ -3355,10 +3824,45 @@ gen3_render_composite_spans_boxes(struct sna *sna,
 }
 
 fastcall static void
+gen3_render_composite_spans_boxes__thread(struct sna *sna,
+					  const struct sna_composite_spans_op *op,
+					  const struct sna_opacity_box *box,
+					  int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     op->base.dst.x, op->base.dst.y));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen3_get_rectangles(sna, &op->base, nbox);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+fastcall static void
 gen3_render_composite_spans_done(struct sna *sna,
 				 const struct sna_composite_spans_op *op)
 {
-	if (sna->render_state.gen3.vertex_offset)
+	if (sna->render.vertex_offset)
 		gen3_vertex_flush(sna);
 
 	DBG(("%s()\n", __FUNCTION__));
@@ -3380,12 +3884,11 @@ gen3_check_composite_spans(struct sna *sna,
 	if (gen3_composite_fallback(sna, op, src, NULL, dst))
 		return false;
 
-	if (need_tiling(sna, width, height)) {
-		if (!is_gpu(dst->pDrawable)) {
-			DBG(("%s: fallback, tiled operation not on GPU\n",
-			     __FUNCTION__));
-			return false;
-		}
+	if (need_tiling(sna, width, height) &&
+	    !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
+		DBG(("%s: fallback, tiled operation not on GPU\n",
+		     __FUNCTION__));
+		return false;
 	}
 
 	return true;
@@ -3458,37 +3961,58 @@ gen3_render_composite_spans(struct sna *sna,
 	no_offset = tmp->base.dst.x == 0 && tmp->base.dst.y == 0;
 	tmp->box   = gen3_render_composite_spans_box;
 	tmp->boxes = gen3_render_composite_spans_boxes;
+	tmp->thread_boxes = gen3_render_composite_spans_boxes__thread;
 	tmp->done  = gen3_render_composite_spans_done;
 	tmp->prim_emit = gen3_emit_composite_spans_primitive;
 	switch (tmp->base.src.u.gen3.type) {
 	case SHADER_NONE:
 		assert(0);
 	case SHADER_ZERO:
-		tmp->prim_emit = no_offset ? gen3_emit_composite_spans_primitive_zero_no_offset : gen3_emit_composite_spans_primitive_zero;
+		if (no_offset) {
+			tmp->prim_emit = gen3_emit_composite_spans_primitive_zero_no_offset;
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_zero_no_offset__boxes;
+		} else {
+			tmp->prim_emit = gen3_emit_composite_spans_primitive_zero;
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_zero__boxes;
+		}
 		break;
 	case SHADER_BLACK:
 	case SHADER_WHITE:
 	case SHADER_CONSTANT:
 		if (no_offset) {
 			tmp->box = gen3_render_composite_spans_constant_box;
+			tmp->thread_boxes = gen3_render_composite_spans_constant_thread_boxes;
 			tmp->prim_emit = gen3_emit_composite_spans_primitive_constant_no_offset;
-		} else
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant_no_offset__boxes;
+		} else {
 			tmp->prim_emit = gen3_emit_composite_spans_primitive_constant;
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_constant__boxes;
+		}
 		break;
 	case SHADER_LINEAR:
 	case SHADER_RADIAL:
-		if (tmp->base.src.transform == NULL)
+		if (tmp->base.src.transform == NULL) {
 			tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_gradient;
-		else if (tmp->base.src.is_affine)
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_gradient__boxes;
+		} else if (tmp->base.src.is_affine) {
 			tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_gradient;
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_gradient__boxes;
+		}
 		break;
 	case SHADER_TEXTURE:
-		if (tmp->base.src.transform == NULL)
+		if (tmp->base.src.transform == NULL) {
 			tmp->prim_emit = gen3_emit_composite_spans_primitive_identity_source;
-		else if (tmp->base.src.is_affine)
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_identity_source__boxes;
+		} else if (tmp->base.src.is_affine) {
+			tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
+			tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
 			tmp->prim_emit = gen3_emit_composite_spans_primitive_affine_source;
+			tmp->emit_boxes = gen3_emit_composite_spans_primitive_affine_source__boxes;
+		}
 		break;
 	}
+	if (tmp->emit_boxes == NULL)
+		tmp->thread_boxes = NULL;
 
 	tmp->base.mask.bo = NULL;
 
@@ -3528,7 +4052,8 @@ gen3_emit_video_state(struct sna *sna,
 		      struct sna_video_frame *frame,
 		      PixmapPtr pixmap,
 		      struct kgem_bo *dst_bo,
-		      int width, int height)
+		      int width, int height,
+		      bool bilinear)
 {
 	struct gen3_render_state *state = &sna->render_state.gen3;
 	uint32_t id, ms3, rewind;
@@ -3841,9 +4366,9 @@ gen3_emit_video_state(struct sna *sna,
 }
 
 static void
-gen3_video_get_batch(struct sna *sna)
+gen3_video_get_batch(struct sna *sna, struct kgem_bo *bo)
 {
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
 
 	if (!kgem_check_batch(&sna->kgem, 120) ||
 	    !kgem_check_reloc(&sna->kgem, 4) ||
@@ -3875,18 +4400,18 @@ gen3_render_video(struct sna *sna,
 		  RegionPtr dstRegion,
 		  short src_w, short src_h,
 		  short drw_w, short drw_h,
+		  short dx, short dy,
 		  PixmapPtr pixmap)
 {
 	struct sna_pixmap *priv = sna_pixmap(pixmap);
 	BoxPtr pbox = REGION_RECTS(dstRegion);
 	int nbox = REGION_NUM_RECTS(dstRegion);
-	int dxo = dstRegion->extents.x1;
-	int dyo = dstRegion->extents.y1;
-	int width = dstRegion->extents.x2 - dxo;
-	int height = dstRegion->extents.y2 - dyo;
+	int width = dstRegion->extents.x2 - dstRegion->extents.x1;
+	int height = dstRegion->extents.y2 - dstRegion->extents.y1;
 	float src_scale_x, src_scale_y;
 	int pix_xoff, pix_yoff;
 	struct kgem_bo *dst_bo;
+	bool bilinear;
 	int copy = 0;
 
 	DBG(("%s: %dx%d -> %dx%d\n", __FUNCTION__, src_w, src_h, drw_w, drw_h));
@@ -3908,8 +4433,8 @@ gen3_render_video(struct sna *sna,
 		if (!dst_bo)
 			return false;
 
-		pix_xoff = -dxo;
-		pix_yoff = -dyo;
+		pix_xoff = -dstRegion->extents.x1;
+		pix_yoff = -dstRegion->extents.y1;
 		copy = 1;
 	} else {
 		width = pixmap->drawable.width;
@@ -3927,22 +4452,24 @@ gen3_render_video(struct sna *sna,
 #endif
 	}
 
+	bilinear = src_w != drw_w || src_h != drw_h;
+
 	src_scale_x = ((float)src_w / frame->width) / drw_w;
 	src_scale_y = ((float)src_h / frame->height) / drw_h;
 
 	DBG(("%s: src offset=(%d, %d), scale=(%f, %f), dst offset=(%d, %d)\n",
 	     __FUNCTION__,
-	     dxo, dyo, src_scale_x, src_scale_y, pix_xoff, pix_yoff));
+	     dx, dy, src_scale_x, src_scale_y, pix_xoff, pix_yoff));
 
-	gen3_video_get_batch(sna);
+	gen3_video_get_batch(sna, dst_bo);
 	gen3_emit_video_state(sna, video, frame, pixmap,
-			      dst_bo, width, height);
+			      dst_bo, width, height, bilinear);
 	do {
 		int nbox_this_time = gen3_get_inline_rectangles(sna, nbox, 4);
 		if (nbox_this_time == 0) {
-			gen3_video_get_batch(sna);
+			gen3_video_get_batch(sna, dst_bo);
 			gen3_emit_video_state(sna, video, frame, pixmap,
-					      dst_bo, width, height);
+					      dst_bo, width, height, bilinear);
 			nbox_this_time = gen3_get_inline_rectangles(sna, nbox, 4);
 		}
 		nbox -= nbox_this_time;
@@ -3962,20 +4489,20 @@ gen3_render_video(struct sna *sna,
 			/* bottom right */
 			OUT_BATCH_F(box_x2 + pix_xoff);
 			OUT_BATCH_F(box_y2 + pix_yoff);
-			OUT_BATCH_F((box_x2 - dxo) * src_scale_x);
-			OUT_BATCH_F((box_y2 - dyo) * src_scale_y);
+			OUT_BATCH_F((box_x2 - dx) * src_scale_x);
+			OUT_BATCH_F((box_y2 - dy) * src_scale_y);
 
 			/* bottom left */
 			OUT_BATCH_F(box_x1 + pix_xoff);
 			OUT_BATCH_F(box_y2 + pix_yoff);
-			OUT_BATCH_F((box_x1 - dxo) * src_scale_x);
-			OUT_BATCH_F((box_y2 - dyo) * src_scale_y);
+			OUT_BATCH_F((box_x1 - dx) * src_scale_x);
+			OUT_BATCH_F((box_y2 - dy) * src_scale_y);
 
 			/* top left */
 			OUT_BATCH_F(box_x1 + pix_xoff);
 			OUT_BATCH_F(box_y1 + pix_yoff);
-			OUT_BATCH_F((box_x1 - dxo) * src_scale_x);
-			OUT_BATCH_F((box_y1 - dyo) * src_scale_y);
+			OUT_BATCH_F((box_x1 - dx) * src_scale_x);
+			OUT_BATCH_F((box_y1 - dy) * src_scale_y);
 		}
 	} while (nbox);
 
@@ -3988,7 +4515,7 @@ gen3_render_video(struct sna *sna,
 		pix_yoff = 0;
 #endif
 		sna_blt_copy_boxes(sna, GXcopy,
-				   dst_bo, -dxo, -dyo,
+				   dst_bo, -dstRegion->extents.x1, -dstRegion->extents.y1,
 				   priv->gpu_bo, pix_xoff, pix_yoff,
 				   pixmap->drawable.bitsPerPixel,
 				   REGION_RECTS(dstRegion),
@@ -4207,7 +4734,7 @@ gen3_render_copy_blt(struct sna *sna,
 static void
 gen3_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
 {
-	if (sna->render_state.gen3.vertex_offset)
+	if (sna->render.vertex_offset)
 		gen3_vertex_flush(sna);
 }
 
@@ -4500,7 +5027,7 @@ gen3_render_fill_op_boxes(struct sna *sna,
 static void
 gen3_render_fill_op_done(struct sna *sna, const struct sna_fill_op *op)
 {
-	if (sna->render_state.gen3.vertex_offset)
+	if (sna->render.vertex_offset)
 		gen3_vertex_flush(sna);
 }
 
@@ -4661,6 +5188,9 @@ gen3_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 static void gen3_render_flush(struct sna *sna)
 {
 	gen3_vertex_close(sna);
+
+	assert(sna->render.vertex_reloc[0] == 0);
+	assert(sna->render.vertex_offset == 0);
 }
 
 static void
@@ -4674,10 +5204,12 @@ bool gen3_render_init(struct sna *sna)
 
 #if !NO_COMPOSITE
 	render->composite = gen3_render_composite;
+	render->prefer_gpu |= PREFER_GPU_RENDER;
 #endif
 #if !NO_COMPOSITE_SPANS
 	render->check_composite_spans = gen3_check_composite_spans;
 	render->composite_spans = gen3_render_composite_spans;
+	render->prefer_gpu |= PREFER_GPU_SPANS;
 #endif
 
 	render->video = gen3_render_video;
diff --git a/src/sna/gen4_render.c b/src/sna/gen4_render.c
index ceef528f0..53fe52a92 100644
--- a/src/sna/gen4_render.c
+++ b/src/sna/gen4_render.c
@@ -42,13 +42,15 @@
 
 #include "brw/brw.h"
 #include "gen4_render.h"
+#include "gen4_source.h"
+#include "gen4_vertex.h"
 
 /* gen4 has a serious issue with its shaders that we need to flush
  * after every rectangle... So until that is resolved, prefer
  * the BLT engine.
  */
-#define PREFER_BLT 1
-#define FLUSH_EVERY_VERTEX 1
+#define FORCE_SPANS 0
+#define FORCE_NONRECTILINEAR_SPANS -1
 
 #define NO_COMPOSITE 0
 #define NO_COMPOSITE_SPANS 0
@@ -59,19 +61,6 @@
 #define NO_FILL_BOXES 0
 #define NO_VIDEO 0
 
-#if FLUSH_EVERY_VERTEX
-#define _FLUSH() do { \
-	gen4_vertex_flush(sna); \
-	OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH); \
-} while (0)
-#define FLUSH(OP) do { \
-	if ((OP)->mask.bo == NULL) _FLUSH(); \
-} while (0)
-#else
-#define _FLUSH()
-#define FLUSH(OP)
-#endif
-
 #define GEN4_GRF_BLOCKS(nreg)    ((nreg + 15) / 16 - 1)
 
 /* Set up a default static partitioning of the URB, which is supposed to
@@ -100,15 +89,9 @@
 #define SF_KERNEL_NUM_GRF 16
 #define PS_KERNEL_NUM_GRF 32
 
-static const struct gt_info {
-	uint32_t max_sf_threads;
-	uint32_t max_wm_threads;
-	uint32_t urb_size;
-} gen4_gt_info = {
-	24, 32, 256,
-}, g4x_gt_info = {
-	24, 50, 384,
-};
+#define GEN4_MAX_SF_THREADS 24
+#define GEN4_MAX_WM_THREADS 32
+#define G4X_MAX_WM_THREADS 50
 
 static const uint32_t ps_kernel_packed_static[][4] = {
 #include "exa_wm_xy.g4b"
@@ -225,21 +208,20 @@ gen4_choose_composite_kernel(int op, bool has_mask, bool is_ca, bool is_affine)
 	return base + !is_affine;
 }
 
-static void gen4_magic_ca_pass(struct sna *sna,
+static bool gen4_magic_ca_pass(struct sna *sna,
 			       const struct sna_composite_op *op)
 {
 	struct gen4_render_state *state = &sna->render_state.gen4;
 
 	if (!op->need_magic_ca_pass)
-		return;
+		return false;
+
+	assert(sna->render.vertex_index > sna->render.vertex_start);
 
 	DBG(("%s: CA fixup\n", __FUNCTION__));
 	assert(op->mask.bo != NULL);
 	assert(op->has_component_alpha);
 
-	if (FLUSH_EVERY_VERTEX)
-		OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
-
 	gen4_emit_pipelined_pointers(sna, op, PictOpAdd,
 				     gen4_choose_composite_kernel(PictOpAdd,
 								  true, true, op->is_affine));
@@ -256,154 +238,9 @@ static void gen4_magic_ca_pass(struct sna *sna,
 	OUT_BATCH(0);	/* index buffer offset, ignored */
 
 	state->last_primitive = sna->kgem.nbatch;
+	return true;
 }
 
-static void gen4_vertex_flush(struct sna *sna)
-{
-	if (sna->render_state.gen4.vertex_offset == 0)
-		return;
-
-	DBG(("%s[%x] = %d\n", __FUNCTION__,
-	     4*sna->render_state.gen4.vertex_offset,
-	     sna->render.vertex_index - sna->render.vertex_start));
-	sna->kgem.batch[sna->render_state.gen4.vertex_offset] =
-		sna->render.vertex_index - sna->render.vertex_start;
-	sna->render_state.gen4.vertex_offset = 0;
-}
-
-static int gen4_vertex_finish(struct sna *sna)
-{
-	struct kgem_bo *bo;
-	unsigned int i;
-
-	assert(sna->render.vertex_used);
-	assert(sna->render.nvertex_reloc);
-
-	/* Note: we only need dword alignment (currently) */
-
-	bo = sna->render.vbo;
-	if (bo) {
-		gen4_vertex_flush(sna);
-
-		for (i = 0; i < sna->render.nvertex_reloc; i++) {
-			DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
-			     i, sna->render.vertex_reloc[i]));
-
-			sna->kgem.batch[sna->render.vertex_reloc[i]] =
-				kgem_add_reloc(&sna->kgem,
-					       sna->render.vertex_reloc[i], bo,
-					       I915_GEM_DOMAIN_VERTEX << 16,
-					       0);
-		}
-
-		sna->render.nvertex_reloc = 0;
-		sna->render.vertex_used = 0;
-		sna->render.vertex_index = 0;
-		sna->render_state.gen4.vb_id = 0;
-
-		kgem_bo_destroy(&sna->kgem, bo);
-	}
-
-	sna->render.vertices = NULL;
-	sna->render.vbo = kgem_create_linear(&sna->kgem,
-					     256*1024, CREATE_GTT_MAP);
-	if (sna->render.vbo)
-		sna->render.vertices = kgem_bo_map(&sna->kgem, sna->render.vbo);
-	if (sna->render.vertices == NULL) {
-		if (sna->render.vbo)
-			kgem_bo_destroy(&sna->kgem, sna->render.vbo);
-		sna->render.vbo = NULL;
-		return 0;
-	}
-
-	if (sna->render.vertex_used) {
-		memcpy(sna->render.vertices,
-		       sna->render.vertex_data,
-		       sizeof(float)*sna->render.vertex_used);
-	}
-	sna->render.vertex_size = 64 * 1024 - 1;
-	return sna->render.vertex_size - sna->render.vertex_used;
-}
-
-static void gen4_vertex_close(struct sna *sna)
-{
-	struct kgem_bo *bo, *free_bo = NULL;
-	unsigned int i, delta = 0;
-
-	assert(sna->render_state.gen4.vertex_offset == 0);
-	if (!sna->render_state.gen4.vb_id)
-		return;
-
-	DBG(("%s: used=%d, vbo active? %d\n",
-	     __FUNCTION__, sna->render.vertex_used, sna->render.vbo != NULL));
-
-	bo = sna->render.vbo;
-	if (bo) {
-		if (sna->render.vertex_size - sna->render.vertex_used < 64) {
-			DBG(("%s: discarding full vbo\n", __FUNCTION__));
-			sna->render.vbo = NULL;
-			sna->render.vertices = sna->render.vertex_data;
-			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
-			free_bo = bo;
-		} else if (IS_CPU_MAP(bo->map)) {
-			DBG(("%s: converting CPU map to GTT\n", __FUNCTION__));
-			sna->render.vertices =
-				kgem_bo_map__gtt(&sna->kgem, sna->render.vbo);
-			if (sna->render.vertices == NULL) {
-				sna->render.vbo = NULL;
-				sna->render.vertices = sna->render.vertex_data;
-				sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
-				free_bo = bo;
-			}
-		}
-	} else {
-		if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
-			DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
-			     sna->render.vertex_used, sna->kgem.nbatch));
-			memcpy(sna->kgem.batch + sna->kgem.nbatch,
-			       sna->render.vertex_data,
-			       sna->render.vertex_used * 4);
-			delta = sna->kgem.nbatch * 4;
-			bo = NULL;
-			sna->kgem.nbatch += sna->render.vertex_used;
-		} else {
-			bo = kgem_create_linear(&sna->kgem,
-						4*sna->render.vertex_used, 0);
-			if (bo && !kgem_bo_write(&sna->kgem, bo,
-						 sna->render.vertex_data,
-						 4*sna->render.vertex_used)) {
-				kgem_bo_destroy(&sna->kgem, bo);
-				bo = NULL;
-			}
-			DBG(("%s: new vbo: %d\n", __FUNCTION__,
-			     sna->render.vertex_used));
-			free_bo = bo;
-		}
-	}
-
-	assert(sna->render.nvertex_reloc);
-	for (i = 0; i < sna->render.nvertex_reloc; i++) {
-		DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
-		     i, sna->render.vertex_reloc[i]));
-
-		sna->kgem.batch[sna->render.vertex_reloc[i]] =
-			kgem_add_reloc(&sna->kgem,
-				       sna->render.vertex_reloc[i], bo,
-				       I915_GEM_DOMAIN_VERTEX << 16,
-				       delta);
-	}
-	sna->render.nvertex_reloc = 0;
-
-	if (sna->render.vbo == NULL) {
-		sna->render.vertex_used = 0;
-		sna->render.vertex_index = 0;
-	}
-
-	if (free_bo)
-		kgem_bo_destroy(&sna->kgem, free_bo);
-}
-
-
 static uint32_t gen4_get_blend(int op,
 			       bool has_component_alpha,
 			       uint32_t dst_format)
@@ -635,6 +472,17 @@ static bool gen4_check_repeat(PicturePtr picture)
 	}
 }
 
+static uint32_t
+gen4_tiling_bits(uint32_t tiling)
+{
+	switch (tiling) {
+	default: assert(0);
+	case I915_TILING_NONE: return 0;
+	case I915_TILING_X: return GEN4_SURFACE_TILED;
+	case I915_TILING_Y: return GEN4_SURFACE_TILED | GEN4_SURFACE_TILED_Y;
+	}
+}
+
 /**
  * Sets up the common fields for a surface state buffer for the given
  * picture in the given surface state buffer.
@@ -647,11 +495,11 @@ gen4_bind_bo(struct sna *sna,
 	     uint32_t format,
 	     bool is_dst)
 {
-	struct gen4_surface_state *ss;
 	uint32_t domains;
 	uint16_t offset;
+	uint32_t *ss;
 
-	assert(!kgem_bo_is_snoop(bo));
+	assert(sna->kgem.gen != 040 || !kgem_bo_is_snoop(bo));
 
 	/* After the first bind, we manage the cache domains within the batch */
 	offset = kgem_bo_get_binding(bo, format);
@@ -663,340 +511,58 @@ gen4_bind_bo(struct sna *sna,
 
 	offset = sna->kgem.surface -=
 		sizeof(struct gen4_surface_state_padded) / sizeof(uint32_t);
-	ss = memset(sna->kgem.batch + offset, 0, sizeof(*ss));
+	ss = sna->kgem.batch + offset;
 
-	ss->ss0.surface_type = GEN4_SURFACE_2D;
-	ss->ss0.surface_format = format;
+	ss[0] = (GEN4_SURFACE_2D << GEN4_SURFACE_TYPE_SHIFT |
+		 GEN4_SURFACE_BLEND_ENABLED |
+		 format << GEN4_SURFACE_FORMAT_SHIFT);
 
 	if (is_dst)
 		domains = I915_GEM_DOMAIN_RENDER << 16 | I915_GEM_DOMAIN_RENDER;
 	else
 		domains = I915_GEM_DOMAIN_SAMPLER << 16;
+	ss[1] = kgem_add_reloc(&sna->kgem, offset + 1, bo, domains, 0);
 
-	ss->ss0.data_return_format = GEN4_SURFACERETURNFORMAT_FLOAT32;
-	ss->ss0.color_blend = 1;
-	ss->ss1.base_addr =
-		kgem_add_reloc(&sna->kgem, offset + 1, bo, domains, 0);
-
-	ss->ss2.height = height - 1;
-	ss->ss2.width  = width - 1;
-	ss->ss3.pitch  = bo->pitch - 1;
-	ss->ss3.tiled_surface = bo->tiling != I915_TILING_NONE;
-	ss->ss3.tile_walk     = bo->tiling == I915_TILING_Y;
+	ss[2] = ((width - 1)  << GEN4_SURFACE_WIDTH_SHIFT |
+		 (height - 1) << GEN4_SURFACE_HEIGHT_SHIFT);
+	ss[3] = (gen4_tiling_bits(bo->tiling) |
+		 (bo->pitch - 1) << GEN4_SURFACE_PITCH_SHIFT);
+	ss[4] = 0;
+	ss[5] = 0;
 
 	kgem_bo_set_binding(bo, format, offset);
 
 	DBG(("[%x] bind bo(handle=%d, addr=%d), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> %s\n",
-	     offset, bo->handle, ss->ss1.base_addr,
-	     ss->ss0.surface_format, width, height, bo->pitch, bo->tiling,
+	     offset, bo->handle, ss[1],
+	     format, width, height, bo->pitch, bo->tiling,
 	     domains & 0xffff ? "render" : "sampler"));
 
 	return offset * sizeof(uint32_t);
 }
 
-fastcall static void
-gen4_emit_composite_primitive_solid(struct sna *sna,
-				    const struct sna_composite_op *op,
-				    const struct sna_composite_rectangles *r)
-{
-	float *v;
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 9;
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	v[1] = 1.;
-	v[2] = 1.;
-
-	dst.p.x = r->dst.x;
-	v[3] = dst.f;
-	v[4] = 0.;
-	v[5] = 1.;
-
-	dst.p.y = r->dst.y;
-	v[6] = dst.f;
-	v[7] = 0.;
-	v[8] = 0.;
-}
-
-fastcall static void
-gen4_emit_composite_primitive_identity_source(struct sna *sna,
-					      const struct sna_composite_op *op,
-					      const struct sna_composite_rectangles *r)
-{
-	const float *sf = op->src.scale;
-	float sx, sy, *v;
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 9;
-
-	sx = r->src.x + op->src.offset[0];
-	sy = r->src.y + op->src.offset[1];
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	v[1] = (sx + r->width) * sf[0];
-	v[2] = (sy + r->height) * sf[1];
-
-	dst.p.x = r->dst.x;
-	v[3] = dst.f;
-	v[4] = sx * sf[0];
-	v[5] = v[2];
-
-	dst.p.y = r->dst.y;
-	v[6] = dst.f;
-	v[7] = v[4];
-	v[8] = sy * sf[1];
-}
-
-fastcall static void
-gen4_emit_composite_primitive_affine_source(struct sna *sna,
-					    const struct sna_composite_op *op,
-					    const struct sna_composite_rectangles *r)
-{
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-	float *v;
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 9;
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	_sna_get_transformed_coordinates(op->src.offset[0] + r->src.x + r->width,
-					 op->src.offset[1] + r->src.y + r->height,
-					 op->src.transform,
-					 &v[1], &v[2]);
-	v[1] *= op->src.scale[0];
-	v[2] *= op->src.scale[1];
-
-	dst.p.x = r->dst.x;
-	v[3] = dst.f;
-	_sna_get_transformed_coordinates(op->src.offset[0] + r->src.x,
-					 op->src.offset[1] + r->src.y + r->height,
-					 op->src.transform,
-					 &v[4], &v[5]);
-	v[4] *= op->src.scale[0];
-	v[5] *= op->src.scale[1];
-
-	dst.p.y = r->dst.y;
-	v[6] = dst.f;
-	_sna_get_transformed_coordinates(op->src.offset[0] + r->src.x,
-					 op->src.offset[1] + r->src.y,
-					 op->src.transform,
-					 &v[7], &v[8]);
-	v[7] *= op->src.scale[0];
-	v[8] *= op->src.scale[1];
-}
-
-fastcall static void
-gen4_emit_composite_primitive_identity_source_mask(struct sna *sna,
-						   const struct sna_composite_op *op,
-						   const struct sna_composite_rectangles *r)
-{
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-	float src_x, src_y;
-	float msk_x, msk_y;
-	float w, h;
-	float *v;
-
-	src_x = r->src.x + op->src.offset[0];
-	src_y = r->src.y + op->src.offset[1];
-	msk_x = r->mask.x + op->mask.offset[0];
-	msk_y = r->mask.y + op->mask.offset[1];
-	w = r->width;
-	h = r->height;
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 15;
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	v[1] = (src_x + w) * op->src.scale[0];
-	v[2] = (src_y + h) * op->src.scale[1];
-	v[3] = (msk_x + w) * op->mask.scale[0];
-	v[4] = (msk_y + h) * op->mask.scale[1];
-
-	dst.p.x = r->dst.x;
-	v[5] = dst.f;
-	v[6] = src_x * op->src.scale[0];
-	v[7] = v[2];
-	v[8] = msk_x * op->mask.scale[0];
-	v[9] = v[4];
-
-	dst.p.y = r->dst.y;
-	v[10] = dst.f;
-	v[11] = v[6];
-	v[12] = src_y * op->src.scale[1];
-	v[13] = v[8];
-	v[14] = msk_y * op->mask.scale[1];
-}
-
-fastcall static void
-gen4_emit_composite_primitive(struct sna *sna,
-			      const struct sna_composite_op *op,
-			      const struct sna_composite_rectangles *r)
-{
-	float src_x[3], src_y[3], src_w[3], mask_x[3], mask_y[3], mask_w[3];
-	bool is_affine = op->is_affine;
-	const float *src_sf = op->src.scale;
-	const float *mask_sf = op->mask.scale;
-
-	if (is_affine) {
-		sna_get_transformed_coordinates(r->src.x + op->src.offset[0],
-						r->src.y + op->src.offset[1],
-						op->src.transform,
-						&src_x[0],
-						&src_y[0]);
-
-		sna_get_transformed_coordinates(r->src.x + op->src.offset[0],
-						r->src.y + op->src.offset[1] + r->height,
-						op->src.transform,
-						&src_x[1],
-						&src_y[1]);
-
-		sna_get_transformed_coordinates(r->src.x + op->src.offset[0] + r->width,
-						r->src.y + op->src.offset[1] + r->height,
-						op->src.transform,
-						&src_x[2],
-						&src_y[2]);
-	} else {
-		sna_get_transformed_coordinates_3d(r->src.x + op->src.offset[0],
-						   r->src.y + op->src.offset[1],
-						   op->src.transform,
-						   &src_x[0],
-						   &src_y[0],
-						   &src_w[0]);
-		sna_get_transformed_coordinates_3d(r->src.x + op->src.offset[0],
-						   r->src.y + op->src.offset[1] + r->height,
-						   op->src.transform,
-						   &src_x[1],
-						   &src_y[1],
-						   &src_w[1]);
-		sna_get_transformed_coordinates_3d(r->src.x + op->src.offset[0] + r->width,
-						   r->src.y + op->src.offset[1] + r->height,
-						   op->src.transform,
-						   &src_x[2],
-						   &src_y[2],
-						   &src_w[2]);
-	}
-
-	if (op->mask.bo) {
-		if (is_affine) {
-			sna_get_transformed_coordinates(r->mask.x + op->mask.offset[0],
-							r->mask.y + op->mask.offset[1],
-							op->mask.transform,
-							&mask_x[0],
-							&mask_y[0]);
-
-			sna_get_transformed_coordinates(r->mask.x + op->mask.offset[0],
-							r->mask.y + op->mask.offset[1] + r->height,
-							op->mask.transform,
-							&mask_x[1],
-							&mask_y[1]);
-
-			sna_get_transformed_coordinates(r->mask.x + op->mask.offset[0] + r->width,
-							r->mask.y + op->mask.offset[1] + r->height,
-							op->mask.transform,
-							&mask_x[2],
-							&mask_y[2]);
-		} else {
-			sna_get_transformed_coordinates_3d(r->mask.x + op->mask.offset[0],
-							   r->mask.y + op->mask.offset[1],
-							   op->mask.transform,
-							   &mask_x[0],
-							   &mask_y[0],
-							   &mask_w[0]);
-			sna_get_transformed_coordinates_3d(r->mask.x + op->mask.offset[0],
-							   r->mask.y + op->mask.offset[1] + r->height,
-							   op->mask.transform,
-							   &mask_x[1],
-							   &mask_y[1],
-							   &mask_w[1]);
-			sna_get_transformed_coordinates_3d(r->mask.x + op->mask.offset[0] + r->width,
-							   r->mask.y + op->mask.offset[1] + r->height,
-							   op->mask.transform,
-							   &mask_x[2],
-							   &mask_y[2],
-							   &mask_w[2]);
-		}
-	}
-
-	OUT_VERTEX(r->dst.x + r->width, r->dst.y + r->height);
-	OUT_VERTEX_F(src_x[2] * src_sf[0]);
-	OUT_VERTEX_F(src_y[2] * src_sf[1]);
-	if (!is_affine)
-		OUT_VERTEX_F(src_w[2]);
-	if (op->mask.bo) {
-		OUT_VERTEX_F(mask_x[2] * mask_sf[0]);
-		OUT_VERTEX_F(mask_y[2] * mask_sf[1]);
-		if (!is_affine)
-			OUT_VERTEX_F(mask_w[2]);
-	}
-
-	OUT_VERTEX(r->dst.x, r->dst.y + r->height);
-	OUT_VERTEX_F(src_x[1] * src_sf[0]);
-	OUT_VERTEX_F(src_y[1] * src_sf[1]);
-	if (!is_affine)
-		OUT_VERTEX_F(src_w[1]);
-	if (op->mask.bo) {
-		OUT_VERTEX_F(mask_x[1] * mask_sf[0]);
-		OUT_VERTEX_F(mask_y[1] * mask_sf[1]);
-		if (!is_affine)
-			OUT_VERTEX_F(mask_w[1]);
-	}
-
-	OUT_VERTEX(r->dst.x, r->dst.y);
-	OUT_VERTEX_F(src_x[0] * src_sf[0]);
-	OUT_VERTEX_F(src_y[0] * src_sf[1]);
-	if (!is_affine)
-		OUT_VERTEX_F(src_w[0]);
-	if (op->mask.bo) {
-		OUT_VERTEX_F(mask_x[0] * mask_sf[0]);
-		OUT_VERTEX_F(mask_y[0] * mask_sf[1]);
-		if (!is_affine)
-			OUT_VERTEX_F(mask_w[0]);
-	}
-}
-
 static void gen4_emit_vertex_buffer(struct sna *sna,
 				    const struct sna_composite_op *op)
 {
 	int id = op->u.gen4.ve_id;
 
+	assert((sna->render.vb_id & (1 << id)) == 0);
+
 	OUT_BATCH(GEN4_3DSTATE_VERTEX_BUFFERS | 3);
 	OUT_BATCH((id << VB0_BUFFER_INDEX_SHIFT) | VB0_VERTEXDATA |
 		  (4*op->floats_per_vertex << VB0_BUFFER_PITCH_SHIFT));
+	assert(sna->render.nvertex_reloc < ARRAY_SIZE(sna->render.vertex_reloc));
 	sna->render.vertex_reloc[sna->render.nvertex_reloc++] = sna->kgem.nbatch;
 	OUT_BATCH(0);
 	OUT_BATCH(0);
 	OUT_BATCH(0);
 
-	sna->render_state.gen4.vb_id |= 1 << id;
+	sna->render.vb_id |= 1 << id;
 }
 
 static void gen4_emit_primitive(struct sna *sna)
 {
 	if (sna->kgem.nbatch == sna->render_state.gen4.last_primitive) {
-		sna->render_state.gen4.vertex_offset = sna->kgem.nbatch - 5;
+		sna->render.vertex_offset = sna->kgem.nbatch - 5;
 		return;
 	}
 
@@ -1005,7 +571,7 @@ static void gen4_emit_primitive(struct sna *sna)
 		  (_3DPRIM_RECTLIST << GEN4_3DPRIMITIVE_TOPOLOGY_SHIFT) |
 		  (0 << 9) |
 		  4);
-	sna->render_state.gen4.vertex_offset = sna->kgem.nbatch;
+	sna->render.vertex_offset = sna->kgem.nbatch;
 	OUT_BATCH(0);	/* vertex count, to be filled in later */
 	OUT_BATCH(sna->render.vertex_index);
 	OUT_BATCH(1);	/* single instance */
@@ -1022,19 +588,20 @@ static bool gen4_rectangle_begin(struct sna *sna,
 	int id = op->u.gen4.ve_id;
 	int ndwords;
 
+	if (sna_vertex_wait__locked(&sna->render) && sna->render.vertex_offset)
+		return true;
+
 	/* 7xpipelined pointers + 6xprimitive + 1xflush */
 	ndwords = op->need_magic_ca_pass? 20 : 6;
-	if (FLUSH_EVERY_VERTEX)
-		ndwords += 1;
-	if ((sna->render_state.gen4.vb_id & (1 << id)) == 0)
+	if ((sna->render.vb_id & (1 << id)) == 0)
 		ndwords += 5;
 
 	if (!kgem_check_batch(&sna->kgem, ndwords))
 		return false;
 
-	if ((sna->render_state.gen4.vb_id & (1 << id)) == 0)
+	if ((sna->render.vb_id & (1 << id)) == 0)
 		gen4_emit_vertex_buffer(sna, op);
-	if (sna->render_state.gen4.vertex_offset == 0)
+	if (sna->render.vertex_offset == 0)
 		gen4_emit_primitive(sna);
 
 	return true;
@@ -1043,14 +610,28 @@ static bool gen4_rectangle_begin(struct sna *sna,
 static int gen4_get_rectangles__flush(struct sna *sna,
 				      const struct sna_composite_op *op)
 {
-	if (!kgem_check_batch(&sna->kgem, (FLUSH_EVERY_VERTEX || op->need_magic_ca_pass) ? 25 : 6))
+	/* Preventing discarding new vbo after lock contention */
+	if (sna_vertex_wait__locked(&sna->render)) {
+		int rem = vertex_space(sna);
+		if (rem > op->floats_per_rect)
+			return rem;
+	}
+
+	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 25 : 6))
 		return 0;
-	if (!kgem_check_reloc_and_exec(&sna->kgem, 1))
+	if (!kgem_check_reloc_and_exec(&sna->kgem, 2))
 		return 0;
 
 	if (op->need_magic_ca_pass && sna->render.vbo)
 		return 0;
 
+	if (sna->render.vertex_offset) {
+		gen4_vertex_flush(sna);
+		if (gen4_magic_ca_pass(sna, op))
+			gen4_emit_pipelined_pointers(sna, op, op->op,
+						     op->u.gen4.wm_kernel);
+	}
+
 	return gen4_vertex_finish(sna);
 }
 
@@ -1063,7 +644,7 @@ inline static int gen4_get_rectangles(struct sna *sna,
 
 start:
 	rem = vertex_space(sna);
-	if (rem < op->floats_per_rect) {
+	if (unlikely(rem < op->floats_per_rect)) {
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen4_get_rectangles__flush(sna, op);
@@ -1071,7 +652,7 @@ start:
 			goto flush;
 	}
 
-	if (unlikely(sna->render_state.gen4.vertex_offset == 0 &&
+	if (unlikely(sna->render.vertex_offset == 0 &&
 		     !gen4_rectangle_begin(sna, op)))
 		goto flush;
 
@@ -1082,18 +663,18 @@ start:
 	return want;
 
 flush:
-	if (sna->render_state.gen4.vertex_offset) {
+	if (sna->render.vertex_offset) {
 		gen4_vertex_flush(sna);
 		gen4_magic_ca_pass(sna, op);
 	}
+	sna_vertex_wait__locked(&sna->render);
 	_kgem_submit(&sna->kgem);
 	emit_state(sna, op);
 	goto start;
 }
 
 static uint32_t *
-gen4_composite_get_binding_table(struct sna *sna,
-				 uint16_t *offset)
+gen4_composite_get_binding_table(struct sna *sna, uint16_t *offset)
 {
 	sna->kgem.surface -=
 		sizeof(struct gen4_surface_state_padded) / sizeof(uint32_t);
@@ -1129,6 +710,9 @@ gen4_emit_urb(struct sna *sna)
 	urb_cs_start = urb_sf_start + urb_sf_size;
 	urb_cs_size = URB_CS_ENTRIES * URB_CS_ENTRY_SIZE;
 
+	while ((sna->kgem.nbatch & 15) > 12)
+		OUT_BATCH(MI_NOOP);
+
 	OUT_BATCH(GEN4_URB_FENCE |
 		  UF0_CS_REALLOC |
 		  UF0_SF_REALLOC |
@@ -1176,7 +760,7 @@ gen4_emit_invariant(struct sna *sna)
 {
 	assert(sna->kgem.surface == sna->kgem.batch_size);
 
-	if (sna->kgem.gen >= 45)
+	if (sna->kgem.gen >= 045)
 		OUT_BATCH(NEW_PIPELINE_SELECT | PIPELINE_SELECT_3D);
 	else
 		OUT_BATCH(GEN4_PIPELINE_SELECT | PIPELINE_SELECT_3D);
@@ -1187,9 +771,9 @@ gen4_emit_invariant(struct sna *sna)
 }
 
 static void
-gen4_get_batch(struct sna *sna)
+gen4_get_batch(struct sna *sna, const struct sna_composite_op *op)
 {
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
 
 	if (!kgem_check_batch_with_surfaces(&sna->kgem, 150, 4)) {
 		DBG(("%s: flushing batch: %d < %d+%d\n",
@@ -1245,11 +829,11 @@ gen4_emit_pipelined_pointers(struct sna *sna,
 			     const struct sna_composite_op *op,
 			     int blend, int kernel)
 {
-	uint32_t key;
 	uint16_t sp, bp;
+	uint32_t key;
 
 	DBG(("%s: has_mask=%d, src=(%d, %d), mask=(%d, %d),kernel=%d, blend=%d, ca=%d, format=%x\n",
-	     __FUNCTION__, op->mask.bo != NULL,
+	     __FUNCTION__, op->u.gen4.ve_id & 2,
 	     op->src.filter, op->src.repeat,
 	     op->mask.filter, op->mask.repeat,
 	     kernel, blend, op->has_component_alpha, (int)op->dst.format));
@@ -1260,8 +844,7 @@ gen4_emit_pipelined_pointers(struct sna *sna,
 	bp = gen4_get_blend(blend, op->has_component_alpha, op->dst.format);
 
 	DBG(("%s: sp=%d, bp=%d\n", __FUNCTION__, sp, bp));
-
-	key = sp | bp << 16;
+	key = sp | (uint32_t)bp << 16;
 	if (key == sna->render_state.gen4.last_pipelined_pointers)
 		return;
 
@@ -1269,7 +852,7 @@ gen4_emit_pipelined_pointers(struct sna *sna,
 	OUT_BATCH(sna->render_state.gen4.vs);
 	OUT_BATCH(GEN4_GS_DISABLE); /* passthrough */
 	OUT_BATCH(GEN4_CLIP_DISABLE); /* passthrough */
-	OUT_BATCH(sna->render_state.gen4.sf[op->mask.bo != NULL]);
+	OUT_BATCH(sna->render_state.gen4.sf);
 	OUT_BATCH(sna->render_state.gen4.wm + sp);
 	OUT_BATCH(sna->render_state.gen4.cc + bp);
 
@@ -1277,7 +860,7 @@ gen4_emit_pipelined_pointers(struct sna *sna,
 	gen4_emit_urb(sna);
 }
 
-static void
+static bool
 gen4_emit_drawing_rectangle(struct sna *sna, const struct sna_composite_op *op)
 {
 	uint32_t limit = (op->dst.height - 1) << 16 | (op->dst.width - 1);
@@ -1288,7 +871,8 @@ gen4_emit_drawing_rectangle(struct sna *sna, const struct sna_composite_op *op)
 
 	if (sna->render_state.gen4.drawrect_limit == limit &&
 	    sna->render_state.gen4.drawrect_offset == offset)
-		return;
+		return true;
+
 	sna->render_state.gen4.drawrect_offset = offset;
 	sna->render_state.gen4.drawrect_limit = limit;
 
@@ -1296,6 +880,7 @@ gen4_emit_drawing_rectangle(struct sna *sna, const struct sna_composite_op *op)
 	OUT_BATCH(0);
 	OUT_BATCH(limit);
 	OUT_BATCH(offset);
+	return false;
 }
 
 static void
@@ -1309,65 +894,108 @@ gen4_emit_vertex_elements(struct sna *sna,
 	 *    texture coordinate 1 if (has_mask is true): same as above
 	 */
 	struct gen4_render_state *render = &sna->render_state.gen4;
-	bool has_mask = op->mask.bo != NULL;
-	int nelem = has_mask ? 2 : 1;
-	int selem;
-	uint32_t w_component;
-	uint32_t src_format;
+	uint32_t src_format, dw;
 	int id = op->u.gen4.ve_id;
 
 	if (render->ve_id == id)
 		return;
-
 	render->ve_id = id;
 
-	if (op->is_affine) {
-		src_format = GEN4_SURFACEFORMAT_R32G32_FLOAT;
-		w_component = GEN4_VFCOMPONENT_STORE_1_FLT;
-		selem = 2;
-	} else {
-		src_format = GEN4_SURFACEFORMAT_R32G32B32_FLOAT;
-		w_component = GEN4_VFCOMPONENT_STORE_SRC;
-		selem = 3;
-	}
-
 	/* The VUE layout
 	 *    dword 0-3: position (x, y, 1.0, 1.0),
 	 *    dword 4-7: texture coordinate 0 (u0, v0, w0, 1.0)
 	 *    [optional] dword 8-11: texture coordinate 1 (u1, v1, w1, 1.0)
 	 */
-	OUT_BATCH(GEN4_3DSTATE_VERTEX_ELEMENTS | (2 * (1 + nelem) - 1));
+	OUT_BATCH(GEN4_3DSTATE_VERTEX_ELEMENTS | (2 * (1 + 2) - 1));
 
 	/* x,y */
 	OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
 		  GEN4_SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
-		  0 << VE0_OFFSET_SHIFT); /* offsets vb in bytes */
-	OUT_BATCH(GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT |
-		  (1*4) << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);       /* VUE offset in dwords */
+		  0 << VE0_OFFSET_SHIFT);
+	OUT_BATCH(VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
+		  VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
+		  VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT |
+		  VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT |
+		  (1*4) << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
 
 	/* u0, v0, w0 */
+	/* u0, v0, w0 */
+	DBG(("%s: first channel %d floats, offset=4b\n", __FUNCTION__, id & 3));
+	dw = VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT;
+	switch (id & 3) {
+	default:
+		assert(0);
+	case 0:
+		src_format = GEN4_SURFACEFORMAT_R16G16_SSCALED;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
+		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
+		break;
+	case 1:
+		src_format = GEN4_SURFACEFORMAT_R32_FLOAT;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
+		dw |= VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT;
+		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
+		break;
+	case 2:
+		src_format = GEN4_SURFACEFORMAT_R32G32_FLOAT;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
+		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
+		break;
+	case 3:
+		src_format = GEN4_SURFACEFORMAT_R32G32B32_FLOAT;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_2_SHIFT;
+		break;
+	}
 	OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
 		  src_format << VE0_FORMAT_SHIFT |
-		  4 << VE0_OFFSET_SHIFT);	/* offset vb in bytes */
-	OUT_BATCH(GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-		  w_component << VE1_VFCOMPONENT_2_SHIFT |
-		  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT |
-		  (2*4) << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);       /* VUE offset in dwords */
+		  4 << VE0_OFFSET_SHIFT);
+	OUT_BATCH(dw | 8 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
 
 	/* u1, v1, w1 */
-	if (has_mask) {
+	if (id >> 2) {
+		unsigned src_offset = 4 + ((id & 3) ?: 1) * sizeof(float);
+		DBG(("%s: second channel %d floats, offset=%db\n", __FUNCTION__,
+		     id >> 2, src_offset));
+		dw = VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT;
+		switch (id >> 2) {
+		case 1:
+			src_format = GEN4_SURFACEFORMAT_R32_FLOAT;
+			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
+			dw |= VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT;
+			dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
+			break;
+		default:
+			assert(0);
+		case 2:
+			src_format = GEN4_SURFACEFORMAT_R32G32_FLOAT;
+			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
+			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
+			dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
+			break;
+		case 3:
+			src_format = GEN4_SURFACEFORMAT_R32G32B32_FLOAT;
+			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
+			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
+			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_2_SHIFT;
+			break;
+		}
 		OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
 			  src_format << VE0_FORMAT_SHIFT |
-			  ((1 + selem) * 4) << VE0_OFFSET_SHIFT); /* vb offset in bytes */
-		OUT_BATCH(GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-			  w_component << VE1_VFCOMPONENT_2_SHIFT |
-			  GEN4_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT |
-			  (3*4) << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);       /* VUE offset in dwords */
+			  src_offset << VE0_OFFSET_SHIFT);
+		OUT_BATCH(dw | 12 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
+	} else {
+		OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
+			  GEN4_SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
+			  0 << VE0_OFFSET_SHIFT);
+		OUT_BATCH(VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT |
+			  VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT |
+			  VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
+			  VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT |
+			  12 << VE1_DESTINATION_ELEMENT_OFFSET_SHIFT);
 	}
 }
 
@@ -1376,32 +1004,37 @@ gen4_emit_state(struct sna *sna,
 		const struct sna_composite_op *op,
 		uint16_t wm_binding_table)
 {
-	if (FLUSH_EVERY_VERTEX)
-		OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
-
-	gen4_emit_drawing_rectangle(sna, op);
-	gen4_emit_binding_table(sna, wm_binding_table);
-	gen4_emit_pipelined_pointers(sna, op, op->op, op->u.gen4.wm_kernel);
-	gen4_emit_vertex_elements(sna, op);
+	bool flush;
 
+	flush = wm_binding_table & 1;
 	if (kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
-		DBG(("%s: flushing dirty (%d, %d)\n", __FUNCTION__,
+		DBG(("%s: flushing dirty (%d, %d), forced? %d\n", __FUNCTION__,
 		     kgem_bo_is_dirty(op->src.bo),
-		     kgem_bo_is_dirty(op->mask.bo)));
+		     kgem_bo_is_dirty(op->mask.bo),
+		     flush));
 		OUT_BATCH(MI_FLUSH);
 		kgem_clear_dirty(&sna->kgem);
 		kgem_bo_mark_dirty(op->dst.bo);
+		flush = false;
 	}
+	flush &= gen4_emit_drawing_rectangle(sna, op);
+	if (flush && op->op > PictOpSrc)
+		OUT_BATCH(MI_FLUSH | MI_INHIBIT_RENDER_CACHE_FLUSH);
+
+	gen4_emit_binding_table(sna, wm_binding_table & ~1);
+	gen4_emit_pipelined_pointers(sna, op, op->op, op->u.gen4.wm_kernel);
+	gen4_emit_vertex_elements(sna, op);
 }
 
 static void
 gen4_bind_surfaces(struct sna *sna,
 		   const struct sna_composite_op *op)
 {
+	bool dirty = kgem_bo_is_dirty(op->dst.bo);
 	uint32_t *binding_table;
 	uint16_t offset;
 
-	gen4_get_batch(sna);
+	gen4_get_batch(sna, op);
 
 	binding_table = gen4_composite_get_binding_table(sna, &offset);
 
@@ -1415,7 +1048,8 @@ gen4_bind_surfaces(struct sna *sna,
 			     op->src.bo, op->src.width, op->src.height,
 			     op->src.card_format,
 			     false);
-	if (op->mask.bo)
+	if (op->mask.bo) {
+		assert(op->u.gen4.ve_id >> 2);
 		binding_table[2] =
 			gen4_bind_bo(sna,
 				     op->mask.bo,
@@ -1423,6 +1057,7 @@ gen4_bind_surfaces(struct sna *sna,
 				     op->mask.height,
 				     op->mask.card_format,
 				     false);
+	}
 
 	if (sna->kgem.surface == offset &&
 	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen4.surface_table) == *(uint64_t*)binding_table &&
@@ -1432,7 +1067,7 @@ gen4_bind_surfaces(struct sna *sna,
 		offset = sna->render_state.gen4.surface_table;
 	}
 
-	gen4_emit_state(sna, op, offset);
+	gen4_emit_state(sna, op, offset | dirty);
 }
 
 fastcall static void
@@ -1449,9 +1084,6 @@ gen4_render_composite_blt(struct sna *sna,
 
 	gen4_get_rectangles(sna, op, 1, gen4_bind_surfaces);
 	op->prim_emit(sna, op, r);
-
-	/* XXX are the shaders fubar? */
-	FLUSH(op);
 }
 
 fastcall static void
@@ -1461,19 +1093,25 @@ gen4_render_composite_box(struct sna *sna,
 {
 	struct sna_composite_rectangles r;
 
+	DBG(("  %s: (%d, %d), (%d, %d)\n",
+	     __FUNCTION__,
+	     box->x1, box->y1, box->x2, box->y2));
+
+	gen4_get_rectangles(sna, op, 1, gen4_bind_surfaces);
+
 	r.dst.x = box->x1;
 	r.dst.y = box->y1;
 	r.width  = box->x2 - box->x1;
 	r.height = box->y2 - box->y1;
 	r.mask = r.src = r.dst;
 
-	gen4_render_composite_blt(sna, op, &r);
+	op->prim_emit(sna, op, &r);
 }
 
 static void
-gen4_render_composite_boxes(struct sna *sna,
-			    const struct sna_composite_op *op,
-			    const BoxRec *box, int nbox)
+gen4_render_composite_boxes__blt(struct sna *sna,
+				 const struct sna_composite_op *op,
+				 const BoxRec *box, int nbox)
 {
 	DBG(("%s(%d) delta=(%d, %d), src=(%d, %d)/(%d, %d), mask=(%d, %d)/(%d, %d)\n",
 	     __FUNCTION__, nbox, op->dst.x, op->dst.y,
@@ -1483,16 +1121,84 @@ gen4_render_composite_boxes(struct sna *sna,
 	     op->mask.width, op->mask.height));
 
 	do {
-		struct sna_composite_rectangles r;
-
-		r.dst.x = box->x1;
-		r.dst.y = box->y1;
-		r.width  = box->x2 - box->x1;
-		r.height = box->y2 - box->y1;
-		r.mask = r.src = r.dst;
-		gen4_render_composite_blt(sna, op, &r);
-		box++;
-	} while (--nbox);
+		int nbox_this_time;
+
+		nbox_this_time = gen4_get_rectangles(sna, op, nbox,
+						     gen4_bind_surfaces);
+		nbox -= nbox_this_time;
+
+		do {
+			struct sna_composite_rectangles r;
+
+			DBG(("  %s: (%d, %d), (%d, %d)\n",
+			     __FUNCTION__,
+			     box->x1, box->y1, box->x2, box->y2));
+
+			r.dst.x = box->x1;
+			r.dst.y = box->y1;
+			r.width  = box->x2 - box->x1;
+			r.height = box->y2 - box->y1;
+			r.mask = r.src = r.dst;
+			op->prim_emit(sna, op, &r);
+			box++;
+		} while (--nbox_this_time);
+	} while (nbox);
+}
+
+static void
+gen4_render_composite_boxes(struct sna *sna,
+			    const struct sna_composite_op *op,
+			    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen4_get_rectangles(sna, op, nbox,
+						     gen4_bind_surfaces);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+	} while (nbox);
+}
+
+static void
+gen4_render_composite_boxes__thread(struct sna *sna,
+				    const struct sna_composite_op *op,
+				    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen4_get_rectangles(sna, op, nbox,
+						     gen4_bind_surfaces);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
 }
 
 #ifndef MAX
@@ -1533,6 +1239,7 @@ static uint32_t gen4_bind_video_source(struct sna *sna,
 static void gen4_video_bind_surfaces(struct sna *sna,
 				     const struct sna_composite_op *op)
 {
+	bool dirty = kgem_bo_is_dirty(op->dst.bo);
 	struct sna_video_frame *frame = op->priv;
 	uint32_t src_surf_format;
 	uint32_t src_surf_base[6];
@@ -1574,10 +1281,9 @@ static void gen4_video_bind_surfaces(struct sna *sna,
 		n_src = 1;
 	}
 
-	gen4_get_batch(sna);
+	gen4_get_batch(sna, op);
 
 	binding_table = gen4_composite_get_binding_table(sna, &offset);
-
 	binding_table[0] =
 		gen4_bind_bo(sna,
 			     op->dst.bo, op->dst.width, op->dst.height,
@@ -1594,7 +1300,7 @@ static void gen4_video_bind_surfaces(struct sna *sna,
 					       src_surf_format);
 	}
 
-	gen4_emit_state(sna, op, offset);
+	gen4_emit_state(sna, op, offset | dirty);
 }
 
 static bool
@@ -1604,10 +1310,11 @@ gen4_render_video(struct sna *sna,
 		  RegionPtr dstRegion,
 		  short src_w, short src_h,
 		  short drw_w, short drw_h,
+		  short dx, short dy,
 		  PixmapPtr pixmap)
 {
 	struct sna_composite_op tmp;
-	int nbox, dxo, dyo, pix_xoff, pix_yoff;
+	int nbox, pix_xoff, pix_yoff;
 	float src_scale_x, src_scale_y;
 	struct sna_pixmap *priv;
 	BoxPtr box;
@@ -1627,13 +1334,16 @@ gen4_render_video(struct sna *sna,
 	tmp.dst.format = sna_format_for_depth(pixmap->drawable.depth);
 	tmp.dst.bo = priv->gpu_bo;
 
-	tmp.src.filter = SAMPLER_FILTER_BILINEAR;
+	if (src_w == drw_w && src_h == drw_h)
+		tmp.src.filter = SAMPLER_FILTER_NEAREST;
+	else
+		tmp.src.filter = SAMPLER_FILTER_BILINEAR;
 	tmp.src.repeat = SAMPLER_EXTEND_PAD;
 	tmp.src.bo = frame->bo;
 	tmp.mask.bo = NULL;
 	tmp.u.gen4.wm_kernel =
 		is_planar_fourcc(frame->id) ? WM_KERNEL_VIDEO_PLANAR : WM_KERNEL_VIDEO_PACKED;
-	tmp.u.gen4.ve_id = 1;
+	tmp.u.gen4.ve_id = 2;
 	tmp.is_affine = true;
 	tmp.floats_per_vertex = 3;
 	tmp.floats_per_rect = 9;
@@ -1658,9 +1368,6 @@ gen4_render_video(struct sna *sna,
 	pix_yoff = 0;
 #endif
 
-	dxo = dstRegion->extents.x1;
-	dyo = dstRegion->extents.y1;
-
 	/* Use normalized texture coordinates */
 	src_scale_x = ((float)src_w / frame->width) / (float)drw_w;
 	src_scale_y = ((float)src_h / frame->height) / (float)drw_h;
@@ -1678,18 +1385,16 @@ gen4_render_video(struct sna *sna,
 		gen4_get_rectangles(sna, &tmp, 1, gen4_video_bind_surfaces);
 
 		OUT_VERTEX(r.x2, r.y2);
-		OUT_VERTEX_F((box->x2 - dxo) * src_scale_x);
-		OUT_VERTEX_F((box->y2 - dyo) * src_scale_y);
+		OUT_VERTEX_F((box->x2 - dx) * src_scale_x);
+		OUT_VERTEX_F((box->y2 - dy) * src_scale_y);
 
 		OUT_VERTEX(r.x1, r.y2);
-		OUT_VERTEX_F((box->x1 - dxo) * src_scale_x);
-		OUT_VERTEX_F((box->y2 - dyo) * src_scale_y);
+		OUT_VERTEX_F((box->x1 - dx) * src_scale_x);
+		OUT_VERTEX_F((box->y2 - dy) * src_scale_y);
 
 		OUT_VERTEX(r.x1, r.y1);
-		OUT_VERTEX_F((box->x1 - dxo) * src_scale_x);
-		OUT_VERTEX_F((box->y1 - dyo) * src_scale_y);
-
-		_FLUSH();
+		OUT_VERTEX_F((box->x1 - dx) * src_scale_x);
+		OUT_VERTEX_F((box->y1 - dy) * src_scale_y);
 
 		if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
 			sna_damage_add_box(&priv->gpu_damage, &r);
@@ -1703,141 +1408,6 @@ gen4_render_video(struct sna *sna,
 	return true;
 }
 
-static bool
-gen4_composite_solid_init(struct sna *sna,
-			  struct sna_composite_channel *channel,
-			  uint32_t color)
-{
-	channel->filter = PictFilterNearest;
-	channel->repeat = RepeatNormal;
-	channel->is_affine = true;
-	channel->is_solid  = true;
-	channel->transform = NULL;
-	channel->width  = 1;
-	channel->height = 1;
-	channel->card_format = GEN4_SURFACEFORMAT_B8G8R8A8_UNORM;
-
-	channel->bo = sna_render_get_solid(sna, color);
-
-	channel->scale[0]  = channel->scale[1]  = 1;
-	channel->offset[0] = channel->offset[1] = 0;
-	return channel->bo != NULL;
-}
-
-static bool
-gen4_composite_linear_init(struct sna *sna,
-			   PicturePtr picture,
-			   struct sna_composite_channel *channel,
-			   int x, int y,
-			   int w, int h,
-			   int dst_x, int dst_y)
-{
-	PictLinearGradient *linear =
-		(PictLinearGradient *)picture->pSourcePict;
-	pixman_fixed_t tx, ty;
-	float x0, y0, sf;
-	float dx, dy;
-
-	DBG(("%s: p1=(%f, %f), p2=(%f, %f), src=(%d, %d), dst=(%d, %d), size=(%d, %d)\n",
-	     __FUNCTION__,
-	     pixman_fixed_to_double(linear->p1.x), pixman_fixed_to_double(linear->p1.y),
-	     pixman_fixed_to_double(linear->p2.x), pixman_fixed_to_double(linear->p2.y),
-	     x, y, dst_x, dst_y, w, h));
-
-	if (linear->p2.x == linear->p1.x && linear->p2.y == linear->p1.y)
-		return 0;
-
-	if (!sna_transform_is_affine(picture->transform)) {
-		DBG(("%s: fallback due to projective transform\n",
-		     __FUNCTION__));
-		return sna_render_picture_fixup(sna, picture, channel,
-						x, y, w, h, dst_x, dst_y);
-	}
-
-	channel->bo = sna_render_get_gradient(sna, (PictGradient *)linear);
-	if (!channel->bo)
-		return 0;
-
-	channel->filter = PictFilterNearest;
-	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
-	channel->width  = channel->bo->pitch / 4;
-	channel->height = 1;
-	channel->pict_format = PICT_a8r8g8b8;
-
-	channel->scale[0]  = channel->scale[1]  = 1;
-	channel->offset[0] = channel->offset[1] = 0;
-
-	if (sna_transform_is_translation(picture->transform, &tx, &ty)) {
-		dx = pixman_fixed_to_double(linear->p2.x - linear->p1.x);
-		dy = pixman_fixed_to_double(linear->p2.y - linear->p1.y);
-
-		x0 = pixman_fixed_to_double(linear->p1.x);
-		y0 = pixman_fixed_to_double(linear->p1.y);
-
-		if (tx | ty) {
-			x0 -= pixman_fixed_to_double(tx);
-			y0 -= pixman_fixed_to_double(ty);
-		}
-	} else {
-		struct pixman_f_vector p1, p2;
-		struct pixman_f_transform m, inv;
-
-		pixman_f_transform_from_pixman_transform(&m, picture->transform);
-		DBG(("%s: transform = [%f %f %f, %f %f %f, %f %f %f]\n",
-		     __FUNCTION__,
-		     m.m[0][0], m.m[0][1], m.m[0][2],
-		     m.m[1][0], m.m[1][1], m.m[1][2],
-		     m.m[2][0], m.m[2][1], m.m[2][2]));
-		if (!pixman_f_transform_invert(&inv, &m))
-			return 0;
-
-		p1.v[0] = pixman_fixed_to_double(linear->p1.x);
-		p1.v[1] = pixman_fixed_to_double(linear->p1.y);
-		p1.v[2] = 1.;
-		pixman_f_transform_point(&inv, &p1);
-
-		p2.v[0] = pixman_fixed_to_double(linear->p2.x);
-		p2.v[1] = pixman_fixed_to_double(linear->p2.y);
-		p2.v[2] = 1.;
-		pixman_f_transform_point(&inv, &p2);
-
-		DBG(("%s: untransformed: p1=(%f, %f, %f), p2=(%f, %f, %f)\n",
-		     __FUNCTION__,
-		     p1.v[0], p1.v[1], p1.v[2],
-		     p2.v[0], p2.v[1], p2.v[2]));
-
-		dx = p2.v[0] - p1.v[0];
-		dy = p2.v[1] - p1.v[1];
-
-		x0 = p1.v[0];
-		y0 = p1.v[1];
-	}
-
-	sf = dx*dx + dy*dy;
-	dx /= sf;
-	dy /= sf;
-
-	channel->embedded_transform.matrix[0][0] = pixman_double_to_fixed(dx);
-	channel->embedded_transform.matrix[0][1] = pixman_double_to_fixed(dy);
-	channel->embedded_transform.matrix[0][2] = -pixman_double_to_fixed(dx*(x0+dst_x-x) + dy*(y0+dst_y-y));
-
-	channel->embedded_transform.matrix[1][0] = 0;
-	channel->embedded_transform.matrix[1][1] = 0;
-	channel->embedded_transform.matrix[1][2] = pixman_double_to_fixed(.5);
-
-	channel->embedded_transform.matrix[2][0] = 0;
-	channel->embedded_transform.matrix[2][1] = 0;
-	channel->embedded_transform.matrix[2][2] = pixman_fixed_1;
-
-	channel->transform = &channel->embedded_transform;
-	channel->is_affine = 1;
-
-	DBG(("%s: dx=%f, dy=%f, offset=%f\n",
-	     __FUNCTION__, dx, dy, -dx*(x0-x+dst_x) + -dy*(y0-y+dst_y)));
-
-	return channel->bo != NULL;
-}
-
 static int
 gen4_composite_picture(struct sna *sna,
 		       PicturePtr picture,
@@ -1858,16 +1428,16 @@ gen4_composite_picture(struct sna *sna,
 	channel->card_format = -1;
 
 	if (sna_picture_is_solid(picture, &color))
-		return gen4_composite_solid_init(sna, channel, color);
+		return gen4_channel_init_solid(sna, channel, color);
 
 	if (picture->pDrawable == NULL) {
 		int ret;
 
 		if (picture->pSourcePict->type == SourcePictTypeLinear)
-			return gen4_composite_linear_init(sna, picture, channel,
-							  x, y,
-							  w, h,
-							  dst_x, dst_y);
+			return gen4_channel_init_linear(sna, picture, channel,
+							x, y,
+							w, h,
+							dst_x, dst_y);
 
 		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
 		ret = -1;
@@ -1922,7 +1492,8 @@ gen4_composite_picture(struct sna *sna,
 	channel->card_format = gen4_get_card_format(picture->format);
 	if (channel->card_format == -1)
 		return sna_render_picture_convert(sna, picture, channel, pixmap,
-						  x, y, w, h, dst_x, dst_y);
+						  x, y, w, h, dst_x, dst_y,
+						  false);
 
 	if (too_large(pixmap->drawable.width, pixmap->drawable.height))
 		return sna_render_picture_extract(sna, picture, channel,
@@ -1950,7 +1521,7 @@ gen4_render_composite_done(struct sna *sna,
 {
 	DBG(("%s()\n", __FUNCTION__));
 
-	if (sna->render_state.gen4.vertex_offset) {
+	if (sna->render.vertex_offset) {
 		gen4_vertex_flush(sna);
 		gen4_magic_ca_pass(sna, op);
 	}
@@ -1964,54 +1535,49 @@ gen4_render_composite_done(struct sna *sna,
 }
 
 static bool
-gen4_composite_set_target(PicturePtr dst, struct sna_composite_op *op)
+gen4_composite_set_target(struct sna *sna,
+			  struct sna_composite_op *op,
+			  PicturePtr dst,
+			  int x, int y, int w, int h)
 {
-	struct sna_pixmap *priv;
-
-	if (!gen4_check_dst_format(dst->format)) {
-		DBG(("%s: incompatible render target format %08x\n",
-		     __FUNCTION__, dst->format));
-		return false;
-	}
+	BoxRec box;
 
 	op->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
 	op->dst.width  = op->dst.pixmap->drawable.width;
 	op->dst.height = op->dst.pixmap->drawable.height;
 	op->dst.format = dst->format;
-	priv = sna_pixmap_force_to_gpu(op->dst.pixmap, MOVE_READ | MOVE_WRITE);
-	if (priv == NULL)
-		return false;
+	if (w && h) {
+		box.x1 = x;
+		box.y1 = y;
+		box.x2 = x + w;
+		box.y2 = y + h;
+	} else
+		sna_render_picture_extents(dst, &box);
 
-	op->dst.bo = priv->gpu_bo;
-	op->damage = &priv->gpu_damage;
-	if (sna_damage_is_all(&priv->gpu_damage, op->dst.width, op->dst.height))
-		op->damage = NULL;
-	DBG(("%s: all-damaged=%d, damage=%p\n", __FUNCTION__,
-	     sna_damage_is_all(&priv->gpu_damage, op->dst.width, op->dst.height),
-	    op->damage));
+	op->dst.bo = sna_drawable_use_bo (dst->pDrawable,
+					  PREFER_GPU | FORCE_GPU | RENDER_GPU,
+					  &box, &op->damage);
+	if (op->dst.bo == NULL)
+		return false;
 
 	get_drawable_deltas(dst->pDrawable, op->dst.pixmap,
 			    &op->dst.x, &op->dst.y);
-	return true;
-}
 
-static inline bool
-picture_is_cpu(PicturePtr picture)
-{
-	if (!picture->pDrawable)
-		return false;
+	DBG(("%s: pixmap=%p, format=%08x, size=%dx%d, pitch=%d, delta=(%d,%d),damage=%p\n",
+	     __FUNCTION__,
+	     op->dst.pixmap, (int)op->dst.format,
+	     op->dst.width, op->dst.height,
+	     op->dst.bo->pitch,
+	     op->dst.x, op->dst.y,
+	     op->damage ? *op->damage : (void *)-1));
 
-	return !is_gpu(picture->pDrawable);
-}
+	assert(op->dst.bo->proxy == NULL);
+
+	if (too_large(op->dst.width, op->dst.height) &&
+	    !sna_render_composite_redirect(sna, op, x, y, w, h))
+		return false;
 
-static inline bool prefer_blt(struct sna *sna)
-{
-#if PREFER_BLT
 	return true;
-	(void)sna;
-#else
-	return sna->kgem.mode != KGEM_RENDER;
-#endif
 }
 
 static bool
@@ -2019,7 +1585,7 @@ try_blt(struct sna *sna,
 	PicturePtr dst, PicturePtr src,
 	int width, int height)
 {
-	if (prefer_blt(sna)) {
+	if (sna->kgem.mode != KGEM_RENDER) {
 		DBG(("%s: already performing BLT\n", __FUNCTION__));
 		return true;
 	}
@@ -2038,7 +1604,7 @@ try_blt(struct sna *sna,
 		return true;
 
 	/* is the source picture only in cpu memory e.g. a shm pixmap? */
-	return picture_is_cpu(src);
+	return picture_is_cpu(sna, src);
 }
 
 static bool
@@ -2060,15 +1626,10 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
-untransformed(PicturePtr p)
+need_upload(struct sna *sna, PicturePtr p)
 {
-	return !p->transform || pixman_transform_is_int_translate(p->transform);
-}
-
-static bool
-need_upload(PicturePtr p)
-{
-	return p->pDrawable && untransformed(p) && !is_gpu(p->pDrawable);
+	return p->pDrawable && untransformed(p) &&
+		!is_gpu(sna, p->pDrawable, PREFER_GPU_RENDER);
 }
 
 static bool
@@ -2084,11 +1645,14 @@ source_is_busy(PixmapPtr pixmap)
 	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
 		return true;
 
+	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
+		return true;
+
 	return priv->gpu_damage && !priv->cpu_damage;
 }
 
 static bool
-source_fallback(PicturePtr p, PixmapPtr pixmap)
+source_fallback(struct sna *sna, PicturePtr p, PixmapPtr pixmap)
 {
 	if (sna_picture_is_solid(p, NULL))
 		return false;
@@ -2103,7 +1667,7 @@ source_fallback(PicturePtr p, PixmapPtr pixmap)
 	if (pixmap && source_is_busy(pixmap))
 		return false;
 
-	return has_alphamap(p) || !gen4_check_filter(p) || need_upload(p);
+	return has_alphamap(p) || !gen4_check_filter(p) || need_upload(sna, p);
 }
 
 static bool
@@ -2112,7 +1676,6 @@ gen4_composite_fallback(struct sna *sna,
 			PicturePtr mask,
 			PicturePtr dst)
 {
-	struct sna_pixmap *priv;
 	PixmapPtr src_pixmap;
 	PixmapPtr mask_pixmap;
 	PixmapPtr dst_pixmap;
@@ -2127,11 +1690,11 @@ gen4_composite_fallback(struct sna *sna,
 	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
 
 	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
-	src_fallback = source_fallback(src, src_pixmap);
+	src_fallback = source_fallback(sna, src, src_pixmap);
 
 	if (mask) {
 		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
-		mask_fallback = source_fallback(mask, mask_pixmap);
+		mask_fallback = source_fallback(sna, mask, mask_pixmap);
 	} else {
 		mask_pixmap = NULL;
 		mask_fallback = false;
@@ -2151,8 +1714,7 @@ gen4_composite_fallback(struct sna *sna,
 	}
 
 	/* If anything is on the GPU, push everything out to the GPU */
-	priv = sna_pixmap(dst_pixmap);
-	if (priv && priv->gpu_damage && !priv->clear) {
+	if (dst_use_gpu(dst_pixmap)) {
 		DBG(("%s: dst is already on the GPU, try to use GPU\n",
 		     __FUNCTION__));
 		return false;
@@ -2187,14 +1749,14 @@ gen4_composite_fallback(struct sna *sna,
 
 	if (too_large(dst_pixmap->drawable.width,
 		      dst_pixmap->drawable.height) &&
-	    (priv == NULL || DAMAGE_IS_ALL(priv->cpu_damage))) {
+	    dst_is_cpu(dst_pixmap)) {
 		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
 		return true;
 	}
 
 	DBG(("%s: dst is not on the GPU and the operation should not fallback\n",
 	     __FUNCTION__));
-	return false;
+	return dst_use_cpu(dst_pixmap);
 }
 
 static int
@@ -2215,7 +1777,7 @@ reuse_source(struct sna *sna,
 	}
 
 	if (sna_picture_is_solid(mask, &color))
-		return gen4_composite_solid_init(sna, mc, color);
+		return gen4_channel_init_solid(sna, mc, color);
 
 	if (sc->is_solid)
 		return false;
@@ -2291,15 +1853,13 @@ gen4_render_composite(struct sna *sna,
 					    width, height,
 					    tmp);
 
-	if (!gen4_composite_set_target(dst, tmp))
-		return false;
-	sna_render_reduce_damage(tmp, dst_x, dst_y, width, height);
-
-	if (too_large(tmp->dst.width, tmp->dst.height) &&
-	    !sna_render_composite_redirect(sna, tmp,
-					   dst_x, dst_y, width, height))
+	if (!gen4_composite_set_target(sna, tmp, dst,
+				       dst_x, dst_y, width, height)) {
+		DBG(("%s: failed to set composite target\n", __FUNCTION__));
 		return false;
+	}
 
+	tmp->op = op;
 	switch (gen4_composite_picture(sna, src, &tmp->src,
 				       src_x, src_y,
 				       width, height,
@@ -2309,7 +1869,7 @@ gen4_render_composite(struct sna *sna,
 		DBG(("%s: failed to prepare source\n", __FUNCTION__));
 		goto cleanup_dst;
 	case 0:
-		if (!gen4_composite_solid_init(sna, &tmp->src, 0))
+		if (!gen4_channel_init_solid(sna, &tmp->src, 0))
 			goto cleanup_dst;
 		/* fall through to fixup */
 	case 1:
@@ -2323,12 +1883,10 @@ gen4_render_composite(struct sna *sna,
 		break;
 	}
 
-	tmp->op = op;
 	tmp->is_affine = tmp->src.is_affine;
 	tmp->has_component_alpha = false;
 	tmp->need_magic_ca_pass = false;
 
-	tmp->prim_emit = gen4_emit_composite_primitive;
 	if (mask) {
 		if (mask->componentAlpha && PICT_FORMAT_RGB(mask->format)) {
 			tmp->has_component_alpha = true;
@@ -2363,7 +1921,7 @@ gen4_render_composite(struct sna *sna,
 				DBG(("%s: failed to prepare mask\n", __FUNCTION__));
 				goto cleanup_src;
 			case 0:
-				if (!gen4_composite_solid_init(sna, &tmp->mask, 0))
+				if (!gen4_channel_init_solid(sna, &tmp->mask, 0))
 					goto cleanup_src;
 				/* fall through to fixup */
 			case 1:
@@ -2373,33 +1931,22 @@ gen4_render_composite(struct sna *sna,
 		}
 
 		tmp->is_affine &= tmp->mask.is_affine;
-
-		if (tmp->src.transform == NULL && tmp->mask.transform == NULL)
-			tmp->prim_emit = gen4_emit_composite_primitive_identity_source_mask;
-
-		tmp->floats_per_vertex = 5 + 2 * !tmp->is_affine;
-	} else {
-		if (tmp->src.is_solid)
-			tmp->prim_emit = gen4_emit_composite_primitive_solid;
-		else if (tmp->src.transform == NULL)
-			tmp->prim_emit = gen4_emit_composite_primitive_identity_source;
-		else if (tmp->src.is_affine)
-			tmp->prim_emit = gen4_emit_composite_primitive_affine_source;
-
-		tmp->floats_per_vertex = 3 + !tmp->is_affine;
 	}
-	tmp->floats_per_rect = 3*tmp->floats_per_vertex;
 
 	tmp->u.gen4.wm_kernel =
 		gen4_choose_composite_kernel(tmp->op,
 					     tmp->mask.bo != NULL,
 					     tmp->has_component_alpha,
 					     tmp->is_affine);
-	tmp->u.gen4.ve_id = (tmp->mask.bo != NULL) << 1 | tmp->is_affine;
+	tmp->u.gen4.ve_id = gen4_choose_composite_emitter(tmp);
 
 	tmp->blt   = gen4_render_composite_blt;
 	tmp->box   = gen4_render_composite_box;
-	tmp->boxes = gen4_render_composite_boxes;
+	tmp->boxes = gen4_render_composite_boxes__blt;
+	if (tmp->emit_boxes) {
+		tmp->boxes = gen4_render_composite_boxes;
+		tmp->thread_boxes = gen4_render_composite_boxes__thread;
+	}
 	tmp->done  = gen4_render_composite_done;
 
 	if (!kgem_check_bo(&sna->kgem,
@@ -2428,127 +1975,7 @@ cleanup_dst:
 	return false;
 }
 
-/* A poor man's span interface. But better than nothing? */
 #if !NO_COMPOSITE_SPANS
-inline static void
-gen4_emit_composite_texcoord(struct sna *sna,
-			     const struct sna_composite_channel *channel,
-			     int16_t x, int16_t y)
-{
-	float t[3];
-
-	if (channel->is_affine) {
-		sna_get_transformed_coordinates(x + channel->offset[0],
-						y + channel->offset[1],
-						channel->transform,
-						&t[0], &t[1]);
-		OUT_VERTEX_F(t[0] * channel->scale[0]);
-		OUT_VERTEX_F(t[1] * channel->scale[1]);
-	} else {
-		t[0] = t[1] = 0; t[2] = 1;
-		sna_get_transformed_coordinates_3d(x + channel->offset[0],
-						   y + channel->offset[1],
-						   channel->transform,
-						   &t[0], &t[1], &t[2]);
-		OUT_VERTEX_F(t[0] * channel->scale[0]);
-		OUT_VERTEX_F(t[1] * channel->scale[1]);
-		OUT_VERTEX_F(t[2]);
-	}
-}
-
-inline static void
-gen4_emit_composite_texcoord_affine(struct sna *sna,
-				    const struct sna_composite_channel *channel,
-				    int16_t x, int16_t y)
-{
-	float t[2];
-
-	sna_get_transformed_coordinates(x + channel->offset[0],
-					y + channel->offset[1],
-					channel->transform,
-					&t[0], &t[1]);
-	OUT_VERTEX_F(t[0] * channel->scale[0]);
-	OUT_VERTEX_F(t[1] * channel->scale[1]);
-}
-
-inline static void
-gen4_emit_composite_spans_vertex(struct sna *sna,
-				 const struct sna_composite_spans_op *op,
-				 int16_t x, int16_t y)
-{
-	OUT_VERTEX(x, y);
-	gen4_emit_composite_texcoord(sna, &op->base.src, x, y);
-}
-
-fastcall static void
-gen4_emit_composite_spans_primitive(struct sna *sna,
-				    const struct sna_composite_spans_op *op,
-				    const BoxRec *box,
-				    float opacity)
-{
-	gen4_emit_composite_spans_vertex(sna, op, box->x2, box->y2);
-	OUT_VERTEX_F(opacity);
-	OUT_VERTEX_F(1);
-	if (!op->base.is_affine)
-		OUT_VERTEX_F(1);
-
-	gen4_emit_composite_spans_vertex(sna, op, box->x1, box->y2);
-	OUT_VERTEX_F(opacity);
-	OUT_VERTEX_F(1);
-	if (!op->base.is_affine)
-		OUT_VERTEX_F(1);
-
-	gen4_emit_composite_spans_vertex(sna, op, box->x1, box->y1);
-	OUT_VERTEX_F(opacity);
-	OUT_VERTEX_F(0);
-	if (!op->base.is_affine)
-		OUT_VERTEX_F(1);
-}
-
-fastcall static void
-gen4_emit_composite_spans_solid(struct sna *sna,
-				const struct sna_composite_spans_op *op,
-				const BoxRec *box,
-				float opacity)
-{
-	OUT_VERTEX(box->x2, box->y2);
-	OUT_VERTEX_F(1); OUT_VERTEX_F(1);
-	OUT_VERTEX_F(opacity); OUT_VERTEX_F(1);
-
-	OUT_VERTEX(box->x1, box->y2);
-	OUT_VERTEX_F(0); OUT_VERTEX_F(1);
-	OUT_VERTEX_F(opacity); OUT_VERTEX_F(1);
-
-	OUT_VERTEX(box->x1, box->y1);
-	OUT_VERTEX_F(0); OUT_VERTEX_F(0);
-	OUT_VERTEX_F(opacity); OUT_VERTEX_F(0);
-}
-
-fastcall static void
-gen4_emit_composite_spans_affine(struct sna *sna,
-				 const struct sna_composite_spans_op *op,
-				 const BoxRec *box,
-				 float opacity)
-{
-	OUT_VERTEX(box->x2, box->y2);
-	gen4_emit_composite_texcoord_affine(sna, &op->base.src,
-					    box->x2, box->y2);
-	OUT_VERTEX_F(opacity);
-	OUT_VERTEX_F(1);
-
-	OUT_VERTEX(box->x1, box->y2);
-	gen4_emit_composite_texcoord_affine(sna, &op->base.src,
-					    box->x1, box->y2);
-	OUT_VERTEX_F(opacity);
-	OUT_VERTEX_F(1);
-
-	OUT_VERTEX(box->x1, box->y1);
-	gen4_emit_composite_texcoord_affine(sna, &op->base.src,
-					    box->x1, box->y1);
-	OUT_VERTEX_F(opacity);
-	OUT_VERTEX_F(0);
-}
-
 fastcall static void
 gen4_render_composite_spans_box(struct sna *sna,
 				const struct sna_composite_spans_op *op,
@@ -2580,22 +2007,69 @@ gen4_render_composite_spans_boxes(struct sna *sna,
 	     op->base.dst.x, op->base.dst.y));
 
 	do {
-		gen4_render_composite_spans_box(sna, op, box++, opacity);
-	} while (--nbox);
+		int nbox_this_time;
+
+		nbox_this_time = gen4_get_rectangles(sna, &op->base, nbox,
+						     gen4_bind_surfaces);
+		nbox -= nbox_this_time;
+
+		do {
+			DBG(("  %s: (%d, %d) x (%d, %d)\n", __FUNCTION__,
+			     box->x1, box->y1,
+			     box->x2 - box->x1,
+			     box->y2 - box->y1));
+
+			op->prim_emit(sna, op, box++, opacity);
+		} while (--nbox_this_time);
+	} while (nbox);
+}
+
+fastcall static void
+gen4_render_composite_spans_boxes__thread(struct sna *sna,
+					  const struct sna_composite_spans_op *op,
+					  const struct sna_opacity_box *box,
+					  int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     op->base.dst.x, op->base.dst.y));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen4_get_rectangles(sna, &op->base, nbox,
+						     gen4_bind_surfaces);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
 }
 
 fastcall static void
 gen4_render_composite_spans_done(struct sna *sna,
 				 const struct sna_composite_spans_op *op)
 {
-	gen4_vertex_flush(sna);
+	if (sna->render.vertex_offset)
+		gen4_vertex_flush(sna);
 
 	DBG(("%s()\n", __FUNCTION__));
 
-	kgem_bo_destroy(&sna->kgem, op->base.mask.bo);
-	if (op->base.src.bo)
-		kgem_bo_destroy(&sna->kgem, op->base.src.bo);
-
+	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
 	sna_render_composite_redirect_done(sna, &op->base);
 }
 
@@ -2605,17 +2079,43 @@ gen4_check_composite_spans(struct sna *sna,
 			   int16_t width, int16_t height,
 			   unsigned flags)
 {
-	if ((flags & COMPOSITE_SPANS_RECTILINEAR) == 0)
-		return false;
+	DBG(("%s: op=%d, width=%d, height=%d, flags=%x\n",
+	     __FUNCTION__, op, width, height, flags));
 
 	if (op >= ARRAY_SIZE(gen4_blend_op))
 		return false;
 
-	if (gen4_composite_fallback(sna, src, NULL, dst))
+	if (gen4_composite_fallback(sna, src, NULL, dst)) {
+		DBG(("%s: operation would fallback\n", __FUNCTION__));
 		return false;
+	}
 
-	if (need_tiling(sna, width, height) && !is_gpu(dst->pDrawable))
+	if (need_tiling(sna, width, height) &&
+	    !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
+		DBG(("%s: fallback, tiled operation not on GPU\n",
+		     __FUNCTION__));
 		return false;
+	}
+
+	if (FORCE_SPANS)
+		return FORCE_SPANS > 0;
+
+	if ((flags & COMPOSITE_SPANS_RECTILINEAR) == 0) {
+		struct sna_pixmap *priv = sna_pixmap_from_drawable(dst->pDrawable);
+		assert(priv);
+
+		if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
+			return true;
+
+		if (flags & COMPOSITE_SPANS_INPLACE_HINT)
+			return false;
+
+		if ((sna->render.prefer_gpu & PREFER_GPU_SPANS) == 0 &&
+		    dst->format == PICT_a8)
+			return false;
+
+		return priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo);
+	}
 
 	return true;
 }
@@ -2645,15 +2145,9 @@ gen4_render_composite_spans(struct sna *sna,
 	}
 
 	tmp->base.op = op;
-	if (!gen4_composite_set_target(dst, &tmp->base))
+	if (!gen4_composite_set_target(sna, &tmp->base, dst,
+				       dst_x, dst_y, width, height))
 		return false;
-	sna_render_reduce_damage(&tmp->base, dst_x, dst_y, width, height);
-
-	if (too_large(tmp->base.dst.width, tmp->base.dst.height)) {
-		if (!sna_render_composite_redirect(sna, &tmp->base,
-						   dst_x, dst_y, width, height))
-			return false;
-	}
 
 	switch (gen4_composite_picture(sna, src, &tmp->base.src,
 				       src_x, src_y,
@@ -2663,7 +2157,7 @@ gen4_render_composite_spans(struct sna *sna,
 	case -1:
 		goto cleanup_dst;
 	case 0:
-		if (!gen4_composite_solid_init(sna, &tmp->base.src, 0))
+		if (!gen4_channel_init_solid(sna, &tmp->base.src, 0))
 			goto cleanup_dst;
 		/* fall through to fixup */
 	case 1:
@@ -2671,27 +2165,21 @@ gen4_render_composite_spans(struct sna *sna,
 		break;
 	}
 
-	tmp->base.mask.bo = sna_render_get_solid(sna, 0);
-	if (tmp->base.mask.bo == NULL)
-		goto cleanup_src;
+	tmp->base.mask.bo = NULL;
+	tmp->base.mask.filter = SAMPLER_FILTER_NEAREST;
+	tmp->base.mask.repeat = SAMPLER_EXTEND_NONE;
 
 	tmp->base.is_affine = tmp->base.src.is_affine;
 	tmp->base.has_component_alpha = false;
 	tmp->base.need_magic_ca_pass = false;
 
-	tmp->prim_emit = gen4_emit_composite_spans_primitive;
-	if (tmp->base.src.is_solid)
-		tmp->prim_emit = gen4_emit_composite_spans_solid;
-	else if (tmp->base.is_affine)
-		tmp->prim_emit = gen4_emit_composite_spans_affine;
-	tmp->base.floats_per_vertex = 5 + 2*!tmp->base.is_affine;
-	tmp->base.floats_per_rect = 3 * tmp->base.floats_per_vertex;
-
-	tmp->base.u.gen5.wm_kernel = WM_KERNEL_OPACITY | !tmp->base.is_affine;
-	tmp->base.u.gen4.ve_id = 1 << 1 | tmp->base.is_affine;
+	tmp->base.u.gen4.ve_id = gen4_choose_spans_emitter(tmp);
+	tmp->base.u.gen4.wm_kernel = WM_KERNEL_OPACITY | !tmp->base.is_affine;
 
 	tmp->box   = gen4_render_composite_spans_box;
 	tmp->boxes = gen4_render_composite_spans_boxes;
+	if (tmp->emit_boxes)
+		tmp->thread_boxes = gen4_render_composite_spans_boxes__thread;
 	tmp->done  = gen4_render_composite_spans_done;
 
 	if (!kgem_check_bo(&sna->kgem,
@@ -2721,10 +2209,11 @@ cleanup_dst:
 static void
 gen4_copy_bind_surfaces(struct sna *sna, const struct sna_composite_op *op)
 {
+	bool dirty = kgem_bo_is_dirty(op->dst.bo);
 	uint32_t *binding_table;
 	uint16_t offset;
 
-	gen4_get_batch(sna);
+	gen4_get_batch(sna, op);
 
 	binding_table = gen4_composite_get_binding_table(sna, &offset);
 
@@ -2745,7 +2234,7 @@ gen4_copy_bind_surfaces(struct sna *sna, const struct sna_composite_op *op)
 		offset = sna->render_state.gen4.surface_table;
 	}
 
-	gen4_emit_state(sna, op, offset);
+	gen4_emit_state(sna, op, offset | dirty);
 }
 
 static void
@@ -2768,19 +2257,6 @@ gen4_render_copy_one(struct sna *sna,
 	OUT_VERTEX(dx, dy);
 	OUT_VERTEX_F(sx*op->src.scale[0]);
 	OUT_VERTEX_F(sy*op->src.scale[1]);
-
-	_FLUSH();
-}
-
-static inline bool prefer_blt_copy(struct sna *sna, unsigned flags)
-{
-#if PREFER_BLT
-	return true;
-	(void)sna;
-#else
-	return sna->kgem.mode != KGEM_RENDER;
-#endif
-	(void)flags;
 }
 
 static bool
@@ -2793,8 +2269,7 @@ gen4_render_copy_boxes(struct sna *sna, uint8_t alu,
 
 	DBG(("%s x %d\n", __FUNCTION__, n));
 
-	if (prefer_blt_copy(sna, flags) &&
-	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
+	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 	    sna_blt_copy_boxes(sna, alu,
 			       src_bo, src_dx, src_dy,
 			       dst_bo, dst_dx, dst_dy,
@@ -2899,7 +2374,7 @@ fallback_blt:
 	tmp.floats_per_vertex = 3;
 	tmp.floats_per_rect = 9;
 	tmp.u.gen4.wm_kernel = WM_KERNEL;
-	tmp.u.gen4.ve_id = 1;
+	tmp.u.gen4.ve_id = 2;
 
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
@@ -2936,6 +2411,14 @@ fallback_tiled_dst:
 	if (tmp.redirect.real_bo)
 		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
 fallback_tiled:
+	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
+	    sna_blt_copy_boxes(sna, alu,
+			       src_bo, src_dx, src_dy,
+			       dst_bo, dst_dx, dst_dy,
+			       dst->drawable.bitsPerPixel,
+			       box, n))
+		return true;
+
 	return sna_tiling_copy_boxes(sna, alu,
 				     src, src_bo, src_dx, src_dy,
 				     dst, dst_bo, dst_dx, dst_dy,
@@ -2955,7 +2438,8 @@ gen4_render_copy_blt(struct sna *sna,
 static void
 gen4_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
 {
-	gen4_vertex_flush(sna);
+	if (sna->render.vertex_offset)
+		gen4_vertex_flush(sna);
 }
 
 static bool
@@ -2970,8 +2454,7 @@ gen4_render_copy(struct sna *sna, uint8_t alu,
 	     dst->drawable.serialNumber,
 	     alu));
 
-	if (prefer_blt(sna) &&
-	    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
+	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 	    sna_blt_copy(sna, alu,
 			 src_bo, dst_bo,
 			 dst->drawable.bitsPerPixel,
@@ -3021,7 +2504,7 @@ fallback:
 	op->base.floats_per_vertex = 3;
 	op->base.floats_per_rect = 9;
 	op->base.u.gen4.wm_kernel = WM_KERNEL;
-	op->base.u.gen4.ve_id = 1;
+	op->base.u.gen4.ve_id = 2;
 
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
@@ -3047,56 +2530,20 @@ fallback:
 }
 
 static void
-gen4_fill_bind_surfaces(struct sna *sna, const struct sna_composite_op *op)
-{
-	uint32_t *binding_table;
-	uint16_t offset;
-
-	gen4_get_batch(sna);
-
-	binding_table = gen4_composite_get_binding_table(sna, &offset);
-
-	binding_table[0] =
-		gen4_bind_bo(sna,
-			     op->dst.bo, op->dst.width, op->dst.height,
-			     gen4_get_dest_format(op->dst.format),
-			     true);
-	binding_table[1] =
-		gen4_bind_bo(sna,
-			     op->src.bo, 1, 1,
-			     GEN4_SURFACEFORMAT_B8G8R8A8_UNORM,
-			     false);
-
-	if (sna->kgem.surface == offset &&
-	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen4.surface_table) == *(uint64_t*)binding_table) {
-		sna->kgem.surface +=
-			sizeof(struct gen4_surface_state_padded)/sizeof(uint32_t);
-		offset = sna->render_state.gen4.surface_table;
-	}
-
-	gen4_emit_state(sna, op, offset);
-}
-
-static void
 gen4_render_fill_rectangle(struct sna *sna,
 			   const struct sna_composite_op *op,
 			   int x, int y, int w, int h)
 {
-	gen4_get_rectangles(sna, op, 1, gen4_fill_bind_surfaces);
+	gen4_get_rectangles(sna, op, 1, gen4_bind_surfaces);
 
 	OUT_VERTEX(x+w, y+h);
-	OUT_VERTEX_F(1);
-	OUT_VERTEX_F(1);
+	OUT_VERTEX_F(.5);
 
 	OUT_VERTEX(x, y+h);
-	OUT_VERTEX_F(0);
-	OUT_VERTEX_F(1);
+	OUT_VERTEX_F(.5);
 
 	OUT_VERTEX(x, y);
-	OUT_VERTEX_F(0);
-	OUT_VERTEX_F(0);
-
-	_FLUSH();
+	OUT_VERTEX_F(.5);
 }
 
 static bool
@@ -3116,10 +2563,7 @@ gen4_render_fill_boxes(struct sna *sna,
 		return false;
 	}
 
-	if (op <= PictOpSrc &&
-	    (prefer_blt(sna) ||
-	     too_large(dst->drawable.width, dst->drawable.height) ||
-	     !gen4_check_dst_format(format))) {
+	if (op <= PictOpSrc) {
 		uint8_t alu = GXinvalid;
 
 		pixel = 0;
@@ -3170,13 +2614,11 @@ gen4_render_fill_boxes(struct sna *sna,
 	tmp.dst.format = format;
 	tmp.dst.bo = dst_bo;
 
-	tmp.src.bo = sna_render_get_solid(sna, pixel);
-	tmp.src.filter = SAMPLER_FILTER_NEAREST;
-	tmp.src.repeat = SAMPLER_EXTEND_REPEAT;
+	gen4_channel_init_solid(sna, &tmp.src, pixel);
 
 	tmp.is_affine = true;
-	tmp.floats_per_vertex = 3;
-	tmp.floats_per_rect = 9;
+	tmp.floats_per_vertex = 2;
+	tmp.floats_per_rect = 6;
 	tmp.u.gen4.wm_kernel = WM_KERNEL;
 	tmp.u.gen4.ve_id = 1;
 
@@ -3185,7 +2627,7 @@ gen4_render_fill_boxes(struct sna *sna,
 		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
 	}
 
-	gen4_fill_bind_surfaces(sna, &tmp);
+	gen4_bind_surfaces(sna, &tmp);
 	gen4_align_vertex(sna, &tmp);
 
 	do {
@@ -3235,7 +2677,8 @@ gen4_render_fill_op_boxes(struct sna *sna,
 static void
 gen4_render_fill_op_done(struct sna *sna, const struct sna_fill_op *op)
 {
-	gen4_vertex_flush(sna);
+	if (sna->render.vertex_offset)
+		gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
 }
 
@@ -3245,8 +2688,7 @@ gen4_render_fill(struct sna *sna, uint8_t alu,
 		 uint32_t color,
 		 struct sna_fill_op *op)
 {
-	if (prefer_blt(sna) &&
-	    sna_blt_fill(sna, alu,
+	if (sna_blt_fill(sna, alu,
 			 dst_bo, dst->drawable.bitsPerPixel,
 			 color,
 			 op))
@@ -3274,20 +2716,14 @@ gen4_render_fill(struct sna *sna, uint8_t alu,
 	op->base.need_magic_ca_pass = 0;
 	op->base.has_component_alpha = 0;
 
-	op->base.src.bo =
-		sna_render_get_solid(sna,
-				     sna_rgba_for_color(color,
-							dst->drawable.depth));
-	op->base.src.filter = SAMPLER_FILTER_NEAREST;
-	op->base.src.repeat = SAMPLER_EXTEND_REPEAT;
-
+	gen4_channel_init_solid(sna, &op->base.src,
+				sna_rgba_for_color(color,
+						   dst->drawable.depth));
 	op->base.mask.bo = NULL;
-	op->base.mask.filter = SAMPLER_FILTER_NEAREST;
-	op->base.mask.repeat = SAMPLER_EXTEND_NONE;
 
 	op->base.is_affine = true;
-	op->base.floats_per_vertex = 3;
-	op->base.floats_per_rect = 9;
+	op->base.floats_per_vertex = 2;
+	op->base.floats_per_rect = 6;
 	op->base.u.gen4.wm_kernel = WM_KERNEL;
 	op->base.u.gen4.ve_id = 1;
 
@@ -3296,7 +2732,7 @@ gen4_render_fill(struct sna *sna, uint8_t alu,
 		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
 	}
 
-	gen4_fill_bind_surfaces(sna, &op->base);
+	gen4_bind_surfaces(sna, &op->base);
 	gen4_align_vertex(sna, &op->base);
 
 	op->blt   = gen4_render_fill_op_blt;
@@ -3356,32 +2792,29 @@ gen4_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	tmp.dst.bo = bo;
 	tmp.dst.x = tmp.dst.y = 0;
 
-	tmp.src.bo =
-		sna_render_get_solid(sna,
-				     sna_rgba_for_color(color,
-							dst->drawable.depth));
-	tmp.src.filter = SAMPLER_FILTER_NEAREST;
-	tmp.src.repeat = SAMPLER_EXTEND_REPEAT;
-
+	gen4_channel_init_solid(sna, &tmp.src,
+				sna_rgba_for_color(color,
+						   dst->drawable.depth));
 	tmp.mask.bo = NULL;
-	tmp.mask.filter = SAMPLER_FILTER_NEAREST;
-	tmp.mask.repeat = SAMPLER_EXTEND_NONE;
 
 	tmp.is_affine = true;
-	tmp.floats_per_vertex = 3;
-	tmp.floats_per_rect = 9;
-	tmp.has_component_alpha = 0;
+	tmp.floats_per_vertex = 2;
+	tmp.floats_per_rect = 6;
+	tmp.has_component_alpha = false;
 	tmp.need_magic_ca_pass = false;
 
 	tmp.u.gen4.wm_kernel = WM_KERNEL;
 	tmp.u.gen4.ve_id = 1;
 
 	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
-		_kgem_submit(&sna->kgem);
-		assert(kgem_check_bo(&sna->kgem, bo, NULL));
+		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
+			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+			return false;
+		}
 	}
 
-	gen4_fill_bind_surfaces(sna, &tmp);
+	gen4_bind_surfaces(sna, &tmp);
 	gen4_align_vertex(sna, &tmp);
 
 	gen4_render_fill_rectangle(sna, &tmp, x1, y1, x2 - x1, y2 - y1);
@@ -3396,6 +2829,9 @@ static void
 gen4_render_flush(struct sna *sna)
 {
 	gen4_vertex_close(sna);
+
+	assert(sna->render.vb_id == 0);
+	assert(sna->render.vertex_offset == 0);
 }
 
 static void
@@ -3438,7 +2874,6 @@ static void gen4_render_reset(struct sna *sna)
 {
 	sna->render_state.gen4.needs_invariant = true;
 	sna->render_state.gen4.needs_urb = true;
-	sna->render_state.gen4.vb_id = 0;
 	sna->render_state.gen4.ve_id = -1;
 	sna->render_state.gen4.last_primitive = -1;
 	sna->render_state.gen4.last_pipelined_pointers = -1;
@@ -3452,6 +2887,10 @@ static void gen4_render_reset(struct sna *sna)
 		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
 		discard_vbo(sna);
 	}
+
+	sna->render.vertex_offset = 0;
+	sna->render.nvertex_reloc = 0;
+	sna->render.vb_id = 0;
 }
 
 static void gen4_render_fini(struct sna *sna)
@@ -3473,8 +2912,7 @@ static uint32_t gen4_create_vs_unit_state(struct sna_static_stream *stream)
 }
 
 static uint32_t gen4_create_sf_state(struct sna_static_stream *stream,
-				     const struct gt_info *info,
-				     uint32_t kernel)
+				     int gen, uint32_t kernel)
 {
 	struct gen4_sf_unit_state *sf;
 
@@ -3488,7 +2926,7 @@ static uint32_t gen4_create_sf_state(struct sna_static_stream *stream,
 	/* don't smash vertex header, read start from dw8 */
 	sf->thread3.urb_entry_read_offset = 1;
 	sf->thread3.dispatch_grf_start_reg = 3;
-	sf->thread4.max_threads = info->max_sf_threads - 1;
+	sf->thread4.max_threads = GEN4_MAX_SF_THREADS - 1;
 	sf->thread4.urb_entry_allocation_size = URB_SF_ENTRY_SIZE - 1;
 	sf->thread4.nr_urb_entries = URB_SF_ENTRIES;
 	sf->sf5.viewport_transform = false;	/* skip viewport */
@@ -3519,7 +2957,7 @@ static uint32_t gen4_create_sampler_state(struct sna_static_stream *stream,
 }
 
 static void gen4_init_wm_state(struct gen4_wm_unit_state *wm,
-			       const struct gt_info *info,
+			       int gen,
 			       bool has_mask,
 			       uint32_t kernel,
 			       uint32_t sampler)
@@ -3540,7 +2978,7 @@ static void gen4_init_wm_state(struct gen4_wm_unit_state *wm,
 	wm->wm4.sampler_state_pointer = sampler >> 5;
 	wm->wm4.sampler_count = 1;
 
-	wm->wm5.max_threads = info->max_wm_threads - 1;
+	wm->wm5.max_threads = gen >= 045 ? G4X_MAX_WM_THREADS - 1 : GEN4_MAX_WM_THREADS - 1;
 	wm->wm5.transposed_urb_read = 0;
 	wm->wm5.thread_dispatch_enable = 1;
 	/* just use 16-pixel dispatch (4 subspans), don't need to change kernel
@@ -3560,23 +2998,11 @@ static void gen4_init_wm_state(struct gen4_wm_unit_state *wm,
 	}
 }
 
-static uint32_t gen4_create_cc_viewport(struct sna_static_stream *stream)
-{
-	struct gen4_cc_viewport vp;
-
-	vp.min_depth = -1.e35;
-	vp.max_depth = 1.e35;
-
-	return sna_static_stream_add(stream, &vp, sizeof(vp), 32);
-}
-
 static uint32_t gen4_create_cc_unit_state(struct sna_static_stream *stream)
 {
 	uint8_t *ptr, *base;
-	uint32_t vp;
 	int i, j;
 
-	vp = gen4_create_cc_viewport(stream);
 	base = ptr =
 		sna_static_stream_map(stream,
 				      GEN4_BLENDFACTOR_COUNT*GEN4_BLENDFACTOR_COUNT*64,
@@ -3589,7 +3015,6 @@ static uint32_t gen4_create_cc_unit_state(struct sna_static_stream *stream)
 
 			state->cc3.blend_enable =
 				!(j == GEN4_BLENDFACTOR_ZERO && i == GEN4_BLENDFACTOR_ONE);
-			state->cc4.cc_viewport_state_offset = vp >> 5;
 
 			state->cc5.logicop_func = 0xc;	/* COPY */
 			state->cc5.ia_blend_function = GEN4_BLENDFUNCTION_ADD;
@@ -3616,15 +3041,9 @@ static bool gen4_render_setup(struct sna *sna)
 	struct gen4_render_state *state = &sna->render_state.gen4;
 	struct sna_static_stream general;
 	struct gen4_wm_unit_state_padded *wm_state;
-	const struct gt_info *info;
-	uint32_t sf[2], wm[KERNEL_COUNT];
+	uint32_t sf, wm[KERNEL_COUNT];
 	int i, j, k, l, m;
 
-	if (sna->kgem.gen == 45)
-		info = &g4x_gt_info;
-	else
-		info = &gen4_gt_info;
-
 	sna_static_stream_init(&general);
 
 	/* Zero pad the start. If you see an offset of 0x0 in the batchbuffer
@@ -3632,8 +3051,7 @@ static bool gen4_render_setup(struct sna *sna)
 	 */
 	null_create(&general);
 
-	sf[0] = sna_static_stream_compile_sf(sna, &general, brw_sf_kernel__nomask);
-	sf[1] = sna_static_stream_compile_sf(sna, &general, brw_sf_kernel__mask);
+	sf = sna_static_stream_compile_sf(sna, &general, brw_sf_kernel__mask);
 	for (m = 0; m < KERNEL_COUNT; m++) {
 		if (wm_kernels[m].size) {
 			wm[m] = sna_static_stream_add(&general,
@@ -3648,8 +3066,7 @@ static bool gen4_render_setup(struct sna *sna)
 	}
 
 	state->vs = gen4_create_vs_unit_state(&general);
-	state->sf[0] = gen4_create_sf_state(&general, info, sf[0]);
-	state->sf[1] = gen4_create_sf_state(&general, info, sf[1]);
+	state->sf = gen4_create_sf_state(&general, sna->kgem.gen, sf);
 
 	wm_state = sna_static_stream_map(&general,
 					  sizeof(*wm_state) * KERNEL_COUNT *
@@ -3669,7 +3086,8 @@ static bool gen4_render_setup(struct sna *sna)
 									  k, l);
 
 					for (m = 0; m < KERNEL_COUNT; m++) {
-						gen4_init_wm_state(&wm_state->state, info,
+						gen4_init_wm_state(&wm_state->state,
+								   sna->kgem.gen,
 								   wm_kernels[m].has_mask,
 								   wm[m], sampler_state);
 						wm_state++;
@@ -3695,10 +3113,13 @@ bool gen4_render_init(struct sna *sna)
 
 #if !NO_COMPOSITE
 	sna->render.composite = gen4_render_composite;
+	sna->render.prefer_gpu |= PREFER_GPU_RENDER;
 #endif
 #if !NO_COMPOSITE_SPANS
 	sna->render.check_composite_spans = gen4_check_composite_spans;
 	sna->render.composite_spans = gen4_render_composite_spans;
+	if (0)
+		sna->render.prefer_gpu |= PREFER_GPU_SPANS;
 #endif
 
 #if !NO_VIDEO
diff --git a/src/sna/gen4_render.h b/src/sna/gen4_render.h
index 49d232e88..53c7fc2f7 100644
--- a/src/sna/gen4_render.h
+++ b/src/sna/gen4_render.h
@@ -25,8 +25,8 @@
  *
  **************************************************************************/
 
-#ifndef GEN5_RENDER_H
-#define GEN5_RENDER_H
+#ifndef GEN4_RENDER_H
+#define GEN4_RENDER_H
 
 #define GEN4_3D(Pipeline,Opcode,Subopcode) ((3 << 29) | \
 					   ((Pipeline) << 27) | \
@@ -661,15 +661,14 @@
 #define GEN4_VERTEXBUFFER_ACCESS_VERTEXDATA     0
 #define GEN4_VERTEXBUFFER_ACCESS_INSTANCEDATA   1
 
-#define GEN4_VFCOMPONENT_NOSTORE      0
-#define GEN4_VFCOMPONENT_STORE_SRC    1
-#define GEN4_VFCOMPONENT_STORE_0      2
-#define GEN4_VFCOMPONENT_STORE_1_FLT  3
-#define GEN4_VFCOMPONENT_STORE_1_INT  4
-#define GEN4_VFCOMPONENT_STORE_VID    5
-#define GEN4_VFCOMPONENT_STORE_IID    6
-#define GEN4_VFCOMPONENT_STORE_PID    7
-
+#define VFCOMPONENT_NOSTORE      0
+#define VFCOMPONENT_STORE_SRC    1
+#define VFCOMPONENT_STORE_0      2
+#define VFCOMPONENT_STORE_1_FLT  3
+#define VFCOMPONENT_STORE_1_INT  4
+#define VFCOMPONENT_STORE_VID    5
+#define VFCOMPONENT_STORE_IID    6
+#define VFCOMPONENT_STORE_PID    7
 
 
 /* Execution Unit (EU) defines
@@ -725,8 +724,8 @@
 #define GEN4_INSTRUCTION_NORMAL    0
 #define GEN4_INSTRUCTION_SATURATE  1
 
-#define GEN4_MASK_ENABLE   0
-#define GEN4_MASK_DISABLE  1
+#define _MASK_ENABLE   0
+#define _MASK_DISABLE  1
 
 #define GEN4_OPCODE_MOV        1
 #define GEN4_OPCODE_SEL        2
@@ -2043,6 +2042,54 @@ struct gen4_surface_state
    } ss5;
 };
 
+/* Surface state DW0 */
+#define GEN4_SURFACE_RC_READ_WRITE       (1 << 8)
+#define GEN4_SURFACE_MIPLAYOUT_SHIFT     10
+#define GEN4_SURFACE_MIPMAPLAYOUT_BELOW   0
+#define GEN4_SURFACE_MIPMAPLAYOUT_RIGHT   1
+#define GEN4_SURFACE_CUBEFACE_ENABLES    0x3f
+#define GEN4_SURFACE_BLEND_ENABLED       (1 << 13)
+#define GEN4_SURFACE_WRITEDISABLE_B_SHIFT        14
+#define GEN4_SURFACE_WRITEDISABLE_G_SHIFT        15
+#define GEN4_SURFACE_WRITEDISABLE_R_SHIFT        16
+#define GEN4_SURFACE_WRITEDISABLE_A_SHIFT        17
+#define GEN4_SURFACE_FORMAT_SHIFT        18
+#define GEN4_SURFACE_FORMAT_MASK         _MASK(26, 18)
+
+#define GEN4_SURFACE_TYPE_SHIFT          29
+#define GEN4_SURFACE_TYPE_MASK           _MASK(31, 29)
+#define GEN4_SURFACE_1D      0
+#define GEN4_SURFACE_2D      1
+#define GEN4_SURFACE_3D      2
+#define GEN4_SURFACE_CUBE    3
+#define GEN4_SURFACE_BUFFER  4
+#define GEN4_SURFACE_NULL    7
+
+/* Surface state DW2 */
+#define GEN4_SURFACE_HEIGHT_SHIFT        19
+#define GEN4_SURFACE_HEIGHT_MASK         _MASK(31, 19)
+#define GEN4_SURFACE_WIDTH_SHIFT         6
+#define GEN4_SURFACE_WIDTH_MASK          _MASK(18, 6)
+#define GEN4_SURFACE_LOD_SHIFT           2
+#define GEN4_SURFACE_LOD_MASK            _MASK(5, 2)
+
+/* Surface state DW3 */
+#define GEN4_SURFACE_DEPTH_SHIFT         21
+#define GEN4_SURFACE_DEPTH_MASK          _MASK(31, 21)
+#define GEN4_SURFACE_PITCH_SHIFT         3
+#define GEN4_SURFACE_PITCH_MASK          _MASK(19, 3)
+#define GEN4_SURFACE_TILED               (1 << 1)
+#define GEN4_SURFACE_TILED_Y             (1 << 0)
+
+/* Surface state DW4 */
+#define GEN4_SURFACE_MIN_LOD_SHIFT       28
+#define GEN4_SURFACE_MIN_LOD_MASK        _MASK(31, 28)
+
+/* Surface state DW5 */
+#define GEN4_SURFACE_X_OFFSET_SHIFT      25
+#define GEN4_SURFACE_X_OFFSET_MASK       _MASK(31, 25)
+#define GEN4_SURFACE_Y_OFFSET_SHIFT      20
+#define GEN4_SURFACE_Y_OFFSET_MASK       _MASK(23, 20)
 
 
 struct gen4_vertex_buffer_state
diff --git a/src/sna/gen4_source.c b/src/sna/gen4_source.c
new file mode 100644
index 000000000..749de8d60
--- /dev/null
+++ b/src/sna/gen4_source.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright © 2011,2012,2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "sna.h"
+#include "sna_render.h"
+#include "sna_render_inline.h"
+#include "gen4_source.h"
+#include "gen4_render.h"
+
+bool
+gen4_channel_init_solid(struct sna *sna,
+			struct sna_composite_channel *channel,
+			uint32_t color)
+{
+	channel->filter = PictFilterNearest;
+	channel->repeat = RepeatNormal;
+	channel->is_affine = true;
+	channel->is_solid  = true;
+	channel->is_opaque = (color >> 24) == 0xff;
+	channel->transform = NULL;
+	channel->width  = 1;
+	channel->height = 1;
+	channel->pict_format = PICT_a8r8g8b8;
+	channel->card_format = GEN4_SURFACEFORMAT_B8G8R8A8_UNORM;
+
+	channel->bo = sna_render_get_solid(sna, color);
+
+	channel->scale[0]  = channel->scale[1]  = 1;
+	channel->offset[0] = channel->offset[1] = 0;
+	return channel->bo != NULL;
+}
+
+bool
+gen4_channel_init_linear(struct sna *sna,
+			 PicturePtr picture,
+			 struct sna_composite_channel *channel,
+			 int x, int y,
+			 int w, int h,
+			 int dst_x, int dst_y)
+{
+	PictLinearGradient *linear =
+		(PictLinearGradient *)picture->pSourcePict;
+	pixman_fixed_t tx, ty;
+	float x0, y0, sf;
+	float dx, dy;
+
+	DBG(("%s: p1=(%f, %f), p2=(%f, %f), src=(%d, %d), dst=(%d, %d), size=(%d, %d)\n",
+	     __FUNCTION__,
+	     pixman_fixed_to_double(linear->p1.x), pixman_fixed_to_double(linear->p1.y),
+	     pixman_fixed_to_double(linear->p2.x), pixman_fixed_to_double(linear->p2.y),
+	     x, y, dst_x, dst_y, w, h));
+
+	if (linear->p2.x == linear->p1.x && linear->p2.y == linear->p1.y)
+		return 0;
+
+	if (!sna_transform_is_affine(picture->transform)) {
+		DBG(("%s: fallback due to projective transform\n",
+		     __FUNCTION__));
+		return sna_render_picture_fixup(sna, picture, channel,
+						x, y, w, h, dst_x, dst_y);
+	}
+
+	channel->bo = sna_render_get_gradient(sna, (PictGradient *)linear);
+	if (!channel->bo)
+		return 0;
+
+	channel->filter = PictFilterNearest;
+	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
+	channel->width  = channel->bo->pitch / 4;
+	channel->height = 1;
+	channel->pict_format = PICT_a8r8g8b8;
+	channel->card_format = GEN4_SURFACEFORMAT_B8G8R8A8_UNORM;
+	channel->is_linear = 1;
+	channel->is_affine = 1;
+
+	channel->scale[0]  = channel->scale[1]  = 1;
+	channel->offset[0] = channel->offset[1] = 0;
+
+	if (sna_transform_is_translation(picture->transform, &tx, &ty)) {
+		dx = pixman_fixed_to_double(linear->p2.x - linear->p1.x);
+		dy = pixman_fixed_to_double(linear->p2.y - linear->p1.y);
+
+		x0 = pixman_fixed_to_double(linear->p1.x);
+		y0 = pixman_fixed_to_double(linear->p1.y);
+
+		if (tx | ty) {
+			x0 -= pixman_fixed_to_double(tx);
+			y0 -= pixman_fixed_to_double(ty);
+		}
+	} else {
+		struct pixman_f_vector p1, p2;
+		struct pixman_f_transform m, inv;
+
+		pixman_f_transform_from_pixman_transform(&m, picture->transform);
+		DBG(("%s: transform = [%f %f %f, %f %f %f, %f %f %f]\n",
+		     __FUNCTION__,
+		     m.m[0][0], m.m[0][1], m.m[0][2],
+		     m.m[1][0], m.m[1][1], m.m[1][2],
+		     m.m[2][0], m.m[2][1], m.m[2][2]));
+		if (!pixman_f_transform_invert(&inv, &m))
+			return 0;
+
+		p1.v[0] = pixman_fixed_to_double(linear->p1.x);
+		p1.v[1] = pixman_fixed_to_double(linear->p1.y);
+		p1.v[2] = 1.;
+		pixman_f_transform_point(&inv, &p1);
+
+		p2.v[0] = pixman_fixed_to_double(linear->p2.x);
+		p2.v[1] = pixman_fixed_to_double(linear->p2.y);
+		p2.v[2] = 1.;
+		pixman_f_transform_point(&inv, &p2);
+
+		DBG(("%s: untransformed: p1=(%f, %f, %f), p2=(%f, %f, %f)\n",
+		     __FUNCTION__,
+		     p1.v[0], p1.v[1], p1.v[2],
+		     p2.v[0], p2.v[1], p2.v[2]));
+
+		dx = p2.v[0] - p1.v[0];
+		dy = p2.v[1] - p1.v[1];
+
+		x0 = p1.v[0];
+		y0 = p1.v[1];
+	}
+
+	sf = dx*dx + dy*dy;
+	dx /= sf;
+	dy /= sf;
+
+	channel->u.linear.dx = dx;
+	channel->u.linear.dy = dy;
+	channel->u.linear.offset = -dx*(x0+dst_x-x) + -dy*(y0+dst_y-y);
+
+	channel->embedded_transform.matrix[0][0] = pixman_double_to_fixed(dx);
+	channel->embedded_transform.matrix[0][1] = pixman_double_to_fixed(dy);
+	channel->embedded_transform.matrix[0][2] = pixman_double_to_fixed(channel->u.linear.offset);
+
+	channel->embedded_transform.matrix[1][0] = 0;
+	channel->embedded_transform.matrix[1][1] = 0;
+	channel->embedded_transform.matrix[1][2] = pixman_double_to_fixed(.5);
+
+	channel->embedded_transform.matrix[2][0] = 0;
+	channel->embedded_transform.matrix[2][1] = 0;
+	channel->embedded_transform.matrix[2][2] = pixman_fixed_1;
+
+	channel->transform = &channel->embedded_transform;
+
+	DBG(("%s: dx=%f, dy=%f, offset=%f\n",
+	     __FUNCTION__, dx, dy, channel->u.linear.offset));
+
+	return channel->bo != NULL;
+}
diff --git a/src/sna/gen4_source.h b/src/sna/gen4_source.h
new file mode 100644
index 000000000..c73afaca9
--- /dev/null
+++ b/src/sna/gen4_source.h
@@ -0,0 +1,22 @@
+#ifndef GEN4_SOURCE_H
+#define GEN4_SOURCE_H
+
+#include "compiler.h"
+
+#include "sna.h"
+#include "sna_render.h"
+
+bool
+gen4_channel_init_solid(struct sna *sna,
+			struct sna_composite_channel *channel,
+			uint32_t color);
+
+bool
+gen4_channel_init_linear(struct sna *sna,
+			 PicturePtr picture,
+			 struct sna_composite_channel *channel,
+			 int x, int y,
+			 int w, int h,
+			 int dst_x, int dst_y);
+
+#endif /* GEN4_SOURCE_H */
diff --git a/src/sna/gen4_vertex.c b/src/sna/gen4_vertex.c
new file mode 100644
index 000000000..5062ebdf0
--- /dev/null
+++ b/src/sna/gen4_vertex.c
@@ -0,0 +1,1543 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "sna.h"
+#include "sna_render.h"
+#include "sna_render_inline.h"
+#include "gen4_vertex.h"
+
+void gen4_vertex_flush(struct sna *sna)
+{
+	DBG(("%s[%x] = %d\n", __FUNCTION__,
+	     4*sna->render.vertex_offset,
+	     sna->render.vertex_index - sna->render.vertex_start));
+
+	assert(sna->render.vertex_offset);
+	assert(sna->render.vertex_index > sna->render.vertex_start);
+
+	sna->kgem.batch[sna->render.vertex_offset] =
+		sna->render.vertex_index - sna->render.vertex_start;
+	sna->render.vertex_offset = 0;
+}
+
+int gen4_vertex_finish(struct sna *sna)
+{
+	struct kgem_bo *bo;
+	unsigned int i;
+	unsigned hint, size;
+
+	DBG(("%s: used=%d / %d\n", __FUNCTION__,
+	     sna->render.vertex_used, sna->render.vertex_size));
+	assert(sna->render.vertex_offset == 0);
+	assert(sna->render.vertex_used);
+
+	sna_vertex_wait__locked(&sna->render);
+
+	/* Note: we only need dword alignment (currently) */
+
+	bo = sna->render.vbo;
+	if (bo) {
+		for (i = 0; i < sna->render.nvertex_reloc; i++) {
+			DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
+			     i, sna->render.vertex_reloc[i]));
+
+			sna->kgem.batch[sna->render.vertex_reloc[i]] =
+				kgem_add_reloc(&sna->kgem,
+					       sna->render.vertex_reloc[i], bo,
+					       I915_GEM_DOMAIN_VERTEX << 16,
+					       0);
+		}
+
+		assert(!sna->render.active);
+		sna->render.nvertex_reloc = 0;
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+		sna->render.vbo = NULL;
+		sna->render.vb_id = 0;
+
+		kgem_bo_destroy(&sna->kgem, bo);
+	}
+
+	hint = CREATE_GTT_MAP;
+	if (bo)
+		hint |= CREATE_CACHED | CREATE_NO_THROTTLE;
+
+	size = 256*1024;
+	assert(!sna->render.active);
+	sna->render.vertices = NULL;
+	sna->render.vbo = kgem_create_linear(&sna->kgem, size, hint);
+	while (sna->render.vbo == NULL && size > 16*1024) {
+		size /= 2;
+		sna->render.vbo = kgem_create_linear(&sna->kgem, size, hint);
+	}
+	if (sna->render.vbo == NULL)
+		sna->render.vbo = kgem_create_linear(&sna->kgem,
+						     256*1024, CREATE_GTT_MAP);
+	if (sna->render.vbo)
+		sna->render.vertices = kgem_bo_map(&sna->kgem, sna->render.vbo);
+	if (sna->render.vertices == NULL) {
+		if (sna->render.vbo) {
+			kgem_bo_destroy(&sna->kgem, sna->render.vbo);
+			sna->render.vbo = NULL;
+		}
+		sna->render.vertices = sna->render.vertex_data;
+		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+		return 0;
+	}
+
+	if (sna->render.vertex_used) {
+		DBG(("%s: copying initial buffer x %d to handle=%d\n",
+		     __FUNCTION__,
+		     sna->render.vertex_used,
+		     sna->render.vbo->handle));
+		assert(sizeof(float)*sna->render.vertex_used <=
+		       __kgem_bo_size(sna->render.vbo));
+		memcpy(sna->render.vertices,
+		       sna->render.vertex_data,
+		       sizeof(float)*sna->render.vertex_used);
+	}
+
+	size = __kgem_bo_size(sna->render.vbo)/4;
+	if (size >= UINT16_MAX)
+		size = UINT16_MAX - 1;
+
+	DBG(("%s: create vbo handle=%d, size=%d\n",
+	     __FUNCTION__, sna->render.vbo->handle, size));
+
+	sna->render.vertex_size = size;
+	return sna->render.vertex_size - sna->render.vertex_used;
+}
+
+void gen4_vertex_close(struct sna *sna)
+{
+	struct kgem_bo *bo, *free_bo = NULL;
+	unsigned int i, delta = 0;
+
+	assert(sna->render.vertex_offset == 0);
+	if (!sna->render.vb_id)
+		return;
+
+	DBG(("%s: used=%d, vbo active? %d, vb=%x, nreloc=%d\n",
+	     __FUNCTION__, sna->render.vertex_used, sna->render.vbo ? sna->render.vbo->handle : 0,
+	     sna->render.vb_id, sna->render.nvertex_reloc));
+
+	assert(!sna->render.active);
+
+	bo = sna->render.vbo;
+	if (bo) {
+		if (sna->render.vertex_size - sna->render.vertex_used < 64) {
+			DBG(("%s: discarding vbo (full), handle=%d\n", __FUNCTION__, sna->render.vbo->handle));
+			sna->render.vbo = NULL;
+			sna->render.vertices = sna->render.vertex_data;
+			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+			free_bo = bo;
+		} else if (IS_CPU_MAP(bo->map) && !sna->kgem.has_llc) {
+			DBG(("%s: converting CPU map to GTT\n", __FUNCTION__));
+			sna->render.vertices =
+				kgem_bo_map__gtt(&sna->kgem, sna->render.vbo);
+			if (sna->render.vertices == NULL) {
+				sna->render.vbo = NULL;
+				sna->render.vertices = sna->render.vertex_data;
+				sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
+				free_bo = bo;
+			}
+
+		}
+	} else {
+		if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
+			DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
+			     sna->render.vertex_used, sna->kgem.nbatch));
+			memcpy(sna->kgem.batch + sna->kgem.nbatch,
+			       sna->render.vertex_data,
+			       sna->render.vertex_used * 4);
+			delta = sna->kgem.nbatch * 4;
+			bo = NULL;
+			sna->kgem.nbatch += sna->render.vertex_used;
+		} else {
+			bo = kgem_create_linear(&sna->kgem,
+						4*sna->render.vertex_used,
+						CREATE_NO_THROTTLE);
+			if (bo && !kgem_bo_write(&sna->kgem, bo,
+						 sna->render.vertex_data,
+						 4*sna->render.vertex_used)) {
+				kgem_bo_destroy(&sna->kgem, bo);
+				bo = NULL;
+			}
+			DBG(("%s: new vbo: %d\n", __FUNCTION__,
+			     sna->render.vertex_used));
+			free_bo = bo;
+		}
+	}
+
+	assert(sna->render.nvertex_reloc);
+	for (i = 0; i < sna->render.nvertex_reloc; i++) {
+		DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
+		     i, sna->render.vertex_reloc[i]));
+
+		sna->kgem.batch[sna->render.vertex_reloc[i]] =
+			kgem_add_reloc(&sna->kgem,
+				       sna->render.vertex_reloc[i], bo,
+				       I915_GEM_DOMAIN_VERTEX << 16,
+				       delta);
+	}
+	sna->render.nvertex_reloc = 0;
+	sna->render.vb_id = 0;
+
+	if (sna->render.vbo == NULL) {
+		assert(!sna->render.active);
+		sna->render.vertex_used = 0;
+		sna->render.vertex_index = 0;
+		assert(sna->render.vertices == sna->render.vertex_data);
+		assert(sna->render.vertex_size == ARRAY_SIZE(sna->render.vertex_data));
+	}
+
+	if (free_bo)
+		kgem_bo_destroy(&sna->kgem, free_bo);
+}
+
+/* specialised vertex emission routines */
+
+#define OUT_VERTEX(x,y) vertex_emit_2s(sna, x,y) /* XXX assert(!too_large(x, y)); */
+#define OUT_VERTEX_F(v) vertex_emit(sna, v)
+
+inline static float
+compute_linear(const struct sna_composite_channel *channel,
+	       int16_t x, int16_t y)
+{
+	return ((x+channel->offset[0]) * channel->u.linear.dx +
+		(y+channel->offset[1]) * channel->u.linear.dy +
+		channel->u.linear.offset);
+}
+
+inline static void
+emit_texcoord(struct sna *sna,
+	      const struct sna_composite_channel *channel,
+	      int16_t x, int16_t y)
+{
+	if (channel->is_solid) {
+		OUT_VERTEX_F(x);
+		return;
+	}
+
+	x += channel->offset[0];
+	y += channel->offset[1];
+
+	if (channel->is_affine) {
+		float s, t;
+
+		sna_get_transformed_coordinates(x, y,
+						channel->transform,
+						&s, &t);
+		OUT_VERTEX_F(s * channel->scale[0]);
+		OUT_VERTEX_F(t * channel->scale[1]);
+	} else {
+		float s, t, w;
+
+		sna_get_transformed_coordinates_3d(x, y,
+						   channel->transform,
+						   &s, &t, &w);
+		OUT_VERTEX_F(s * channel->scale[0]);
+		OUT_VERTEX_F(t * channel->scale[1]);
+		OUT_VERTEX_F(w);
+	}
+}
+
+inline static void
+emit_vertex(struct sna *sna,
+		      const struct sna_composite_op *op,
+		      int16_t srcX, int16_t srcY,
+		      int16_t mskX, int16_t mskY,
+		      int16_t dstX, int16_t dstY)
+{
+	OUT_VERTEX(dstX, dstY);
+	emit_texcoord(sna, &op->src, srcX, srcY);
+}
+
+fastcall static void
+emit_primitive(struct sna *sna,
+	       const struct sna_composite_op *op,
+	       const struct sna_composite_rectangles *r)
+{
+	emit_vertex(sna, op,
+		    r->src.x + r->width,  r->src.y + r->height,
+		    r->mask.x + r->width, r->mask.y + r->height,
+		    r->dst.x + r->width, r->dst.y + r->height);
+	emit_vertex(sna, op,
+		    r->src.x,  r->src.y + r->height,
+		    r->mask.x, r->mask.y + r->height,
+		    r->dst.x,  r->dst.y + r->height);
+	emit_vertex(sna, op,
+		    r->src.x,  r->src.y,
+		    r->mask.x, r->mask.y,
+		    r->dst.x,  r->dst.y);
+}
+
+inline static void
+emit_vertex_mask(struct sna *sna,
+		 const struct sna_composite_op *op,
+		 int16_t srcX, int16_t srcY,
+		 int16_t mskX, int16_t mskY,
+		 int16_t dstX, int16_t dstY)
+{
+	OUT_VERTEX(dstX, dstY);
+	emit_texcoord(sna, &op->src, srcX, srcY);
+	emit_texcoord(sna, &op->mask, mskX, mskY);
+}
+
+fastcall static void
+emit_primitive_mask(struct sna *sna,
+		    const struct sna_composite_op *op,
+		    const struct sna_composite_rectangles *r)
+{
+	emit_vertex_mask(sna, op,
+			 r->src.x + r->width,  r->src.y + r->height,
+			 r->mask.x + r->width, r->mask.y + r->height,
+			 r->dst.x + r->width, r->dst.y + r->height);
+	emit_vertex_mask(sna, op,
+			 r->src.x,  r->src.y + r->height,
+			 r->mask.x, r->mask.y + r->height,
+			 r->dst.x,  r->dst.y + r->height);
+	emit_vertex_mask(sna, op,
+			 r->src.x,  r->src.y,
+			 r->mask.x, r->mask.y,
+			 r->dst.x,  r->dst.y);
+}
+
+fastcall static void
+emit_primitive_solid(struct sna *sna,
+		     const struct sna_composite_op *op,
+		     const struct sna_composite_rectangles *r)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	assert(op->floats_per_rect == 6);
+	assert((sna->render.vertex_used % 2) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 6;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	dst.p.x = r->dst.x;
+	v[2] = dst.f;
+	dst.p.y = r->dst.y;
+	v[4] = dst.f;
+
+	v[5] = v[3] = v[1] = .5;
+}
+
+fastcall static void
+emit_boxes_solid(const struct sna_composite_op *op,
+		 const BoxRec *box, int nbox,
+		 float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		dst.p.x = box->x1;
+		v[2] = dst.f;
+		dst.p.y = box->y1;
+		v[4] = dst.f;
+
+		v[5] = v[3] = v[1] = .5;
+		box++;
+		v += 6;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_primitive_linear(struct sna *sna,
+		      const struct sna_composite_op *op,
+		      const struct sna_composite_rectangles *r)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	assert(op->floats_per_rect == 6);
+	assert((sna->render.vertex_used % 2) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 6;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	dst.p.x = r->dst.x;
+	v[2] = dst.f;
+	dst.p.y = r->dst.y;
+	v[4] = dst.f;
+
+	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
+	v[3] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
+	v[5] = compute_linear(&op->src, r->src.x, r->src.y);
+}
+
+fastcall static void
+emit_boxes_linear(const struct sna_composite_op *op,
+		  const BoxRec *box, int nbox,
+		  float *v)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	do {
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		dst.p.x = box->x1;
+		v[2] = dst.f;
+		dst.p.y = box->y1;
+		v[4] = dst.f;
+
+		v[1] = compute_linear(&op->src, box->x2, box->y2);
+		v[3] = compute_linear(&op->src, box->x1, box->y2);
+		v[5] = compute_linear(&op->src, box->x1, box->y1);
+
+		v += 6;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_primitive_identity_source(struct sna *sna,
+			       const struct sna_composite_op *op,
+			       const struct sna_composite_rectangles *r)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	dst.p.x = r->dst.x;
+	v[3] = dst.f;
+	dst.p.y = r->dst.y;
+	v[6] = dst.f;
+
+	v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
+	v[1] = v[4] + r->width * op->src.scale[0];
+
+	v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
+	v[5] = v[2] = v[8] + r->height * op->src.scale[1];
+}
+
+fastcall static void
+emit_boxes_identity_source(const struct sna_composite_op *op,
+			   const BoxRec *box, int nbox,
+			   float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		dst.p.x = box->x1;
+		v[3] = dst.f;
+		dst.p.y = box->y1;
+		v[6] = dst.f;
+
+		v[7] = v[4] = (box->x1 + op->src.offset[0]) * op->src.scale[0];
+		v[1] = (box->x2 + op->src.offset[0]) * op->src.scale[0];
+
+		v[8] = (box->y1 + op->src.offset[1]) * op->src.scale[1];
+		v[2] = v[5] = (box->y2 + op->src.offset[1]) * op->src.scale[1];
+
+		v += 9;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_primitive_simple_source(struct sna *sna,
+			     const struct sna_composite_op *op,
+			     const struct sna_composite_rectangles *r)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float xx = op->src.transform->matrix[0][0];
+	float x0 = op->src.transform->matrix[0][2];
+	float yy = op->src.transform->matrix[1][1];
+	float y0 = op->src.transform->matrix[1][2];
+	float sx = op->src.scale[0];
+	float sy = op->src.scale[1];
+	int16_t tx = op->src.offset[0];
+	int16_t ty = op->src.offset[1];
+
+	assert(op->floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*3;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
+	v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
+
+	dst.p.x = r->dst.x;
+	v[3] = dst.f;
+	v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx;
+
+	dst.p.y = r->dst.y;
+	v[6] = dst.f;
+	v[8] = ((r->src.y + ty) * yy + y0) * sy;
+}
+
+fastcall static void
+emit_boxes_simple_source(const struct sna_composite_op *op,
+			 const BoxRec *box, int nbox,
+			 float *v)
+{
+	float xx = op->src.transform->matrix[0][0];
+	float x0 = op->src.transform->matrix[0][2];
+	float yy = op->src.transform->matrix[1][1];
+	float y0 = op->src.transform->matrix[1][2];
+	float sx = op->src.scale[0];
+	float sy = op->src.scale[1];
+	int16_t tx = op->src.offset[0];
+	int16_t ty = op->src.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		v[1] = ((box->x2 + tx) * xx + x0) * sx;
+		v[5] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
+
+		dst.p.x = box->x1;
+		v[3] = dst.f;
+		v[7] = v[4] = ((box->x1 + tx) * xx + x0) * sx;
+
+		dst.p.y = box->y1;
+		v[6] = dst.f;
+		v[8] = ((box->y1 + ty) * yy + y0) * sy;
+
+		v += 9;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_primitive_affine_source(struct sna *sna,
+			     const struct sna_composite_op *op,
+			     const struct sna_composite_rectangles *r)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x + r->width,
+				    op->src.offset[1] + r->src.y + r->height,
+				    op->src.transform, op->src.scale,
+				    &v[1], &v[2]);
+
+	dst.p.x = r->dst.x;
+	v[3] = dst.f;
+	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x,
+				    op->src.offset[1] + r->src.y + r->height,
+				    op->src.transform, op->src.scale,
+				    &v[4], &v[5]);
+
+	dst.p.y = r->dst.y;
+	v[6] = dst.f;
+	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x,
+				    op->src.offset[1] + r->src.y,
+				    op->src.transform, op->src.scale,
+				    &v[7], &v[8]);
+}
+
+fastcall static void
+emit_boxes_affine_source(const struct sna_composite_op *op,
+			 const BoxRec *box, int nbox,
+			 float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		_sna_get_transformed_scaled(op->src.offset[0] + box->x2,
+					    op->src.offset[1] + box->y2,
+					    op->src.transform, op->src.scale,
+					    &v[1], &v[2]);
+
+		dst.p.x = box->x1;
+		v[3] = dst.f;
+		_sna_get_transformed_scaled(op->src.offset[0] + box->x1,
+					    op->src.offset[1] + box->y2,
+					    op->src.transform, op->src.scale,
+					    &v[4], &v[5]);
+
+		dst.p.y = box->y1;
+		v[6] = dst.f;
+		_sna_get_transformed_scaled(op->src.offset[0] + box->x1,
+					    op->src.offset[1] + box->y1,
+					    op->src.transform, op->src.scale,
+					    &v[7], &v[8]);
+		box++;
+		v += 9;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_primitive_identity_mask(struct sna *sna,
+			     const struct sna_composite_op *op,
+			     const struct sna_composite_rectangles *r)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float msk_x, msk_y;
+	float w, h;
+	float *v;
+
+	msk_x = r->mask.x + op->mask.offset[0];
+	msk_y = r->mask.y + op->mask.offset[1];
+	w = r->width;
+	h = r->height;
+
+	DBG(("%s: dst=(%d, %d), mask=(%f, %f) x (%f, %f)\n",
+	     __FUNCTION__, r->dst.x, r->dst.y, msk_x, msk_y, w, h));
+
+	assert(op->floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 12;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	v[2] = (msk_x + w) * op->mask.scale[0];
+	v[7] = v[3] = (msk_y + h) * op->mask.scale[1];
+
+	dst.p.x = r->dst.x;
+	v[4] = dst.f;
+	v[10] = v[6] = msk_x * op->mask.scale[0];
+
+	dst.p.y = r->dst.y;
+	v[8] = dst.f;
+	v[11] = msk_y * op->mask.scale[1];
+
+	v[9] = v[5] = v[1] = .5;
+}
+
+fastcall static void
+emit_boxes_identity_mask(const struct sna_composite_op *op,
+			 const BoxRec *box, int nbox,
+			 float *v)
+{
+	float msk_x = op->mask.offset[0];
+	float msk_y = op->mask.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		v[2] = (msk_x + box->x2) * op->mask.scale[0];
+		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
+
+		dst.p.x = box->x1;
+		v[4] = dst.f;
+		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
+
+		dst.p.y = box->y1;
+		v[8] = dst.f;
+		v[11] = (msk_y + box->y1) * op->mask.scale[1];
+
+		v[9] = v[5] = v[1] = .5;
+		v += 12;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_primitive_linear_identity_mask(struct sna *sna,
+				    const struct sna_composite_op *op,
+				    const struct sna_composite_rectangles *r)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float msk_x, msk_y;
+	float w, h;
+	float *v;
+
+	msk_x = r->mask.x + op->mask.offset[0];
+	msk_y = r->mask.y + op->mask.offset[1];
+	w = r->width;
+	h = r->height;
+
+	DBG(("%s: dst=(%d, %d), mask=(%f, %f) x (%f, %f)\n",
+	     __FUNCTION__, r->dst.x, r->dst.y, msk_x, msk_y, w, h));
+
+	assert(op->floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 12;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	v[2] = (msk_x + w) * op->mask.scale[0];
+	v[7] = v[3] = (msk_y + h) * op->mask.scale[1];
+
+	dst.p.x = r->dst.x;
+	v[4] = dst.f;
+	v[10] = v[6] = msk_x * op->mask.scale[0];
+
+	dst.p.y = r->dst.y;
+	v[8] = dst.f;
+	v[11] = msk_y * op->mask.scale[1];
+
+	v[1] = compute_linear(&op->src, r->src.x+r->width, r->src.y+r->height);
+	v[5] = compute_linear(&op->src, r->src.x, r->src.y+r->height);
+	v[9] = compute_linear(&op->src, r->src.x, r->src.y);
+}
+
+fastcall static void
+emit_boxes_linear_identity_mask(const struct sna_composite_op *op,
+				const BoxRec *box, int nbox,
+				float *v)
+{
+	float msk_x = op->mask.offset[0];
+	float msk_y = op->mask.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = box->x2;
+		dst.p.y = box->y2;
+		v[0] = dst.f;
+		v[2] = (msk_x + box->x2) * op->mask.scale[0];
+		v[7] = v[3] = (msk_y + box->y2) * op->mask.scale[1];
+
+		dst.p.x = box->x1;
+		v[4] = dst.f;
+		v[10] = v[6] = (msk_x + box->x1) * op->mask.scale[0];
+
+		dst.p.y = box->y1;
+		v[8] = dst.f;
+		v[11] = (msk_y + box->y1) * op->mask.scale[1];
+
+		v[1] = compute_linear(&op->src, box->x2, box->y2);
+		v[5] = compute_linear(&op->src, box->x1, box->y2);
+		v[9] = compute_linear(&op->src, box->x1, box->y1);
+
+		v += 12;
+		box++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_primitive_identity_source_mask(struct sna *sna,
+				    const struct sna_composite_op *op,
+				    const struct sna_composite_rectangles *r)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float src_x, src_y;
+	float msk_x, msk_y;
+	float w, h;
+	float *v;
+
+	src_x = r->src.x + op->src.offset[0];
+	src_y = r->src.y + op->src.offset[1];
+	msk_x = r->mask.x + op->mask.offset[0];
+	msk_y = r->mask.y + op->mask.offset[1];
+	w = r->width;
+	h = r->height;
+
+	assert(op->floats_per_rect == 15);
+	assert((sna->render.vertex_used % 5) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 15;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	v[1] = (src_x + w) * op->src.scale[0];
+	v[2] = (src_y + h) * op->src.scale[1];
+	v[3] = (msk_x + w) * op->mask.scale[0];
+	v[4] = (msk_y + h) * op->mask.scale[1];
+
+	dst.p.x = r->dst.x;
+	v[5] = dst.f;
+	v[6] = src_x * op->src.scale[0];
+	v[7] = v[2];
+	v[8] = msk_x * op->mask.scale[0];
+	v[9] = v[4];
+
+	dst.p.y = r->dst.y;
+	v[10] = dst.f;
+	v[11] = v[6];
+	v[12] = src_y * op->src.scale[1];
+	v[13] = v[8];
+	v[14] = msk_y * op->mask.scale[1];
+}
+
+fastcall static void
+emit_primitive_simple_source_identity(struct sna *sna,
+				      const struct sna_composite_op *op,
+				      const struct sna_composite_rectangles *r)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float xx = op->src.transform->matrix[0][0];
+	float x0 = op->src.transform->matrix[0][2];
+	float yy = op->src.transform->matrix[1][1];
+	float y0 = op->src.transform->matrix[1][2];
+	float sx = op->src.scale[0];
+	float sy = op->src.scale[1];
+	int16_t tx = op->src.offset[0];
+	int16_t ty = op->src.offset[1];
+	float msk_x = r->mask.x + op->mask.offset[0];
+	float msk_y = r->mask.y + op->mask.offset[1];
+	float w = r->width, h = r->height;
+
+	assert(op->floats_per_rect == 15);
+	assert((sna->render.vertex_used % 5) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*5;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
+	v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
+	v[3] = (msk_x + w) * op->mask.scale[0];
+	v[4] = (msk_y + h) * op->mask.scale[1];
+
+	dst.p.x = r->dst.x;
+	v[5] = dst.f;
+	v[6] = ((r->src.x + tx) * xx + x0) * sx;
+	v[7] = v[2];
+	v[8] = msk_x * op->mask.scale[0];
+	v[9] = v[4];
+
+	dst.p.y = r->dst.y;
+	v[10] = dst.f;
+	v[11] = v[6];
+	v[12] = ((r->src.y + ty) * yy + y0) * sy;
+	v[13] = v[8];
+	v[14] = msk_y * op->mask.scale[1];
+}
+
+fastcall static void
+emit_primitive_affine_source_identity(struct sna *sna,
+				      const struct sna_composite_op *op,
+				      const struct sna_composite_rectangles *r)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float msk_x = r->mask.x + op->mask.offset[0];
+	float msk_y = r->mask.y + op->mask.offset[1];
+	float w = r->width, h = r->height;
+
+	assert(op->floats_per_rect == 15);
+	assert((sna->render.vertex_used % 5) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*5;
+
+	dst.p.x = r->dst.x + r->width;
+	dst.p.y = r->dst.y + r->height;
+	v[0] = dst.f;
+	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x + r->width,
+				    op->src.offset[1] + r->src.y + r->height,
+				    op->src.transform, op->src.scale,
+				    &v[1], &v[2]);
+	v[3] = (msk_x + w) * op->mask.scale[0];
+	v[4] = (msk_y + h) * op->mask.scale[1];
+
+	dst.p.x = r->dst.x;
+	v[5] = dst.f;
+	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x,
+				    op->src.offset[1] + r->src.y + r->height,
+				    op->src.transform, op->src.scale,
+				    &v[6], &v[7]);
+	v[8] = msk_x * op->mask.scale[0];
+	v[9] = v[4];
+
+	dst.p.y = r->dst.y;
+	v[10] = dst.f;
+	_sna_get_transformed_scaled(op->src.offset[0] + r->src.x,
+				    op->src.offset[1] + r->src.y,
+				    op->src.transform, op->src.scale,
+				    &v[11], &v[12]);
+	v[13] = v[8];
+	v[14] = msk_y * op->mask.scale[1];
+}
+
+inline static void
+emit_composite_texcoord_affine(struct sna *sna,
+			       const struct sna_composite_channel *channel,
+			       int16_t x, int16_t y)
+{
+	float t[2];
+
+	sna_get_transformed_coordinates(x + channel->offset[0],
+					y + channel->offset[1],
+					channel->transform,
+					&t[0], &t[1]);
+	OUT_VERTEX_F(t[0] * channel->scale[0]);
+	OUT_VERTEX_F(t[1] * channel->scale[1]);
+}
+
+
+unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp)
+{
+	unsigned vb;
+
+	if (tmp->mask.bo) {
+		if (tmp->mask.transform == NULL) {
+			if (tmp->src.is_solid) {
+				DBG(("%s: solid, identity mask\n", __FUNCTION__));
+				tmp->prim_emit = emit_primitive_identity_mask;
+				tmp->emit_boxes = emit_boxes_identity_mask;
+				tmp->floats_per_vertex = 4;
+				vb = 1 | 2 << 2;
+			} else if (tmp->src.is_linear) {
+				DBG(("%s: linear, identity mask\n", __FUNCTION__));
+				tmp->prim_emit = emit_primitive_linear_identity_mask;
+				tmp->emit_boxes = emit_boxes_linear_identity_mask;
+				tmp->floats_per_vertex = 4;
+				vb = 1 | 2 << 2;
+			} else if (tmp->src.transform == NULL) {
+				DBG(("%s: identity source, identity mask\n", __FUNCTION__));
+				tmp->prim_emit = emit_primitive_identity_source_mask;
+				tmp->floats_per_vertex = 5;
+				vb = 2 << 2 | 2;
+			} else if (tmp->src.is_affine) {
+				tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
+				tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
+				if (!sna_affine_transform_is_rotation(tmp->src.transform)) {
+					DBG(("%s: simple src, identity mask\n", __FUNCTION__));
+					tmp->prim_emit = emit_primitive_simple_source_identity;
+				} else {
+					DBG(("%s: affine src, identity mask\n", __FUNCTION__));
+					tmp->prim_emit = emit_primitive_affine_source_identity;
+				}
+				tmp->floats_per_vertex = 5;
+				vb = 2 << 2 | 2;
+			} else {
+				DBG(("%s: projective source, identity mask\n", __FUNCTION__));
+				tmp->prim_emit = emit_primitive_mask;
+				tmp->floats_per_vertex = 6;
+				vb = 2 << 2 | 3;
+			}
+		} else {
+			tmp->prim_emit = emit_primitive_mask;
+			tmp->floats_per_vertex = 1;
+			vb = 0;
+			if (tmp->mask.is_solid) {
+				tmp->floats_per_vertex += 1;
+				vb |= 1 << 2;
+			} else if (tmp->mask.is_affine) {
+				tmp->floats_per_vertex += 2;
+				vb |= 2 << 2;
+			}else {
+				tmp->floats_per_vertex += 3;
+				vb |= 3 << 2;
+			}
+			if (tmp->src.is_solid) {
+				tmp->floats_per_vertex += 1;
+				vb |= 1;
+			} else if (tmp->src.is_affine) {
+				tmp->floats_per_vertex += 2;
+				vb |= 2 ;
+			}else {
+				tmp->floats_per_vertex += 3;
+				vb |= 3;
+			}
+			DBG(("%s: general mask: floats-per-vertex=%d, vb=%x\n",
+			     __FUNCTION__,tmp->floats_per_vertex, vb));
+		}
+	} else {
+		if (tmp->src.is_solid) {
+			DBG(("%s: solid, no mask\n", __FUNCTION__));
+			tmp->prim_emit = emit_primitive_solid;
+			tmp->emit_boxes = emit_boxes_solid;
+			if (tmp->src.is_opaque && tmp->op == PictOpOver)
+				tmp->op = PictOpSrc;
+			tmp->floats_per_vertex = 2;
+			vb = 1;
+		} else if (tmp->src.is_linear) {
+			DBG(("%s: linear, no mask\n", __FUNCTION__));
+			tmp->prim_emit = emit_primitive_linear;
+			tmp->emit_boxes = emit_boxes_linear;
+			tmp->floats_per_vertex = 2;
+			vb = 1;
+		} else if (tmp->src.transform == NULL) {
+			DBG(("%s: identity src, no mask\n", __FUNCTION__));
+			tmp->prim_emit = emit_primitive_identity_source;
+			tmp->emit_boxes = emit_boxes_identity_source;
+			tmp->floats_per_vertex = 3;
+			vb = 2;
+		} else if (tmp->src.is_affine) {
+			tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
+			tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
+			if (!sna_affine_transform_is_rotation(tmp->src.transform)) {
+				DBG(("%s: simple src, no mask\n", __FUNCTION__));
+				tmp->prim_emit = emit_primitive_simple_source;
+				tmp->emit_boxes = emit_boxes_simple_source;
+			} else {
+				DBG(("%s: affine src, no mask\n", __FUNCTION__));
+				tmp->prim_emit = emit_primitive_affine_source;
+				tmp->emit_boxes = emit_boxes_affine_source;
+			}
+			tmp->floats_per_vertex = 3;
+			vb = 2;
+		} else {
+			DBG(("%s: projective src, no mask\n", __FUNCTION__));
+			assert(!tmp->src.is_solid);
+			tmp->prim_emit = emit_primitive;
+			tmp->floats_per_vertex = 4;
+			vb = 3;
+		}
+	}
+	tmp->floats_per_rect = 3 * tmp->floats_per_vertex;
+
+	return vb;
+}
+
+inline static void
+emit_span_vertex(struct sna *sna,
+		  const struct sna_composite_spans_op *op,
+		  int16_t x, int16_t y)
+{
+	OUT_VERTEX(x, y);
+	emit_texcoord(sna, &op->base.src, x, y);
+}
+
+fastcall static void
+emit_composite_spans_primitive(struct sna *sna,
+			       const struct sna_composite_spans_op *op,
+			       const BoxRec *box,
+			       float opacity)
+{
+	emit_span_vertex(sna, op, box->x2, box->y2);
+	OUT_VERTEX_F(opacity);
+
+	emit_span_vertex(sna, op, box->x1, box->y2);
+	OUT_VERTEX_F(opacity);
+
+	emit_span_vertex(sna, op, box->x1, box->y1);
+	OUT_VERTEX_F(opacity);
+}
+
+fastcall static void
+emit_span_solid(struct sna *sna,
+		 const struct sna_composite_spans_op *op,
+		 const BoxRec *box,
+		 float opacity)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	assert(op->base.floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*3;
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+
+	dst.p.x = box->x1;
+	v[3] = dst.f;
+
+	dst.p.y = box->y1;
+	v[6] = dst.f;
+
+	v[7] = v[4] = v[1] = .5;
+	v[8] = v[5] = v[2] = opacity;
+}
+
+fastcall static void
+emit_span_boxes_solid(const struct sna_composite_spans_op *op,
+		      const struct sna_opacity_box *b,
+		      int nbox, float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+
+		dst.p.x = b->box.x1;
+		v[3] = dst.f;
+
+		dst.p.y = b->box.y1;
+		v[6] = dst.f;
+
+		v[7] = v[4] = v[1] = .5;
+		v[8] = v[5] = v[2] = b->alpha;
+
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_span_identity(struct sna *sna,
+		    const struct sna_composite_spans_op *op,
+		    const BoxRec *box,
+		    float opacity)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float sx = op->base.src.scale[0];
+	float sy = op->base.src.scale[1];
+	int16_t tx = op->base.src.offset[0];
+	int16_t ty = op->base.src.offset[1];
+
+	assert(op->base.floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*4;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	v[1] = (box->x2 + tx) * sx;
+	v[6] = v[2] = (box->y2 + ty) * sy;
+
+	dst.p.x = box->x1;
+	v[4] = dst.f;
+	v[9] = v[5] = (box->x1 + tx) * sx;
+
+	dst.p.y = box->y1;
+	v[8] = dst.f;
+	v[10] = (box->y1 + ty) * sy;
+
+	v[11] = v[7] = v[3] = opacity;
+}
+
+fastcall static void
+emit_span_boxes_identity(const struct sna_composite_spans_op *op,
+			 const struct sna_opacity_box *b, int nbox,
+			 float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		float sx = op->base.src.scale[0];
+		float sy = op->base.src.scale[1];
+		int16_t tx = op->base.src.offset[0];
+		int16_t ty = op->base.src.offset[1];
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		v[1] = (b->box.x2 + tx) * sx;
+		v[6] = v[2] = (b->box.y2 + ty) * sy;
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		v[9] = v[5] = (b->box.x1 + tx) * sx;
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		v[10] = (b->box.y1 + ty) * sy;
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_span_simple(struct sna *sna,
+		  const struct sna_composite_spans_op *op,
+		  const BoxRec *box,
+		  float opacity)
+{
+	float *v;
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+
+	float xx = op->base.src.transform->matrix[0][0];
+	float x0 = op->base.src.transform->matrix[0][2];
+	float yy = op->base.src.transform->matrix[1][1];
+	float y0 = op->base.src.transform->matrix[1][2];
+	float sx = op->base.src.scale[0];
+	float sy = op->base.src.scale[1];
+	int16_t tx = op->base.src.offset[0];
+	int16_t ty = op->base.src.offset[1];
+
+	assert(op->base.floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 3*4;
+	assert(sna->render.vertex_used <= sna->render.vertex_size);
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	v[1] = ((box->x2 + tx) * xx + x0) * sx;
+	v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
+
+	dst.p.x = box->x1;
+	v[4] = dst.f;
+	v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx;
+
+	dst.p.y = box->y1;
+	v[8] = dst.f;
+	v[10] = ((box->y1 + ty) * yy + y0) * sy;
+
+	v[11] = v[7] = v[3] = opacity;
+}
+
+fastcall static void
+emit_span_boxes_simple(const struct sna_composite_spans_op *op,
+		       const struct sna_opacity_box *b, int nbox,
+		       float *v)
+{
+	float xx = op->base.src.transform->matrix[0][0];
+	float x0 = op->base.src.transform->matrix[0][2];
+	float yy = op->base.src.transform->matrix[1][1];
+	float y0 = op->base.src.transform->matrix[1][2];
+	float sx = op->base.src.scale[0];
+	float sy = op->base.src.scale[1];
+	int16_t tx = op->base.src.offset[0];
+	int16_t ty = op->base.src.offset[1];
+
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		v[1] = ((b->box.x2 + tx) * xx + x0) * sx;
+		v[6] = v[2] = ((b->box.y2 + ty) * yy + y0) * sy;
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		v[9] = v[5] = ((b->box.x1 + tx) * xx + x0) * sx;
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		v[10] = ((b->box.y1 + ty) * yy + y0) * sy;
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_span_affine(struct sna *sna,
+		  const struct sna_composite_spans_op *op,
+		  const BoxRec *box,
+		  float opacity)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->base.floats_per_rect == 12);
+	assert((sna->render.vertex_used % 4) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 12;
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x2,
+				    op->base.src.offset[1] + box->y2,
+				    op->base.src.transform,
+				    op->base.src.scale,
+				    &v[1], &v[2]);
+
+	dst.p.x = box->x1;
+	v[4] = dst.f;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+				    op->base.src.offset[1] + box->y2,
+				    op->base.src.transform,
+				    op->base.src.scale,
+				    &v[5], &v[6]);
+
+	dst.p.y = box->y1;
+	v[8] = dst.f;
+	_sna_get_transformed_scaled(op->base.src.offset[0] + box->x1,
+				    op->base.src.offset[1] + box->y1,
+				    op->base.src.transform,
+				    op->base.src.scale,
+				    &v[9], &v[10]);
+
+	v[11] = v[7] = v[3] = opacity;
+}
+
+fastcall static void
+emit_span_boxes_affine(const struct sna_composite_spans_op *op,
+		       const struct sna_opacity_box *b, int nbox,
+		       float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x2,
+					    op->base.src.offset[1] + b->box.y2,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[1], &v[2]);
+
+		dst.p.x = b->box.x1;
+		v[4] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y2,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[5], &v[6]);
+
+		dst.p.y = b->box.y1;
+		v[8] = dst.f;
+		_sna_get_transformed_scaled(op->base.src.offset[0] + b->box.x1,
+					    op->base.src.offset[1] + b->box.y1,
+					    op->base.src.transform,
+					    op->base.src.scale,
+					    &v[9], &v[10]);
+
+		v[11] = v[7] = v[3] = b->alpha;
+
+		v += 12;
+		b++;
+	} while (--nbox);
+}
+
+fastcall static void
+emit_span_linear(struct sna *sna,
+		  const struct sna_composite_spans_op *op,
+		  const BoxRec *box,
+		  float opacity)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} dst;
+	float *v;
+
+	assert(op->base.floats_per_rect == 9);
+	assert((sna->render.vertex_used % 3) == 0);
+	v = sna->render.vertices + sna->render.vertex_used;
+	sna->render.vertex_used += 9;
+
+	dst.p.x = box->x2;
+	dst.p.y = box->y2;
+	v[0] = dst.f;
+	dst.p.x = box->x1;
+	v[3] = dst.f;
+	dst.p.y = box->y1;
+	v[6] = dst.f;
+
+	v[1] = compute_linear(&op->base.src, box->x2, box->y2);
+	v[4] = compute_linear(&op->base.src, box->x1, box->y2);
+	v[7] = compute_linear(&op->base.src, box->x1, box->y1);
+
+	v[8] = v[5] = v[2] = opacity;
+}
+
+fastcall static void
+emit_span_boxes_linear(const struct sna_composite_spans_op *op,
+		       const struct sna_opacity_box *b, int nbox,
+		       float *v)
+{
+	do {
+		union {
+			struct sna_coordinate p;
+			float f;
+		} dst;
+
+		dst.p.x = b->box.x2;
+		dst.p.y = b->box.y2;
+		v[0] = dst.f;
+		dst.p.x = b->box.x1;
+		v[3] = dst.f;
+		dst.p.y = b->box.y1;
+		v[6] = dst.f;
+
+		v[1] = compute_linear(&op->base.src, b->box.x2, b->box.y2);
+		v[4] = compute_linear(&op->base.src, b->box.x1, b->box.y2);
+		v[7] = compute_linear(&op->base.src, b->box.x1, b->box.y1);
+
+		v[8] = v[5] = v[2] = b->alpha;
+
+		v += 9;
+		b++;
+	} while (--nbox);
+}
+
+inline inline static uint32_t
+gen4_choose_spans_vertex_buffer(const struct sna_composite_op *op)
+{
+	int id = op->src.is_solid ? 1 : 2 + !op->src.is_affine;
+	DBG(("%s: id=%x (%d, 1)\n", __FUNCTION__, 1 << 2 | id, id));
+	return 1 << 2 | id;
+}
+
+unsigned gen4_choose_spans_emitter(struct sna_composite_spans_op *tmp)
+{
+	unsigned vb;
+
+	if (tmp->base.src.is_solid) {
+		tmp->prim_emit = emit_span_solid;
+		tmp->emit_boxes = emit_span_boxes_solid;
+		tmp->base.floats_per_vertex = 3;
+		vb = 1 << 2 | 1;
+	} else if (tmp->base.src.is_linear) {
+		tmp->prim_emit = emit_span_linear;
+		tmp->emit_boxes = emit_span_boxes_linear;
+		tmp->base.floats_per_vertex = 3;
+		vb = 1 << 2 | 1;
+	} else if (tmp->base.src.transform == NULL) {
+		tmp->prim_emit = emit_span_identity;
+		tmp->emit_boxes = emit_span_boxes_identity;
+		tmp->base.floats_per_vertex = 4;
+		vb = 1 << 2 | 2;
+	} else if (tmp->base.is_affine) {
+		tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
+		tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
+		if (!sna_affine_transform_is_rotation(tmp->base.src.transform)) {
+			tmp->prim_emit = emit_span_simple;
+			tmp->emit_boxes = emit_span_boxes_simple;
+		} else {
+			tmp->prim_emit = emit_span_affine;
+			tmp->emit_boxes = emit_span_boxes_affine;
+		}
+		tmp->base.floats_per_vertex = 4;
+		vb = 1 << 2 | 2;
+	} else {
+		tmp->prim_emit = emit_composite_spans_primitive;
+		tmp->base.floats_per_vertex = 5;
+		vb = 1 << 2 | 3;
+	}
+	tmp->base.floats_per_rect = 3 * tmp->base.floats_per_vertex;
+	return vb;
+}
diff --git a/src/sna/gen4_vertex.h b/src/sna/gen4_vertex.h
new file mode 100644
index 000000000..431b545eb
--- /dev/null
+++ b/src/sna/gen4_vertex.h
@@ -0,0 +1,16 @@
+#ifndef GEN4_VERTEX_H
+#define GEN4_VERTEX_H
+
+#include "compiler.h"
+
+#include "sna.h"
+#include "sna_render.h"
+
+void gen4_vertex_flush(struct sna *sna);
+int gen4_vertex_finish(struct sna *sna);
+void gen4_vertex_close(struct sna *sna);
+
+unsigned gen4_choose_composite_emitter(struct sna_composite_op *tmp);
+unsigned gen4_choose_spans_emitter(struct sna_composite_spans_op *tmp);
+
+#endif /* GEN4_VERTEX_H */
diff --git a/src/sna/gen5_render.c b/src/sna/gen5_render.c
index 5d559377b..6e1199638 100644
--- a/src/sna/gen5_render.c
+++ b/src/sna/gen5_render.c
@@ -42,7 +42,10 @@
 
 #include "brw/brw.h"
 #include "gen5_render.h"
+#include "gen4_source.h"
+#include "gen4_vertex.h"
 
+#define NO_COMPOSITE 0
 #define NO_COMPOSITE_SPANS 0
 
 #define PREFER_BLT_FILL 1
@@ -196,17 +199,19 @@ gen5_choose_composite_kernel(int op, bool has_mask, bool is_ca, bool is_affine)
 	return base + !is_affine;
 }
 
-static void gen5_magic_ca_pass(struct sna *sna,
+static bool gen5_magic_ca_pass(struct sna *sna,
 			       const struct sna_composite_op *op)
 {
 	struct gen5_render_state *state = &sna->render_state.gen5;
 
 	if (!op->need_magic_ca_pass)
-		return;
+		return false;
 
 	assert(sna->render.vertex_index > sna->render.vertex_start);
 
 	DBG(("%s: CA fixup\n", __FUNCTION__));
+	assert(op->mask.bo != NULL);
+	assert(op->has_component_alpha);
 
 	gen5_emit_pipelined_pointers
 		(sna, op, PictOpAdd,
@@ -225,162 +230,7 @@ static void gen5_magic_ca_pass(struct sna *sna,
 	OUT_BATCH(0);	/* index buffer offset, ignored */
 
 	state->last_primitive = sna->kgem.nbatch;
-}
-
-static void gen5_vertex_flush(struct sna *sna)
-{
-	assert(sna->render_state.gen5.vertex_offset);
-	assert(sna->render.vertex_index > sna->render.vertex_start);
-
-	DBG(("%s[%x] = %d\n", __FUNCTION__,
-	     4*sna->render_state.gen5.vertex_offset,
-	     sna->render.vertex_index - sna->render.vertex_start));
-	sna->kgem.batch[sna->render_state.gen5.vertex_offset] =
-		sna->render.vertex_index - sna->render.vertex_start;
-	sna->render_state.gen5.vertex_offset = 0;
-}
-
-static int gen5_vertex_finish(struct sna *sna)
-{
-	struct kgem_bo *bo;
-	unsigned int i;
-
-	assert(sna->render.vertex_used);
-	assert(sna->render.nvertex_reloc);
-
-	/* Note: we only need dword alignment (currently) */
-
-	bo = sna->render.vbo;
-	if (bo) {
-		if (sna->render_state.gen5.vertex_offset)
-			gen5_vertex_flush(sna);
-
-		for (i = 0; i < sna->render.nvertex_reloc; i++) {
-			DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
-			     i, sna->render.vertex_reloc[i]));
-
-			sna->kgem.batch[sna->render.vertex_reloc[i]] =
-				kgem_add_reloc(&sna->kgem,
-					       sna->render.vertex_reloc[i], bo,
-					       I915_GEM_DOMAIN_VERTEX << 16,
-					       0);
-			sna->kgem.batch[sna->render.vertex_reloc[i]+1] =
-				kgem_add_reloc(&sna->kgem,
-					       sna->render.vertex_reloc[i]+1, bo,
-					       I915_GEM_DOMAIN_VERTEX << 16,
-					       sna->render.vertex_used * 4 - 1);
-		}
-
-		sna->render.nvertex_reloc = 0;
-		sna->render.vertex_used = 0;
-		sna->render.vertex_index = 0;
-		sna->render_state.gen5.vb_id = 0;
-
-		kgem_bo_destroy(&sna->kgem, bo);
-	}
-
-	sna->render.vertices = NULL;
-	sna->render.vbo = kgem_create_linear(&sna->kgem,
-					     256*1024, CREATE_GTT_MAP);
-	if (sna->render.vbo)
-		sna->render.vertices = kgem_bo_map(&sna->kgem, sna->render.vbo);
-	if (sna->render.vertices == NULL) {
-		if (sna->render.vbo)
-			kgem_bo_destroy(&sna->kgem, sna->render.vbo);
-		sna->render.vbo = NULL;
-		return 0;
-	}
-
-	if (sna->render.vertex_used) {
-		memcpy(sna->render.vertices,
-		       sna->render.vertex_data,
-		       sizeof(float)*sna->render.vertex_used);
-	}
-	sna->render.vertex_size = 64 * 1024 - 1;
-	return sna->render.vertex_size - sna->render.vertex_used;
-}
-
-static void gen5_vertex_close(struct sna *sna)
-{
-	struct kgem_bo *bo, *free_bo = NULL;
-	unsigned int i, delta = 0;
-
-	assert(sna->render_state.gen5.vertex_offset == 0);
-	if (!sna->render_state.gen5.vb_id)
-		return;
-
-	DBG(("%s: used=%d, vbo active? %d\n",
-	     __FUNCTION__, sna->render.vertex_used, sna->render.vbo != NULL));
-
-	bo = sna->render.vbo;
-	if (bo) {
-		if (sna->render.vertex_size - sna->render.vertex_used < 64) {
-			DBG(("%s: discarding full vbo\n", __FUNCTION__));
-			sna->render.vbo = NULL;
-			sna->render.vertices = sna->render.vertex_data;
-			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
-			free_bo = bo;
-		} else if (IS_CPU_MAP(bo->map)) {
-			DBG(("%s: converting CPU map to GTT\n", __FUNCTION__));
-			sna->render.vertices =
-				kgem_bo_map__gtt(&sna->kgem, sna->render.vbo);
-			if (sna->render.vertices == NULL) {
-				sna->render.vbo = NULL;
-				sna->render.vertices = sna->render.vertex_data;
-				sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
-				free_bo = bo;
-			}
-		}
-	} else {
-		if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
-			DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
-			     sna->render.vertex_used, sna->kgem.nbatch));
-			memcpy(sna->kgem.batch + sna->kgem.nbatch,
-			       sna->render.vertex_data,
-			       sna->render.vertex_used * 4);
-			delta = sna->kgem.nbatch * 4;
-			bo = NULL;
-			sna->kgem.nbatch += sna->render.vertex_used;
-		} else {
-			bo = kgem_create_linear(&sna->kgem,
-						4*sna->render.vertex_used, 0);
-			if (bo && !kgem_bo_write(&sna->kgem, bo,
-						 sna->render.vertex_data,
-						 4*sna->render.vertex_used)) {
-				kgem_bo_destroy(&sna->kgem, bo);
-				bo = NULL;
-			}
-			DBG(("%s: new vbo: %d\n", __FUNCTION__,
-			     sna->render.vertex_used));
-			free_bo = bo;
-		}
-	}
-
-	assert(sna->render.nvertex_reloc);
-	for (i = 0; i < sna->render.nvertex_reloc; i++) {
-		DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
-		     i, sna->render.vertex_reloc[i]));
-
-		sna->kgem.batch[sna->render.vertex_reloc[i]] =
-			kgem_add_reloc(&sna->kgem,
-				       sna->render.vertex_reloc[i], bo,
-				       I915_GEM_DOMAIN_VERTEX << 16,
-				       delta);
-		sna->kgem.batch[sna->render.vertex_reloc[i]+1] =
-			kgem_add_reloc(&sna->kgem,
-				       sna->render.vertex_reloc[i]+1, bo,
-				       I915_GEM_DOMAIN_VERTEX << 16,
-				       delta + sna->render.vertex_used * 4 - 1);
-	}
-	sna->render.nvertex_reloc = 0;
-
-	if (sna->render.vbo == NULL) {
-		sna->render.vertex_used = 0;
-		sna->render.vertex_index = 0;
-	}
-
-	if (free_bo)
-		kgem_bo_destroy(&sna->kgem, free_bo);
+	return true;
 }
 
 static uint32_t gen5_get_blend(int op,
@@ -679,310 +529,29 @@ gen5_bind_bo(struct sna *sna,
 	return offset * sizeof(uint32_t);
 }
 
-fastcall static void
-gen5_emit_composite_primitive_solid(struct sna *sna,
-				    const struct sna_composite_op *op,
-				    const struct sna_composite_rectangles *r)
-{
-	float *v;
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 9;
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	v[1] = 1.;
-	v[2] = 1.;
-
-	dst.p.x = r->dst.x;
-	v[3] = dst.f;
-	v[4] = 0.;
-	v[5] = 1.;
-
-	dst.p.y = r->dst.y;
-	v[6] = dst.f;
-	v[7] = 0.;
-	v[8] = 0.;
-}
-
-fastcall static void
-gen5_emit_composite_primitive_identity_source(struct sna *sna,
-					      const struct sna_composite_op *op,
-					      const struct sna_composite_rectangles *r)
-{
-	const float *sf = op->src.scale;
-	float sx, sy, *v;
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 9;
-
-	sx = r->src.x + op->src.offset[0];
-	sy = r->src.y + op->src.offset[1];
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	v[1] = (sx + r->width) * sf[0];
-	v[5] = v[2] = (sy + r->height) * sf[1];
-
-	dst.p.x = r->dst.x;
-	v[3] = dst.f;
-	v[7] = v[4] = sx * sf[0];
-
-	dst.p.y = r->dst.y;
-	v[6] = dst.f;
-	v[8] = sy * sf[1];
-}
-
-fastcall static void
-gen5_emit_composite_primitive_affine_source(struct sna *sna,
-					    const struct sna_composite_op *op,
-					    const struct sna_composite_rectangles *r)
-{
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-	float *v;
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 9;
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	_sna_get_transformed_coordinates(op->src.offset[0] + r->src.x + r->width,
-					 op->src.offset[1] + r->src.y + r->height,
-					 op->src.transform,
-					 &v[1], &v[2]);
-	v[1] *= op->src.scale[0];
-	v[2] *= op->src.scale[1];
-
-	dst.p.x = r->dst.x;
-	v[3] = dst.f;
-	_sna_get_transformed_coordinates(op->src.offset[0] + r->src.x,
-					 op->src.offset[1] + r->src.y + r->height,
-					 op->src.transform,
-					 &v[4], &v[5]);
-	v[4] *= op->src.scale[0];
-	v[5] *= op->src.scale[1];
-
-	dst.p.y = r->dst.y;
-	v[6] = dst.f;
-	_sna_get_transformed_coordinates(op->src.offset[0] + r->src.x,
-					 op->src.offset[1] + r->src.y,
-					 op->src.transform,
-					 &v[7], &v[8]);
-	v[7] *= op->src.scale[0];
-	v[8] *= op->src.scale[1];
-}
-
-fastcall static void
-gen5_emit_composite_primitive_identity_source_mask(struct sna *sna,
-						   const struct sna_composite_op *op,
-						   const struct sna_composite_rectangles *r)
-{
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-	float src_x, src_y;
-	float msk_x, msk_y;
-	float w, h;
-	float *v;
-
-	src_x = r->src.x + op->src.offset[0];
-	src_y = r->src.y + op->src.offset[1];
-	msk_x = r->mask.x + op->mask.offset[0];
-	msk_y = r->mask.y + op->mask.offset[1];
-	w = r->width;
-	h = r->height;
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 15;
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	v[1] = (src_x + w) * op->src.scale[0];
-	v[2] = (src_y + h) * op->src.scale[1];
-	v[3] = (msk_x + w) * op->mask.scale[0];
-	v[4] = (msk_y + h) * op->mask.scale[1];
-
-	dst.p.x = r->dst.x;
-	v[5] = dst.f;
-	v[6] = src_x * op->src.scale[0];
-	v[7] = v[2];
-	v[8] = msk_x * op->mask.scale[0];
-	v[9] = v[4];
-
-	dst.p.y = r->dst.y;
-	v[10] = dst.f;
-	v[11] = v[6];
-	v[12] = src_y * op->src.scale[1];
-	v[13] = v[8];
-	v[14] = msk_y * op->mask.scale[1];
-}
-
-fastcall static void
-gen5_emit_composite_primitive(struct sna *sna,
-			      const struct sna_composite_op *op,
-			      const struct sna_composite_rectangles *r)
-{
-	float src_x[3], src_y[3], src_w[3], mask_x[3], mask_y[3], mask_w[3];
-	bool is_affine = op->is_affine;
-	const float *src_sf = op->src.scale;
-	const float *mask_sf = op->mask.scale;
-
-	if (is_affine) {
-		sna_get_transformed_coordinates(r->src.x + op->src.offset[0],
-						r->src.y + op->src.offset[1],
-						op->src.transform,
-						&src_x[0],
-						&src_y[0]);
-
-		sna_get_transformed_coordinates(r->src.x + op->src.offset[0],
-						r->src.y + op->src.offset[1] + r->height,
-						op->src.transform,
-						&src_x[1],
-						&src_y[1]);
-
-		sna_get_transformed_coordinates(r->src.x + op->src.offset[0] + r->width,
-						r->src.y + op->src.offset[1] + r->height,
-						op->src.transform,
-						&src_x[2],
-						&src_y[2]);
-	} else {
-		sna_get_transformed_coordinates_3d(r->src.x + op->src.offset[0],
-						   r->src.y + op->src.offset[1],
-						   op->src.transform,
-						   &src_x[0],
-						   &src_y[0],
-						   &src_w[0]);
-		sna_get_transformed_coordinates_3d(r->src.x + op->src.offset[0],
-						   r->src.y + op->src.offset[1] + r->height,
-						   op->src.transform,
-						   &src_x[1],
-						   &src_y[1],
-						   &src_w[1]);
-		sna_get_transformed_coordinates_3d(r->src.x + op->src.offset[0] + r->width,
-						   r->src.y + op->src.offset[1] + r->height,
-						   op->src.transform,
-						   &src_x[2],
-						   &src_y[2],
-						   &src_w[2]);
-	}
-
-	if (op->mask.bo) {
-		if (is_affine) {
-			sna_get_transformed_coordinates(r->mask.x + op->mask.offset[0],
-							r->mask.y + op->mask.offset[1],
-							op->mask.transform,
-							&mask_x[0],
-							&mask_y[0]);
-
-			sna_get_transformed_coordinates(r->mask.x + op->mask.offset[0],
-							r->mask.y + op->mask.offset[1] + r->height,
-							op->mask.transform,
-							&mask_x[1],
-							&mask_y[1]);
-
-			sna_get_transformed_coordinates(r->mask.x + op->mask.offset[0] + r->width,
-							r->mask.y + op->mask.offset[1] + r->height,
-							op->mask.transform,
-							&mask_x[2],
-							&mask_y[2]);
-		} else {
-			sna_get_transformed_coordinates_3d(r->mask.x + op->mask.offset[0],
-							   r->mask.y + op->mask.offset[1],
-							   op->mask.transform,
-							   &mask_x[0],
-							   &mask_y[0],
-							   &mask_w[0]);
-
-			sna_get_transformed_coordinates_3d(r->mask.x + op->mask.offset[0],
-							   r->mask.y + op->mask.offset[1] + r->height,
-							   op->mask.transform,
-							   &mask_x[1],
-							   &mask_y[1],
-							   &mask_w[1]);
-			sna_get_transformed_coordinates_3d(r->mask.x + op->mask.offset[0] + r->width,
-							   r->mask.y + op->mask.offset[1] + r->height,
-							   op->mask.transform,
-							   &mask_x[2],
-							   &mask_y[2],
-							   &mask_w[2]);
-		}
-	}
-
-	OUT_VERTEX(r->dst.x + r->width, r->dst.y + r->height);
-	OUT_VERTEX_F(src_x[2] * src_sf[0]);
-	OUT_VERTEX_F(src_y[2] * src_sf[1]);
-	if (!is_affine)
-		OUT_VERTEX_F(src_w[2]);
-	if (op->mask.bo) {
-		OUT_VERTEX_F(mask_x[2] * mask_sf[0]);
-		OUT_VERTEX_F(mask_y[2] * mask_sf[1]);
-		if (!is_affine)
-			OUT_VERTEX_F(mask_w[2]);
-	}
-
-	OUT_VERTEX(r->dst.x, r->dst.y + r->height);
-	OUT_VERTEX_F(src_x[1] * src_sf[0]);
-	OUT_VERTEX_F(src_y[1] * src_sf[1]);
-	if (!is_affine)
-		OUT_VERTEX_F(src_w[1]);
-	if (op->mask.bo) {
-		OUT_VERTEX_F(mask_x[1] * mask_sf[0]);
-		OUT_VERTEX_F(mask_y[1] * mask_sf[1]);
-		if (!is_affine)
-			OUT_VERTEX_F(mask_w[1]);
-	}
-
-	OUT_VERTEX(r->dst.x, r->dst.y);
-	OUT_VERTEX_F(src_x[0] * src_sf[0]);
-	OUT_VERTEX_F(src_y[0] * src_sf[1]);
-	if (!is_affine)
-		OUT_VERTEX_F(src_w[0]);
-	if (op->mask.bo) {
-		OUT_VERTEX_F(mask_x[0] * mask_sf[0]);
-		OUT_VERTEX_F(mask_y[0] * mask_sf[1]);
-		if (!is_affine)
-			OUT_VERTEX_F(mask_w[0]);
-	}
-}
-
 static void gen5_emit_vertex_buffer(struct sna *sna,
 				    const struct sna_composite_op *op)
 {
 	int id = op->u.gen5.ve_id;
 
-	assert((unsigned)id <= 3);
+	assert((sna->render.vb_id & (1 << id)) == 0);
 
 	OUT_BATCH(GEN5_3DSTATE_VERTEX_BUFFERS | 3);
-	OUT_BATCH((id << VB0_BUFFER_INDEX_SHIFT) | VB0_VERTEXDATA |
+	OUT_BATCH(id << VB0_BUFFER_INDEX_SHIFT | VB0_VERTEXDATA |
 		  (4*op->floats_per_vertex << VB0_BUFFER_PITCH_SHIFT));
+	assert(sna->render.nvertex_reloc < ARRAY_SIZE(sna->render.vertex_reloc));
 	sna->render.vertex_reloc[sna->render.nvertex_reloc++] = sna->kgem.nbatch;
 	OUT_BATCH(0);
-	OUT_BATCH(0);
+	OUT_BATCH(~0); /* max address: disabled */
 	OUT_BATCH(0);
 
-	sna->render_state.gen5.vb_id |= 1 << id;
+	sna->render.vb_id |= 1 << id;
 }
 
 static void gen5_emit_primitive(struct sna *sna)
 {
 	if (sna->kgem.nbatch == sna->render_state.gen5.last_primitive) {
-		sna->render_state.gen5.vertex_offset = sna->kgem.nbatch - 5;
+		sna->render.vertex_offset = sna->kgem.nbatch - 5;
 		return;
 	}
 
@@ -991,7 +560,7 @@ static void gen5_emit_primitive(struct sna *sna)
 		  (_3DPRIM_RECTLIST << GEN5_3DPRIMITIVE_TOPOLOGY_SHIFT) |
 		  (0 << 9) |
 		  4);
-	sna->render_state.gen5.vertex_offset = sna->kgem.nbatch;
+	sna->render.vertex_offset = sna->kgem.nbatch;
 	OUT_BATCH(0);	/* vertex count, to be filled in later */
 	OUT_BATCH(sna->render.vertex_index);
 	OUT_BATCH(1);	/* single instance */
@@ -1008,18 +577,19 @@ static bool gen5_rectangle_begin(struct sna *sna,
 	int id = op->u.gen5.ve_id;
 	int ndwords;
 
-	assert((unsigned)id <= 3);
+	if (sna_vertex_wait__locked(&sna->render) && sna->render.vertex_offset)
+		return true;
 
 	ndwords = op->need_magic_ca_pass ? 20 : 6;
-	if ((sna->render_state.gen5.vb_id & (1 << id)) == 0)
+	if ((sna->render.vb_id & (1 << id)) == 0)
 		ndwords += 5;
 
 	if (!kgem_check_batch(&sna->kgem, ndwords))
 		return false;
 
-	if ((sna->render_state.gen5.vb_id & (1 << id)) == 0)
+	if ((sna->render.vb_id & (1 << id)) == 0)
 		gen5_emit_vertex_buffer(sna, op);
-	if (sna->render_state.gen5.vertex_offset == 0)
+	if (sna->render.vertex_offset == 0)
 		gen5_emit_primitive(sna);
 
 	return true;
@@ -1028,17 +598,26 @@ static bool gen5_rectangle_begin(struct sna *sna,
 static int gen5_get_rectangles__flush(struct sna *sna,
 				      const struct sna_composite_op *op)
 {
+	/* Preventing discarding new vbo after lock contention */
+	if (sna_vertex_wait__locked(&sna->render)) {
+		int rem = vertex_space(sna);
+		if (rem > op->floats_per_rect)
+			return rem;
+	}
+
 	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 20 : 6))
 		return 0;
-	if (!kgem_check_exec(&sna->kgem, 1))
-		return 0;
-	if (!kgem_check_reloc(&sna->kgem, 2))
+	if (!kgem_check_reloc_and_exec(&sna->kgem, 2))
 		return 0;
 
-	if (op->need_magic_ca_pass && sna->render.vbo)
-		return 0;
+	if (sna->render.vertex_offset) {
+		gen4_vertex_flush(sna);
+		if (gen5_magic_ca_pass(sna, op))
+			gen5_emit_pipelined_pointers(sna, op, op->op,
+						     op->u.gen5.wm_kernel);
+	}
 
-	return gen5_vertex_finish(sna);
+	return gen4_vertex_finish(sna);
 }
 
 inline static int gen5_get_rectangles(struct sna *sna,
@@ -1051,7 +630,7 @@ inline static int gen5_get_rectangles(struct sna *sna,
 
 start:
 	rem = vertex_space(sna);
-	if (rem < op->floats_per_rect) {
+	if (unlikely(rem < op->floats_per_rect)) {
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen5_get_rectangles__flush(sna, op);
@@ -1059,21 +638,22 @@ start:
 			goto flush;
 	}
 
-	if (unlikely(sna->render_state.gen5.vertex_offset == 0 &&
+	if (unlikely(sna->render.vertex_offset == 0 &&
 		     !gen5_rectangle_begin(sna, op)))
 		goto flush;
 
-	if (want * op->floats_per_rect > rem)
+	if (want > 1 && want * op->floats_per_rect > rem)
 		want = rem / op->floats_per_rect;
 
 	sna->render.vertex_index += 3*want;
 	return want;
 
 flush:
-	if (sna->render_state.gen5.vertex_offset) {
-		gen5_vertex_flush(sna);
+	if (sna->render.vertex_offset) {
+		gen4_vertex_flush(sna);
 		gen5_magic_ca_pass(sna, op);
 	}
+	sna_vertex_wait__locked(&sna->render);
 	_kgem_submit(&sna->kgem);
 	emit_state(sna, op);
 	goto start;
@@ -1083,18 +663,15 @@ static uint32_t *
 gen5_composite_get_binding_table(struct sna *sna,
 				 uint16_t *offset)
 {
-	uint32_t *table;
-
 	sna->kgem.surface -=
 		sizeof(struct gen5_surface_state_padded) / sizeof(uint32_t);
-	/* Clear all surplus entries to zero in case of prefetch */
-	table = memset(sna->kgem.batch + sna->kgem.surface,
-		       0, sizeof(struct gen5_surface_state_padded));
-	*offset = sna->kgem.surface;
 
 	DBG(("%s(%x)\n", __FUNCTION__, 4*sna->kgem.surface));
 
-	return table;
+	/* Clear all surplus entries to zero in case of prefetch */
+	*offset = sna->kgem.surface;
+	return memset(sna->kgem.batch + sna->kgem.surface,
+		      0, sizeof(struct gen5_surface_state_padded));
 }
 
 static void
@@ -1181,9 +758,9 @@ gen5_emit_invariant(struct sna *sna)
 }
 
 static void
-gen5_get_batch(struct sna *sna)
+gen5_get_batch(struct sna *sna, const struct sna_composite_op *op)
 {
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
 
 	if (!kgem_check_batch_with_surfaces(&sna->kgem, 150, 4)) {
 		DBG(("%s: flushing batch: %d < %d+%d\n",
@@ -1200,9 +777,10 @@ gen5_get_batch(struct sna *sna)
 static void
 gen5_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 {
+	assert(op->floats_per_rect == 3*op->floats_per_vertex);
 	if (op->floats_per_vertex != sna->render_state.gen5.floats_per_vertex) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-			gen5_vertex_finish(sna);
+			gen4_vertex_finish(sna);
 
 		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
 		     sna->render_state.gen5.floats_per_vertex,
@@ -1215,12 +793,12 @@ gen5_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 	}
 }
 
-static bool
+static void
 gen5_emit_binding_table(struct sna *sna, uint16_t offset)
 {
 	if (!DBG_NO_STATE_CACHE &&
 	    sna->render_state.gen5.surface_table == offset)
-		return false;
+		return;
 
 	sna->render_state.gen5.surface_table = offset;
 
@@ -1232,8 +810,6 @@ gen5_emit_binding_table(struct sna *sna, uint16_t offset)
 	OUT_BATCH(0);		/* sf */
 	/* Only the PS uses the binding table */
 	OUT_BATCH(offset*4);
-
-	return true;
 }
 
 static bool
@@ -1241,33 +817,36 @@ gen5_emit_pipelined_pointers(struct sna *sna,
 			     const struct sna_composite_op *op,
 			     int blend, int kernel)
 {
-	uint16_t offset = sna->kgem.nbatch, last;
+	uint16_t sp, bp;
+	uint32_t key;
+
+	DBG(("%s: has_mask=%d, src=(%d, %d), mask=(%d, %d),kernel=%d, blend=%d, ca=%d, format=%x\n",
+	     __FUNCTION__, op->u.gen5.ve_id & 2,
+	     op->src.filter, op->src.repeat,
+	     op->mask.filter, op->mask.repeat,
+	     kernel, blend, op->has_component_alpha, (int)op->dst.format));
+
+	sp = SAMPLER_OFFSET(op->src.filter, op->src.repeat,
+			    op->mask.filter, op->mask.repeat,
+			    kernel);
+	bp = gen5_get_blend(blend, op->has_component_alpha, op->dst.format);
+
+	DBG(("%s: sp=%d, bp=%d\n", __FUNCTION__, sp, bp));
+	key = sp | (uint32_t)bp << 16 | (op->mask.bo != NULL) << 31;
+	if (key == sna->render_state.gen5.last_pipelined_pointers)
+		return false;
+
 
 	OUT_BATCH(GEN5_3DSTATE_PIPELINED_POINTERS | 5);
 	OUT_BATCH(sna->render_state.gen5.vs);
 	OUT_BATCH(GEN5_GS_DISABLE); /* passthrough */
 	OUT_BATCH(GEN5_CLIP_DISABLE); /* passthrough */
 	OUT_BATCH(sna->render_state.gen5.sf[op->mask.bo != NULL]);
-	OUT_BATCH(sna->render_state.gen5.wm +
-		  SAMPLER_OFFSET(op->src.filter, op->src.repeat,
-				 op->mask.filter, op->mask.repeat,
-				 kernel));
-	OUT_BATCH(sna->render_state.gen5.cc +
-		  gen5_get_blend(blend, op->has_component_alpha, op->dst.format));
-
-	last = sna->render_state.gen5.last_pipelined_pointers;
-	if (!DBG_NO_STATE_CACHE && last &&
-	    sna->kgem.batch[offset + 1] == sna->kgem.batch[last + 1] &&
-	    sna->kgem.batch[offset + 3] == sna->kgem.batch[last + 3] &&
-	    sna->kgem.batch[offset + 4] == sna->kgem.batch[last + 4] &&
-	    sna->kgem.batch[offset + 5] == sna->kgem.batch[last + 5] &&
-	    sna->kgem.batch[offset + 6] == sna->kgem.batch[last + 6]) {
-		sna->kgem.nbatch = offset;
-		return false;
-	} else {
-		sna->render_state.gen5.last_pipelined_pointers = offset;
-		return true;
-	}
+	OUT_BATCH(sna->render_state.gen5.wm + sp);
+	OUT_BATCH(sna->render_state.gen5.cc + bp);
+
+	sna->render_state.gen5.last_pipelined_pointers = key;
+	return true;
 }
 
 static void
@@ -1304,28 +883,16 @@ gen5_emit_vertex_elements(struct sna *sna,
 	 *    texture coordinate 1 if (has_mask is true): same as above
 	 */
 	struct gen5_render_state *render = &sna->render_state.gen5;
-	bool has_mask = op->mask.bo != NULL;
-	bool is_affine = op->is_affine;
-	int nelem = has_mask ? 2 : 1;
-	int selem = is_affine ? 2 : 3;
-	uint32_t w_component;
-	uint32_t src_format;
 	int id = op->u.gen5.ve_id;
+	bool has_mask = id >> 2;
+	uint32_t format, dw;
 
-	assert((unsigned)id <= 3);
 	if (!DBG_NO_STATE_CACHE && render->ve_id == id)
 		return;
 
+	DBG(("%s: changing %d -> %d\n", __FUNCTION__, render->ve_id, id));
 	render->ve_id = id;
 
-	if (is_affine) {
-		src_format = GEN5_SURFACEFORMAT_R32G32_FLOAT;
-		w_component = GEN5_VFCOMPONENT_STORE_1_FLT;
-	} else {
-		src_format = GEN5_SURFACEFORMAT_R32G32B32_FLOAT;
-		w_component = GEN5_VFCOMPONENT_STORE_SRC;
-	}
-
 	/* The VUE layout
 	 *    dword 0-3: pad (0.0, 0.0, 0.0. 0.0)
 	 *    dword 4-7: position (x, y, 1.0, 1.0),
@@ -1335,43 +902,92 @@ gen5_emit_vertex_elements(struct sna *sna,
 	 * dword 4-15 are fetched from vertex buffer
 	 */
 	OUT_BATCH(GEN5_3DSTATE_VERTEX_ELEMENTS |
-		((2 * (2 + nelem)) + 1 - 2));
+		((2 * (has_mask ? 4 : 3)) + 1 - 2));
 
 	OUT_BATCH((id << VE0_VERTEX_BUFFER_INDEX_SHIFT) | VE0_VALID |
 		  (GEN5_SURFACEFORMAT_R32G32B32A32_FLOAT << VE0_FORMAT_SHIFT) |
 		  (0 << VE0_OFFSET_SHIFT));
-	OUT_BATCH((GEN5_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT) |
-		  (GEN5_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT) |
-		  (GEN5_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT) |
-		  (GEN5_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT));
+	OUT_BATCH((VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT) |
+		  (VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT) |
+		  (VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT) |
+		  (VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT));
 
 	/* x,y */
-	OUT_BATCH((id << VE0_VERTEX_BUFFER_INDEX_SHIFT) | VE0_VALID |
-		  (GEN5_SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT) |
-		  (0 << VE0_OFFSET_SHIFT)); /* offsets vb in bytes */
-	OUT_BATCH((GEN5_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
-		  (GEN5_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-		  (GEN5_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT) |
-		  (GEN5_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT));
+	OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
+		  GEN5_SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
+		  0 << VE0_OFFSET_SHIFT);
+	OUT_BATCH(VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
+		  VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
+		  VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT |
+		  VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
 
 	/* u0, v0, w0 */
-	OUT_BATCH((id << VE0_VERTEX_BUFFER_INDEX_SHIFT) | VE0_VALID |
-		  (src_format << VE0_FORMAT_SHIFT) |
-		  (4 << VE0_OFFSET_SHIFT));	/* offset vb in bytes */
-	OUT_BATCH((GEN5_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
-		  (GEN5_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-		  (w_component << VE1_VFCOMPONENT_2_SHIFT) |
-		  (GEN5_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT));
+	DBG(("%s: id=%d, first channel %d floats, offset=4b\n", __FUNCTION__,
+	     id, id & 3));
+	dw = VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT;
+	switch (id & 3) {
+	default:
+		assert(0);
+	case 0:
+		format = GEN5_SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
+		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
+		break;
+	case 1:
+		format = GEN5_SURFACEFORMAT_R32_FLOAT << VE0_FORMAT_SHIFT;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
+		dw |= VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT;
+		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
+		break;
+	case 2:
+		format = GEN5_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
+		dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
+		break;
+	case 3:
+		format = GEN5_SURFACEFORMAT_R32G32B32_FLOAT << VE0_FORMAT_SHIFT;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
+		dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_2_SHIFT;
+		break;
+	}
+	OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
+		  format | 4 << VE0_OFFSET_SHIFT);
+	OUT_BATCH(dw);
 
 	/* u1, v1, w1 */
 	if (has_mask) {
-		OUT_BATCH((id << VE0_VERTEX_BUFFER_INDEX_SHIFT) | VE0_VALID |
-			  (src_format << VE0_FORMAT_SHIFT) |
-			  (((1 + selem) * 4) << VE0_OFFSET_SHIFT)); /* vb offset in bytes */
-		OUT_BATCH((GEN5_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT) |
-			  (GEN5_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT) |
-			  (w_component << VE1_VFCOMPONENT_2_SHIFT) |
-			  (GEN5_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT));
+		unsigned offset = 4 + ((id & 3) ?: 1) * sizeof(float);
+		DBG(("%s: id=%x, second channel %d floats, offset=%db\n", __FUNCTION__,
+		     id, id >> 2, offset));
+		dw = VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT;
+		switch (id >> 2) {
+		case 1:
+			format = GEN5_SURFACEFORMAT_R32_FLOAT << VE0_FORMAT_SHIFT;
+			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
+			dw |= VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT;
+			dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
+			break;
+		default:
+			assert(0);
+		case 2:
+			format = GEN5_SURFACEFORMAT_R32G32_FLOAT << VE0_FORMAT_SHIFT;
+			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
+			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
+			dw |= VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_2_SHIFT;
+			break;
+		case 3:
+			format = GEN5_SURFACEFORMAT_R32G32B32_FLOAT << VE0_FORMAT_SHIFT;
+			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
+			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
+			dw |= VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_2_SHIFT;
+			break;
+		}
+		OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
+			  format | offset << VE0_OFFSET_SHIFT);
+		OUT_BATCH(dw);
 	}
 }
 
@@ -1380,23 +996,21 @@ gen5_emit_state(struct sna *sna,
 		const struct sna_composite_op *op,
 		uint16_t offset)
 {
-	bool flush;
+	if (kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
+		DBG(("%s: flushing dirty (%d, %d)\n", __FUNCTION__,
+		     kgem_bo_is_dirty(op->src.bo),
+		     kgem_bo_is_dirty(op->mask.bo)));
+		OUT_BATCH(MI_FLUSH);
+		kgem_clear_dirty(&sna->kgem);
+		kgem_bo_mark_dirty(op->dst.bo);
+	}
 
 	/* drawrect must be first for Ironlake BLT workaround */
 	gen5_emit_drawing_rectangle(sna, op);
-
-	flush = gen5_emit_binding_table(sna, offset);
-	if (gen5_emit_pipelined_pointers(sna, op, op->op, op->u.gen5.wm_kernel)) {
+	gen5_emit_binding_table(sna, offset);
+	if (gen5_emit_pipelined_pointers(sna, op, op->op, op->u.gen5.wm_kernel))
 		gen5_emit_urb(sna);
-		flush = true;
-	}
 	gen5_emit_vertex_elements(sna, op);
-
-	if (flush || kgem_bo_is_dirty(op->src.bo) || kgem_bo_is_dirty(op->mask.bo)) {
-		OUT_BATCH(MI_FLUSH);
-		kgem_clear_dirty(&sna->kgem);
-		kgem_bo_mark_dirty(op->dst.bo);
-	}
 }
 
 static void gen5_bind_surfaces(struct sna *sna,
@@ -1405,7 +1019,7 @@ static void gen5_bind_surfaces(struct sna *sna,
 	uint32_t *binding_table;
 	uint16_t offset;
 
-	gen5_get_batch(sna);
+	gen5_get_batch(sna, op);
 
 	binding_table = gen5_composite_get_binding_table(sna, &offset);
 
@@ -1419,7 +1033,8 @@ static void gen5_bind_surfaces(struct sna *sna,
 			     op->src.bo, op->src.width, op->src.height,
 			     op->src.card_format,
 			     false);
-	if (op->mask.bo)
+	if (op->mask.bo) {
+		assert(op->u.gen5.ve_id >> 2);
 		binding_table[2] =
 			gen5_bind_bo(sna,
 				     op->mask.bo,
@@ -1427,6 +1042,7 @@ static void gen5_bind_surfaces(struct sna *sna,
 				     op->mask.height,
 				     op->mask.card_format,
 				     false);
+	}
 
 	if (sna->kgem.surface == offset &&
 	    *(uint64_t *)(sna->kgem.batch + sna->render_state.gen5.surface_table) == *(uint64_t*)binding_table &&
@@ -1478,9 +1094,9 @@ gen5_render_composite_box(struct sna *sna,
 }
 
 static void
-gen5_render_composite_boxes(struct sna *sna,
-			    const struct sna_composite_op *op,
-			    const BoxRec *box, int nbox)
+gen5_render_composite_boxes__blt(struct sna *sna,
+				 const struct sna_composite_op *op,
+				 const BoxRec *box, int nbox)
 {
 	DBG(("%s(%d) delta=(%d, %d), src=(%d, %d)/(%d, %d), mask=(%d, %d)/(%d, %d)\n",
 	     __FUNCTION__, nbox, op->dst.x, op->dst.y,
@@ -1514,6 +1130,62 @@ gen5_render_composite_boxes(struct sna *sna,
 	} while (nbox);
 }
 
+static void
+gen5_render_composite_boxes(struct sna *sna,
+			    const struct sna_composite_op *op,
+			    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen5_get_rectangles(sna, op, nbox,
+						     gen5_bind_surfaces);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+	} while (nbox);
+}
+
+static void
+gen5_render_composite_boxes__thread(struct sna *sna,
+				    const struct sna_composite_op *op,
+				    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen5_get_rectangles(sna, op, nbox,
+						     gen5_bind_surfaces);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
 #ifndef MAX
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 #endif
@@ -1559,9 +1231,8 @@ static void gen5_video_bind_surfaces(struct sna *sna,
 	int src_height[6];
 	int src_pitch[6];
 	uint32_t *binding_table;
-	int n_src, n;
 	uint16_t offset;
-
+	int n_src, n;
 
 	src_surf_base[0] = 0;
 	src_surf_base[1] = 0;
@@ -1594,9 +1265,9 @@ static void gen5_video_bind_surfaces(struct sna *sna,
 		n_src = 1;
 	}
 
-	gen5_get_batch(sna);
-	binding_table = gen5_composite_get_binding_table(sna, &offset);
+	gen5_get_batch(sna, op);
 
+	binding_table = gen5_composite_get_binding_table(sna, &offset);
 	binding_table[0] =
 		gen5_bind_bo(sna,
 			     op->dst.bo, op->dst.width, op->dst.height,
@@ -1623,10 +1294,11 @@ gen5_render_video(struct sna *sna,
 		  RegionPtr dstRegion,
 		  short src_w, short src_h,
 		  short drw_w, short drw_h,
+		  short dx, short dy,
 		  PixmapPtr pixmap)
 {
 	struct sna_composite_op tmp;
-	int nbox, dxo, dyo, pix_xoff, pix_yoff;
+	int nbox, pix_xoff, pix_yoff;
 	float src_scale_x, src_scale_y;
 	struct sna_pixmap *priv;
 	BoxPtr box;
@@ -1646,13 +1318,16 @@ gen5_render_video(struct sna *sna,
 	tmp.dst.format = sna_format_for_depth(pixmap->drawable.depth);
 	tmp.dst.bo = priv->gpu_bo;
 
-	tmp.src.filter = SAMPLER_FILTER_BILINEAR;
+	if (src_w == drw_w && src_h == drw_h)
+		tmp.src.filter = SAMPLER_FILTER_NEAREST;
+	else
+		tmp.src.filter = SAMPLER_FILTER_BILINEAR;
 	tmp.src.repeat = SAMPLER_EXTEND_PAD;
 	tmp.src.bo = frame->bo;
 	tmp.mask.bo = NULL;
 	tmp.u.gen5.wm_kernel =
 		is_planar_fourcc(frame->id) ? WM_KERNEL_VIDEO_PLANAR : WM_KERNEL_VIDEO_PACKED;
-	tmp.u.gen5.ve_id = 1;
+	tmp.u.gen5.ve_id = 2;
 	tmp.is_affine = true;
 	tmp.floats_per_vertex = 3;
 	tmp.floats_per_rect = 9;
@@ -1677,9 +1352,6 @@ gen5_render_video(struct sna *sna,
 	pix_yoff = 0;
 #endif
 
-	dxo = dstRegion->extents.x1;
-	dyo = dstRegion->extents.y1;
-
 	/* Use normalized texture coordinates */
 	src_scale_x = ((float)src_w / frame->width) / (float)drw_w;
 	src_scale_y = ((float)src_h / frame->height) / (float)drw_h;
@@ -1697,16 +1369,16 @@ gen5_render_video(struct sna *sna,
 		gen5_get_rectangles(sna, &tmp, 1, gen5_video_bind_surfaces);
 
 		OUT_VERTEX(r.x2, r.y2);
-		OUT_VERTEX_F((box->x2 - dxo) * src_scale_x);
-		OUT_VERTEX_F((box->y2 - dyo) * src_scale_y);
+		OUT_VERTEX_F((box->x2 - dx) * src_scale_x);
+		OUT_VERTEX_F((box->y2 - dy) * src_scale_y);
 
 		OUT_VERTEX(r.x1, r.y2);
-		OUT_VERTEX_F((box->x1 - dxo) * src_scale_x);
-		OUT_VERTEX_F((box->y2 - dyo) * src_scale_y);
+		OUT_VERTEX_F((box->x1 - dx) * src_scale_x);
+		OUT_VERTEX_F((box->y2 - dy) * src_scale_y);
 
 		OUT_VERTEX(r.x1, r.y1);
-		OUT_VERTEX_F((box->x1 - dxo) * src_scale_x);
-		OUT_VERTEX_F((box->y1 - dyo) * src_scale_y);
+		OUT_VERTEX_F((box->x1 - dx) * src_scale_x);
+		OUT_VERTEX_F((box->y1 - dy) * src_scale_y);
 
 		if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
 			sna_damage_add_box(&priv->gpu_damage, &r);
@@ -1716,146 +1388,11 @@ gen5_render_video(struct sna *sna,
 	}
 	priv->clear = false;
 
-	gen5_vertex_flush(sna);
+	gen4_vertex_flush(sna);
 	return true;
 }
 
 static int
-gen5_composite_solid_init(struct sna *sna,
-			  struct sna_composite_channel *channel,
-			  uint32_t color)
-{
-	channel->filter = PictFilterNearest;
-	channel->repeat = RepeatNormal;
-	channel->is_affine = true;
-	channel->is_solid  = true;
-	channel->transform = NULL;
-	channel->width  = 1;
-	channel->height = 1;
-	channel->card_format = GEN5_SURFACEFORMAT_B8G8R8A8_UNORM;
-
-	channel->bo = sna_render_get_solid(sna, color);
-
-	channel->scale[0]  = channel->scale[1]  = 1;
-	channel->offset[0] = channel->offset[1] = 0;
-	return channel->bo != NULL;
-}
-
-static bool
-gen5_composite_linear_init(struct sna *sna,
-			   PicturePtr picture,
-			   struct sna_composite_channel *channel,
-			   int x, int y,
-			   int w, int h,
-			   int dst_x, int dst_y)
-{
-	PictLinearGradient *linear =
-		(PictLinearGradient *)picture->pSourcePict;
-	pixman_fixed_t tx, ty;
-	float x0, y0, sf;
-	float dx, dy;
-
-	DBG(("%s: p1=(%f, %f), p2=(%f, %f), src=(%d, %d), dst=(%d, %d), size=(%d, %d)\n",
-	     __FUNCTION__,
-	     pixman_fixed_to_double(linear->p1.x), pixman_fixed_to_double(linear->p1.y),
-	     pixman_fixed_to_double(linear->p2.x), pixman_fixed_to_double(linear->p2.y),
-	     x, y, dst_x, dst_y, w, h));
-
-	if (linear->p2.x == linear->p1.x && linear->p2.y == linear->p1.y)
-		return 0;
-
-	if (!sna_transform_is_affine(picture->transform)) {
-		DBG(("%s: fallback due to projective transform\n",
-		     __FUNCTION__));
-		return sna_render_picture_fixup(sna, picture, channel,
-						x, y, w, h, dst_x, dst_y);
-	}
-
-	channel->bo = sna_render_get_gradient(sna, (PictGradient *)linear);
-	if (!channel->bo)
-		return 0;
-
-	channel->filter = PictFilterNearest;
-	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
-	channel->width  = channel->bo->pitch / 4;
-	channel->height = 1;
-	channel->pict_format = PICT_a8r8g8b8;
-
-	channel->scale[0]  = channel->scale[1]  = 1;
-	channel->offset[0] = channel->offset[1] = 0;
-
-	if (sna_transform_is_translation(picture->transform, &tx, &ty)) {
-		dx = pixman_fixed_to_double(linear->p2.x - linear->p1.x);
-		dy = pixman_fixed_to_double(linear->p2.y - linear->p1.y);
-
-		x0 = pixman_fixed_to_double(linear->p1.x);
-		y0 = pixman_fixed_to_double(linear->p1.y);
-
-		if (tx | ty) {
-			x0 -= pixman_fixed_to_double(tx);
-			y0 -= pixman_fixed_to_double(ty);
-		}
-	} else {
-		struct pixman_f_vector p1, p2;
-		struct pixman_f_transform m, inv;
-
-		pixman_f_transform_from_pixman_transform(&m, picture->transform);
-		DBG(("%s: transform = [%f %f %f, %f %f %f, %f %f %f]\n",
-		     __FUNCTION__,
-		     m.m[0][0], m.m[0][1], m.m[0][2],
-		     m.m[1][0], m.m[1][1], m.m[1][2],
-		     m.m[2][0], m.m[2][1], m.m[2][2]));
-		if (!pixman_f_transform_invert(&inv, &m))
-			return 0;
-
-		p1.v[0] = pixman_fixed_to_double(linear->p1.x);
-		p1.v[1] = pixman_fixed_to_double(linear->p1.y);
-		p1.v[2] = 1.;
-		pixman_f_transform_point(&inv, &p1);
-
-		p2.v[0] = pixman_fixed_to_double(linear->p2.x);
-		p2.v[1] = pixman_fixed_to_double(linear->p2.y);
-		p2.v[2] = 1.;
-		pixman_f_transform_point(&inv, &p2);
-
-		DBG(("%s: untransformed: p1=(%f, %f, %f), p2=(%f, %f, %f)\n",
-		     __FUNCTION__,
-		     p1.v[0], p1.v[1], p1.v[2],
-		     p2.v[0], p2.v[1], p2.v[2]));
-
-		dx = p2.v[0] - p1.v[0];
-		dy = p2.v[1] - p1.v[1];
-
-		x0 = p1.v[0];
-		y0 = p1.v[1];
-	}
-
-	sf = dx*dx + dy*dy;
-	dx /= sf;
-	dy /= sf;
-
-	channel->embedded_transform.matrix[0][0] = pixman_double_to_fixed(dx);
-	channel->embedded_transform.matrix[0][1] = pixman_double_to_fixed(dy);
-	channel->embedded_transform.matrix[0][2] = -pixman_double_to_fixed(dx*(x0+dst_x-x) + dy*(y0+dst_y-y));
-
-	channel->embedded_transform.matrix[1][0] = 0;
-	channel->embedded_transform.matrix[1][1] = 0;
-	channel->embedded_transform.matrix[1][2] = pixman_double_to_fixed(.5);
-
-	channel->embedded_transform.matrix[2][0] = 0;
-	channel->embedded_transform.matrix[2][1] = 0;
-	channel->embedded_transform.matrix[2][2] = pixman_fixed_1;
-
-	channel->transform = &channel->embedded_transform;
-	channel->is_affine = 1;
-
-	DBG(("%s: dx=%f, dy=%f, offset=%f\n",
-	     __FUNCTION__, dx, dy, -dx*(x0-x+dst_x) + -dy*(y0-y+dst_y)));
-
-	return channel->bo != NULL;
-}
-
-static int
 gen5_composite_picture(struct sna *sna,
 		       PicturePtr picture,
 		       struct sna_composite_channel *channel,
@@ -1875,16 +1412,16 @@ gen5_composite_picture(struct sna *sna,
 	channel->card_format = -1;
 
 	if (sna_picture_is_solid(picture, &color))
-		return gen5_composite_solid_init(sna, channel, color);
+		return gen4_channel_init_solid(sna, channel, color);
 
 	if (picture->pDrawable == NULL) {
 		int ret;
 
 		if (picture->pSourcePict->type == SourcePictTypeLinear)
-			return gen5_composite_linear_init(sna, picture, channel,
-							  x, y,
-							  w, h,
-							  dst_x, dst_y);
+			return gen4_channel_init_linear(sna, picture, channel,
+							x, y,
+							w, h,
+							dst_x, dst_y);
 
 		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
 		ret = -1;
@@ -1935,7 +1472,8 @@ gen5_composite_picture(struct sna *sna,
 	channel->card_format = gen5_get_card_format(picture->format);
 	if (channel->card_format == -1)
 		return sna_render_picture_convert(sna, picture, channel, pixmap,
-						  x, y, w, h, dst_x, dst_y);
+						  x, y, w, h, dst_x, dst_y,
+						  false);
 
 	if (too_large(pixmap->drawable.width, pixmap->drawable.height))
 		return sna_render_picture_extract(sna, picture, channel,
@@ -1957,8 +1495,8 @@ static void
 gen5_render_composite_done(struct sna *sna,
 			   const struct sna_composite_op *op)
 {
-	if (sna->render_state.gen5.vertex_offset) {
-		gen5_vertex_flush(sna);
+	if (sna->render.vertex_offset) {
+		gen4_vertex_flush(sna);
 		gen5_magic_ca_pass(sna,op);
 	}
 
@@ -1981,10 +1519,9 @@ gen5_composite_set_target(struct sna *sna,
 	BoxRec box;
 
 	op->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
-	op->dst.format = dst->format;
-	op->dst.width = op->dst.pixmap->drawable.width;
+	op->dst.width  = op->dst.pixmap->drawable.width;
 	op->dst.height = op->dst.pixmap->drawable.height;
-
+	op->dst.format = dst->format;
 	if (w && h) {
 		box.x1 = x;
 		box.y1 = y;
@@ -2019,15 +1556,6 @@ gen5_composite_set_target(struct sna *sna,
 	return true;
 }
 
-static inline bool
-picture_is_cpu(PicturePtr picture)
-{
-	if (!picture->pDrawable)
-		return false;
-
-	return !is_gpu(picture->pDrawable);
-}
-
 static bool
 try_blt(struct sna *sna,
 	PicturePtr dst, PicturePtr src,
@@ -2052,7 +1580,7 @@ try_blt(struct sna *sna,
 		return true;
 
 	/* is the source picture only in cpu memory e.g. a shm pixmap? */
-	return picture_is_cpu(src);
+	return picture_is_cpu(sna, src);
 }
 
 static bool
@@ -2077,15 +1605,10 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
-untransformed(PicturePtr p)
+need_upload(struct sna *sna, PicturePtr p)
 {
-	return !p->transform || pixman_transform_is_int_translate(p->transform);
-}
-
-static bool
-need_upload(PicturePtr p)
-{
-	return p->pDrawable && untransformed(p) && !is_gpu(p->pDrawable);
+	return p->pDrawable && untransformed(p) &&
+		!is_gpu(sna, p->pDrawable, PREFER_GPU_RENDER);
 }
 
 static bool
@@ -2108,7 +1631,7 @@ source_is_busy(PixmapPtr pixmap)
 }
 
 static bool
-source_fallback(PicturePtr p, PixmapPtr pixmap)
+source_fallback(struct sna *sna, PicturePtr p, PixmapPtr pixmap)
 {
 	if (sna_picture_is_solid(p, NULL))
 		return false;
@@ -2121,7 +1644,7 @@ source_fallback(PicturePtr p, PixmapPtr pixmap)
 	if (pixmap && source_is_busy(pixmap))
 		return false;
 
-	return has_alphamap(p) || !gen5_check_filter(p) || need_upload(p);
+	return has_alphamap(p) || !gen5_check_filter(p) || need_upload(sna, p);
 }
 
 static bool
@@ -2130,7 +1653,6 @@ gen5_composite_fallback(struct sna *sna,
 			PicturePtr mask,
 			PicturePtr dst)
 {
-	struct sna_pixmap *priv;
 	PixmapPtr src_pixmap;
 	PixmapPtr mask_pixmap;
 	PixmapPtr dst_pixmap;
@@ -2145,11 +1667,11 @@ gen5_composite_fallback(struct sna *sna,
 	dst_pixmap = get_drawable_pixmap(dst->pDrawable);
 
 	src_pixmap = src->pDrawable ? get_drawable_pixmap(src->pDrawable) : NULL;
-	src_fallback = source_fallback(src, src_pixmap);
+	src_fallback = source_fallback(sna, src, src_pixmap);
 
 	if (mask) {
 		mask_pixmap = mask->pDrawable ? get_drawable_pixmap(mask->pDrawable) : NULL;
-		mask_fallback = source_fallback(mask, mask_pixmap);
+		mask_fallback = source_fallback(sna, mask, mask_pixmap);
 	} else {
 		mask_pixmap = NULL;
 		mask_fallback = false;
@@ -2169,8 +1691,7 @@ gen5_composite_fallback(struct sna *sna,
 	}
 
 	/* If anything is on the GPU, push everything out to the GPU */
-	priv = sna_pixmap(dst_pixmap);
-	if (priv && priv->gpu_damage && !priv->clear) {
+	if (dst_use_gpu(dst_pixmap)) {
 		DBG(("%s: dst is already on the GPU, try to use GPU\n",
 		     __FUNCTION__));
 		return false;
@@ -2205,14 +1726,14 @@ gen5_composite_fallback(struct sna *sna,
 
 	if (too_large(dst_pixmap->drawable.width,
 		      dst_pixmap->drawable.height) &&
-	    (priv == NULL || DAMAGE_IS_ALL(priv->cpu_damage))) {
+	    dst_is_cpu(dst_pixmap)) {
 		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
 		return true;
 	}
 
 	DBG(("%s: dst is not on the GPU and the operation should not fallback\n",
 	     __FUNCTION__));
-	return false;
+	return dst_use_cpu(dst_pixmap);
 }
 
 static int
@@ -2233,7 +1754,7 @@ reuse_source(struct sna *sna,
 	}
 
 	if (sna_picture_is_solid(mask, &color))
-		return gen5_composite_solid_init(sna, mc, color);
+		return gen4_channel_init_solid(sna, mc, color);
 
 	if (sc->is_solid)
 		return false;
@@ -2318,6 +1839,7 @@ gen5_render_composite(struct sna *sna,
 	}
 
 	DBG(("%s: preparing source\n", __FUNCTION__));
+	tmp->op = op;
 	switch (gen5_composite_picture(sna, src, &tmp->src,
 				       src_x, src_y,
 				       width, height,
@@ -2327,7 +1849,7 @@ gen5_render_composite(struct sna *sna,
 		DBG(("%s: failed to prepare source picture\n", __FUNCTION__));
 		goto cleanup_dst;
 	case 0:
-		if (!gen5_composite_solid_init(sna, &tmp->src, 0))
+		if (!gen4_channel_init_solid(sna, &tmp->src, 0))
 			goto cleanup_dst;
 		/* fall through to fixup */
 	case 1:
@@ -2341,12 +1863,10 @@ gen5_render_composite(struct sna *sna,
 		break;
 	}
 
-	tmp->op = op;
 	tmp->is_affine = tmp->src.is_affine;
 	tmp->has_component_alpha = false;
 	tmp->need_magic_ca_pass = false;
 
-	tmp->prim_emit = gen5_emit_composite_primitive;
 	if (mask) {
 		if (mask->componentAlpha && PICT_FORMAT_RGB(mask->format)) {
 			tmp->has_component_alpha = true;
@@ -2380,7 +1900,7 @@ gen5_render_composite(struct sna *sna,
 				DBG(("%s: failed to prepare mask picture\n", __FUNCTION__));
 				goto cleanup_src;
 			case 0:
-				if (!gen5_composite_solid_init(sna, &tmp->mask, 0))
+				if (!gen4_channel_init_solid(sna, &tmp->mask, 0))
 					goto cleanup_src;
 				/* fall through to fixup */
 			case 1:
@@ -2390,33 +1910,22 @@ gen5_render_composite(struct sna *sna,
 		}
 
 		tmp->is_affine &= tmp->mask.is_affine;
-
-		if (tmp->src.transform == NULL && tmp->mask.transform == NULL)
-			tmp->prim_emit = gen5_emit_composite_primitive_identity_source_mask;
-
-		tmp->floats_per_vertex = 5 + 2 * !tmp->is_affine;
-	} else {
-		if (tmp->src.is_solid)
-			tmp->prim_emit = gen5_emit_composite_primitive_solid;
-		else if (tmp->src.transform == NULL)
-			tmp->prim_emit = gen5_emit_composite_primitive_identity_source;
-		else if (tmp->src.is_affine)
-			tmp->prim_emit = gen5_emit_composite_primitive_affine_source;
-
-		tmp->floats_per_vertex = 3 + !tmp->is_affine;
 	}
-	tmp->floats_per_rect = 3*tmp->floats_per_vertex;
 
 	tmp->u.gen5.wm_kernel =
 		gen5_choose_composite_kernel(tmp->op,
 					     tmp->mask.bo != NULL,
 					     tmp->has_component_alpha,
 					     tmp->is_affine);
-	tmp->u.gen5.ve_id = (tmp->mask.bo != NULL) << 1 | tmp->is_affine;
+	tmp->u.gen5.ve_id = gen4_choose_composite_emitter(tmp);
 
 	tmp->blt   = gen5_render_composite_blt;
 	tmp->box   = gen5_render_composite_box;
-	tmp->boxes = gen5_render_composite_boxes;
+	tmp->boxes = gen5_render_composite_boxes__blt;
+	if (tmp->emit_boxes) {
+		tmp->boxes = gen5_render_composite_boxes;
+		tmp->thread_boxes = gen5_render_composite_boxes__thread;
+	}
 	tmp->done  = gen5_render_composite_done;
 
 	if (!kgem_check_bo(&sna->kgem,
@@ -2444,125 +1953,6 @@ cleanup_dst:
 }
 
 #if !NO_COMPOSITE_SPANS
-inline static void
-gen5_emit_composite_texcoord(struct sna *sna,
-			     const struct sna_composite_channel *channel,
-			     int16_t x, int16_t y)
-{
-	float t[3];
-
-	if (channel->is_affine) {
-		sna_get_transformed_coordinates(x + channel->offset[0],
-						y + channel->offset[1],
-						channel->transform,
-						&t[0], &t[1]);
-		OUT_VERTEX_F(t[0] * channel->scale[0]);
-		OUT_VERTEX_F(t[1] * channel->scale[1]);
-	} else {
-		t[0] = t[1] = 0; t[2] = 1;
-		sna_get_transformed_coordinates_3d(x + channel->offset[0],
-						   y + channel->offset[1],
-						   channel->transform,
-						   &t[0], &t[1], &t[2]);
-		OUT_VERTEX_F(t[0] * channel->scale[0]);
-		OUT_VERTEX_F(t[1] * channel->scale[1]);
-		OUT_VERTEX_F(t[2]);
-	}
-}
-
-inline static void
-gen5_emit_composite_texcoord_affine(struct sna *sna,
-				    const struct sna_composite_channel *channel,
-				    int16_t x, int16_t y)
-{
-	float t[2];
-
-	sna_get_transformed_coordinates(x + channel->offset[0],
-					y + channel->offset[1],
-					channel->transform,
-					&t[0], &t[1]);
-	OUT_VERTEX_F(t[0] * channel->scale[0]);
-	OUT_VERTEX_F(t[1] * channel->scale[1]);
-}
-
-inline static void
-gen5_emit_composite_spans_vertex(struct sna *sna,
-				 const struct sna_composite_spans_op *op,
-				 int16_t x, int16_t y)
-{
-	OUT_VERTEX(x, y);
-	gen5_emit_composite_texcoord(sna, &op->base.src, x, y);
-}
-
-fastcall static void
-gen5_emit_composite_spans_primitive(struct sna *sna,
-				    const struct sna_composite_spans_op *op,
-				    const BoxRec *box,
-				    float opacity)
-{
-	gen5_emit_composite_spans_vertex(sna, op, box->x2, box->y2);
-	OUT_VERTEX_F(opacity);
-	OUT_VERTEX_F(1);
-	if (!op->base.is_affine)
-		OUT_VERTEX_F(1);
-
-	gen5_emit_composite_spans_vertex(sna, op, box->x1, box->y2);
-	OUT_VERTEX_F(opacity);
-	OUT_VERTEX_F(1);
-	if (!op->base.is_affine)
-		OUT_VERTEX_F(1);
-
-	gen5_emit_composite_spans_vertex(sna, op, box->x1, box->y1);
-	OUT_VERTEX_F(opacity);
-	OUT_VERTEX_F(0);
-	if (!op->base.is_affine)
-		OUT_VERTEX_F(1);
-}
-
-fastcall static void
-gen5_emit_composite_spans_solid(struct sna *sna,
-				const struct sna_composite_spans_op *op,
-				const BoxRec *box,
-				float opacity)
-{
-	OUT_VERTEX(box->x2, box->y2);
-	OUT_VERTEX_F(1); OUT_VERTEX_F(1);
-	OUT_VERTEX_F(opacity); OUT_VERTEX_F(1);
-
-	OUT_VERTEX(box->x1, box->y2);
-	OUT_VERTEX_F(0); OUT_VERTEX_F(1);
-	OUT_VERTEX_F(opacity); OUT_VERTEX_F(1);
-
-	OUT_VERTEX(box->x1, box->y1);
-	OUT_VERTEX_F(0); OUT_VERTEX_F(0);
-	OUT_VERTEX_F(opacity); OUT_VERTEX_F(0);
-}
-
-fastcall static void
-gen5_emit_composite_spans_affine(struct sna *sna,
-				 const struct sna_composite_spans_op *op,
-				 const BoxRec *box,
-				 float opacity)
-{
-	OUT_VERTEX(box->x2, box->y2);
-	gen5_emit_composite_texcoord_affine(sna, &op->base.src,
-					    box->x2, box->y2);
-	OUT_VERTEX_F(opacity);
-	OUT_VERTEX_F(1);
-
-	OUT_VERTEX(box->x1, box->y2);
-	gen5_emit_composite_texcoord_affine(sna, &op->base.src,
-					    box->x1, box->y2);
-	OUT_VERTEX_F(opacity);
-	OUT_VERTEX_F(1);
-
-	OUT_VERTEX(box->x1, box->y1);
-	gen5_emit_composite_texcoord_affine(sna, &op->base.src,
-					    box->x1, box->y1);
-	OUT_VERTEX_F(opacity);
-	OUT_VERTEX_F(0);
-}
-
 fastcall static void
 gen5_render_composite_spans_box(struct sna *sna,
 				const struct sna_composite_spans_op *op,
@@ -2612,18 +2002,51 @@ gen5_render_composite_spans_boxes(struct sna *sna,
 }
 
 fastcall static void
+gen5_render_composite_spans_boxes__thread(struct sna *sna,
+					  const struct sna_composite_spans_op *op,
+					  const struct sna_opacity_box *box,
+					  int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     op->base.dst.x, op->base.dst.y));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen5_get_rectangles(sna, &op->base, nbox,
+						     gen5_bind_surfaces);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+fastcall static void
 gen5_render_composite_spans_done(struct sna *sna,
 				 const struct sna_composite_spans_op *op)
 {
-	if (sna->render_state.gen5.vertex_offset)
-		gen5_vertex_flush(sna);
+	if (sna->render.vertex_offset)
+		gen4_vertex_flush(sna);
 
 	DBG(("%s()\n", __FUNCTION__));
 
-	kgem_bo_destroy(&sna->kgem, op->base.mask.bo);
-	if (op->base.src.bo)
-		kgem_bo_destroy(&sna->kgem, op->base.src.bo);
-
+	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
 	sna_render_composite_redirect_done(sna, &op->base);
 }
 
@@ -2633,21 +2056,39 @@ gen5_check_composite_spans(struct sna *sna,
 			   int16_t width, int16_t height,
 			   unsigned flags)
 {
-	if ((flags & COMPOSITE_SPANS_RECTILINEAR) == 0)
-		return false;
+	DBG(("%s: op=%d, width=%d, height=%d, flags=%x\n",
+	     __FUNCTION__, op, width, height, flags));
 
 	if (op >= ARRAY_SIZE(gen5_blend_op))
 		return false;
 
-	if (gen5_composite_fallback(sna, src, NULL, dst))
+	if (gen5_composite_fallback(sna, src, NULL, dst)) {
+		DBG(("%s: operation would fallback\n", __FUNCTION__));
 		return false;
+	}
 
-	if (need_tiling(sna, width, height)) {
-		if (!is_gpu(dst->pDrawable)) {
-			DBG(("%s: fallback, tiled operation not on GPU\n",
-			     __FUNCTION__));
+	if (need_tiling(sna, width, height) &&
+	    !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
+		DBG(("%s: fallback, tiled operation not on GPU\n",
+		     __FUNCTION__));
+		return false;
+	}
+
+	if ((flags & COMPOSITE_SPANS_RECTILINEAR) == 0) {
+		struct sna_pixmap *priv = sna_pixmap_from_drawable(dst->pDrawable);
+		assert(priv);
+
+		if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
+			return true;
+
+		if (flags & COMPOSITE_SPANS_INPLACE_HINT)
 			return false;
-		}
+
+		if ((sna->render.prefer_gpu & PREFER_GPU_SPANS) == 0 &&
+		    dst->format == PICT_a8)
+			return false;
+
+		return priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo);
 	}
 
 	return true;
@@ -2690,7 +2131,7 @@ gen5_render_composite_spans(struct sna *sna,
 	case -1:
 		goto cleanup_dst;
 	case 0:
-		if (!gen5_composite_solid_init(sna, &tmp->base.src, 0))
+		if (!gen4_channel_init_solid(sna, &tmp->base.src, 0))
 			goto cleanup_dst;
 		/* fall through to fixup */
 	case 1:
@@ -2698,27 +2139,19 @@ gen5_render_composite_spans(struct sna *sna,
 		break;
 	}
 
-	tmp->base.mask.bo = sna_render_get_solid(sna, 0);
-	if (tmp->base.mask.bo == NULL)
-		goto cleanup_src;
+	tmp->base.mask.bo = NULL;
 
 	tmp->base.is_affine = tmp->base.src.is_affine;
 	tmp->base.has_component_alpha = false;
 	tmp->base.need_magic_ca_pass = false;
 
-	tmp->prim_emit = gen5_emit_composite_spans_primitive;
-	if (tmp->base.src.is_solid)
-		tmp->prim_emit = gen5_emit_composite_spans_solid;
-	else if (tmp->base.is_affine)
-		tmp->prim_emit = gen5_emit_composite_spans_affine;
-	tmp->base.floats_per_vertex = 5 + 2*!tmp->base.is_affine;
-	tmp->base.floats_per_rect = 3 * tmp->base.floats_per_vertex;
-
+	tmp->base.u.gen5.ve_id = gen4_choose_spans_emitter(tmp);
 	tmp->base.u.gen5.wm_kernel = WM_KERNEL_OPACITY | !tmp->base.is_affine;
-	tmp->base.u.gen5.ve_id = 1 << 1 | tmp->base.is_affine;
 
 	tmp->box   = gen5_render_composite_spans_box;
 	tmp->boxes = gen5_render_composite_spans_boxes;
+	if (tmp->emit_boxes)
+		tmp->thread_boxes = gen5_render_composite_spans_boxes__thread;
 	tmp->done  = gen5_render_composite_spans_done;
 
 	if (!kgem_check_bo(&sna->kgem,
@@ -2752,7 +2185,7 @@ gen5_copy_bind_surfaces(struct sna *sna,
 	uint32_t *binding_table;
 	uint16_t offset;
 
-	gen5_get_batch(sna);
+	gen5_get_batch(sna, op);
 
 	binding_table = gen5_composite_get_binding_table(sna, &offset);
 
@@ -2846,7 +2279,6 @@ fallback_blt:
 			if (box[i].y2 > extents.y2)
 				extents.y2 = box[i].y2;
 		}
-
 		if (!sna_render_composite_redirect(sna, &tmp,
 						   extents.x1 + dst_dx,
 						   extents.y1 + dst_dy,
@@ -2893,7 +2325,7 @@ fallback_blt:
 	tmp.floats_per_vertex = 3;
 	tmp.floats_per_rect = 9;
 	tmp.u.gen5.wm_kernel = WM_KERNEL;
-	tmp.u.gen5.ve_id = 1;
+	tmp.u.gen5.ve_id = 2;
 
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
@@ -2939,7 +2371,7 @@ fallback_blt:
 		} while (--n_this_time);
 	} while (n);
 
-	gen5_vertex_flush(sna);
+	gen4_vertex_flush(sna);
 	sna_render_composite_redirect_done(sna, &tmp);
 	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 	return true;
@@ -2950,6 +2382,14 @@ fallback_tiled_dst:
 	if (tmp.redirect.real_bo)
 		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
 fallback_tiled:
+	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
+	    sna_blt_copy_boxes(sna, alu,
+			       src_bo, src_dx, src_dy,
+			       dst_bo, dst_dx, dst_dy,
+			       dst->drawable.bitsPerPixel,
+			       box, n))
+		return true;
+
 	return sna_tiling_copy_boxes(sna, alu,
 				     src, src_bo, src_dx, src_dy,
 				     dst, dst_bo, dst_dx, dst_dy,
@@ -2985,8 +2425,8 @@ static void
 gen5_render_copy_done(struct sna *sna,
 		      const struct sna_copy_op *op)
 {
-	if (sna->render_state.gen5.vertex_offset)
-		gen5_vertex_flush(sna);
+	if (sna->render.vertex_offset)
+		gen4_vertex_flush(sna);
 
 	DBG(("%s()\n", __FUNCTION__));
 }
@@ -3049,9 +2489,9 @@ fallback:
 	op->base.floats_per_vertex = 3;
 	op->base.floats_per_rect = 9;
 	op->base.u.gen5.wm_kernel = WM_KERNEL;
-	op->base.u.gen5.ve_id = 1;
+	op->base.u.gen5.ve_id = 2;
 
-	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))  {
+	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
 			goto fallback;
@@ -3081,7 +2521,7 @@ gen5_fill_bind_surfaces(struct sna *sna,
 	uint32_t *binding_table;
 	uint16_t offset;
 
-	gen5_get_batch(sna);
+	gen5_get_batch(sna, op);
 
 	binding_table = gen5_composite_get_binding_table(sna, &offset);
 
@@ -3168,16 +2608,19 @@ gen5_render_fill_boxes(struct sna *sna,
 						     dst, dst_bo, box, n);
 	}
 
-	if (op == PictOpClear)
+	if (op == PictOpClear) {
 		pixel = 0;
-	else if (!sna_get_pixel_from_rgba(&pixel,
-					  color->red,
-					  color->green,
-					  color->blue,
-					  color->alpha,
-					  PICT_a8r8g8b8))
+		op = PictOpSrc;
+	} else if (!sna_get_pixel_from_rgba(&pixel,
+					    color->red,
+					    color->green,
+					    color->blue,
+					    color->alpha,
+					    PICT_a8r8g8b8))
 		return false;
 
+	DBG(("%s(%08x x %d)\n", __FUNCTION__, pixel, n));
+
 	memset(&tmp, 0, sizeof(tmp));
 
 	tmp.op = op;
@@ -3193,8 +2636,8 @@ gen5_render_fill_boxes(struct sna *sna,
 	tmp.src.repeat = SAMPLER_EXTEND_REPEAT;
 
 	tmp.is_affine = true;
-	tmp.floats_per_vertex = 3;
-	tmp.floats_per_rect = 9;
+	tmp.floats_per_vertex = 2;
+	tmp.floats_per_rect = 6;
 	tmp.u.gen5.wm_kernel = WM_KERNEL;
 	tmp.u.gen5.ve_id = 1;
 
@@ -3217,22 +2660,19 @@ gen5_render_fill_boxes(struct sna *sna,
 			DBG(("	(%d, %d), (%d, %d)\n",
 			     box->x1, box->y1, box->x2, box->y2));
 			OUT_VERTEX(box->x2, box->y2);
-			OUT_VERTEX_F(1);
-			OUT_VERTEX_F(1);
+			OUT_VERTEX_F(.5);
 
 			OUT_VERTEX(box->x1, box->y2);
-			OUT_VERTEX_F(0);
-			OUT_VERTEX_F(1);
+			OUT_VERTEX_F(.5);
 
 			OUT_VERTEX(box->x1, box->y1);
-			OUT_VERTEX_F(0);
-			OUT_VERTEX_F(0);
+			OUT_VERTEX_F(.5);
 
 			box++;
 		} while (--n_this_time);
 	} while (n);
 
-	gen5_vertex_flush(sna);
+	gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 	return true;
 }
@@ -3247,16 +2687,13 @@ gen5_render_fill_op_blt(struct sna *sna,
 	gen5_get_rectangles(sna, &op->base, 1, gen5_fill_bind_surfaces);
 
 	OUT_VERTEX(x+w, y+h);
-	OUT_VERTEX_F(1);
-	OUT_VERTEX_F(1);
+	OUT_VERTEX_F(.5);
 
 	OUT_VERTEX(x, y+h);
-	OUT_VERTEX_F(0);
-	OUT_VERTEX_F(1);
+	OUT_VERTEX_F(.5);
 
 	OUT_VERTEX(x, y);
-	OUT_VERTEX_F(0);
-	OUT_VERTEX_F(0);
+	OUT_VERTEX_F(.5);
 }
 
 fastcall static void
@@ -3270,16 +2707,13 @@ gen5_render_fill_op_box(struct sna *sna,
 	gen5_get_rectangles(sna, &op->base, 1, gen5_fill_bind_surfaces);
 
 	OUT_VERTEX(box->x2, box->y2);
-	OUT_VERTEX_F(1);
-	OUT_VERTEX_F(1);
+	OUT_VERTEX_F(.5);
 
 	OUT_VERTEX(box->x1, box->y2);
-	OUT_VERTEX_F(0);
-	OUT_VERTEX_F(1);
+	OUT_VERTEX_F(.5);
 
 	OUT_VERTEX(box->x1, box->y1);
-	OUT_VERTEX_F(0);
-	OUT_VERTEX_F(0);
+	OUT_VERTEX_F(.5);
 }
 
 fastcall static void
@@ -3300,16 +2734,13 @@ gen5_render_fill_op_boxes(struct sna *sna,
 
 		do {
 			OUT_VERTEX(box->x2, box->y2);
-			OUT_VERTEX_F(1);
-			OUT_VERTEX_F(1);
+			OUT_VERTEX_F(.5);
 
 			OUT_VERTEX(box->x1, box->y2);
-			OUT_VERTEX_F(0);
-			OUT_VERTEX_F(1);
+			OUT_VERTEX_F(.5);
 
 			OUT_VERTEX(box->x1, box->y1);
-			OUT_VERTEX_F(0);
-			OUT_VERTEX_F(0);
+			OUT_VERTEX_F(.5);
 			box++;
 		} while (--nbox_this_time);
 	} while (nbox);
@@ -3319,8 +2750,8 @@ static void
 gen5_render_fill_op_done(struct sna *sna,
 			 const struct sna_fill_op *op)
 {
-	if (sna->render_state.gen5.vertex_offset)
-		gen5_vertex_flush(sna);
+	if (sna->render.vertex_offset)
+		gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
 
 	DBG(("%s()\n", __FUNCTION__));
@@ -3375,8 +2806,8 @@ gen5_render_fill(struct sna *sna, uint8_t alu,
 	op->base.mask.repeat = SAMPLER_EXTEND_NONE;
 
 	op->base.is_affine = true;
-	op->base.floats_per_vertex = 3;
-	op->base.floats_per_rect = 9;
+	op->base.floats_per_vertex = 2;
+	op->base.floats_per_rect = 6;
 	op->base.u.gen5.wm_kernel = WM_KERNEL;
 	op->base.u.gen5.ve_id = 1;
 
@@ -3463,8 +2894,8 @@ gen5_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	tmp.mask.repeat = SAMPLER_EXTEND_NONE;
 
 	tmp.is_affine = true;
-	tmp.floats_per_vertex = 3;
-	tmp.floats_per_rect = 9;
+	tmp.floats_per_vertex = 2;
+	tmp.floats_per_rect = 6;
 	tmp.has_component_alpha = 0;
 	tmp.need_magic_ca_pass = false;
 
@@ -3472,7 +2903,11 @@ gen5_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	tmp.u.gen5.ve_id = 1;
 
 	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
-		_kgem_submit(&sna->kgem);
+		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
+			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+			return false;
+		}
 		assert(kgem_check_bo(&sna->kgem, bo, NULL));
 	}
 
@@ -3483,18 +2918,15 @@ gen5_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 
 	DBG(("	(%d, %d), (%d, %d)\n", x1, y1, x2, y2));
 	OUT_VERTEX(x2, y2);
-	OUT_VERTEX_F(1);
-	OUT_VERTEX_F(1);
+	OUT_VERTEX_F(.5);
 
 	OUT_VERTEX(x1, y2);
-	OUT_VERTEX_F(0);
-	OUT_VERTEX_F(1);
+	OUT_VERTEX_F(.5);
 
 	OUT_VERTEX(x1, y1);
-	OUT_VERTEX_F(0);
-	OUT_VERTEX_F(0);
+	OUT_VERTEX_F(.5);
 
-	gen5_vertex_flush(sna);
+	gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 
 	return true;
@@ -3503,14 +2935,17 @@ gen5_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 static void
 gen5_render_flush(struct sna *sna)
 {
-	gen5_vertex_close(sna);
+	gen4_vertex_close(sna);
+
+	assert(sna->render.vb_id == 0);
+	assert(sna->render.vertex_offset == 0);
 }
 
 static void
 gen5_render_context_switch(struct kgem *kgem,
 			   int new_mode)
 {
-	if (!kgem->mode)
+	if (!kgem->nbatch)
 		return;
 
 	/* WaNonPipelinedStateCommandFlush
@@ -3529,7 +2964,7 @@ gen5_render_context_switch(struct kgem *kgem,
 		sna->render_state.gen5.drawrect_limit = -1;
 	}
 
-	if (kgem_is_idle(kgem)) {
+	if (kgem_ring_is_idle(kgem, kgem->ring)) {
 		DBG(("%s: GPU idle, flushing\n", __FUNCTION__));
 		_kgem_submit(kgem);
 	}
@@ -3574,7 +3009,6 @@ gen5_render_expire(struct kgem *kgem)
 static void gen5_render_reset(struct sna *sna)
 {
 	sna->render_state.gen5.needs_invariant = true;
-	sna->render_state.gen5.vb_id = 0;
 	sna->render_state.gen5.ve_id = -1;
 	sna->render_state.gen5.last_primitive = -1;
 	sna->render_state.gen5.last_pipelined_pointers = 0;
@@ -3588,6 +3022,10 @@ static void gen5_render_reset(struct sna *sna)
 		DBG(("%s: discarding unmappable vbo\n", __FUNCTION__));
 		discard_vbo(sna);
 	}
+
+	sna->render.vertex_offset = 0;
+	sna->render.nvertex_reloc = 0;
+	sna->render.vb_id = 0;
 }
 
 static void gen5_render_fini(struct sna *sna)
@@ -3703,23 +3141,11 @@ static void gen5_init_wm_state(struct gen5_wm_unit_state *state,
 	state->thread1.binding_table_entry_count = 0;
 }
 
-static uint32_t gen5_create_cc_viewport(struct sna_static_stream *stream)
-{
-	struct gen5_cc_viewport vp;
-
-	vp.min_depth = -1.e35;
-	vp.max_depth = 1.e35;
-
-	return sna_static_stream_add(stream, &vp, sizeof(vp), 32);
-}
-
 static uint32_t gen5_create_cc_unit_state(struct sna_static_stream *stream)
 {
 	uint8_t *ptr, *base;
-	uint32_t vp;
 	int i, j;
 
-	vp = gen5_create_cc_viewport(stream);
 	base = ptr =
 		sna_static_stream_map(stream,
 				      GEN5_BLENDFACTOR_COUNT*GEN5_BLENDFACTOR_COUNT*64,
@@ -3732,7 +3158,6 @@ static uint32_t gen5_create_cc_unit_state(struct sna_static_stream *stream)
 
 			state->cc3.blend_enable =
 				!(j == GEN5_BLENDFACTOR_ZERO && i == GEN5_BLENDFACTOR_ONE);
-			state->cc4.cc_viewport_state_offset = vp >> 5;
 
 			state->cc5.logicop_func = 0xc;	/* COPY */
 			state->cc5.ia_blend_function = GEN5_BLENDFUNCTION_ADD;
@@ -3816,8 +3241,7 @@ static bool gen5_render_setup(struct sna *sna)
 					for (m = 0; m < KERNEL_COUNT; m++) {
 						gen5_init_wm_state(&wm_state->state,
 								   wm_kernels[m].has_mask,
-								   wm[m],
-								   sampler_state);
+								   wm[m], sampler_state);
 						wm_state++;
 					}
 				}
@@ -3840,10 +3264,15 @@ bool gen5_render_init(struct sna *sna)
 	sna->kgem.retire = gen5_render_retire;
 	sna->kgem.expire = gen5_render_expire;
 
+#if !NO_COMPOSITE
 	sna->render.composite = gen5_render_composite;
+	sna->render.prefer_gpu |= PREFER_GPU_RENDER;
+#endif
 #if !NO_COMPOSITE_SPANS
 	sna->render.check_composite_spans = gen5_check_composite_spans;
 	sna->render.composite_spans = gen5_render_composite_spans;
+	if (DEVICE_ID(sna->PciInfo) == 0x0044)
+		sna->render.prefer_gpu |= PREFER_GPU_SPANS;
 #endif
 	sna->render.video = gen5_render_video;
 
diff --git a/src/sna/gen5_render.h b/src/sna/gen5_render.h
index b6e5b0c2e..0f6bae6b6 100644
--- a/src/sna/gen5_render.h
+++ b/src/sna/gen5_render.h
@@ -749,15 +749,14 @@
 #define GEN5_VERTEXBUFFER_ACCESS_VERTEXDATA     0
 #define GEN5_VERTEXBUFFER_ACCESS_INSTANCEDATA   1
 
-#define GEN5_VFCOMPONENT_NOSTORE      0
-#define GEN5_VFCOMPONENT_STORE_SRC    1
-#define GEN5_VFCOMPONENT_STORE_0      2
-#define GEN5_VFCOMPONENT_STORE_1_FLT  3
-#define GEN5_VFCOMPONENT_STORE_1_INT  4
-#define GEN5_VFCOMPONENT_STORE_VID    5
-#define GEN5_VFCOMPONENT_STORE_IID    6
-#define GEN5_VFCOMPONENT_STORE_PID    7
-
+#define VFCOMPONENT_NOSTORE      0
+#define VFCOMPONENT_STORE_SRC    1
+#define VFCOMPONENT_STORE_0      2
+#define VFCOMPONENT_STORE_1_FLT  3
+#define VFCOMPONENT_STORE_1_INT  4
+#define VFCOMPONENT_STORE_VID    5
+#define VFCOMPONENT_STORE_IID    6
+#define VFCOMPONENT_STORE_PID    7
 
 
 /* Execution Unit (EU) defines
@@ -1990,50 +1989,43 @@ struct gen5_sampler_legacy_border_color {
    uint8_t color[4];
 };
 
-struct gen5_sampler_state
-{
-   
-   struct
-   {
-      unsigned int shadow_function:3; 
-      unsigned int lod_bias:11; 
-      unsigned int min_filter:3; 
-      unsigned int mag_filter:3; 
-      unsigned int mip_filter:2; 
-      unsigned int base_level:5; 
+struct gen5_sampler_state {
+   struct {
+      unsigned int shadow_function:3;
+      unsigned int lod_bias:11;
+      unsigned int min_filter:3;
+      unsigned int mag_filter:3;
+      unsigned int mip_filter:2;
+      unsigned int base_level:5;
       unsigned int pad:1;
-      unsigned int lod_preclamp:1; 
-      unsigned int border_color_mode:1; 
+      unsigned int lod_preclamp:1;
+      unsigned int border_color_mode:1;
       unsigned int pad0:1;
-      unsigned int disable:1; 
+      unsigned int disable:1;
    } ss0;
 
-   struct
-   {
-      unsigned int r_wrap_mode:3; 
-      unsigned int t_wrap_mode:3; 
-      unsigned int s_wrap_mode:3; 
+   struct {
+      unsigned int r_wrap_mode:3;
+      unsigned int t_wrap_mode:3;
+      unsigned int s_wrap_mode:3;
       unsigned int pad:3;
-      unsigned int max_lod:10; 
-      unsigned int min_lod:10; 
+      unsigned int max_lod:10;
+      unsigned int min_lod:10;
    } ss1;
 
-   
-   struct
-   {
+   struct {
       unsigned int pad:5;
-      unsigned int border_color_pointer:27; 
+      unsigned int border_color_pointer:27;
    } ss2;
-   
-   struct
-   {
-      unsigned int pad:19;
-      unsigned int max_aniso:3; 
-      unsigned int chroma_key_mode:1; 
-      unsigned int chroma_key_index:2; 
-      unsigned int chroma_key_enable:1; 
-      unsigned int monochrome_filter_width:3; 
-      unsigned int monochrome_filter_height:3; 
+
+   struct {
+      uint32_t pad:13;
+      uint32_t address_round:6;
+      uint32_t max_aniso:3;
+      uint32_t chroma_key_mode:1;
+      uint32_t chroma_key_index:2;
+      uint32_t chroma_key_enable:1;
+      uint32_t mbz:6;
    } ss3;
 };
 
diff --git a/src/sna/gen6_render.c b/src/sna/gen6_render.c
index fd7f2958b..3855f0449 100644
--- a/src/sna/gen6_render.c
+++ b/src/sna/gen6_render.c
@@ -42,6 +42,8 @@
 
 #include "brw/brw.h"
 #include "gen6_render.h"
+#include "gen4_source.h"
+#include "gen4_vertex.h"
 
 #define NO_COMPOSITE 0
 #define NO_COMPOSITE_SPANS 0
@@ -186,10 +188,6 @@ static const struct blendinfo {
 #define FILL_FLAGS(op, format) GEN6_SET_FLAGS(FILL_SAMPLER, gen6_get_blend((op), false, (format)), GEN6_WM_KERNEL_NOMASK, FILL_VERTEX)
 #define FILL_FLAGS_NOBLEND GEN6_SET_FLAGS(FILL_SAMPLER, NO_BLEND, GEN6_WM_KERNEL_NOMASK, FILL_VERTEX)
 
-#define VIDEO_SAMPLER \
-	SAMPLER_OFFSET(SAMPLER_FILTER_BILINEAR, SAMPLER_EXTEND_PAD, \
-		       SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE)
-
 #define GEN6_SAMPLER(f) (((f) >> 16) & 0xfff0)
 #define GEN6_BLEND(f) (((f) >> 0) & 0xfff0)
 #define GEN6_KERNEL(f) (((f) >> 16) & 0xf)
@@ -437,7 +435,7 @@ gen6_emit_viewports(struct sna *sna)
 		  (4 - 2));
 	OUT_BATCH(0);
 	OUT_BATCH(0);
-	OUT_BATCH(sna->render_state.gen6.cc_vp);
+	OUT_BATCH(0);
 }
 
 static void
@@ -734,7 +732,7 @@ gen6_emit_vertex_elements(struct sna *sna,
 	 *    texture coordinate 1 if (has_mask is true): same as above
 	 */
 	struct gen6_render_state *render = &sna->render_state.gen6;
-	uint32_t src_format, dw, offset;
+	uint32_t src_format, dw;
 	int id = GEN6_VERTEX(op->u.gen6.flags);
 	bool has_mask;
 
@@ -744,40 +742,6 @@ gen6_emit_vertex_elements(struct sna *sna,
 		return;
 	render->ve_id = id;
 
-	if (id == VERTEX_2s2s) {
-		DBG(("%s: setup COPY\n", __FUNCTION__));
-
-		OUT_BATCH(GEN6_3DSTATE_VERTEX_ELEMENTS |
-			  ((2 * (1 + 2)) + 1 - 2));
-
-		OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
-			  GEN6_SURFACEFORMAT_R32G32B32A32_FLOAT << VE0_FORMAT_SHIFT |
-			  0 << VE0_OFFSET_SHIFT);
-		OUT_BATCH(GEN6_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_0_SHIFT |
-			  GEN6_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT |
-			  GEN6_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-			  GEN6_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_3_SHIFT);
-
-		/* x,y */
-		OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
-			  GEN6_SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
-			  0 << VE0_OFFSET_SHIFT);
-		OUT_BATCH(GEN6_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-			  GEN6_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-			  GEN6_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-			  GEN6_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
-
-		/* u0, v0, w0 */
-		OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
-			  GEN6_SURFACEFORMAT_R16G16_SSCALED << VE0_FORMAT_SHIFT |
-			  4 << VE0_OFFSET_SHIFT);
-		OUT_BATCH(GEN6_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT |
-			  GEN6_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
-			  GEN6_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
-			  GEN6_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
-		return;
-	}
-
 	/* The VUE layout
 	 *    dword 0-3: pad (0.0, 0.0, 0.0. 0.0)
 	 *    dword 4-7: position (x, y, 1.0, 1.0),
@@ -806,20 +770,25 @@ gen6_emit_vertex_elements(struct sna *sna,
 		  GEN6_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT |
 		  GEN6_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT |
 		  GEN6_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT);
-	offset = 4;
 
 	/* u0, v0, w0 */
-	DBG(("%s: first channel %d floats, offset=%d\n", __FUNCTION__, id & 3, offset));
+	DBG(("%s: first channel %d floats, offset=4b\n", __FUNCTION__, id & 3));
 	dw = GEN6_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT;
 	switch (id & 3) {
+	default:
+		assert(0);
+	case 0:
+		src_format = GEN6_SURFACEFORMAT_R16G16_SSCALED;
+		dw |= GEN6_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
+		dw |= GEN6_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_1_SHIFT;
+		dw |= GEN6_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT;
+		break;
 	case 1:
 		src_format = GEN6_SURFACEFORMAT_R32_FLOAT;
 		dw |= GEN6_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
 		dw |= GEN6_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_1_SHIFT;
 		dw |= GEN6_VFCOMPONENT_STORE_0 << VE1_VFCOMPONENT_2_SHIFT;
 		break;
-	default:
-		assert(0);
 	case 2:
 		src_format = GEN6_SURFACEFORMAT_R32G32_FLOAT;
 		dw |= GEN6_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
@@ -835,15 +804,15 @@ gen6_emit_vertex_elements(struct sna *sna,
 	}
 	OUT_BATCH(id << VE0_VERTEX_BUFFER_INDEX_SHIFT | VE0_VALID |
 		  src_format << VE0_FORMAT_SHIFT |
-		  offset << VE0_OFFSET_SHIFT);
+		  4 << VE0_OFFSET_SHIFT);
 	OUT_BATCH(dw);
-	offset += (id & 3) * sizeof(float);
 
 	/* u1, v1, w1 */
 	if (has_mask) {
-		DBG(("%s: second channel %d floats, offset=%d\n", __FUNCTION__, (id >> 2) & 3, offset));
+		unsigned offset = 4 + ((id & 3) ?: 1) * sizeof(float);
+		DBG(("%s: second channel %d floats, offset=%db\n", __FUNCTION__, id >> 2, offset));
 		dw = GEN6_VFCOMPONENT_STORE_1_FLT << VE1_VFCOMPONENT_3_SHIFT;
-		switch ((id >> 2) & 3) {
+		switch (id >> 2) {
 		case 1:
 			src_format = GEN6_SURFACEFORMAT_R32_FLOAT;
 			dw |= GEN6_VFCOMPONENT_STORE_SRC << VE1_VFCOMPONENT_0_SHIFT;
@@ -917,13 +886,13 @@ gen6_emit_state(struct sna *sna,
 	sna->render_state.gen6.first_state_packet = false;
 }
 
-static void gen6_magic_ca_pass(struct sna *sna,
+static bool gen6_magic_ca_pass(struct sna *sna,
 			       const struct sna_composite_op *op)
 {
 	struct gen6_render_state *state = &sna->render_state.gen6;
 
 	if (!op->need_magic_ca_pass)
-		return;
+		return false;
 
 	DBG(("%s: CA fixup (%d -> %d)\n", __FUNCTION__,
 	     sna->render.vertex_start, sna->render.vertex_index));
@@ -949,163 +918,7 @@ static void gen6_magic_ca_pass(struct sna *sna,
 	OUT_BATCH(0);	/* index buffer offset, ignored */
 
 	state->last_primitive = sna->kgem.nbatch;
-}
-
-static void gen6_vertex_flush(struct sna *sna)
-{
-	assert(sna->render_state.gen6.vertex_offset);
-
-	DBG(("%s[%x] = %d\n", __FUNCTION__,
-	     4*sna->render_state.gen6.vertex_offset,
-	     sna->render.vertex_index - sna->render.vertex_start));
-	sna->kgem.batch[sna->render_state.gen6.vertex_offset] =
-		sna->render.vertex_index - sna->render.vertex_start;
-	sna->render_state.gen6.vertex_offset = 0;
-}
-
-static int gen6_vertex_finish(struct sna *sna)
-{
-	struct kgem_bo *bo;
-	unsigned int i;
-
-	DBG(("%s: used=%d / %d\n", __FUNCTION__,
-	     sna->render.vertex_used, sna->render.vertex_size));
-	assert(sna->render.vertex_used);
-	assert(sna->render.nvertex_reloc);
-
-	/* Note: we only need dword alignment (currently) */
-
-	bo = sna->render.vbo;
-	if (bo) {
-		if (sna->render_state.gen6.vertex_offset)
-			gen6_vertex_flush(sna);
-
-		for (i = 0; i < sna->render.nvertex_reloc; i++) {
-			DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
-			     i, sna->render.vertex_reloc[i]));
-
-			sna->kgem.batch[sna->render.vertex_reloc[i]] =
-				kgem_add_reloc(&sna->kgem,
-					       sna->render.vertex_reloc[i], bo,
-					       I915_GEM_DOMAIN_VERTEX << 16,
-					       0);
-			sna->kgem.batch[sna->render.vertex_reloc[i]+1] =
-				kgem_add_reloc(&sna->kgem,
-					       sna->render.vertex_reloc[i]+1, bo,
-					       I915_GEM_DOMAIN_VERTEX << 16,
-					       sna->render.vertex_used * 4 - 1);
-		}
-
-		sna->render.nvertex_reloc = 0;
-		sna->render.vertex_used = 0;
-		sna->render.vertex_index = 0;
-		sna->render_state.gen6.vb_id = 0;
-
-		kgem_bo_destroy(&sna->kgem, bo);
-	}
-
-	sna->render.vertices = NULL;
-	sna->render.vbo = kgem_create_linear(&sna->kgem,
-					     256*1024, CREATE_GTT_MAP);
-	if (sna->render.vbo)
-		sna->render.vertices = kgem_bo_map(&sna->kgem, sna->render.vbo);
-	if (sna->render.vertices == NULL) {
-		if (sna->render.vbo)
-			kgem_bo_destroy(&sna->kgem, sna->render.vbo);
-		sna->render.vbo = NULL;
-		return 0;
-	}
-
-	DBG(("%s: create vbo handle=%d\n", __FUNCTION__, sna->render.vbo->handle));
-
-	kgem_bo_sync__cpu(&sna->kgem, sna->render.vbo);
-	if (sna->render.vertex_used) {
-		DBG(("%s: copying initial buffer x %d to handle=%d\n",
-		     __FUNCTION__,
-		     sna->render.vertex_used,
-		     sna->render.vbo->handle));
-		memcpy(sna->render.vertices,
-		       sna->render.vertex_data,
-		       sizeof(float)*sna->render.vertex_used);
-	}
-	sna->render.vertex_size = 64 * 1024 - 1;
-	return sna->render.vertex_size - sna->render.vertex_used;
-}
-
-static void gen6_vertex_close(struct sna *sna)
-{
-	struct kgem_bo *bo, *free_bo = NULL;
-	unsigned int i, delta = 0;
-
-	assert(sna->render_state.gen6.vertex_offset == 0);
-
-	if (!sna->render_state.gen6.vb_id)
-		return;
-
-	DBG(("%s: used=%d, vbo active? %d\n",
-	     __FUNCTION__, sna->render.vertex_used, sna->render.vbo ? sna->render.vbo->handle : 0));
-
-	bo = sna->render.vbo;
-	if (bo) {
-		if (sna->render.vertex_size - sna->render.vertex_used < 64) {
-			DBG(("%s: discarding vbo (full), handle=%d\n", __FUNCTION__, sna->render.vbo->handle));
-			sna->render.vbo = NULL;
-			sna->render.vertices = sna->render.vertex_data;
-			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
-			free_bo = bo;
-		}
-	} else {
-		if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
-			DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
-			     sna->render.vertex_used, sna->kgem.nbatch));
-			memcpy(sna->kgem.batch + sna->kgem.nbatch,
-			       sna->render.vertex_data,
-			       sna->render.vertex_used * 4);
-			delta = sna->kgem.nbatch * 4;
-			bo = NULL;
-			sna->kgem.nbatch += sna->render.vertex_used;
-		} else {
-			bo = kgem_create_linear(&sna->kgem,
-						4*sna->render.vertex_used, 0);
-			if (bo && !kgem_bo_write(&sna->kgem, bo,
-						 sna->render.vertex_data,
-						 4*sna->render.vertex_used)) {
-				kgem_bo_destroy(&sna->kgem, bo);
-				bo = NULL;
-			}
-			DBG(("%s: new vbo: %d\n", __FUNCTION__,
-			     sna->render.vertex_used));
-			free_bo = bo;
-		}
-	}
-
-	assert(sna->render.nvertex_reloc);
-	for (i = 0; i < sna->render.nvertex_reloc; i++) {
-		DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
-		     i, sna->render.vertex_reloc[i]));
-
-		sna->kgem.batch[sna->render.vertex_reloc[i]] =
-			kgem_add_reloc(&sna->kgem,
-				       sna->render.vertex_reloc[i], bo,
-				       I915_GEM_DOMAIN_VERTEX << 16,
-				       delta);
-		sna->kgem.batch[sna->render.vertex_reloc[i]+1] =
-			kgem_add_reloc(&sna->kgem,
-				       sna->render.vertex_reloc[i]+1, bo,
-				       I915_GEM_DOMAIN_VERTEX << 16,
-				       delta + sna->render.vertex_used * 4 - 1);
-	}
-	sna->render.nvertex_reloc = 0;
-
-	if (sna->render.vbo == NULL) {
-		sna->render.vertex_used = 0;
-		sna->render.vertex_index = 0;
-		assert(sna->render.vertices == sna->render.vertex_data);
-		assert(sna->render.vertex_size == ARRAY_SIZE(sna->render.vertex_data));
-	}
-
-	if (free_bo)
-		kgem_bo_destroy(&sna->kgem, free_bo);
+	return true;
 }
 
 typedef struct gen6_surface_state_padded {
@@ -1193,16 +1006,6 @@ sampler_fill_init(struct gen6_sampler_state *ss)
 	sampler_state_init(ss+1, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE);
 }
 
-static uint32_t gen6_create_cc_viewport(struct sna_static_stream *stream)
-{
-	struct gen6_cc_viewport vp;
-
-	vp.min_depth = -1.e35;
-	vp.max_depth = 1.e35;
-
-	return sna_static_stream_add(stream, &vp, sizeof(vp), 32);
-}
-
 static uint32_t
 gen6_tiling_bits(uint32_t tiling)
 {
@@ -1229,9 +1032,10 @@ gen6_bind_bo(struct sna *sna,
 	uint32_t *ss;
 	uint32_t domains;
 	uint16_t offset;
+	uint32_t is_scanout = is_dst && bo->scanout;
 
 	/* After the first bind, we manage the cache domains within the batch */
-	offset = kgem_bo_get_binding(bo, format);
+	offset = kgem_bo_get_binding(bo, format | is_scanout << 31);
 	if (offset) {
 		DBG(("[%x]  bo(handle=%d), format=%d, reuse %s binding\n",
 		     offset, bo->handle, format,
@@ -1258,9 +1062,9 @@ gen6_bind_bo(struct sna *sna,
 	ss[3] = (gen6_tiling_bits(bo->tiling) |
 		 (bo->pitch - 1) << GEN6_SURFACE_PITCH_SHIFT);
 	ss[4] = 0;
-	ss[5] = 0;
+	ss[5] = is_scanout ? 0 : 3 << 16;
 
-	kgem_bo_set_binding(bo, format, offset);
+	kgem_bo_set_binding(bo, format | is_scanout << 31, offset);
 
 	DBG(("[%x] bind bo(handle=%d, addr=%d), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> %s\n",
 	     offset, bo->handle, ss[1],
@@ -1270,254 +1074,6 @@ gen6_bind_bo(struct sna *sna,
 	return offset * sizeof(uint32_t);
 }
 
-fastcall static void
-gen6_emit_composite_primitive_solid(struct sna *sna,
-				    const struct sna_composite_op *op,
-				    const struct sna_composite_rectangles *r)
-{
-	float *v;
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-
-	DBG(("%s: [%d+9] = (%d, %d)x(%d, %d)\n", __FUNCTION__,
-	     sna->render.vertex_used, r->dst.x, r->dst.y, r->width, r->height));
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 9;
-	assert(sna->render.vertex_used <= sna->render.vertex_size);
-	assert(!too_large(op->dst.x + r->dst.x + r->width,
-			  op->dst.y + r->dst.y + r->height));
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	dst.p.x = r->dst.x;
-	v[3] = dst.f;
-	dst.p.y = r->dst.y;
-	v[6] = dst.f;
-
-	v[5] = v[2] = v[1] = 1.;
-	v[8] = v[7] = v[4] = 0.;
-}
-
-fastcall static void
-gen6_emit_composite_primitive_identity_source(struct sna *sna,
-					      const struct sna_composite_op *op,
-					      const struct sna_composite_rectangles *r)
-{
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-	float *v;
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 9;
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	dst.p.x = r->dst.x;
-	v[3] = dst.f;
-	dst.p.y = r->dst.y;
-	v[6] = dst.f;
-
-	v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
-	v[1] = v[4] + r->width * op->src.scale[0];
-
-	v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
-	v[5] = v[2] = v[8] + r->height * op->src.scale[1];
-}
-
-fastcall static void
-gen6_emit_composite_primitive_simple_source(struct sna *sna,
-					    const struct sna_composite_op *op,
-					    const struct sna_composite_rectangles *r)
-{
-	float *v;
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-
-	float xx = op->src.transform->matrix[0][0];
-	float x0 = op->src.transform->matrix[0][2];
-	float yy = op->src.transform->matrix[1][1];
-	float y0 = op->src.transform->matrix[1][2];
-	float sx = op->src.scale[0];
-	float sy = op->src.scale[1];
-	int16_t tx = op->src.offset[0];
-	int16_t ty = op->src.offset[1];
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 3*3;
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
-	v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
-
-	dst.p.x = r->dst.x;
-	v[3] = dst.f;
-	v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx;
-
-	dst.p.y = r->dst.y;
-	v[6] = dst.f;
-	v[8] = ((r->src.y + ty) * yy + y0) * sy;
-}
-
-fastcall static void
-gen6_emit_composite_primitive_affine_source(struct sna *sna,
-					    const struct sna_composite_op *op,
-					    const struct sna_composite_rectangles *r)
-{
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-	float *v;
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 9;
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	_sna_get_transformed_coordinates(op->src.offset[0] + r->src.x + r->width,
-					 op->src.offset[1] + r->src.y + r->height,
-					 op->src.transform,
-					 &v[1], &v[2]);
-	v[1] *= op->src.scale[0];
-	v[2] *= op->src.scale[1];
-
-	dst.p.x = r->dst.x;
-	v[3] = dst.f;
-	_sna_get_transformed_coordinates(op->src.offset[0] + r->src.x,
-					 op->src.offset[1] + r->src.y + r->height,
-					 op->src.transform,
-					 &v[4], &v[5]);
-	v[4] *= op->src.scale[0];
-	v[5] *= op->src.scale[1];
-
-	dst.p.y = r->dst.y;
-	v[6] = dst.f;
-	_sna_get_transformed_coordinates(op->src.offset[0] + r->src.x,
-					 op->src.offset[1] + r->src.y,
-					 op->src.transform,
-					 &v[7], &v[8]);
-	v[7] *= op->src.scale[0];
-	v[8] *= op->src.scale[1];
-}
-
-fastcall static void
-gen6_emit_composite_primitive_identity_source_mask(struct sna *sna,
-						   const struct sna_composite_op *op,
-						   const struct sna_composite_rectangles *r)
-{
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-	float src_x, src_y;
-	float msk_x, msk_y;
-	float w, h;
-	float *v;
-
-	src_x = r->src.x + op->src.offset[0];
-	src_y = r->src.y + op->src.offset[1];
-	msk_x = r->mask.x + op->mask.offset[0];
-	msk_y = r->mask.y + op->mask.offset[1];
-	w = r->width;
-	h = r->height;
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 15;
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	v[1] = (src_x + w) * op->src.scale[0];
-	v[2] = (src_y + h) * op->src.scale[1];
-	v[3] = (msk_x + w) * op->mask.scale[0];
-	v[4] = (msk_y + h) * op->mask.scale[1];
-
-	dst.p.x = r->dst.x;
-	v[5] = dst.f;
-	v[6] = src_x * op->src.scale[0];
-	v[7] = v[2];
-	v[8] = msk_x * op->mask.scale[0];
-	v[9] = v[4];
-
-	dst.p.y = r->dst.y;
-	v[10] = dst.f;
-	v[11] = v[6];
-	v[12] = src_y * op->src.scale[1];
-	v[13] = v[8];
-	v[14] = msk_y * op->mask.scale[1];
-}
-
-inline static void
-gen6_emit_composite_texcoord(struct sna *sna,
-			     const struct sna_composite_channel *channel,
-			     int16_t x, int16_t y)
-{
-	x += channel->offset[0];
-	y += channel->offset[1];
-
-	if (channel->is_affine) {
-		float s, t;
-
-		sna_get_transformed_coordinates(x, y,
-						channel->transform,
-						&s, &t);
-		OUT_VERTEX_F(s * channel->scale[0]);
-		OUT_VERTEX_F(t * channel->scale[1]);
-	} else {
-		float s, t, w;
-
-		sna_get_transformed_coordinates_3d(x, y,
-						   channel->transform,
-						   &s, &t, &w);
-		OUT_VERTEX_F(s * channel->scale[0]);
-		OUT_VERTEX_F(t * channel->scale[1]);
-		OUT_VERTEX_F(w);
-	}
-}
-
-static void
-gen6_emit_composite_vertex(struct sna *sna,
-			   const struct sna_composite_op *op,
-			   int16_t srcX, int16_t srcY,
-			   int16_t mskX, int16_t mskY,
-			   int16_t dstX, int16_t dstY)
-{
-	OUT_VERTEX(dstX, dstY);
-	gen6_emit_composite_texcoord(sna, &op->src, srcX, srcY);
-	gen6_emit_composite_texcoord(sna, &op->mask, mskX, mskY);
-}
-
-fastcall static void
-gen6_emit_composite_primitive(struct sna *sna,
-			      const struct sna_composite_op *op,
-			      const struct sna_composite_rectangles *r)
-{
-	gen6_emit_composite_vertex(sna, op,
-				   r->src.x + r->width,  r->src.y + r->height,
-				   r->mask.x + r->width, r->mask.y + r->height,
-				   r->dst.x + r->width, r->dst.y + r->height);
-	gen6_emit_composite_vertex(sna, op,
-				   r->src.x,  r->src.y + r->height,
-				   r->mask.x, r->mask.y + r->height,
-				   r->dst.x,  r->dst.y + r->height);
-	gen6_emit_composite_vertex(sna, op,
-				   r->src.x,  r->src.y,
-				   r->mask.x, r->mask.y,
-				   r->dst.x,  r->dst.y);
-}
-
 static void gen6_emit_vertex_buffer(struct sna *sna,
 				    const struct sna_composite_op *op)
 {
@@ -1528,10 +1084,10 @@ static void gen6_emit_vertex_buffer(struct sna *sna,
 		  4*op->floats_per_vertex << VB0_BUFFER_PITCH_SHIFT);
 	sna->render.vertex_reloc[sna->render.nvertex_reloc++] = sna->kgem.nbatch;
 	OUT_BATCH(0);
-	OUT_BATCH(0);
+	OUT_BATCH(~0); /* max address: disabled */
 	OUT_BATCH(0);
 
-	sna->render_state.gen6.vb_id |= 1 << id;
+	sna->render.vb_id |= 1 << id;
 }
 
 static void gen6_emit_primitive(struct sna *sna)
@@ -1541,7 +1097,7 @@ static void gen6_emit_primitive(struct sna *sna)
 		     __FUNCTION__,
 		     sna->render.vertex_start,
 		     sna->render.vertex_index));
-		sna->render_state.gen6.vertex_offset = sna->kgem.nbatch - 5;
+		sna->render.vertex_offset = sna->kgem.nbatch - 5;
 		return;
 	}
 
@@ -1550,7 +1106,7 @@ static void gen6_emit_primitive(struct sna *sna)
 		  _3DPRIM_RECTLIST << GEN6_3DPRIMITIVE_TOPOLOGY_SHIFT |
 		  0 << 9 |
 		  4);
-	sna->render_state.gen6.vertex_offset = sna->kgem.nbatch;
+	sna->render.vertex_offset = sna->kgem.nbatch;
 	OUT_BATCH(0);	/* vertex count, to be filled in later */
 	OUT_BATCH(sna->render.vertex_index);
 	OUT_BATCH(1);	/* single instance */
@@ -1569,13 +1125,16 @@ static bool gen6_rectangle_begin(struct sna *sna,
 	int id = 1 << GEN6_VERTEX(op->u.gen6.flags);
 	int ndwords;
 
+	if (sna_vertex_wait__locked(&sna->render) && sna->render.vertex_offset)
+		return true;
+
 	ndwords = op->need_magic_ca_pass ? 60 : 6;
-	if ((sna->render_state.gen6.vb_id & id) == 0)
+	if ((sna->render.vb_id & id) == 0)
 		ndwords += 5;
 	if (!kgem_check_batch(&sna->kgem, ndwords))
 		return false;
 
-	if ((sna->render_state.gen6.vb_id & id) == 0)
+	if ((sna->render.vb_id & id) == 0)
 		gen6_emit_vertex_buffer(sna, op);
 
 	gen6_emit_primitive(sna);
@@ -1585,17 +1144,30 @@ static bool gen6_rectangle_begin(struct sna *sna,
 static int gen6_get_rectangles__flush(struct sna *sna,
 				      const struct sna_composite_op *op)
 {
+	/* Preventing discarding new vbo after lock contention */
+	if (sna_vertex_wait__locked(&sna->render)) {
+		int rem = vertex_space(sna);
+		if (rem > op->floats_per_rect)
+			return rem;
+	}
+
 	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 65 : 5))
 		return 0;
-	if (!kgem_check_exec(&sna->kgem, 1))
-		return 0;
-	if (!kgem_check_reloc(&sna->kgem, 2))
+	if (!kgem_check_reloc_and_exec(&sna->kgem, 2))
 		return 0;
 
-	if (op->need_magic_ca_pass && sna->render.vbo)
-		return 0;
+	if (sna->render.vertex_offset) {
+		gen4_vertex_flush(sna);
+		if (gen6_magic_ca_pass(sna, op)) {
+			gen6_emit_flush(sna);
+			gen6_emit_cc(sna, GEN6_BLEND(op->u.gen6.flags));
+			gen6_emit_wm(sna,
+				     GEN6_KERNEL(op->u.gen6.flags),
+				     GEN6_VERTEX(op->u.gen6.flags) >> 2);
+		}
+	}
 
-	return gen6_vertex_finish(sna);
+	return gen4_vertex_finish(sna);
 }
 
 inline static int gen6_get_rectangles(struct sna *sna,
@@ -1607,7 +1179,7 @@ inline static int gen6_get_rectangles(struct sna *sna,
 
 start:
 	rem = vertex_space(sna);
-	if (rem < op->floats_per_rect) {
+	if (unlikely(rem < op->floats_per_rect)) {
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen6_get_rectangles__flush(sna, op);
@@ -1615,7 +1187,7 @@ start:
 			goto flush;
 	}
 
-	if (unlikely(sna->render_state.gen6.vertex_offset == 0 &&
+	if (unlikely(sna->render.vertex_offset == 0 &&
 		     !gen6_rectangle_begin(sna, op)))
 		goto flush;
 
@@ -1627,10 +1199,11 @@ start:
 	return want;
 
 flush:
-	if (sna->render_state.gen6.vertex_offset) {
-		gen6_vertex_flush(sna);
+	if (sna->render.vertex_offset) {
+		gen4_vertex_flush(sna);
 		gen6_magic_ca_pass(sna, op);
 	}
+	sna_vertex_wait__locked(&sna->render);
 	_kgem_submit(&sna->kgem);
 	emit_state(sna, op);
 	goto start;
@@ -1653,20 +1226,10 @@ inline static uint32_t *gen6_composite_get_binding_table(struct sna *sna,
 	return table;
 }
 
-static uint32_t
-gen6_choose_composite_vertex_buffer(const struct sna_composite_op *op)
-{
-	int id = 2 + !op->is_affine;
-	if (op->mask.bo)
-		id |= id << 2;
-	assert(id > 0 && id < 16);
-	return id;
-}
-
-static void
-gen6_get_batch(struct sna *sna)
+static bool
+gen6_get_batch(struct sna *sna, const struct sna_composite_op *op)
 {
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
 
 	if (!kgem_check_batch_with_surfaces(&sna->kgem, 150, 4)) {
 		DBG(("%s: flushing batch: %d < %d+%d\n",
@@ -1678,6 +1241,8 @@ gen6_get_batch(struct sna *sna)
 
 	if (sna->render_state.gen6.needs_invariant)
 		gen6_emit_invariant(sna);
+
+	return kgem_bo_is_dirty(op->dst.bo);
 }
 
 static void gen6_emit_composite_state(struct sna *sna,
@@ -1687,8 +1252,7 @@ static void gen6_emit_composite_state(struct sna *sna,
 	uint16_t offset;
 	bool dirty;
 
-	gen6_get_batch(sna);
-	dirty = kgem_bo_is_dirty(op->dst.bo);
+	dirty = gen6_get_batch(sna, op);
 
 	binding_table = gen6_composite_get_binding_table(sna, &offset);
 
@@ -1726,11 +1290,10 @@ static void gen6_emit_composite_state(struct sna *sna,
 static void
 gen6_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 {
-	assert (sna->render_state.gen6.vertex_offset == 0);
+	assert (sna->render.vertex_offset == 0);
 	if (op->floats_per_vertex != sna->render_state.gen6.floats_per_vertex) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-			/* XXX propagate failure */
-			gen6_vertex_finish(sna);
+			gen4_vertex_finish(sna);
 
 		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
 		     sna->render_state.gen6.floats_per_vertex,
@@ -1741,6 +1304,7 @@ gen6_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 		sna->render.vertex_used = sna->render.vertex_index * op->floats_per_vertex;
 		sna->render_state.gen6.floats_per_vertex = op->floats_per_vertex;
 	}
+	assert((sna->render.vertex_used % op->floats_per_vertex) == 0);
 }
 
 fastcall static void
@@ -1775,9 +1339,9 @@ gen6_render_composite_box(struct sna *sna,
 }
 
 static void
-gen6_render_composite_boxes(struct sna *sna,
-			    const struct sna_composite_op *op,
-			    const BoxRec *box, int nbox)
+gen6_render_composite_boxes__blt(struct sna *sna,
+				 const struct sna_composite_op *op,
+				 const BoxRec *box, int nbox)
 {
 	DBG(("composite_boxes(%d)\n", nbox));
 
@@ -1807,6 +1371,62 @@ gen6_render_composite_boxes(struct sna *sna,
 	} while (nbox);
 }
 
+static void
+gen6_render_composite_boxes(struct sna *sna,
+			    const struct sna_composite_op *op,
+			    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen6_get_rectangles(sna, op, nbox,
+						     gen6_emit_composite_state);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+	} while (nbox);
+}
+
+static void
+gen6_render_composite_boxes__thread(struct sna *sna,
+				    const struct sna_composite_op *op,
+				    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen6_get_rectangles(sna, op, nbox,
+						     gen6_emit_composite_state);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
 #ifndef MAX
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 #endif
@@ -1887,8 +1507,7 @@ static void gen6_emit_video_state(struct sna *sna,
 	bool dirty;
 	int n_src, n;
 
-	gen6_get_batch(sna);
-	dirty = kgem_bo_is_dirty(op->dst.bo);
+	dirty = gen6_get_batch(sna, op);
 
 	src_surf_base[0] = 0;
 	src_surf_base[1] = 0;
@@ -1949,12 +1568,14 @@ gen6_render_video(struct sna *sna,
 		  RegionPtr dstRegion,
 		  short src_w, short src_h,
 		  short drw_w, short drw_h,
+		  short dx, short dy,
 		  PixmapPtr pixmap)
 {
 	struct sna_composite_op tmp;
-	int nbox, dxo, dyo, pix_xoff, pix_yoff;
+	int nbox, pix_xoff, pix_yoff;
 	float src_scale_x, src_scale_y;
 	struct sna_pixmap *priv;
+	unsigned filter;
 	BoxPtr box;
 
 	DBG(("%s: src=(%d, %d), dst=(%d, %d), %dx[(%d, %d), (%d, %d)...]\n",
@@ -1983,15 +1604,22 @@ gen6_render_video(struct sna *sna,
 	tmp.floats_per_vertex = 3;
 	tmp.floats_per_rect = 9;
 
+	if (src_w == drw_w && src_h == drw_h)
+		filter = SAMPLER_FILTER_NEAREST;
+	else
+		filter = SAMPLER_FILTER_BILINEAR;
+
 	tmp.u.gen6.flags =
-		GEN6_SET_FLAGS(VIDEO_SAMPLER, NO_BLEND,
+		GEN6_SET_FLAGS(SAMPLER_OFFSET(filter, SAMPLER_EXTEND_PAD,
+					       SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE),
+			       NO_BLEND,
 			       is_planar_fourcc(frame->id) ?
 			       GEN6_WM_KERNEL_VIDEO_PLANAR :
 			       GEN6_WM_KERNEL_VIDEO_PACKED,
 			       2);
 	tmp.priv = frame;
 
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
 	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		assert(kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL));
@@ -2012,9 +1640,6 @@ gen6_render_video(struct sna *sna,
 	pix_yoff = 0;
 #endif
 
-	dxo = dstRegion->extents.x1;
-	dyo = dstRegion->extents.y1;
-
 	/* Use normalized texture coordinates */
 	src_scale_x = ((float)src_w / frame->width) / (float)drw_w;
 	src_scale_y = ((float)src_h / frame->height) / (float)drw_h;
@@ -2032,16 +1657,16 @@ gen6_render_video(struct sna *sna,
 		gen6_get_rectangles(sna, &tmp, 1, gen6_emit_video_state);
 
 		OUT_VERTEX(r.x2, r.y2);
-		OUT_VERTEX_F((box->x2 - dxo) * src_scale_x);
-		OUT_VERTEX_F((box->y2 - dyo) * src_scale_y);
+		OUT_VERTEX_F((box->x2 - dx) * src_scale_x);
+		OUT_VERTEX_F((box->y2 - dy) * src_scale_y);
 
 		OUT_VERTEX(r.x1, r.y2);
-		OUT_VERTEX_F((box->x1 - dxo) * src_scale_x);
-		OUT_VERTEX_F((box->y2 - dyo) * src_scale_y);
+		OUT_VERTEX_F((box->x1 - dx) * src_scale_x);
+		OUT_VERTEX_F((box->y2 - dy) * src_scale_y);
 
 		OUT_VERTEX(r.x1, r.y1);
-		OUT_VERTEX_F((box->x1 - dxo) * src_scale_x);
-		OUT_VERTEX_F((box->y1 - dyo) * src_scale_y);
+		OUT_VERTEX_F((box->x1 - dx) * src_scale_x);
+		OUT_VERTEX_F((box->y1 - dy) * src_scale_y);
 
 		if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
 			sna_damage_add_box(&priv->gpu_damage, &r);
@@ -2051,148 +1676,10 @@ gen6_render_video(struct sna *sna,
 	}
 	priv->clear = false;
 
-	gen6_vertex_flush(sna);
+	gen4_vertex_flush(sna);
 	return true;
 }
 
-static bool
-gen6_composite_solid_init(struct sna *sna,
-			  struct sna_composite_channel *channel,
-			  uint32_t color)
-{
-	DBG(("%s: color=%x\n", __FUNCTION__, color));
-
-	channel->filter = PictFilterNearest;
-	channel->repeat = RepeatNormal;
-	channel->is_affine = true;
-	channel->is_solid  = true;
-	channel->is_opaque = (color >> 24) == 0xff;
-	channel->transform = NULL;
-	channel->width  = 1;
-	channel->height = 1;
-	channel->card_format = GEN6_SURFACEFORMAT_B8G8R8A8_UNORM;
-
-	channel->bo = sna_render_get_solid(sna, color);
-
-	channel->scale[0]  = channel->scale[1]  = 1;
-	channel->offset[0] = channel->offset[1] = 0;
-	return channel->bo != NULL;
-}
-
-static bool
-gen6_composite_linear_init(struct sna *sna,
-			   PicturePtr picture,
-			   struct sna_composite_channel *channel,
-			   int x, int y,
-			   int w, int h,
-			   int dst_x, int dst_y)
-{
-	PictLinearGradient *linear =
-		(PictLinearGradient *)picture->pSourcePict;
-	pixman_fixed_t tx, ty;
-	float x0, y0, sf;
-	float dx, dy;
-
-	DBG(("%s: p1=(%f, %f), p2=(%f, %f), src=(%d, %d), dst=(%d, %d), size=(%d, %d)\n",
-	     __FUNCTION__,
-	     pixman_fixed_to_double(linear->p1.x), pixman_fixed_to_double(linear->p1.y),
-	     pixman_fixed_to_double(linear->p2.x), pixman_fixed_to_double(linear->p2.y),
-	     x, y, dst_x, dst_y, w, h));
-
-	if (linear->p2.x == linear->p1.x && linear->p2.y == linear->p1.y)
-		return 0;
-
-	if (!sna_transform_is_affine(picture->transform)) {
-		DBG(("%s: fallback due to projective transform\n",
-		     __FUNCTION__));
-		return sna_render_picture_fixup(sna, picture, channel,
-						x, y, w, h, dst_x, dst_y);
-	}
-
-	channel->bo = sna_render_get_gradient(sna, (PictGradient *)linear);
-	if (!channel->bo)
-		return 0;
-
-	channel->filter = PictFilterNearest;
-	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
-	channel->width  = channel->bo->pitch / 4;
-	channel->height = 1;
-	channel->pict_format = PICT_a8r8g8b8;
-
-	channel->scale[0]  = channel->scale[1]  = 1;
-	channel->offset[0] = channel->offset[1] = 0;
-
-	if (sna_transform_is_translation(picture->transform, &tx, &ty)) {
-		dx = pixman_fixed_to_double(linear->p2.x - linear->p1.x);
-		dy = pixman_fixed_to_double(linear->p2.y - linear->p1.y);
-
-		x0 = pixman_fixed_to_double(linear->p1.x);
-		y0 = pixman_fixed_to_double(linear->p1.y);
-
-		if (tx | ty) {
-			x0 -= pixman_fixed_to_double(tx);
-			y0 -= pixman_fixed_to_double(ty);
-		}
-	} else {
-		struct pixman_f_vector p1, p2;
-		struct pixman_f_transform m, inv;
-
-		pixman_f_transform_from_pixman_transform(&m, picture->transform);
-		DBG(("%s: transform = [%f %f %f, %f %f %f, %f %f %f]\n",
-		     __FUNCTION__,
-		     m.m[0][0], m.m[0][1], m.m[0][2],
-		     m.m[1][0], m.m[1][1], m.m[1][2],
-		     m.m[2][0], m.m[2][1], m.m[2][2]));
-		if (!pixman_f_transform_invert(&inv, &m))
-			return 0;
-
-		p1.v[0] = pixman_fixed_to_double(linear->p1.x);
-		p1.v[1] = pixman_fixed_to_double(linear->p1.y);
-		p1.v[2] = 1.;
-		pixman_f_transform_point(&inv, &p1);
-
-		p2.v[0] = pixman_fixed_to_double(linear->p2.x);
-		p2.v[1] = pixman_fixed_to_double(linear->p2.y);
-		p2.v[2] = 1.;
-		pixman_f_transform_point(&inv, &p2);
-
-		DBG(("%s: untransformed: p1=(%f, %f, %f), p2=(%f, %f, %f)\n",
-		     __FUNCTION__,
-		     p1.v[0], p1.v[1], p1.v[2],
-		     p2.v[0], p2.v[1], p2.v[2]));
-
-		dx = p2.v[0] - p1.v[0];
-		dy = p2.v[1] - p1.v[1];
-
-		x0 = p1.v[0];
-		y0 = p1.v[1];
-	}
-
-	sf = dx*dx + dy*dy;
-	dx /= sf;
-	dy /= sf;
-
-	channel->embedded_transform.matrix[0][0] = pixman_double_to_fixed(dx);
-	channel->embedded_transform.matrix[0][1] = pixman_double_to_fixed(dy);
-	channel->embedded_transform.matrix[0][2] = -pixman_double_to_fixed(dx*(x0+dst_x-x) + dy*(y0+dst_y-y));
-
-	channel->embedded_transform.matrix[1][0] = 0;
-	channel->embedded_transform.matrix[1][1] = 0;
-	channel->embedded_transform.matrix[1][2] = pixman_double_to_fixed(.5);
-
-	channel->embedded_transform.matrix[2][0] = 0;
-	channel->embedded_transform.matrix[2][1] = 0;
-	channel->embedded_transform.matrix[2][2] = pixman_fixed_1;
-
-	channel->transform = &channel->embedded_transform;
-	channel->is_affine = 1;
-
-	DBG(("%s: dx=%f, dy=%f, offset=%f\n",
-	     __FUNCTION__, dx, dy, -dx*(x0-x+dst_x) + -dy*(y0-y+dst_y)));
-
-	return channel->bo != NULL;
-}
-
 static int
 gen6_composite_picture(struct sna *sna,
 		       PicturePtr picture,
@@ -2213,16 +1700,16 @@ gen6_composite_picture(struct sna *sna,
 	channel->card_format = -1;
 
 	if (sna_picture_is_solid(picture, &color))
-		return gen6_composite_solid_init(sna, channel, color);
+		return gen4_channel_init_solid(sna, channel, color);
 
 	if (picture->pDrawable == NULL) {
 		int ret;
 
 		if (picture->pSourcePict->type == SourcePictTypeLinear)
-			return gen6_composite_linear_init(sna, picture, channel,
-							  x, y,
-							  w, h,
-							  dst_x, dst_y);
+			return gen4_channel_init_linear(sna, picture, channel,
+							x, y,
+							w, h,
+							dst_x, dst_y);
 
 		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
 		ret = -1;
@@ -2273,7 +1760,8 @@ gen6_composite_picture(struct sna *sna,
 	channel->card_format = gen6_get_card_format(picture->format);
 	if (channel->card_format == (unsigned)-1)
 		return sna_render_picture_convert(sna, picture, channel, pixmap,
-						  x, y, w, h, dst_x, dst_y);
+						  x, y, w, h, dst_x, dst_y,
+						  false);
 
 	if (too_large(pixmap->drawable.width, pixmap->drawable.height)) {
 		DBG(("%s: extracting from pixmap %dx%d\n", __FUNCTION__,
@@ -2300,8 +1788,9 @@ static void gen6_render_composite_done(struct sna *sna,
 {
 	DBG(("%s\n", __FUNCTION__));
 
-	if (sna->render_state.gen6.vertex_offset) {
-		gen6_vertex_flush(sna);
+	assert(!sna->render.active);
+	if (sna->render.vertex_offset) {
+		gen4_vertex_flush(sna);
 		gen6_magic_ca_pass(sna, op);
 	}
 
@@ -2360,17 +1849,11 @@ gen6_composite_set_target(struct sna *sna,
 	return true;
 }
 
-static bool prefer_blt_ring(struct sna *sna)
-{
-	if (PREFER_RENDER)
-		return PREFER_RENDER < 0;
-
-	return sna->kgem.ring != KGEM_RENDER;
-}
-
-static bool can_switch_to_blt(struct sna *sna)
+inline static bool can_switch_to_blt(struct sna *sna,
+				     struct kgem_bo *bo,
+				     unsigned flags)
 {
-	if (sna->kgem.ring == KGEM_BLT)
+	if (sna->kgem.ring != KGEM_RENDER)
 		return true;
 
 	if (NO_RING_SWITCH)
@@ -2379,7 +1862,13 @@ static bool can_switch_to_blt(struct sna *sna)
 	if (!sna->kgem.has_semaphores)
 		return false;
 
-	return sna->kgem.mode == KGEM_NONE || kgem_is_idle(&sna->kgem);
+	if (flags & COPY_LAST)
+		return true;
+
+	if (bo && RQ_IS_BLT(bo->rq))
+		return true;
+
+	return kgem_ring_is_idle(&sna->kgem, KGEM_BLT);
 }
 
 static inline bool untiled_tlb_miss(struct kgem_bo *bo)
@@ -2387,9 +1876,19 @@ static inline bool untiled_tlb_miss(struct kgem_bo *bo)
 	return bo->tiling == I915_TILING_NONE && bo->pitch >= 4096;
 }
 
-static bool prefer_blt_bo(struct sna *sna, struct kgem_bo *bo)
+static int prefer_blt_bo(struct sna *sna, struct kgem_bo *bo)
 {
-	return untiled_tlb_miss(bo) && bo->pitch < MAXSHORT;
+	if (bo->rq)
+		return RQ_IS_BLT(bo->rq) ? 1 : -1;
+
+	return bo->tiling == I915_TILING_NONE || bo->scanout;
+}
+
+inline static bool prefer_blt_ring(struct sna *sna,
+				   struct kgem_bo *bo,
+				   unsigned flags)
+{
+	return can_switch_to_blt(sna, bo, flags);
 }
 
 static bool
@@ -2397,7 +1896,7 @@ try_blt(struct sna *sna,
 	PicturePtr dst, PicturePtr src,
 	int width, int height)
 {
-	if (prefer_blt_ring(sna)) {
+	if (sna->kgem.ring == KGEM_BLT) {
 		DBG(("%s: already performing BLT\n", __FUNCTION__));
 		return true;
 	}
@@ -2408,7 +1907,7 @@ try_blt(struct sna *sna,
 		return true;
 	}
 
-	if (can_switch_to_blt(sna) && sna_picture_is_solid(src, NULL))
+	if (sna_picture_is_solid(src, NULL) && can_switch_to_blt(sna, NULL, 0))
 		return true;
 
 	return false;
@@ -2436,12 +1935,6 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
-untransformed(PicturePtr p)
-{
-	return !p->transform || pixman_transform_is_int_translate(p->transform);
-}
-
-static bool
 need_upload(PicturePtr p)
 {
 	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
@@ -2487,7 +1980,6 @@ gen6_composite_fallback(struct sna *sna,
 			PicturePtr mask,
 			PicturePtr dst)
 {
-	struct sna_pixmap *priv;
 	PixmapPtr src_pixmap;
 	PixmapPtr mask_pixmap;
 	PixmapPtr dst_pixmap;
@@ -2526,10 +2018,7 @@ gen6_composite_fallback(struct sna *sna,
 	}
 
 	/* If anything is on the GPU, push everything out to the GPU */
-	priv = sna_pixmap(dst_pixmap);
-	if (priv &&
-	    ((priv->gpu_damage && !priv->clear) ||
-	     (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo)))) {
+	if (dst_use_gpu(dst_pixmap)) {
 		DBG(("%s: dst is already on the GPU, try to use GPU\n",
 		     __FUNCTION__));
 		return false;
@@ -2564,14 +2053,14 @@ gen6_composite_fallback(struct sna *sna,
 
 	if (too_large(dst_pixmap->drawable.width,
 		      dst_pixmap->drawable.height) &&
-	    (priv == NULL || DAMAGE_IS_ALL(priv->cpu_damage))) {
+	    dst_is_cpu(dst_pixmap)) {
 		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
 		return true;
 	}
 
 	DBG(("%s: dst is not on the GPU and the operation should not fallback\n",
 	     __FUNCTION__));
-	return false;
+	return dst_use_cpu(dst_pixmap);
 }
 
 static int
@@ -2592,7 +2081,7 @@ reuse_source(struct sna *sna,
 	}
 
 	if (sna_picture_is_solid(mask, &color))
-		return gen6_composite_solid_init(sna, mc, color);
+		return gen4_channel_init_solid(sna, mc, color);
 
 	if (sc->is_solid)
 		return false;
@@ -2635,11 +2124,14 @@ prefer_blt_composite(struct sna *sna, struct sna_composite_op *tmp)
 	if (sna->kgem.ring == KGEM_BLT)
 		return true;
 
-	if (!prefer_blt_ring(sna))
+	if (untiled_tlb_miss(tmp->dst.bo) ||
+	    untiled_tlb_miss(tmp->src.bo))
+		return true;
+
+	if (!prefer_blt_ring(sna, tmp->dst.bo, 0))
 		return false;
 
-	return (prefer_blt_bo(sna, tmp->dst.bo) ||
-		prefer_blt_bo(sna, tmp->src.bo));
+	return (prefer_blt_bo(sna, tmp->dst.bo) | prefer_blt_bo(sna, tmp->src.bo)) > 0;
 }
 
 static bool
@@ -2696,7 +2188,7 @@ gen6_render_composite(struct sna *sna,
 	case -1:
 		goto cleanup_dst;
 	case 0:
-		if (!gen6_composite_solid_init(sna, &tmp->src, 0))
+		if (!gen4_channel_init_solid(sna, &tmp->src, 0))
 			goto cleanup_dst;
 		/* fall through to fixup */
 	case 1:
@@ -2720,7 +2212,6 @@ gen6_render_composite(struct sna *sna,
 	tmp->mask.filter = SAMPLER_FILTER_NEAREST;
 	tmp->mask.repeat = SAMPLER_EXTEND_NONE;
 
-	tmp->prim_emit = gen6_emit_composite_primitive;
 	if (mask) {
 		if (mask->componentAlpha && PICT_FORMAT_RGB(mask->format)) {
 			tmp->has_component_alpha = true;
@@ -2750,7 +2241,7 @@ gen6_render_composite(struct sna *sna,
 			case -1:
 				goto cleanup_src;
 			case 0:
-				if (!gen6_composite_solid_init(sna, &tmp->mask, 0))
+				if (!gen4_channel_init_solid(sna, &tmp->mask, 0))
 					goto cleanup_src;
 				/* fall through to fixup */
 			case 1:
@@ -2760,40 +2251,7 @@ gen6_render_composite(struct sna *sna,
 		}
 
 		tmp->is_affine &= tmp->mask.is_affine;
-
-		if (tmp->src.transform == NULL && tmp->mask.transform == NULL)
-			tmp->prim_emit = gen6_emit_composite_primitive_identity_source_mask;
-
-		tmp->floats_per_vertex = 5 + 2 * !tmp->is_affine;
-	} else {
-		if (tmp->src.is_solid) {
-			DBG(("%s: choosing gen6_emit_composite_primitive_solid\n",
-			     __FUNCTION__));
-			tmp->prim_emit = gen6_emit_composite_primitive_solid;
-			if (tmp->src.is_opaque && op == PictOpOver)
-				tmp->op = PictOpSrc;
-		} else if (tmp->src.transform == NULL) {
-			DBG(("%s: choosing gen6_emit_composite_primitive_identity_source\n",
-			     __FUNCTION__));
-			tmp->prim_emit = gen6_emit_composite_primitive_identity_source;
-		} else if (tmp->src.is_affine) {
-			if (tmp->src.transform->matrix[0][1] == 0 &&
-			    tmp->src.transform->matrix[1][0] == 0) {
-				tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
-				tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
-				DBG(("%s: choosing gen6_emit_composite_primitive_simple_source\n",
-				     __FUNCTION__));
-				tmp->prim_emit = gen6_emit_composite_primitive_simple_source;
-			} else {
-				DBG(("%s: choosing gen6_emit_composite_primitive_affine_source\n",
-				     __FUNCTION__));
-				tmp->prim_emit = gen6_emit_composite_primitive_affine_source;
-			}
-		}
-
-		tmp->floats_per_vertex = 3 + !tmp->is_affine;
 	}
-	tmp->floats_per_rect = 3 * tmp->floats_per_vertex;
 
 	tmp->u.gen6.flags =
 		GEN6_SET_FLAGS(SAMPLER_OFFSET(tmp->src.filter,
@@ -2807,14 +2265,18 @@ gen6_render_composite(struct sna *sna,
 							    tmp->mask.bo != NULL,
 							    tmp->has_component_alpha,
 							    tmp->is_affine),
-			       gen6_choose_composite_vertex_buffer(tmp));
+			       gen4_choose_composite_emitter(tmp));
 
 	tmp->blt   = gen6_render_composite_blt;
 	tmp->box   = gen6_render_composite_box;
-	tmp->boxes = gen6_render_composite_boxes;
+	tmp->boxes = gen6_render_composite_boxes__blt;
+	if (tmp->emit_boxes) {
+		tmp->boxes = gen6_render_composite_boxes;
+		tmp->thread_boxes = gen6_render_composite_boxes__thread;
+	}
 	tmp->done  = gen6_render_composite_done;
 
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->dst.bo);
 	if (!kgem_check_bo(&sna->kgem,
 			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
 			   NULL)) {
@@ -2843,167 +2305,6 @@ cleanup_dst:
 }
 
 #if !NO_COMPOSITE_SPANS
-inline static void
-gen6_emit_composite_texcoord_affine(struct sna *sna,
-				    const struct sna_composite_channel *channel,
-				    int16_t x, int16_t y)
-{
-	float t[2];
-
-	sna_get_transformed_coordinates(x + channel->offset[0],
-					y + channel->offset[1],
-					channel->transform,
-					&t[0], &t[1]);
-	OUT_VERTEX_F(t[0] * channel->scale[0]);
-	OUT_VERTEX_F(t[1] * channel->scale[1]);
-}
-
-inline static void
-gen6_emit_composite_spans_vertex(struct sna *sna,
-				 const struct sna_composite_spans_op *op,
-				 int16_t x, int16_t y)
-{
-	OUT_VERTEX(x, y);
-	gen6_emit_composite_texcoord(sna, &op->base.src, x, y);
-}
-
-fastcall static void
-gen6_emit_composite_spans_primitive(struct sna *sna,
-				    const struct sna_composite_spans_op *op,
-				    const BoxRec *box,
-				    float opacity)
-{
-	gen6_emit_composite_spans_vertex(sna, op, box->x2, box->y2);
-	OUT_VERTEX_F(opacity);
-
-	gen6_emit_composite_spans_vertex(sna, op, box->x1, box->y2);
-	OUT_VERTEX_F(opacity);
-
-	gen6_emit_composite_spans_vertex(sna, op, box->x1, box->y1);
-	OUT_VERTEX_F(opacity);
-}
-
-fastcall static void
-gen6_emit_composite_spans_solid(struct sna *sna,
-				const struct sna_composite_spans_op *op,
-				const BoxRec *box,
-				float opacity)
-{
-	OUT_VERTEX(box->x2, box->y2);
-	OUT_VERTEX_F(1); OUT_VERTEX_F(1);
-	OUT_VERTEX_F(opacity);
-
-	OUT_VERTEX(box->x1, box->y2);
-	OUT_VERTEX_F(0); OUT_VERTEX_F(1);
-	OUT_VERTEX_F(opacity);
-
-	OUT_VERTEX(box->x1, box->y1);
-	OUT_VERTEX_F(0); OUT_VERTEX_F(0);
-	OUT_VERTEX_F(opacity);
-}
-
-fastcall static void
-gen6_emit_composite_spans_identity(struct sna *sna,
-				   const struct sna_composite_spans_op *op,
-				   const BoxRec *box,
-				   float opacity)
-{
-	float *v;
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-
-	float sx = op->base.src.scale[0];
-	float sy = op->base.src.scale[1];
-	int16_t tx = op->base.src.offset[0];
-	int16_t ty = op->base.src.offset[1];
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 3*4;
-	assert(sna->render.vertex_used <= sna->render.vertex_size);
-
-	dst.p.x = box->x2;
-	dst.p.y = box->y2;
-	v[0] = dst.f;
-	v[1] = (box->x2 + tx) * sx;
-	v[6] = v[2] = (box->y2 + ty) * sy;
-
-	dst.p.x = box->x1;
-	v[4] = dst.f;
-	v[9] = v[5] = (box->x1 + tx) * sx;
-
-	dst.p.y = box->y1;
-	v[8] = dst.f;
-	v[10] = (box->y1 + ty) * sy;
-
-	v[11] = v[7] = v[3] = opacity;
-}
-
-fastcall static void
-gen6_emit_composite_spans_simple(struct sna *sna,
-				 const struct sna_composite_spans_op *op,
-				 const BoxRec *box,
-				 float opacity)
-{
-	float *v;
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-
-	float xx = op->base.src.transform->matrix[0][0];
-	float x0 = op->base.src.transform->matrix[0][2];
-	float yy = op->base.src.transform->matrix[1][1];
-	float y0 = op->base.src.transform->matrix[1][2];
-	float sx = op->base.src.scale[0];
-	float sy = op->base.src.scale[1];
-	int16_t tx = op->base.src.offset[0];
-	int16_t ty = op->base.src.offset[1];
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 3*4;
-	assert(sna->render.vertex_used <= sna->render.vertex_size);
-
-	dst.p.x = box->x2;
-	dst.p.y = box->y2;
-	v[0] = dst.f;
-	v[1] = ((box->x2 + tx) * xx + x0) * sx;
-	v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
-
-	dst.p.x = box->x1;
-	v[4] = dst.f;
-	v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx;
-
-	dst.p.y = box->y1;
-	v[8] = dst.f;
-	v[10] = ((box->y1 + ty) * yy + y0) * sy;
-
-	v[11] = v[7] = v[3] = opacity;
-}
-
-fastcall static void
-gen6_emit_composite_spans_affine(struct sna *sna,
-				 const struct sna_composite_spans_op *op,
-				 const BoxRec *box,
-				 float opacity)
-{
-	OUT_VERTEX(box->x2, box->y2);
-	gen6_emit_composite_texcoord_affine(sna, &op->base.src,
-					    box->x2, box->y2);
-	OUT_VERTEX_F(opacity);
-
-	OUT_VERTEX(box->x1, box->y2);
-	gen6_emit_composite_texcoord_affine(sna, &op->base.src,
-					    box->x1, box->y2);
-	OUT_VERTEX_F(opacity);
-
-	OUT_VERTEX(box->x1, box->y1);
-	gen6_emit_composite_texcoord_affine(sna, &op->base.src,
-					    box->x1, box->y1);
-	OUT_VERTEX_F(opacity);
-}
-
 fastcall static void
 gen6_render_composite_spans_box(struct sna *sna,
 				const struct sna_composite_spans_op *op,
@@ -3053,13 +2354,50 @@ gen6_render_composite_spans_boxes(struct sna *sna,
 }
 
 fastcall static void
+gen6_render_composite_spans_boxes__thread(struct sna *sna,
+					  const struct sna_composite_spans_op *op,
+					  const struct sna_opacity_box *box,
+					  int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     op->base.dst.x, op->base.dst.y));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen6_get_rectangles(sna, &op->base, nbox,
+						     gen6_emit_composite_state);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+fastcall static void
 gen6_render_composite_spans_done(struct sna *sna,
 				 const struct sna_composite_spans_op *op)
 {
 	DBG(("%s()\n", __FUNCTION__));
+	assert(!sna->render.active);
 
-	if (sna->render_state.gen6.vertex_offset)
-		gen6_vertex_flush(sna);
+	if (sna->render.vertex_offset)
+		gen4_vertex_flush(sna);
 
 	if (op->base.src.bo)
 		kgem_bo_destroy(&sna->kgem, op->base.src.bo);
@@ -3070,23 +2408,38 @@ gen6_render_composite_spans_done(struct sna *sna,
 static bool
 gen6_check_composite_spans(struct sna *sna,
 			   uint8_t op, PicturePtr src, PicturePtr dst,
-			   int16_t width, int16_t height, unsigned flags)
+			   int16_t width, int16_t height,
+			   unsigned flags)
 {
-	if ((flags & COMPOSITE_SPANS_RECTILINEAR) == 0)
-		return false;
+	DBG(("%s: op=%d, width=%d, height=%d, flags=%x\n",
+	     __FUNCTION__, op, width, height, flags));
 
 	if (op >= ARRAY_SIZE(gen6_blend_op))
 		return false;
 
-	if (gen6_composite_fallback(sna, src, NULL, dst))
+	if (gen6_composite_fallback(sna, src, NULL, dst)) {
+		DBG(("%s: operation would fallback\n", __FUNCTION__));
 		return false;
+	}
 
-	if (need_tiling(sna, width, height)) {
-		if (!is_gpu(dst->pDrawable)) {
-			DBG(("%s: fallback, tiled operation not on GPU\n",
-			     __FUNCTION__));
+	if (need_tiling(sna, width, height) &&
+	    !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
+		DBG(("%s: fallback, tiled operation not on GPU\n",
+		     __FUNCTION__));
+		return false;
+	}
+
+	if ((flags & COMPOSITE_SPANS_RECTILINEAR) == 0) {
+		struct sna_pixmap *priv = sna_pixmap_from_drawable(dst->pDrawable);
+		assert(priv);
+
+		if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
+			return true;
+
+		if (flags & COMPOSITE_SPANS_INPLACE_HINT)
 			return false;
-		}
+
+		return priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo);
 	}
 
 	return true;
@@ -3129,7 +2482,7 @@ gen6_render_composite_spans(struct sna *sna,
 	case -1:
 		goto cleanup_dst;
 	case 0:
-		if (!gen6_composite_solid_init(sna, &tmp->base.src, 0))
+		if (!gen4_channel_init_solid(sna, &tmp->base.src, 0))
 			goto cleanup_dst;
 		/* fall through to fixup */
 	case 1:
@@ -3141,23 +2494,6 @@ gen6_render_composite_spans(struct sna *sna,
 	tmp->base.is_affine = tmp->base.src.is_affine;
 	tmp->base.need_magic_ca_pass = false;
 
-	tmp->prim_emit = gen6_emit_composite_spans_primitive;
-	if (tmp->base.src.is_solid) {
-		tmp->prim_emit = gen6_emit_composite_spans_solid;
-	} else if (tmp->base.src.transform == NULL) {
-		tmp->prim_emit = gen6_emit_composite_spans_identity;
-	} else if (tmp->base.is_affine) {
-		if (tmp->base.src.transform->matrix[0][1] == 0 &&
-		    tmp->base.src.transform->matrix[1][0] == 0) {
-			tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
-			tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
-			tmp->prim_emit = gen6_emit_composite_spans_simple;
-		} else
-			tmp->prim_emit = gen6_emit_composite_spans_affine;
-	}
-	tmp->base.floats_per_vertex = 4 + !tmp->base.is_affine;
-	tmp->base.floats_per_rect = 3 * tmp->base.floats_per_vertex;
-
 	tmp->base.u.gen6.flags =
 		GEN6_SET_FLAGS(SAMPLER_OFFSET(tmp->base.src.filter,
 					      tmp->base.src.repeat,
@@ -3165,13 +2501,15 @@ gen6_render_composite_spans(struct sna *sna,
 					      SAMPLER_EXTEND_PAD),
 			       gen6_get_blend(tmp->base.op, false, tmp->base.dst.format),
 			       GEN6_WM_KERNEL_OPACITY | !tmp->base.is_affine,
-			       1 << 2 | (2+!tmp->base.is_affine));
+			       gen4_choose_spans_emitter(tmp));
 
 	tmp->box   = gen6_render_composite_spans_box;
 	tmp->boxes = gen6_render_composite_spans_boxes;
+	if (tmp->emit_boxes)
+		tmp->thread_boxes = gen6_render_composite_spans_boxes__thread;
 	tmp->done  = gen6_render_composite_spans_done;
 
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->base.dst.bo);
 	if (!kgem_check_bo(&sna->kgem,
 			   tmp->base.dst.bo, tmp->base.src.bo,
 			   NULL)) {
@@ -3205,8 +2543,7 @@ gen6_emit_copy_state(struct sna *sna,
 	uint16_t offset;
 	bool dirty;
 
-	gen6_get_batch(sna);
-	dirty = kgem_bo_is_dirty(op->dst.bo);
+	dirty = gen6_get_batch(sna, op);
 
 	binding_table = gen6_composite_get_binding_table(sna, &offset);
 
@@ -3235,13 +2572,27 @@ static inline bool prefer_blt_copy(struct sna *sna,
 				   struct kgem_bo *dst_bo,
 				   unsigned flags)
 {
+	if (flags & COPY_SYNC)
+		return false;
+
 	if (PREFER_RENDER)
 		return PREFER_RENDER > 0;
 
-	return (sna->kgem.ring == KGEM_BLT ||
-		(flags & COPY_LAST && sna->kgem.mode == KGEM_NONE) ||
-		prefer_blt_bo(sna, src_bo) ||
-		prefer_blt_bo(sna, dst_bo));
+	if (sna->kgem.ring == KGEM_BLT)
+		return true;
+
+	if (src_bo == dst_bo && can_switch_to_blt(sna, dst_bo, flags))
+		return true;
+
+	if (untiled_tlb_miss(src_bo) ||
+	    untiled_tlb_miss(dst_bo))
+		return true;
+
+	if (!prefer_blt_ring(sna, dst_bo, flags))
+		return false;
+
+	return (prefer_blt_bo(sna, src_bo) >= 0 &&
+		prefer_blt_bo(sna, dst_bo) > 0);
 }
 
 inline static void boxes_extents(const BoxRec *box, int n, BoxRec *extents)
@@ -3322,7 +2673,7 @@ fallback_blt:
 		if (too_large(extents.x2-extents.x1, extents.y2-extents.y1))
 			goto fallback_blt;
 
-		if ((flags & COPY_LAST || can_switch_to_blt(sna)) &&
+		if (can_switch_to_blt(sna, dst_bo, flags) &&
 		    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 		    sna_blt_copy_boxes(sna, alu,
 				       src_bo, src_dx, src_dy,
@@ -3429,7 +2780,7 @@ fallback_blt:
 	assert(GEN6_SAMPLER(tmp.u.gen6.flags) == COPY_SAMPLER);
 	assert(GEN6_VERTEX(tmp.u.gen6.flags) == COPY_VERTEX);
 
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
 	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, tmp.src.bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, tmp.src.bo, NULL)) {
@@ -3472,7 +2823,7 @@ fallback_blt:
 		} while (--n_this_time);
 	} while (n);
 
-	gen6_vertex_flush(sna);
+	gen4_vertex_flush(sna);
 	sna_render_composite_redirect_done(sna, &tmp);
 	if (tmp.src.bo != src_bo)
 		kgem_bo_destroy(&sna->kgem, tmp.src.bo);
@@ -3485,6 +2836,14 @@ fallback_tiled_dst:
 	if (tmp.redirect.real_bo)
 		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
 fallback_tiled:
+	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
+	    sna_blt_copy_boxes(sna, alu,
+			       src_bo, src_dx, src_dy,
+			       dst_bo, dst_dx, dst_dy,
+			       dst->drawable.bitsPerPixel,
+			       box, n))
+		return true;
+
 	return sna_tiling_copy_boxes(sna, alu,
 				     src, src_bo, src_dx, src_dy,
 				     dst, dst_bo, dst_dx, dst_dy,
@@ -3519,8 +2878,9 @@ gen6_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
 {
 	DBG(("%s()\n", __FUNCTION__));
 
-	if (sna->render_state.gen6.vertex_offset)
-		gen6_vertex_flush(sna);
+	assert(!sna->render.active);
+	if (sna->render.vertex_offset)
+		gen4_vertex_flush(sna);
 }
 
 static bool
@@ -3585,7 +2945,7 @@ fallback:
 	assert(GEN6_SAMPLER(op->base.u.gen6.flags) == COPY_SAMPLER);
 	assert(GEN6_VERTEX(op->base.u.gen6.flags) == COPY_VERTEX);
 
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
@@ -3608,8 +2968,7 @@ gen6_emit_fill_state(struct sna *sna, const struct sna_composite_op *op)
 	uint16_t offset;
 	bool dirty;
 
-	gen6_get_batch(sna);
-	dirty = kgem_bo_is_dirty(op->dst.bo);
+	dirty = gen6_get_batch(sna, op);
 
 	binding_table = gen6_composite_get_binding_table(sna, &offset);
 
@@ -3640,9 +2999,10 @@ static inline bool prefer_blt_fill(struct sna *sna,
 	if (PREFER_RENDER)
 		return PREFER_RENDER < 0;
 
-	return (can_switch_to_blt(sna) ||
-		prefer_blt_ring(sna) ||
-		untiled_tlb_miss(bo));
+	if (untiled_tlb_miss(bo))
+		return true;
+
+	return prefer_blt_ring(sna, bo, 0) || prefer_blt_bo(sna, bo) >= 0;
 }
 
 static bool
@@ -3773,7 +3133,7 @@ gen6_render_fill_boxes(struct sna *sna,
 		} while (--n_this_time);
 	} while (n);
 
-	gen6_vertex_flush(sna);
+	gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 	sna_render_composite_redirect_done(sna, &tmp);
 	return true;
@@ -3866,8 +3226,9 @@ gen6_render_op_fill_done(struct sna *sna, const struct sna_fill_op *op)
 {
 	DBG(("%s()\n", __FUNCTION__));
 
-	if (sna->render_state.gen6.vertex_offset)
-		gen6_vertex_flush(sna);
+	assert(!sna->render.active);
+	if (sna->render.vertex_offset)
+		gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
 }
 
@@ -3999,8 +3360,11 @@ gen6_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	assert(GEN6_VERTEX(tmp.u.gen6.flags) == FILL_VERTEX);
 
 	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
-		_kgem_submit(&sna->kgem);
-		assert(kgem_check_bo(&sna->kgem, bo, NULL));
+		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
+			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+			return false;
+		}
 	}
 
 	gen6_emit_fill_state(sna, &tmp);
@@ -4021,7 +3385,7 @@ gen6_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	v[7] = v[2]  = v[3]  = 1;
 	v[6] = v[10] = v[11] = 0;
 
-	gen6_vertex_flush(sna);
+	gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 
 	return true;
@@ -4082,8 +3446,11 @@ gen6_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 	assert(GEN6_VERTEX(tmp.u.gen6.flags) == FILL_VERTEX);
 
 	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
-		_kgem_submit(&sna->kgem);
-		assert(kgem_check_bo(&sna->kgem, bo, NULL));
+		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
+			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+			return false;
+		}
 	}
 
 	gen6_emit_fill_state(sna, &tmp);
@@ -4103,7 +3470,7 @@ gen6_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 	v[7] = v[2]  = v[3]  = 1;
 	v[6] = v[10] = v[11] = 0;
 
-	gen6_vertex_flush(sna);
+	gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 
 	return true;
@@ -4111,20 +3478,20 @@ gen6_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 
 static void gen6_render_flush(struct sna *sna)
 {
-	gen6_vertex_close(sna);
+	gen4_vertex_close(sna);
+
+	assert(sna->render.vb_id == 0);
+	assert(sna->render.vertex_offset == 0);
 }
 
 static void
 gen6_render_context_switch(struct kgem *kgem,
 			   int new_mode)
 {
-	if (!new_mode)
-		return;
-
-	 DBG(("%s: from %d to %d\n", __FUNCTION__, kgem->mode, new_mode));
-
-	if (kgem->mode)
-		kgem_submit(kgem);
+	if (kgem->nbatch) {
+		DBG(("%s: from %d to %d\n", __FUNCTION__, kgem->mode, new_mode));
+		_kgem_submit(kgem);
+	}
 
 	kgem->ring = new_mode;
 }
@@ -4154,6 +3521,7 @@ gen6_render_expire(struct kgem *kgem)
 	if (sna->render.vbo && !sna->render.vertex_used) {
 		DBG(("%s: discarding vbo handle=%d\n", __FUNCTION__, sna->render.vbo->handle));
 		kgem_bo_destroy(kgem, sna->render.vbo);
+		assert(!sna->render.active);
 		sna->render.vbo = NULL;
 		sna->render.vertices = sna->render.vertex_data;
 		sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
@@ -4166,7 +3534,6 @@ static void gen6_render_reset(struct sna *sna)
 {
 	sna->render_state.gen6.needs_invariant = true;
 	sna->render_state.gen6.first_state_packet = true;
-	sna->render_state.gen6.vb_id = 0;
 	sna->render_state.gen6.ve_id = 3 << 2;
 	sna->render_state.gen6.last_primitive = -1;
 
@@ -4177,6 +3544,10 @@ static void gen6_render_reset(struct sna *sna)
 	sna->render_state.gen6.drawrect_offset = -1;
 	sna->render_state.gen6.drawrect_limit = -1;
 	sna->render_state.gen6.surface_table = -1;
+
+	sna->render.vertex_offset = 0;
+	sna->render.nvertex_reloc = 0;
+	sna->render.vb_id = 0;
 }
 
 static void gen6_render_fini(struct sna *sna)
@@ -4184,6 +3555,16 @@ static void gen6_render_fini(struct sna *sna)
 	kgem_bo_destroy(&sna->kgem, sna->render_state.gen6.general_bo);
 }
 
+static bool is_gt2(struct sna *sna)
+{
+	return DEVICE_ID(sna->PciInfo) & 0x30;
+}
+
+static bool is_mobile(struct sna *sna)
+{
+	return (DEVICE_ID(sna->PciInfo) & 0xf) == 0x6;
+}
+
 static bool gen6_render_setup(struct sna *sna)
 {
 	struct gen6_render_state *state = &sna->render_state.gen6;
@@ -4192,7 +3573,7 @@ static bool gen6_render_setup(struct sna *sna)
 	int i, j, k, l, m;
 
 	state->info = &gt1_info;
-	if (DEVICE_ID(sna->PciInfo) & 0x20)
+	if (is_gt2(sna))
 		state->info = &gt2_info; /* XXX requires GT_MODE WiZ disabled */
 
 	sna_static_stream_init(&general);
@@ -4256,7 +3637,6 @@ static bool gen6_render_setup(struct sna *sna)
 		}
 	}
 
-	state->cc_vp = gen6_create_cc_viewport(&general);
 	state->cc_blend = gen6_composite_create_blend_state(&general);
 
 	state->general_bo = sna_static_stream_fini(sna, &general);
@@ -4274,10 +3654,14 @@ bool gen6_render_init(struct sna *sna)
 
 #if !NO_COMPOSITE
 	sna->render.composite = gen6_render_composite;
+	sna->render.prefer_gpu |= PREFER_GPU_RENDER;
+
 #endif
 #if !NO_COMPOSITE_SPANS
 	sna->render.check_composite_spans = gen6_check_composite_spans;
 	sna->render.composite_spans = gen6_render_composite_spans;
+	if (is_mobile(sna))
+		sna->render.prefer_gpu |= PREFER_GPU_SPANS;
 #endif
 	sna->render.video = gen6_render_video;
 
diff --git a/src/sna/gen7_render.c b/src/sna/gen7_render.c
index 4d94c803c..f05d6f926 100644
--- a/src/sna/gen7_render.c
+++ b/src/sna/gen7_render.c
@@ -42,6 +42,8 @@
 
 #include "brw/brw.h"
 #include "gen7_render.h"
+#include "gen4_source.h"
+#include "gen4_vertex.h"
 
 #define NO_COMPOSITE 0
 #define NO_COMPOSITE_SPANS 0
@@ -113,6 +115,24 @@ static const struct gt_info hsw_gt_info = {
 	.urb = { 128, 64, 64 },
 };
 
+static const struct gt_info hsw_gt1_info = {
+	.max_vs_threads = 70,
+	.max_gs_threads = 70,
+	.max_wm_threads =
+		(102 - 1) << HSW_PS_MAX_THREADS_SHIFT |
+		1 << HSW_PS_SAMPLE_MASK_SHIFT,
+	.urb = { 128, 640, 256 },
+};
+
+static const struct gt_info hsw_gt2_info = {
+	.max_vs_threads = 280,
+	.max_gs_threads = 280,
+	.max_wm_threads =
+		(204 - 1) << HSW_PS_MAX_THREADS_SHIFT |
+		1 << HSW_PS_SAMPLE_MASK_SHIFT,
+	.urb = { 256, 1664, 640 },
+};
+
 static const uint32_t ps_kernel_packed[][4] = {
 #include "exa_wm_src_affine.g7b"
 #include "exa_wm_src_sample_argb.g7b"
@@ -209,10 +229,6 @@ static const struct blendinfo {
 #define FILL_FLAGS(op, format) GEN7_SET_FLAGS(FILL_SAMPLER, gen7_get_blend((op), false, (format)), GEN7_WM_KERNEL_NOMASK, FILL_VERTEX)
 #define FILL_FLAGS_NOBLEND GEN7_SET_FLAGS(FILL_SAMPLER, NO_BLEND, GEN7_WM_KERNEL_NOMASK, FILL_VERTEX)
 
-#define VIDEO_SAMPLER \
-	SAMPLER_OFFSET(SAMPLER_FILTER_BILINEAR, SAMPLER_EXTEND_PAD, \
-		       SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE)
-
 #define GEN7_SAMPLER(f) (((f) >> 16) & 0xfff0)
 #define GEN7_BLEND(f) (((f) >> 0) & 0x7ff0)
 #define GEN7_READS_DST(f) (((f) >> 15) & 1)
@@ -627,9 +643,9 @@ gen7_emit_cc_invariant(struct sna *sna)
 	OUT_BATCH(0);
 #endif
 
-	assert(is_aligned(sna->render_state.gen7.cc_vp, 32));
+	/* XXX clear to be safe */
 	OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC | (2 - 2));
-	OUT_BATCH(sna->render_state.gen7.cc_vp);
+	OUT_BATCH(0);
 }
 
 static void
@@ -865,7 +881,7 @@ gen7_emit_vertex_elements(struct sna *sna,
 	 *    texture coordinate 1 if (has_mask is true): same as above
 	 */
 	struct gen7_render_state *render = &sna->render_state.gen7;
-	uint32_t src_format, dw, offset;
+	uint32_t src_format, dw;
 	int id = GEN7_VERTEX(op->u.gen7.flags);
 	bool has_mask;
 
@@ -875,39 +891,6 @@ gen7_emit_vertex_elements(struct sna *sna,
 		return;
 	render->ve_id = id;
 
-	if (id == VERTEX_2s2s) {
-		DBG(("%s: setup COPY\n", __FUNCTION__));
-
-		OUT_BATCH(GEN7_3DSTATE_VERTEX_ELEMENTS |
-			  ((2 * (1 + 2)) + 1 - 2));
-
-		OUT_BATCH(VERTEX_2s2s << GEN7_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN7_VE0_VALID |
-			  GEN7_SURFACEFORMAT_R32G32B32A32_FLOAT << GEN7_VE0_FORMAT_SHIFT |
-			  0 << GEN7_VE0_OFFSET_SHIFT);
-		OUT_BATCH(GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_0_SHIFT |
-			  GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_1_SHIFT |
-			  GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_2_SHIFT |
-			  GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_3_SHIFT);
-
-		/* x,y */
-		OUT_BATCH(VERTEX_2s2s << GEN7_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN7_VE0_VALID |
-			  GEN7_SURFACEFORMAT_R16G16_SSCALED << GEN7_VE0_FORMAT_SHIFT |
-			  0 << GEN7_VE0_OFFSET_SHIFT); /* offsets vb in bytes */
-		OUT_BATCH(GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_0_SHIFT |
-			  GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_1_SHIFT |
-			  GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_2_SHIFT |
-			  GEN7_VFCOMPONENT_STORE_1_FLT << GEN7_VE1_VFCOMPONENT_3_SHIFT);
-
-		OUT_BATCH(VERTEX_2s2s << GEN7_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN7_VE0_VALID |
-			  GEN7_SURFACEFORMAT_R16G16_SSCALED << GEN7_VE0_FORMAT_SHIFT |
-			  4 << GEN7_VE0_OFFSET_SHIFT);	/* offset vb in bytes */
-		OUT_BATCH(GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_0_SHIFT |
-			  GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_1_SHIFT |
-			  GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_2_SHIFT |
-			  GEN7_VFCOMPONENT_STORE_1_FLT << GEN7_VE1_VFCOMPONENT_3_SHIFT);
-		return;
-	}
-
 	/* The VUE layout
 	 *    dword 0-3: pad (0.0, 0.0, 0.0. 0.0)
 	 *    dword 4-7: position (x, y, 1.0, 1.0),
@@ -936,20 +919,25 @@ gen7_emit_vertex_elements(struct sna *sna,
 		  GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_1_SHIFT |
 		  GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_2_SHIFT |
 		  GEN7_VFCOMPONENT_STORE_1_FLT << GEN7_VE1_VFCOMPONENT_3_SHIFT);
-	offset = 4;
 
 	/* u0, v0, w0 */
-	DBG(("%s: first channel %d floats, offset=%d\n", __FUNCTION__, id & 3, offset));
+	DBG(("%s: first channel %d floats, offset=4b\n", __FUNCTION__, id & 3));
 	dw = GEN7_VFCOMPONENT_STORE_1_FLT << GEN7_VE1_VFCOMPONENT_3_SHIFT;
 	switch (id & 3) {
+	default:
+		assert(0);
+	case 0:
+		src_format = GEN7_SURFACEFORMAT_R16G16_SSCALED;
+		dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_0_SHIFT;
+		dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_1_SHIFT;
+		dw |= GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_2_SHIFT;
+		break;
 	case 1:
 		src_format = GEN7_SURFACEFORMAT_R32_FLOAT;
 		dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_0_SHIFT;
 		dw |= GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_1_SHIFT;
 		dw |= GEN7_VFCOMPONENT_STORE_0 << GEN7_VE1_VFCOMPONENT_2_SHIFT;
 		break;
-	default:
-		assert(0);
 	case 2:
 		src_format = GEN7_SURFACEFORMAT_R32G32_FLOAT;
 		dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_0_SHIFT;
@@ -965,15 +953,15 @@ gen7_emit_vertex_elements(struct sna *sna,
 	}
 	OUT_BATCH(id << GEN7_VE0_VERTEX_BUFFER_INDEX_SHIFT | GEN7_VE0_VALID |
 		  src_format << GEN7_VE0_FORMAT_SHIFT |
-		  offset << GEN7_VE0_OFFSET_SHIFT);
+		  4 << GEN7_VE0_OFFSET_SHIFT);
 	OUT_BATCH(dw);
-	offset += (id & 3) * sizeof(float);
 
 	/* u1, v1, w1 */
 	if (has_mask) {
-		DBG(("%s: second channel %d floats, offset=%d\n", __FUNCTION__, (id >> 2) & 3, offset));
+		unsigned offset = 4 + ((id & 3) ?: 1) * sizeof(float);
+		DBG(("%s: second channel %d floats, offset=%db\n", __FUNCTION__, id >> 2, offset));
 		dw = GEN7_VFCOMPONENT_STORE_1_FLT << GEN7_VE1_VFCOMPONENT_3_SHIFT;
-		switch ((id >> 2) & 3) {
+		switch (id >> 2) {
 		case 1:
 			src_format = GEN7_SURFACEFORMAT_R32_FLOAT;
 			dw |= GEN7_VFCOMPONENT_STORE_SRC << GEN7_VE1_VFCOMPONENT_0_SHIFT;
@@ -1064,20 +1052,22 @@ gen7_emit_state(struct sna *sna,
 	sna->render_state.gen7.emit_flush = GEN7_READS_DST(op->u.gen7.flags);
 }
 
-static void gen7_magic_ca_pass(struct sna *sna,
+static bool gen7_magic_ca_pass(struct sna *sna,
 			       const struct sna_composite_op *op)
 {
 	struct gen7_render_state *state = &sna->render_state.gen7;
 
 	if (!op->need_magic_ca_pass)
-		return;
+		return false;
 
 	DBG(("%s: CA fixup (%d -> %d)\n", __FUNCTION__,
 	     sna->render.vertex_start, sna->render.vertex_index));
 
 	gen7_emit_pipe_invalidate(sna);
 
-	gen7_emit_cc(sna, gen7_get_blend(PictOpAdd, true, op->dst.format));
+	gen7_emit_cc(sna,
+		     GEN7_BLEND(gen7_get_blend(PictOpAdd, true,
+					       op->dst.format)));
 	gen7_emit_wm(sna,
 		     gen7_choose_composite_kernel(PictOpAdd,
 						  true, true,
@@ -1092,155 +1082,7 @@ static void gen7_magic_ca_pass(struct sna *sna,
 	OUT_BATCH(0);	/* index buffer offset, ignored */
 
 	state->last_primitive = sna->kgem.nbatch;
-}
-
-static void gen7_vertex_flush(struct sna *sna)
-{
-	assert(sna->render_state.gen7.vertex_offset);
-
-	DBG(("%s[%x] = %d\n", __FUNCTION__,
-	     4*sna->render_state.gen7.vertex_offset,
-	     sna->render.vertex_index - sna->render.vertex_start));
-	sna->kgem.batch[sna->render_state.gen7.vertex_offset] =
-		sna->render.vertex_index - sna->render.vertex_start;
-	sna->render_state.gen7.vertex_offset = 0;
-}
-
-static int gen7_vertex_finish(struct sna *sna)
-{
-	struct kgem_bo *bo;
-	unsigned int i;
-
-	assert(sna->render.vertex_used);
-	assert(sna->render.nvertex_reloc);
-
-	/* Note: we only need dword alignment (currently) */
-
-	bo = sna->render.vbo;
-	if (bo) {
-		if (sna->render_state.gen7.vertex_offset)
-			gen7_vertex_flush(sna);
-
-		for (i = 0; i < sna->render.nvertex_reloc; i++) {
-			DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
-			     i, sna->render.vertex_reloc[i]));
-
-			sna->kgem.batch[sna->render.vertex_reloc[i]] =
-				kgem_add_reloc(&sna->kgem,
-					       sna->render.vertex_reloc[i], bo,
-					       I915_GEM_DOMAIN_VERTEX << 16,
-					       0);
-			sna->kgem.batch[sna->render.vertex_reloc[i]+1] =
-				kgem_add_reloc(&sna->kgem,
-					       sna->render.vertex_reloc[i]+1, bo,
-					       I915_GEM_DOMAIN_VERTEX << 16,
-					       sna->render.vertex_used * 4 - 1);
-		}
-
-		sna->render.nvertex_reloc = 0;
-		sna->render.vertex_used = 0;
-		sna->render.vertex_index = 0;
-		sna->render_state.gen7.vb_id = 0;
-
-		kgem_bo_destroy(&sna->kgem, bo);
-	}
-
-	sna->render.vertices = NULL;
-	sna->render.vbo = kgem_create_linear(&sna->kgem,
-					     256*1024, CREATE_GTT_MAP);
-	if (sna->render.vbo)
-		sna->render.vertices = kgem_bo_map(&sna->kgem, sna->render.vbo);
-	if (sna->render.vertices == NULL) {
-		if (sna->render.vbo)
-			kgem_bo_destroy(&sna->kgem, sna->render.vbo);
-		sna->render.vbo = NULL;
-		return 0;
-	}
-
-	kgem_bo_sync__cpu(&sna->kgem, sna->render.vbo);
-	if (sna->render.vertex_used) {
-		memcpy(sna->render.vertices,
-		       sna->render.vertex_data,
-		       sizeof(float)*sna->render.vertex_used);
-	}
-	sna->render.vertex_size = 64 * 1024 - 1;
-	return sna->render.vertex_size - sna->render.vertex_used;
-}
-
-static void gen7_vertex_close(struct sna *sna)
-{
-	struct kgem_bo *bo, *free_bo = NULL;
-	unsigned int i, delta = 0;
-
-	assert(sna->render_state.gen7.vertex_offset == 0);
-
-	if (!sna->render_state.gen7.vb_id)
-		return;
-
-	DBG(("%s: used=%d, vbo active? %d\n",
-	     __FUNCTION__, sna->render.vertex_used, sna->render.vbo ? sna->render.vbo->handle : 0));
-
-	bo = sna->render.vbo;
-	if (bo) {
-		if (sna->render.vertex_size - sna->render.vertex_used < 64) {
-			DBG(("%s: discarding vbo (full), handle=%d\n", __FUNCTION__, sna->render.vbo->handle));
-			sna->render.vbo = NULL;
-			sna->render.vertices = sna->render.vertex_data;
-			sna->render.vertex_size = ARRAY_SIZE(sna->render.vertex_data);
-			free_bo = bo;
-		}
-	} else {
-		if (sna->kgem.nbatch + sna->render.vertex_used <= sna->kgem.surface) {
-			DBG(("%s: copy to batch: %d @ %d\n", __FUNCTION__,
-			     sna->render.vertex_used, sna->kgem.nbatch));
-			memcpy(sna->kgem.batch + sna->kgem.nbatch,
-			       sna->render.vertex_data,
-			       sna->render.vertex_used * 4);
-			delta = sna->kgem.nbatch * 4;
-			bo = NULL;
-			sna->kgem.nbatch += sna->render.vertex_used;
-		} else {
-			bo = kgem_create_linear(&sna->kgem,
-						4*sna->render.vertex_used, 0);
-			if (bo && !kgem_bo_write(&sna->kgem, bo,
-						 sna->render.vertex_data,
-						 4*sna->render.vertex_used)) {
-				kgem_bo_destroy(&sna->kgem, bo);
-				bo = NULL;
-			}
-			DBG(("%s: new vbo: %d\n", __FUNCTION__,
-			     sna->render.vertex_used));
-			free_bo = bo;
-		}
-	}
-
-	assert(sna->render.nvertex_reloc);
-	for (i = 0; i < sna->render.nvertex_reloc; i++) {
-		DBG(("%s: reloc[%d] = %d\n", __FUNCTION__,
-		     i, sna->render.vertex_reloc[i]));
-
-		sna->kgem.batch[sna->render.vertex_reloc[i]] =
-			kgem_add_reloc(&sna->kgem,
-				       sna->render.vertex_reloc[i], bo,
-				       I915_GEM_DOMAIN_VERTEX << 16,
-				       delta);
-		sna->kgem.batch[sna->render.vertex_reloc[i]+1] =
-			kgem_add_reloc(&sna->kgem,
-				       sna->render.vertex_reloc[i]+1, bo,
-				       I915_GEM_DOMAIN_VERTEX << 16,
-				       delta + sna->render.vertex_used * 4 - 1);
-	}
-	sna->render.nvertex_reloc = 0;
-
-	if (sna->render.vbo == NULL) {
-		sna->render.vertex_used = 0;
-		sna->render.vertex_index = 0;
-		assert(sna->render.vertices == sna->render.vertex_data);
-		assert(sna->render.vertex_size == ARRAY_SIZE(sna->render.vertex_data));
-	}
-
-	if (free_bo)
-		kgem_bo_destroy(&sna->kgem, free_bo);
+	return true;
 }
 
 static void null_create(struct sna_static_stream *stream)
@@ -1315,16 +1157,6 @@ sampler_fill_init(struct gen7_sampler_state *ss)
 	sampler_state_init(ss+1, SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE);
 }
 
-static uint32_t gen7_create_cc_viewport(struct sna_static_stream *stream)
-{
-	struct gen7_cc_viewport vp;
-
-	vp.min_depth = -1.e35;
-	vp.max_depth = 1.e35;
-
-	return sna_static_stream_add(stream, &vp, sizeof(vp), 32);
-}
-
 static uint32_t
 gen7_tiling_bits(uint32_t tiling)
 {
@@ -1351,11 +1183,12 @@ gen7_bind_bo(struct sna *sna,
 	uint32_t *ss;
 	uint32_t domains;
 	int offset;
+	uint32_t is_scanout = is_dst && bo->scanout;
 
 	COMPILE_TIME_ASSERT(sizeof(struct gen7_surface_state) == 32);
 
 	/* After the first bind, we manage the cache domains within the batch */
-	offset = kgem_bo_get_binding(bo, format);
+	offset = kgem_bo_get_binding(bo, format | is_scanout << 31);
 	if (offset) {
 		if (is_dst)
 			kgem_bo_mark_dirty(bo);
@@ -1377,13 +1210,13 @@ gen7_bind_bo(struct sna *sna,
 		 (height - 1) << GEN7_SURFACE_HEIGHT_SHIFT);
 	ss[3] = (bo->pitch - 1) << GEN7_SURFACE_PITCH_SHIFT;
 	ss[4] = 0;
-	ss[5] = 0;
+	ss[5] = is_scanout ? 0 : 3 << 16;
 	ss[6] = 0;
 	ss[7] = 0;
-	if (sna->kgem.gen == 75)
+	if (sna->kgem.gen == 075)
 		ss[7] |= HSW_SURFACE_SWIZZLE(RED, GREEN, BLUE, ALPHA);
 
-	kgem_bo_set_binding(bo, format, offset);
+	kgem_bo_set_binding(bo, format | is_scanout << 31, offset);
 
 	DBG(("[%x] bind bo(handle=%d, addr=%d), format=%d, width=%d, height=%d, pitch=%d, tiling=%d -> %s\n",
 	     offset, bo->handle, ss[1],
@@ -1393,251 +1226,6 @@ gen7_bind_bo(struct sna *sna,
 	return offset * sizeof(uint32_t);
 }
 
-fastcall static void
-gen7_emit_composite_primitive_solid(struct sna *sna,
-				    const struct sna_composite_op *op,
-				    const struct sna_composite_rectangles *r)
-{
-	float *v;
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 9;
-	assert(sna->render.vertex_used <= sna->render.vertex_size);
-	assert(!too_large(op->dst.x + r->dst.x + r->width,
-			  op->dst.y + r->dst.y + r->height));
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	dst.p.x = r->dst.x;
-	v[3] = dst.f;
-	dst.p.y = r->dst.y;
-	v[6] = dst.f;
-
-	v[5] = v[2] = v[1] = 1.;
-	v[8] = v[7] = v[4] = 0.;
-}
-
-fastcall static void
-gen7_emit_composite_primitive_identity_source(struct sna *sna,
-					      const struct sna_composite_op *op,
-					      const struct sna_composite_rectangles *r)
-{
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-	float *v;
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 9;
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	dst.p.x = r->dst.x;
-	v[3] = dst.f;
-	dst.p.y = r->dst.y;
-	v[6] = dst.f;
-
-	v[7] = v[4] = (r->src.x + op->src.offset[0]) * op->src.scale[0];
-	v[1] = v[4] + r->width * op->src.scale[0];
-
-	v[8] = (r->src.y + op->src.offset[1]) * op->src.scale[1];
-	v[5] = v[2] = v[8] + r->height * op->src.scale[1];
-}
-
-fastcall static void
-gen7_emit_composite_primitive_simple_source(struct sna *sna,
-					    const struct sna_composite_op *op,
-					    const struct sna_composite_rectangles *r)
-{
-	float *v;
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-
-	float xx = op->src.transform->matrix[0][0];
-	float x0 = op->src.transform->matrix[0][2];
-	float yy = op->src.transform->matrix[1][1];
-	float y0 = op->src.transform->matrix[1][2];
-	float sx = op->src.scale[0];
-	float sy = op->src.scale[1];
-	int16_t tx = op->src.offset[0];
-	int16_t ty = op->src.offset[1];
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 3*3;
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	v[1] = ((r->src.x + r->width + tx) * xx + x0) * sx;
-	v[5] = v[2] = ((r->src.y + r->height + ty) * yy + y0) * sy;
-
-	dst.p.x = r->dst.x;
-	v[3] = dst.f;
-	v[7] = v[4] = ((r->src.x + tx) * xx + x0) * sx;
-
-	dst.p.y = r->dst.y;
-	v[6] = dst.f;
-	v[8] = ((r->src.y + ty) * yy + y0) * sy;
-}
-
-fastcall static void
-gen7_emit_composite_primitive_affine_source(struct sna *sna,
-					    const struct sna_composite_op *op,
-					    const struct sna_composite_rectangles *r)
-{
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-	float *v;
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 9;
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	_sna_get_transformed_coordinates(op->src.offset[0] + r->src.x + r->width,
-					 op->src.offset[1] + r->src.y + r->height,
-					 op->src.transform,
-					 &v[1], &v[2]);
-	v[1] *= op->src.scale[0];
-	v[2] *= op->src.scale[1];
-
-	dst.p.x = r->dst.x;
-	v[3] = dst.f;
-	_sna_get_transformed_coordinates(op->src.offset[0] + r->src.x,
-					 op->src.offset[1] + r->src.y + r->height,
-					 op->src.transform,
-					 &v[4], &v[5]);
-	v[4] *= op->src.scale[0];
-	v[5] *= op->src.scale[1];
-
-	dst.p.y = r->dst.y;
-	v[6] = dst.f;
-	_sna_get_transformed_coordinates(op->src.offset[0] + r->src.x,
-					 op->src.offset[1] + r->src.y,
-					 op->src.transform,
-					 &v[7], &v[8]);
-	v[7] *= op->src.scale[0];
-	v[8] *= op->src.scale[1];
-}
-
-fastcall static void
-gen7_emit_composite_primitive_identity_source_mask(struct sna *sna,
-						   const struct sna_composite_op *op,
-						   const struct sna_composite_rectangles *r)
-{
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-	float src_x, src_y;
-	float msk_x, msk_y;
-	float w, h;
-	float *v;
-
-	src_x = r->src.x + op->src.offset[0];
-	src_y = r->src.y + op->src.offset[1];
-	msk_x = r->mask.x + op->mask.offset[0];
-	msk_y = r->mask.y + op->mask.offset[1];
-	w = r->width;
-	h = r->height;
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 15;
-
-	dst.p.x = r->dst.x + r->width;
-	dst.p.y = r->dst.y + r->height;
-	v[0] = dst.f;
-	v[1] = (src_x + w) * op->src.scale[0];
-	v[2] = (src_y + h) * op->src.scale[1];
-	v[3] = (msk_x + w) * op->mask.scale[0];
-	v[4] = (msk_y + h) * op->mask.scale[1];
-
-	dst.p.x = r->dst.x;
-	v[5] = dst.f;
-	v[6] = src_x * op->src.scale[0];
-	v[7] = v[2];
-	v[8] = msk_x * op->mask.scale[0];
-	v[9] = v[4];
-
-	dst.p.y = r->dst.y;
-	v[10] = dst.f;
-	v[11] = v[6];
-	v[12] = src_y * op->src.scale[1];
-	v[13] = v[8];
-	v[14] = msk_y * op->mask.scale[1];
-}
-
-inline static void
-gen7_emit_composite_texcoord(struct sna *sna,
-			     const struct sna_composite_channel *channel,
-			     int16_t x, int16_t y)
-{
-	x += channel->offset[0];
-	y += channel->offset[1];
-
-	if (channel->is_affine) {
-		float s, t;
-
-		sna_get_transformed_coordinates(x, y,
-						channel->transform,
-						&s, &t);
-		OUT_VERTEX_F(s * channel->scale[0]);
-		OUT_VERTEX_F(t * channel->scale[1]);
-	} else {
-		float s, t, w;
-
-		sna_get_transformed_coordinates_3d(x, y,
-						   channel->transform,
-						   &s, &t, &w);
-		OUT_VERTEX_F(s * channel->scale[0]);
-		OUT_VERTEX_F(t * channel->scale[1]);
-		OUT_VERTEX_F(w);
-	}
-}
-
-static void
-gen7_emit_composite_vertex(struct sna *sna,
-			   const struct sna_composite_op *op,
-			   int16_t srcX, int16_t srcY,
-			   int16_t mskX, int16_t mskY,
-			   int16_t dstX, int16_t dstY)
-{
-	OUT_VERTEX(dstX, dstY);
-	gen7_emit_composite_texcoord(sna, &op->src, srcX, srcY);
-	gen7_emit_composite_texcoord(sna, &op->mask, mskX, mskY);
-}
-
-fastcall static void
-gen7_emit_composite_primitive(struct sna *sna,
-			      const struct sna_composite_op *op,
-			      const struct sna_composite_rectangles *r)
-{
-	gen7_emit_composite_vertex(sna, op,
-				   r->src.x + r->width,  r->src.y + r->height,
-				   r->mask.x + r->width, r->mask.y + r->height,
-				   r->dst.x + r->width, r->dst.y + r->height);
-	gen7_emit_composite_vertex(sna, op,
-				   r->src.x,  r->src.y + r->height,
-				   r->mask.x, r->mask.y + r->height,
-				   r->dst.x,  r->dst.y + r->height);
-	gen7_emit_composite_vertex(sna, op,
-				   r->src.x,  r->src.y,
-				   r->mask.x, r->mask.y,
-				   r->dst.x,  r->dst.y);
-}
-
 static void gen7_emit_vertex_buffer(struct sna *sna,
 				    const struct sna_composite_op *op)
 {
@@ -1650,22 +1238,22 @@ static void gen7_emit_vertex_buffer(struct sna *sna,
 		  4*op->floats_per_vertex << GEN7_VB0_BUFFER_PITCH_SHIFT);
 	sna->render.vertex_reloc[sna->render.nvertex_reloc++] = sna->kgem.nbatch;
 	OUT_BATCH(0);
-	OUT_BATCH(0);
+	OUT_BATCH(~0); /* max address: disabled */
 	OUT_BATCH(0);
 
-	sna->render_state.gen7.vb_id |= 1 << id;
+	sna->render.vb_id |= 1 << id;
 }
 
 static void gen7_emit_primitive(struct sna *sna)
 {
 	if (sna->kgem.nbatch == sna->render_state.gen7.last_primitive) {
-		sna->render_state.gen7.vertex_offset = sna->kgem.nbatch - 5;
+		sna->render.vertex_offset = sna->kgem.nbatch - 5;
 		return;
 	}
 
 	OUT_BATCH(GEN7_3DPRIMITIVE | (7- 2));
 	OUT_BATCH(GEN7_3DPRIMITIVE_VERTEX_SEQUENTIAL | _3DPRIM_RECTLIST);
-	sna->render_state.gen7.vertex_offset = sna->kgem.nbatch;
+	sna->render.vertex_offset = sna->kgem.nbatch;
 	OUT_BATCH(0);	/* vertex count, to be filled in later */
 	OUT_BATCH(sna->render.vertex_index);
 	OUT_BATCH(1);	/* single instance */
@@ -1682,13 +1270,16 @@ static bool gen7_rectangle_begin(struct sna *sna,
 	int id = 1 << GEN7_VERTEX(op->u.gen7.flags);
 	int ndwords;
 
+	if (sna_vertex_wait__locked(&sna->render) && sna->render.vertex_offset)
+		return true;
+
 	ndwords = op->need_magic_ca_pass ? 60 : 6;
-	if ((sna->render_state.gen7.vb_id & id) == 0)
+	if ((sna->render.vb_id & id) == 0)
 		ndwords += 5;
 	if (!kgem_check_batch(&sna->kgem, ndwords))
 		return false;
 
-	if ((sna->render_state.gen7.vb_id & id) == 0)
+	if ((sna->render.vb_id & id) == 0)
 		gen7_emit_vertex_buffer(sna, op);
 
 	gen7_emit_primitive(sna);
@@ -1698,17 +1289,28 @@ static bool gen7_rectangle_begin(struct sna *sna,
 static int gen7_get_rectangles__flush(struct sna *sna,
 				      const struct sna_composite_op *op)
 {
+	/* Preventing discarding new vbo after lock contention */
+	if (sna_vertex_wait__locked(&sna->render)) {
+		int rem = vertex_space(sna);
+		if (rem > op->floats_per_rect)
+			return rem;
+	}
+
 	if (!kgem_check_batch(&sna->kgem, op->need_magic_ca_pass ? 65 : 6))
 		return 0;
-	if (!kgem_check_exec(&sna->kgem, 1))
-		return 0;
-	if (!kgem_check_reloc(&sna->kgem, 2))
+	if (!kgem_check_reloc_and_exec(&sna->kgem, 2))
 		return 0;
 
-	if (op->need_magic_ca_pass && sna->render.vbo)
-		return 0;
+	if (sna->render.vertex_offset) {
+		gen4_vertex_flush(sna);
+		if (gen7_magic_ca_pass(sna, op)) {
+			gen7_emit_pipe_invalidate(sna);
+			gen7_emit_cc(sna, GEN7_BLEND(op->u.gen7.flags));
+			gen7_emit_wm(sna, GEN7_KERNEL(op->u.gen7.flags));
+		}
+	}
 
-	return gen7_vertex_finish(sna);
+	return gen4_vertex_finish(sna);
 }
 
 inline static int gen7_get_rectangles(struct sna *sna,
@@ -1720,7 +1322,7 @@ inline static int gen7_get_rectangles(struct sna *sna,
 
 start:
 	rem = vertex_space(sna);
-	if (rem < op->floats_per_rect) {
+	if (unlikely(rem < op->floats_per_rect)) {
 		DBG(("flushing vbo for %s: %d < %d\n",
 		     __FUNCTION__, rem, op->floats_per_rect));
 		rem = gen7_get_rectangles__flush(sna, op);
@@ -1728,7 +1330,7 @@ start:
 			goto flush;
 	}
 
-	if (unlikely(sna->render_state.gen7.vertex_offset == 0 &&
+	if (unlikely(sna->render.vertex_offset == 0 &&
 		     !gen7_rectangle_begin(sna, op)))
 		goto flush;
 
@@ -1740,10 +1342,11 @@ start:
 	return want;
 
 flush:
-	if (sna->render_state.gen7.vertex_offset) {
-		gen7_vertex_flush(sna);
+	if (sna->render.vertex_offset) {
+		gen4_vertex_flush(sna);
 		gen7_magic_ca_pass(sna, op);
 	}
+	sna_vertex_wait__locked(&sna->render);
 	_kgem_submit(&sna->kgem);
 	emit_state(sna, op);
 	goto start;
@@ -1766,20 +1369,10 @@ inline static uint32_t *gen7_composite_get_binding_table(struct sna *sna,
 	return table;
 }
 
-static uint32_t
-gen7_choose_composite_vertex_buffer(const struct sna_composite_op *op)
-{
-	int id = 2 + !op->is_affine;
-	if (op->mask.bo)
-		id |= id << 2;
-	assert(id > 0 && id < 16);
-	return id;
-}
-
 static void
-gen7_get_batch(struct sna *sna)
+gen7_get_batch(struct sna *sna, const struct sna_composite_op *op)
 {
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, op->dst.bo);
 
 	if (!kgem_check_batch_with_surfaces(&sna->kgem, 150, 4)) {
 		DBG(("%s: flushing batch: %d < %d+%d\n",
@@ -1802,7 +1395,7 @@ static void gen7_emit_composite_state(struct sna *sna,
 	uint32_t *binding_table;
 	uint16_t offset;
 
-	gen7_get_batch(sna);
+	gen7_get_batch(sna, op);
 
 	binding_table = gen7_composite_get_binding_table(sna, &offset);
 
@@ -1842,7 +1435,7 @@ gen7_align_vertex(struct sna *sna, const struct sna_composite_op *op)
 {
 	if (op->floats_per_vertex != sna->render_state.gen7.floats_per_vertex) {
 		if (sna->render.vertex_size - sna->render.vertex_used < 2*op->floats_per_rect)
-			gen7_vertex_finish(sna);
+			gen4_vertex_finish(sna);
 
 		DBG(("aligning vertex: was %d, now %d floats per vertex, %d->%d\n",
 		     sna->render_state.gen7.floats_per_vertex,
@@ -1887,9 +1480,9 @@ gen7_render_composite_box(struct sna *sna,
 }
 
 static void
-gen7_render_composite_boxes(struct sna *sna,
-			    const struct sna_composite_op *op,
-			    const BoxRec *box, int nbox)
+gen7_render_composite_boxes__blt(struct sna *sna,
+				 const struct sna_composite_op *op,
+				 const BoxRec *box, int nbox)
 {
 	DBG(("composite_boxes(%d)\n", nbox));
 
@@ -1919,6 +1512,62 @@ gen7_render_composite_boxes(struct sna *sna,
 	} while (nbox);
 }
 
+static void
+gen7_render_composite_boxes(struct sna *sna,
+			    const struct sna_composite_op *op,
+			    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen7_get_rectangles(sna, op, nbox,
+						     gen7_emit_composite_state);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+	} while (nbox);
+}
+
+static void
+gen7_render_composite_boxes__thread(struct sna *sna,
+				    const struct sna_composite_op *op,
+				    const BoxRec *box, int nbox)
+{
+	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen7_get_rectangles(sna, op, nbox,
+						     gen7_emit_composite_state);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
 #ifndef MAX
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 #endif
@@ -1998,7 +1647,7 @@ static void gen7_emit_video_state(struct sna *sna,
 	uint16_t offset;
 	int n_src, n;
 
-	gen7_get_batch(sna);
+	gen7_get_batch(sna, op);
 
 	src_surf_base[0] = 0;
 	src_surf_base[1] = 0;
@@ -2059,12 +1708,14 @@ gen7_render_video(struct sna *sna,
 		  RegionPtr dstRegion,
 		  short src_w, short src_h,
 		  short drw_w, short drw_h,
+		  short dx, short dy,
 		  PixmapPtr pixmap)
 {
 	struct sna_composite_op tmp;
-	int nbox, dxo, dyo, pix_xoff, pix_yoff;
+	int nbox, pix_xoff, pix_yoff;
 	float src_scale_x, src_scale_y;
 	struct sna_pixmap *priv;
+	unsigned filter;
 	BoxPtr box;
 
 	DBG(("%s: src=(%d, %d), dst=(%d, %d), %dx[(%d, %d), (%d, %d)...]\n",
@@ -2093,15 +1744,22 @@ gen7_render_video(struct sna *sna,
 	tmp.floats_per_vertex = 3;
 	tmp.floats_per_rect = 9;
 
+	if (src_w == drw_w && src_h == drw_h)
+		filter = SAMPLER_FILTER_NEAREST;
+	else
+		filter = SAMPLER_FILTER_BILINEAR;
+
 	tmp.u.gen7.flags =
-		GEN7_SET_FLAGS(VIDEO_SAMPLER, NO_BLEND,
+		GEN7_SET_FLAGS(SAMPLER_OFFSET(filter, SAMPLER_EXTEND_PAD,
+					      SAMPLER_FILTER_NEAREST, SAMPLER_EXTEND_NONE),
+			       NO_BLEND,
 			       is_planar_fourcc(frame->id) ?
 			       GEN7_WM_KERNEL_VIDEO_PLANAR :
 			       GEN7_WM_KERNEL_VIDEO_PACKED,
 			       2);
 	tmp.priv = frame;
 
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
 	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		assert(kgem_check_bo(&sna->kgem, tmp.dst.bo, frame->bo, NULL));
@@ -2122,9 +1780,6 @@ gen7_render_video(struct sna *sna,
 	pix_yoff = 0;
 #endif
 
-	dxo = dstRegion->extents.x1;
-	dyo = dstRegion->extents.y1;
-
 	/* Use normalized texture coordinates */
 	src_scale_x = ((float)src_w / frame->width) / (float)drw_w;
 	src_scale_y = ((float)src_h / frame->height) / (float)drw_h;
@@ -2142,16 +1797,16 @@ gen7_render_video(struct sna *sna,
 		gen7_get_rectangles(sna, &tmp, 1, gen7_emit_video_state);
 
 		OUT_VERTEX(r.x2, r.y2);
-		OUT_VERTEX_F((box->x2 - dxo) * src_scale_x);
-		OUT_VERTEX_F((box->y2 - dyo) * src_scale_y);
+		OUT_VERTEX_F((box->x2 - dx) * src_scale_x);
+		OUT_VERTEX_F((box->y2 - dy) * src_scale_y);
 
 		OUT_VERTEX(r.x1, r.y2);
-		OUT_VERTEX_F((box->x1 - dxo) * src_scale_x);
-		OUT_VERTEX_F((box->y2 - dyo) * src_scale_y);
+		OUT_VERTEX_F((box->x1 - dx) * src_scale_x);
+		OUT_VERTEX_F((box->y2 - dy) * src_scale_y);
 
 		OUT_VERTEX(r.x1, r.y1);
-		OUT_VERTEX_F((box->x1 - dxo) * src_scale_x);
-		OUT_VERTEX_F((box->y1 - dyo) * src_scale_y);
+		OUT_VERTEX_F((box->x1 - dx) * src_scale_x);
+		OUT_VERTEX_F((box->y1 - dy) * src_scale_y);
 
 		if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
 			sna_damage_add_box(&priv->gpu_damage, &r);
@@ -2161,148 +1816,10 @@ gen7_render_video(struct sna *sna,
 	}
 	priv->clear = false;
 
-	gen7_vertex_flush(sna);
+	gen4_vertex_flush(sna);
 	return true;
 }
 
-static bool
-gen7_composite_solid_init(struct sna *sna,
-			  struct sna_composite_channel *channel,
-			  uint32_t color)
-{
-	DBG(("%s: color=%x\n", __FUNCTION__, color));
-
-	channel->filter = PictFilterNearest;
-	channel->repeat = RepeatNormal;
-	channel->is_affine = true;
-	channel->is_solid  = true;
-	channel->is_opaque = (color >> 24) == 0xff;
-	channel->transform = NULL;
-	channel->width  = 1;
-	channel->height = 1;
-	channel->card_format = GEN7_SURFACEFORMAT_B8G8R8A8_UNORM;
-
-	channel->bo = sna_render_get_solid(sna, color);
-
-	channel->scale[0]  = channel->scale[1]  = 1;
-	channel->offset[0] = channel->offset[1] = 0;
-	return channel->bo != NULL;
-}
-
-static bool
-gen7_composite_linear_init(struct sna *sna,
-			   PicturePtr picture,
-			   struct sna_composite_channel *channel,
-			   int x, int y,
-			   int w, int h,
-			   int dst_x, int dst_y)
-{
-	PictLinearGradient *linear =
-		(PictLinearGradient *)picture->pSourcePict;
-	pixman_fixed_t tx, ty;
-	float x0, y0, sf;
-	float dx, dy;
-
-	DBG(("%s: p1=(%f, %f), p2=(%f, %f), src=(%d, %d), dst=(%d, %d), size=(%d, %d)\n",
-	     __FUNCTION__,
-	     pixman_fixed_to_double(linear->p1.x), pixman_fixed_to_double(linear->p1.y),
-	     pixman_fixed_to_double(linear->p2.x), pixman_fixed_to_double(linear->p2.y),
-	     x, y, dst_x, dst_y, w, h));
-
-	if (linear->p2.x == linear->p1.x && linear->p2.y == linear->p1.y)
-		return 0;
-
-	if (!sna_transform_is_affine(picture->transform)) {
-		DBG(("%s: fallback due to projective transform\n",
-		     __FUNCTION__));
-		return sna_render_picture_fixup(sna, picture, channel,
-						x, y, w, h, dst_x, dst_y);
-	}
-
-	channel->bo = sna_render_get_gradient(sna, (PictGradient *)linear);
-	if (!channel->bo)
-		return 0;
-
-	channel->filter = PictFilterNearest;
-	channel->repeat = picture->repeat ? picture->repeatType : RepeatNone;
-	channel->width  = channel->bo->pitch / 4;
-	channel->height = 1;
-	channel->pict_format = PICT_a8r8g8b8;
-
-	channel->scale[0]  = channel->scale[1]  = 1;
-	channel->offset[0] = channel->offset[1] = 0;
-
-	if (sna_transform_is_translation(picture->transform, &tx, &ty)) {
-		dx = pixman_fixed_to_double(linear->p2.x - linear->p1.x);
-		dy = pixman_fixed_to_double(linear->p2.y - linear->p1.y);
-
-		x0 = pixman_fixed_to_double(linear->p1.x);
-		y0 = pixman_fixed_to_double(linear->p1.y);
-
-		if (tx | ty) {
-			x0 -= pixman_fixed_to_double(tx);
-			y0 -= pixman_fixed_to_double(ty);
-		}
-	} else {
-		struct pixman_f_vector p1, p2;
-		struct pixman_f_transform m, inv;
-
-		pixman_f_transform_from_pixman_transform(&m, picture->transform);
-		DBG(("%s: transform = [%f %f %f, %f %f %f, %f %f %f]\n",
-		     __FUNCTION__,
-		     m.m[0][0], m.m[0][1], m.m[0][2],
-		     m.m[1][0], m.m[1][1], m.m[1][2],
-		     m.m[2][0], m.m[2][1], m.m[2][2]));
-		if (!pixman_f_transform_invert(&inv, &m))
-			return 0;
-
-		p1.v[0] = pixman_fixed_to_double(linear->p1.x);
-		p1.v[1] = pixman_fixed_to_double(linear->p1.y);
-		p1.v[2] = 1.;
-		pixman_f_transform_point(&inv, &p1);
-
-		p2.v[0] = pixman_fixed_to_double(linear->p2.x);
-		p2.v[1] = pixman_fixed_to_double(linear->p2.y);
-		p2.v[2] = 1.;
-		pixman_f_transform_point(&inv, &p2);
-
-		DBG(("%s: untransformed: p1=(%f, %f, %f), p2=(%f, %f, %f)\n",
-		     __FUNCTION__,
-		     p1.v[0], p1.v[1], p1.v[2],
-		     p2.v[0], p2.v[1], p2.v[2]));
-
-		dx = p2.v[0] - p1.v[0];
-		dy = p2.v[1] - p1.v[1];
-
-		x0 = p1.v[0];
-		y0 = p1.v[1];
-	}
-
-	sf = dx*dx + dy*dy;
-	dx /= sf;
-	dy /= sf;
-
-	channel->embedded_transform.matrix[0][0] = pixman_double_to_fixed(dx);
-	channel->embedded_transform.matrix[0][1] = pixman_double_to_fixed(dy);
-	channel->embedded_transform.matrix[0][2] = -pixman_double_to_fixed(dx*(x0+dst_x-x) + dy*(y0+dst_y-y));
-
-	channel->embedded_transform.matrix[1][0] = 0;
-	channel->embedded_transform.matrix[1][1] = 0;
-	channel->embedded_transform.matrix[1][2] = pixman_double_to_fixed(.5);
-
-	channel->embedded_transform.matrix[2][0] = 0;
-	channel->embedded_transform.matrix[2][1] = 0;
-	channel->embedded_transform.matrix[2][2] = pixman_fixed_1;
-
-	channel->transform = &channel->embedded_transform;
-	channel->is_affine = 1;
-
-	DBG(("%s: dx=%f, dy=%f, offset=%f\n",
-	     __FUNCTION__, dx, dy, -dx*(x0-x+dst_x) + -dy*(y0-y+dst_y)));
-
-	return channel->bo != NULL;
-}
-
 static int
 gen7_composite_picture(struct sna *sna,
 		       PicturePtr picture,
@@ -2323,16 +1840,16 @@ gen7_composite_picture(struct sna *sna,
 	channel->card_format = -1;
 
 	if (sna_picture_is_solid(picture, &color))
-		return gen7_composite_solid_init(sna, channel, color);
+		return gen4_channel_init_solid(sna, channel, color);
 
 	if (picture->pDrawable == NULL) {
 		int ret;
 
 		if (picture->pSourcePict->type == SourcePictTypeLinear)
-			return gen7_composite_linear_init(sna, picture, channel,
-							  x, y,
-							  w, h,
-							  dst_x, dst_y);
+			return gen4_channel_init_linear(sna, picture, channel,
+							x, y,
+							w, h,
+							dst_x, dst_y);
 
 		DBG(("%s -- fixup, gradient\n", __FUNCTION__));
 		ret = -1;
@@ -2383,7 +1900,8 @@ gen7_composite_picture(struct sna *sna,
 	channel->card_format = gen7_get_card_format(picture->format);
 	if (channel->card_format == (unsigned)-1)
 		return sna_render_picture_convert(sna, picture, channel, pixmap,
-						  x, y, w, h, dst_x, dst_y);
+						  x, y, w, h, dst_x, dst_y,
+						  false);
 
 	if (too_large(pixmap->drawable.width, pixmap->drawable.height)) {
 		DBG(("%s: extracting from pixmap %dx%d\n", __FUNCTION__,
@@ -2408,8 +1926,8 @@ static void gen7_composite_channel_convert(struct sna_composite_channel *channel
 static void gen7_render_composite_done(struct sna *sna,
 				       const struct sna_composite_op *op)
 {
-	if (sna->render_state.gen7.vertex_offset) {
-		gen7_vertex_flush(sna);
+	if (sna->render.vertex_offset) {
+		gen4_vertex_flush(sna);
 		gen7_magic_ca_pass(sna, op);
 	}
 
@@ -2469,9 +1987,11 @@ gen7_composite_set_target(struct sna *sna,
 	return true;
 }
 
-inline static bool can_switch_to_blt(struct sna *sna)
+inline static bool can_switch_to_blt(struct sna *sna,
+				     struct kgem_bo *bo,
+				     unsigned flags)
 {
-	if (sna->kgem.ring == KGEM_BLT)
+	if (sna->kgem.ring != KGEM_RENDER)
 		return true;
 
 	if (NO_RING_SWITCH)
@@ -2480,7 +2000,13 @@ inline static bool can_switch_to_blt(struct sna *sna)
 	if (!sna->kgem.has_semaphores)
 		return false;
 
-	return sna->kgem.mode == KGEM_NONE || kgem_is_idle(&sna->kgem);
+	if (flags & COPY_LAST)
+		return true;
+
+	if (bo && RQ_IS_BLT(bo->rq))
+		return true;
+
+	return kgem_ring_is_idle(&sna->kgem, KGEM_BLT);
 }
 
 static inline bool untiled_tlb_miss(struct kgem_bo *bo)
@@ -2488,14 +2014,19 @@ static inline bool untiled_tlb_miss(struct kgem_bo *bo)
 	return bo->tiling == I915_TILING_NONE && bo->pitch >= 4096;
 }
 
-static bool prefer_blt_bo(struct sna *sna, struct kgem_bo *bo)
+static int prefer_blt_bo(struct sna *sna, struct kgem_bo *bo)
 {
-	return untiled_tlb_miss(bo) && bo->pitch < MAXSHORT;
+	if (bo->rq)
+		return RQ_IS_BLT(bo->rq) ? 1 : -1;
+
+	return bo->tiling == I915_TILING_NONE || bo->scanout;
 }
 
-inline static bool prefer_blt_ring(struct sna *sna)
+inline static bool prefer_blt_ring(struct sna *sna,
+				   struct kgem_bo *bo,
+				   unsigned flags)
 {
-	return sna->kgem.ring != KGEM_RENDER || can_switch_to_blt(sna);
+	return can_switch_to_blt(sna, bo, flags);
 }
 
 static bool
@@ -2514,17 +2045,8 @@ try_blt(struct sna *sna,
 		return true;
 	}
 
-	if (can_switch_to_blt(sna)) {
-		if (sna_picture_is_solid(src, NULL))
-			return true;
-
-		if (dst->pDrawable == src->pDrawable)
-			return true;
-
-		if (src->pDrawable &&
-		    get_drawable_pixmap(dst->pDrawable) == get_drawable_pixmap(src->pDrawable))
-			return true;
-	}
+	if (sna_picture_is_solid(src, NULL) && can_switch_to_blt(sna, NULL, 0))
+		return true;
 
 	return false;
 }
@@ -2551,12 +2073,6 @@ has_alphamap(PicturePtr p)
 }
 
 static bool
-untransformed(PicturePtr p)
-{
-	return !p->transform || pixman_transform_is_int_translate(p->transform);
-}
-
-static bool
 need_upload(PicturePtr p)
 {
 	return p->pDrawable && unattached(p->pDrawable) && untransformed(p);
@@ -2602,7 +2118,6 @@ gen7_composite_fallback(struct sna *sna,
 			PicturePtr mask,
 			PicturePtr dst)
 {
-	struct sna_pixmap *priv;
 	PixmapPtr src_pixmap;
 	PixmapPtr mask_pixmap;
 	PixmapPtr dst_pixmap;
@@ -2641,10 +2156,7 @@ gen7_composite_fallback(struct sna *sna,
 	}
 
 	/* If anything is on the GPU, push everything out to the GPU */
-	priv = sna_pixmap(dst_pixmap);
-	if (priv &&
-	    ((priv->gpu_damage && !priv->clear) ||
-	     (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo)))) {
+	if (dst_use_gpu(dst_pixmap)) {
 		DBG(("%s: dst is already on the GPU, try to use GPU\n",
 		     __FUNCTION__));
 		return false;
@@ -2679,14 +2191,14 @@ gen7_composite_fallback(struct sna *sna,
 
 	if (too_large(dst_pixmap->drawable.width,
 		      dst_pixmap->drawable.height) &&
-	    (priv == NULL || DAMAGE_IS_ALL(priv->cpu_damage))) {
+	    dst_is_cpu(dst_pixmap)) {
 		DBG(("%s: dst is on the CPU and too large\n", __FUNCTION__));
 		return true;
 	}
 
 	DBG(("%s: dst is not on the GPU and the operation should not fallback\n",
 	     __FUNCTION__));
-	return false;
+	return dst_use_cpu(dst_pixmap);
 }
 
 static int
@@ -2707,7 +2219,7 @@ reuse_source(struct sna *sna,
 	}
 
 	if (sna_picture_is_solid(mask, &color))
-		return gen7_composite_solid_init(sna, mc, color);
+		return gen4_channel_init_solid(sna, mc, color);
 
 	if (sc->is_solid)
 		return false;
@@ -2750,11 +2262,14 @@ prefer_blt_composite(struct sna *sna, struct sna_composite_op *tmp)
 	if (sna->kgem.ring == KGEM_BLT)
 		return true;
 
-	if (!prefer_blt_ring(sna))
+	if (untiled_tlb_miss(tmp->dst.bo) ||
+	    untiled_tlb_miss(tmp->src.bo))
+		return true;
+
+	if (!prefer_blt_ring(sna, tmp->dst.bo, 0))
 		return false;
 
-	return (prefer_blt_bo(sna, tmp->dst.bo) ||
-		prefer_blt_bo(sna, tmp->src.bo));
+	return (prefer_blt_bo(sna, tmp->dst.bo) | prefer_blt_bo(sna, tmp->src.bo)) > 0;
 }
 
 static bool
@@ -2811,7 +2326,7 @@ gen7_render_composite(struct sna *sna,
 	case -1:
 		goto cleanup_dst;
 	case 0:
-		if (!gen7_composite_solid_init(sna, &tmp->src, 0))
+		if (!gen4_channel_init_solid(sna, &tmp->src, 0))
 			goto cleanup_dst;
 		/* fall through to fixup */
 	case 1:
@@ -2835,7 +2350,6 @@ gen7_render_composite(struct sna *sna,
 	tmp->mask.filter = SAMPLER_FILTER_NEAREST;
 	tmp->mask.repeat = SAMPLER_EXTEND_NONE;
 
-	tmp->prim_emit = gen7_emit_composite_primitive;
 	if (mask) {
 		if (mask->componentAlpha && PICT_FORMAT_RGB(mask->format)) {
 			tmp->has_component_alpha = true;
@@ -2865,7 +2379,7 @@ gen7_render_composite(struct sna *sna,
 			case -1:
 				goto cleanup_src;
 			case 0:
-				if (!gen7_composite_solid_init(sna, &tmp->mask, 0))
+				if (!gen4_channel_init_solid(sna, &tmp->mask, 0))
 					goto cleanup_src;
 				/* fall through to fixup */
 			case 1:
@@ -2875,31 +2389,7 @@ gen7_render_composite(struct sna *sna,
 		}
 
 		tmp->is_affine &= tmp->mask.is_affine;
-
-		if (tmp->src.transform == NULL && tmp->mask.transform == NULL)
-			tmp->prim_emit = gen7_emit_composite_primitive_identity_source_mask;
-
-		tmp->floats_per_vertex = 5 + 2 * !tmp->is_affine;
-	} else {
-		if (tmp->src.is_solid) {
-			tmp->prim_emit = gen7_emit_composite_primitive_solid;
-			if (tmp->src.is_opaque && op == PictOpOver)
-				tmp->op = PictOpSrc;
-		} else if (tmp->src.transform == NULL)
-			tmp->prim_emit = gen7_emit_composite_primitive_identity_source;
-		else if (tmp->src.is_affine) {
-			if (tmp->src.transform->matrix[0][1] == 0 &&
-			    tmp->src.transform->matrix[1][0] == 0) {
-				tmp->src.scale[0] /= tmp->src.transform->matrix[2][2];
-				tmp->src.scale[1] /= tmp->src.transform->matrix[2][2];
-				tmp->prim_emit = gen7_emit_composite_primitive_simple_source;
-			} else
-				tmp->prim_emit = gen7_emit_composite_primitive_affine_source;
-		}
-
-		tmp->floats_per_vertex = 3 + !tmp->is_affine;
 	}
-	tmp->floats_per_rect = 3 * tmp->floats_per_vertex;
 
 	tmp->u.gen7.flags =
 		GEN7_SET_FLAGS(SAMPLER_OFFSET(tmp->src.filter,
@@ -2913,14 +2403,18 @@ gen7_render_composite(struct sna *sna,
 							    tmp->mask.bo != NULL,
 							    tmp->has_component_alpha,
 							    tmp->is_affine),
-			       gen7_choose_composite_vertex_buffer(tmp));
+			       gen4_choose_composite_emitter(tmp));
 
 	tmp->blt   = gen7_render_composite_blt;
 	tmp->box   = gen7_render_composite_box;
-	tmp->boxes = gen7_render_composite_boxes;
+	tmp->boxes = gen7_render_composite_boxes__blt;
+	if (tmp->emit_boxes){
+		tmp->boxes = gen7_render_composite_boxes;
+		tmp->thread_boxes = gen7_render_composite_boxes__thread;
+	}
 	tmp->done  = gen7_render_composite_done;
 
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->dst.bo);
 	if (!kgem_check_bo(&sna->kgem,
 			   tmp->dst.bo, tmp->src.bo, tmp->mask.bo,
 			   NULL)) {
@@ -2949,167 +2443,6 @@ cleanup_dst:
 }
 
 #if !NO_COMPOSITE_SPANS
-inline static void
-gen7_emit_composite_texcoord_affine(struct sna *sna,
-				    const struct sna_composite_channel *channel,
-				    int16_t x, int16_t y)
-{
-	float t[2];
-
-	sna_get_transformed_coordinates(x + channel->offset[0],
-					y + channel->offset[1],
-					channel->transform,
-					&t[0], &t[1]);
-	OUT_VERTEX_F(t[0] * channel->scale[0]);
-	OUT_VERTEX_F(t[1] * channel->scale[1]);
-}
-
-inline static void
-gen7_emit_composite_spans_vertex(struct sna *sna,
-				 const struct sna_composite_spans_op *op,
-				 int16_t x, int16_t y)
-{
-	OUT_VERTEX(x, y);
-	gen7_emit_composite_texcoord(sna, &op->base.src, x, y);
-}
-
-fastcall static void
-gen7_emit_composite_spans_primitive(struct sna *sna,
-				    const struct sna_composite_spans_op *op,
-				    const BoxRec *box,
-				    float opacity)
-{
-	gen7_emit_composite_spans_vertex(sna, op, box->x2, box->y2);
-	OUT_VERTEX_F(opacity);
-
-	gen7_emit_composite_spans_vertex(sna, op, box->x1, box->y2);
-	OUT_VERTEX_F(opacity);
-
-	gen7_emit_composite_spans_vertex(sna, op, box->x1, box->y1);
-	OUT_VERTEX_F(opacity);
-}
-
-fastcall static void
-gen7_emit_composite_spans_solid(struct sna *sna,
-				const struct sna_composite_spans_op *op,
-				const BoxRec *box,
-				float opacity)
-{
-	OUT_VERTEX(box->x2, box->y2);
-	OUT_VERTEX_F(1); OUT_VERTEX_F(1);
-	OUT_VERTEX_F(opacity);
-
-	OUT_VERTEX(box->x1, box->y2);
-	OUT_VERTEX_F(0); OUT_VERTEX_F(1);
-	OUT_VERTEX_F(opacity);
-
-	OUT_VERTEX(box->x1, box->y1);
-	OUT_VERTEX_F(0); OUT_VERTEX_F(0);
-	OUT_VERTEX_F(opacity);
-}
-
-fastcall static void
-gen7_emit_composite_spans_identity(struct sna *sna,
-				   const struct sna_composite_spans_op *op,
-				   const BoxRec *box,
-				   float opacity)
-{
-	float *v;
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-
-	float sx = op->base.src.scale[0];
-	float sy = op->base.src.scale[1];
-	int16_t tx = op->base.src.offset[0];
-	int16_t ty = op->base.src.offset[1];
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 3*4;
-	assert(sna->render.vertex_used <= sna->render.vertex_size);
-
-	dst.p.x = box->x2;
-	dst.p.y = box->y2;
-	v[0] = dst.f;
-	v[1] = (box->x2 + tx) * sx;
-	v[6] = v[2] = (box->y2 + ty) * sy;
-
-	dst.p.x = box->x1;
-	v[4] = dst.f;
-	v[9] = v[5] = (box->x1 + tx) * sx;
-
-	dst.p.y = box->y1;
-	v[8] = dst.f;
-	v[10] = (box->y1 + ty) * sy;
-
-	v[11] = v[7] = v[3] = opacity;
-}
-
-fastcall static void
-gen7_emit_composite_spans_simple(struct sna *sna,
-				 const struct sna_composite_spans_op *op,
-				 const BoxRec *box,
-				 float opacity)
-{
-	float *v;
-	union {
-		struct sna_coordinate p;
-		float f;
-	} dst;
-
-	float xx = op->base.src.transform->matrix[0][0];
-	float x0 = op->base.src.transform->matrix[0][2];
-	float yy = op->base.src.transform->matrix[1][1];
-	float y0 = op->base.src.transform->matrix[1][2];
-	float sx = op->base.src.scale[0];
-	float sy = op->base.src.scale[1];
-	int16_t tx = op->base.src.offset[0];
-	int16_t ty = op->base.src.offset[1];
-
-	v = sna->render.vertices + sna->render.vertex_used;
-	sna->render.vertex_used += 3*4;
-	assert(sna->render.vertex_used <= sna->render.vertex_size);
-
-	dst.p.x = box->x2;
-	dst.p.y = box->y2;
-	v[0] = dst.f;
-	v[1] = ((box->x2 + tx) * xx + x0) * sx;
-	v[6] = v[2] = ((box->y2 + ty) * yy + y0) * sy;
-
-	dst.p.x = box->x1;
-	v[4] = dst.f;
-	v[9] = v[5] = ((box->x1 + tx) * xx + x0) * sx;
-
-	dst.p.y = box->y1;
-	v[8] = dst.f;
-	v[10] = ((box->y1 + ty) * yy + y0) * sy;
-
-	v[11] = v[7] = v[3] = opacity;
-}
-
-fastcall static void
-gen7_emit_composite_spans_affine(struct sna *sna,
-				 const struct sna_composite_spans_op *op,
-				 const BoxRec *box,
-				 float opacity)
-{
-	OUT_VERTEX(box->x2, box->y2);
-	gen7_emit_composite_texcoord_affine(sna, &op->base.src,
-					    box->x2, box->y2);
-	OUT_VERTEX_F(opacity);
-
-	OUT_VERTEX(box->x1, box->y2);
-	gen7_emit_composite_texcoord_affine(sna, &op->base.src,
-					    box->x1, box->y2);
-	OUT_VERTEX_F(opacity);
-
-	OUT_VERTEX(box->x1, box->y1);
-	gen7_emit_composite_texcoord_affine(sna, &op->base.src,
-					    box->x1, box->y1);
-	OUT_VERTEX_F(opacity);
-}
-
 fastcall static void
 gen7_render_composite_spans_box(struct sna *sna,
 				const struct sna_composite_spans_op *op,
@@ -3159,11 +2492,47 @@ gen7_render_composite_spans_boxes(struct sna *sna,
 }
 
 fastcall static void
+gen7_render_composite_spans_boxes__thread(struct sna *sna,
+					  const struct sna_composite_spans_op *op,
+					  const struct sna_opacity_box *box,
+					  int nbox)
+{
+	DBG(("%s: nbox=%d, src=+(%d, %d), dst=+(%d, %d)\n",
+	     __FUNCTION__, nbox,
+	     op->base.src.offset[0], op->base.src.offset[1],
+	     op->base.dst.x, op->base.dst.y));
+
+	sna_vertex_lock(&sna->render);
+	do {
+		int nbox_this_time;
+		float *v;
+
+		nbox_this_time = gen7_get_rectangles(sna, &op->base, nbox,
+						     gen7_emit_composite_state);
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		v = sna->render.vertices + sna->render.vertex_used;
+		sna->render.vertex_used += nbox_this_time * op->base.floats_per_rect;
+
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		op->emit_boxes(op, box, nbox_this_time, v);
+		box += nbox_this_time;
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+	} while (nbox);
+	sna_vertex_unlock(&sna->render);
+}
+
+fastcall static void
 gen7_render_composite_spans_done(struct sna *sna,
 				 const struct sna_composite_spans_op *op)
 {
-	if (sna->render_state.gen7.vertex_offset)
-		gen7_vertex_flush(sna);
+	if (sna->render.vertex_offset)
+		gen4_vertex_flush(sna);
 
 	DBG(("%s()\n", __FUNCTION__));
 
@@ -3184,12 +2553,11 @@ gen7_check_composite_spans(struct sna *sna,
 	if (gen7_composite_fallback(sna, src, NULL, dst))
 		return false;
 
-	if (need_tiling(sna, width, height)) {
-		if (!is_gpu(dst->pDrawable)) {
-			DBG(("%s: fallback, tiled operation not on GPU\n",
-			     __FUNCTION__));
-			return false;
-		}
+	if (need_tiling(sna, width, height) &&
+	    !is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
+		DBG(("%s: fallback, tiled operation not on GPU\n",
+		     __FUNCTION__));
+		return false;
 	}
 
 	return true;
@@ -3232,7 +2600,7 @@ gen7_render_composite_spans(struct sna *sna,
 	case -1:
 		goto cleanup_dst;
 	case 0:
-		if (!gen7_composite_solid_init(sna, &tmp->base.src, 0))
+		if (!gen4_channel_init_solid(sna, &tmp->base.src, 0))
 			goto cleanup_dst;
 		/* fall through to fixup */
 	case 1:
@@ -3244,23 +2612,6 @@ gen7_render_composite_spans(struct sna *sna,
 	tmp->base.is_affine = tmp->base.src.is_affine;
 	tmp->base.need_magic_ca_pass = false;
 
-	tmp->prim_emit = gen7_emit_composite_spans_primitive;
-	if (tmp->base.src.is_solid) {
-		tmp->prim_emit = gen7_emit_composite_spans_solid;
-	} else if (tmp->base.src.transform == NULL) {
-		tmp->prim_emit = gen7_emit_composite_spans_identity;
-	} else if (tmp->base.is_affine) {
-		if (tmp->base.src.transform->matrix[0][1] == 0 &&
-		    tmp->base.src.transform->matrix[1][0] == 0) {
-			tmp->base.src.scale[0] /= tmp->base.src.transform->matrix[2][2];
-			tmp->base.src.scale[1] /= tmp->base.src.transform->matrix[2][2];
-			tmp->prim_emit = gen7_emit_composite_spans_simple;
-		} else
-			tmp->prim_emit = gen7_emit_composite_spans_affine;
-	}
-	tmp->base.floats_per_vertex = 4 + !tmp->base.is_affine;
-	tmp->base.floats_per_rect = 3 * tmp->base.floats_per_vertex;
-
 	tmp->base.u.gen7.flags =
 		GEN7_SET_FLAGS(SAMPLER_OFFSET(tmp->base.src.filter,
 					      tmp->base.src.repeat,
@@ -3268,13 +2619,15 @@ gen7_render_composite_spans(struct sna *sna,
 					      SAMPLER_EXTEND_PAD),
 			       gen7_get_blend(tmp->base.op, false, tmp->base.dst.format),
 			       GEN7_WM_KERNEL_OPACITY | !tmp->base.is_affine,
-			       1 << 2 | (2+!tmp->base.is_affine));
+			       gen4_choose_spans_emitter(tmp));
 
 	tmp->box   = gen7_render_composite_spans_box;
 	tmp->boxes = gen7_render_composite_spans_boxes;
+	if (tmp->emit_boxes)
+		tmp->thread_boxes = gen7_render_composite_spans_boxes__thread;
 	tmp->done  = gen7_render_composite_spans_done;
 
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp->base.dst.bo);
 	if (!kgem_check_bo(&sna->kgem,
 			   tmp->base.dst.bo, tmp->base.src.bo,
 			   NULL)) {
@@ -3307,7 +2660,7 @@ gen7_emit_copy_state(struct sna *sna,
 	uint32_t *binding_table;
 	uint16_t offset;
 
-	gen7_get_batch(sna);
+	gen7_get_batch(sna, op);
 
 	binding_table = gen7_composite_get_binding_table(sna, &offset);
 
@@ -3337,10 +2690,23 @@ static inline bool prefer_blt_copy(struct sna *sna,
 				   struct kgem_bo *dst_bo,
 				   unsigned flags)
 {
-	return (sna->kgem.ring == KGEM_BLT ||
-		(flags & COPY_LAST && sna->kgem.mode == KGEM_NONE) ||
-		prefer_blt_bo(sna, src_bo) ||
-		prefer_blt_bo(sna, dst_bo));
+	if (sna->kgem.ring == KGEM_BLT)
+		return true;
+
+	assert((flags & COPY_SYNC) == 0);
+
+	if (src_bo == dst_bo && can_switch_to_blt(sna, dst_bo, flags))
+		return true;
+
+	if (untiled_tlb_miss(src_bo) ||
+	    untiled_tlb_miss(dst_bo))
+		return true;
+
+	if (!prefer_blt_ring(sna, dst_bo, flags))
+		return false;
+
+	return (prefer_blt_bo(sna, src_bo) >= 0 &&
+		prefer_blt_bo(sna, dst_bo) > 0);
 }
 
 inline static void boxes_extents(const BoxRec *box, int n, BoxRec *extents)
@@ -3386,8 +2752,8 @@ gen7_render_copy_boxes(struct sna *sna, uint8_t alu,
 	struct sna_composite_op tmp;
 	BoxRec extents;
 
-	DBG(("%s (%d, %d)->(%d, %d) x %d, alu=%x, self-copy=%d, overlaps? %d\n",
-	     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy, n, alu,
+	DBG(("%s (%d, %d)->(%d, %d) x %d, alu=%x, flags=%x, self-copy=%d, overlaps? %d\n",
+	     __FUNCTION__, src_dx, src_dy, dst_dx, dst_dy, n, alu, flags,
 	     src_bo == dst_bo,
 	     overlaps(sna,
 		      src_bo, src_dx, src_dy,
@@ -3421,7 +2787,7 @@ fallback_blt:
 		if (too_large(extents.x2-extents.x1, extents.y2-extents.y1))
 			goto fallback_blt;
 
-		if ((flags & COPY_LAST || can_switch_to_blt(sna)) &&
+		if (can_switch_to_blt(sna, dst_bo, flags) &&
 		    sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
 		    sna_blt_copy_boxes(sna, alu,
 				       src_bo, src_dx, src_dy,
@@ -3523,7 +2889,7 @@ fallback_blt:
 
 	tmp.u.gen7.flags = COPY_FLAGS(alu);
 
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, tmp.dst.bo);
 	if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, tmp.src.bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		if (!kgem_check_bo(&sna->kgem, tmp.dst.bo, tmp.src.bo, NULL))
@@ -3563,7 +2929,7 @@ fallback_blt:
 		} while (--n_this_time);
 	} while (n);
 
-	gen7_vertex_flush(sna);
+	gen4_vertex_flush(sna);
 	sna_render_composite_redirect_done(sna, &tmp);
 	if (tmp.src.bo != src_bo)
 		kgem_bo_destroy(&sna->kgem, tmp.src.bo);
@@ -3576,6 +2942,14 @@ fallback_tiled_dst:
 	if (tmp.redirect.real_bo)
 		kgem_bo_destroy(&sna->kgem, tmp.dst.bo);
 fallback_tiled:
+	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
+	    sna_blt_copy_boxes(sna, alu,
+			       src_bo, src_dx, src_dy,
+			       dst_bo, dst_dx, dst_dy,
+			       dst->drawable.bitsPerPixel,
+			       box, n))
+		return true;
+
 	return sna_tiling_copy_boxes(sna, alu,
 				     src, src_bo, src_dx, src_dy,
 				     dst, dst_bo, dst_dx, dst_dy,
@@ -3608,8 +2982,8 @@ gen7_render_copy_blt(struct sna *sna,
 static void
 gen7_render_copy_done(struct sna *sna, const struct sna_copy_op *op)
 {
-	if (sna->render_state.gen7.vertex_offset)
-		gen7_vertex_flush(sna);
+	if (sna->render.vertex_offset)
+		gen4_vertex_flush(sna);
 }
 
 static bool
@@ -3671,7 +3045,7 @@ fallback:
 
 	op->base.u.gen7.flags = COPY_FLAGS(alu);
 
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
 	if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		if (!kgem_check_bo(&sna->kgem, dst_bo, src_bo, NULL))
@@ -3699,7 +3073,7 @@ gen7_emit_fill_state(struct sna *sna, const struct sna_composite_op *op)
 	 * specific kernel.
 	 */
 
-	gen7_get_batch(sna);
+	gen7_get_batch(sna, op);
 
 	binding_table = gen7_composite_get_binding_table(sna, &offset);
 
@@ -3727,7 +3101,10 @@ gen7_emit_fill_state(struct sna *sna, const struct sna_composite_op *op)
 static inline bool prefer_blt_fill(struct sna *sna,
 				   struct kgem_bo *bo)
 {
-	return prefer_blt_ring(sna) || untiled_tlb_miss(bo);
+	if (untiled_tlb_miss(bo))
+		return true;
+
+	return prefer_blt_ring(sna, bo, 0) || prefer_blt_bo(sna, bo) >= 0;
 }
 
 static bool
@@ -3822,6 +3199,7 @@ gen7_render_fill_boxes(struct sna *sna,
 
 	tmp.u.gen7.flags = FILL_FLAGS(op, format);
 
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
 	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
@@ -3855,7 +3233,7 @@ gen7_render_fill_boxes(struct sna *sna,
 		} while (--n_this_time);
 	} while (n);
 
-	gen7_vertex_flush(sna);
+	gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 	sna_render_composite_redirect_done(sna, &tmp);
 	return true;
@@ -3946,8 +3324,8 @@ gen7_render_fill_op_boxes(struct sna *sna,
 static void
 gen7_render_fill_op_done(struct sna *sna, const struct sna_fill_op *op)
 {
-	if (sna->render_state.gen7.vertex_offset)
-		gen7_vertex_flush(sna);
+	if (sna->render.vertex_offset)
+		gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, op->base.src.bo);
 }
 
@@ -3995,6 +3373,7 @@ gen7_render_fill(struct sna *sna, uint8_t alu,
 
 	op->base.u.gen7.flags = FILL_FLAGS_NOBLEND;
 
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, dst_bo);
 	if (!kgem_check_bo(&sna->kgem, dst_bo, NULL)) {
 		kgem_submit(&sna->kgem);
 		assert(kgem_check_bo(&sna->kgem, dst_bo, NULL));
@@ -4072,9 +3451,13 @@ gen7_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 
 	tmp.u.gen7.flags = FILL_FLAGS_NOBLEND;
 
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
 	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
-		_kgem_submit(&sna->kgem);
-		assert(kgem_check_bo(&sna->kgem, bo, NULL));
+		kgem_submit(&sna->kgem);
+		if (kgem_check_bo(&sna->kgem, bo, NULL)) {
+			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+			return false;
+		}
 	}
 
 	gen7_emit_fill_state(sna, &tmp);
@@ -4095,7 +3478,7 @@ gen7_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
 	v[7] = v[2]  = v[3]  = 1;
 	v[6] = v[10] = v[11] = 0;
 
-	gen7_vertex_flush(sna);
+	gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 
 	return true;
@@ -4152,9 +3535,13 @@ gen7_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 
 	tmp.u.gen7.flags = FILL_FLAGS_NOBLEND;
 
+	kgem_set_mode(&sna->kgem, KGEM_RENDER, bo);
 	if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
-		_kgem_submit(&sna->kgem);
-		assert(kgem_check_bo(&sna->kgem, bo, NULL));
+		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo(&sna->kgem, bo, NULL)) {
+			kgem_bo_destroy(&sna->kgem, tmp.src.bo);
+			return false;
+		}
 	}
 
 	gen7_emit_fill_state(sna, &tmp);
@@ -4174,7 +3561,7 @@ gen7_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 	v[7] = v[2]  = v[3]  = 1;
 	v[6] = v[10] = v[11] = 0;
 
-	gen7_vertex_flush(sna);
+	gen4_vertex_flush(sna);
 	kgem_bo_destroy(&sna->kgem, tmp.src.bo);
 
 	return true;
@@ -4182,20 +3569,20 @@ gen7_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
 
 static void gen7_render_flush(struct sna *sna)
 {
-	gen7_vertex_close(sna);
+	gen4_vertex_close(sna);
+
+	assert(sna->render.vb_id == 0);
+	assert(sna->render.vertex_offset == 0);
 }
 
 static void
 gen7_render_context_switch(struct kgem *kgem,
 			   int new_mode)
 {
-	if (!new_mode)
-		return;
-
-	if (kgem->mode) {
+	if (kgem->nbatch) {
 		DBG(("%s: switch rings %d -> %d\n",
 		     __FUNCTION__, kgem->mode, new_mode));
-		kgem_submit(kgem);
+		_kgem_submit(kgem);
 	}
 
 	kgem->ring = new_mode;
@@ -4238,7 +3625,6 @@ static void gen7_render_reset(struct sna *sna)
 {
 	sna->render_state.gen7.emit_flush = false;
 	sna->render_state.gen7.needs_invariant = true;
-	sna->render_state.gen7.vb_id = 0;
 	sna->render_state.gen7.ve_id = 3 << 2;
 	sna->render_state.gen7.last_primitive = -1;
 
@@ -4249,6 +3635,10 @@ static void gen7_render_reset(struct sna *sna)
 	sna->render_state.gen7.drawrect_offset = -1;
 	sna->render_state.gen7.drawrect_limit = -1;
 	sna->render_state.gen7.surface_table = -1;
+
+	sna->render.vertex_offset = 0;
+	sna->render.nvertex_reloc = 0;
+	sna->render.vb_id = 0;
 }
 
 static void gen7_render_fini(struct sna *sna)
@@ -4256,6 +3646,16 @@ static void gen7_render_fini(struct sna *sna)
 	kgem_bo_destroy(&sna->kgem, sna->render_state.gen7.general_bo);
 }
 
+static bool is_gt2(struct sna *sna)
+{
+	return DEVICE_ID(sna->PciInfo) & 0x20;
+}
+
+static bool is_mobile(struct sna *sna)
+{
+	return (DEVICE_ID(sna->PciInfo) & 0xf) == 0x6;
+}
+
 static bool gen7_render_setup(struct sna *sna)
 {
 	struct gen7_render_state *state = &sna->render_state.gen7;
@@ -4263,15 +3663,20 @@ static bool gen7_render_setup(struct sna *sna)
 	struct gen7_sampler_state *ss;
 	int i, j, k, l, m;
 
-	if (sna->kgem.gen == 70) {
+	if (sna->kgem.gen == 070) {
 		state->info = &ivb_gt_info;
 		if (DEVICE_ID(sna->PciInfo) & 0xf) {
 			state->info = &ivb_gt1_info;
-			if (DEVICE_ID(sna->PciInfo) & 0x20)
+			if (is_gt2(sna))
 				state->info = &ivb_gt2_info; /* XXX requires GT_MODE WiZ disabled */
 		}
-	} else if (sna->kgem.gen == 75) {
+	} else if (sna->kgem.gen == 075) {
 		state->info = &hsw_gt_info;
+		if (DEVICE_ID(sna->PciInfo) & 0xf) {
+			state->info = &hsw_gt1_info;
+			if (is_gt2(sna))
+				state->info = &hsw_gt2_info;
+		}
 	} else
 		return false;
 
@@ -4331,7 +3736,6 @@ static bool gen7_render_setup(struct sna *sna)
 		}
 	}
 
-	state->cc_vp = gen7_create_cc_viewport(&general);
 	state->cc_blend = gen7_composite_create_blend_state(&general);
 
 	state->general_bo = sna_static_stream_fini(sna, &general);
@@ -4349,10 +3753,13 @@ bool gen7_render_init(struct sna *sna)
 
 #if !NO_COMPOSITE
 	sna->render.composite = gen7_render_composite;
+	sna->render.prefer_gpu |= PREFER_GPU_RENDER;
 #endif
 #if !NO_COMPOSITE_SPANS
 	sna->render.check_composite_spans = gen7_check_composite_spans;
 	sna->render.composite_spans = gen7_render_composite_spans;
+	if (is_mobile(sna))
+		sna->render.prefer_gpu |= PREFER_GPU_SPANS;
 #endif
 	sna->render.video = gen7_render_video;
 
diff --git a/src/sna/kgem.c b/src/sna/kgem.c
index 46c898f79..86a2dfcde 100644
--- a/src/sna/kgem.c
+++ b/src/sna/kgem.c
@@ -69,10 +69,26 @@ search_snoop_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
 #define DBG_NO_UPLOAD_ACTIVE 0
 #define DBG_NO_MAP_UPLOAD 0
 #define DBG_NO_RELAXED_FENCING 0
+#define DBG_NO_SECURE_BATCHES 0
+#define DBG_NO_PINNED_BATCHES 0
+#define DBG_NO_FAST_RELOC 0
+#define DBG_NO_HANDLE_LUT 0
 #define DBG_DUMP 0
 
+#ifndef DEBUG_SYNC
+#define DEBUG_SYNC 0
+#endif
+
 #define SHOW_BATCH 0
 
+#if 0
+#define ASSERT_IDLE(kgem__, handle__) assert(!__kgem_busy(kgem__, handle__))
+#define ASSERT_MAYBE_IDLE(kgem__, handle__, expect__) assert(!(expect__) || !__kgem_busy(kgem__, handle__))
+#else
+#define ASSERT_IDLE(kgem__, handle__)
+#define ASSERT_MAYBE_IDLE(kgem__, handle__, expect__)
+#endif
+
 /* Worst case seems to be 965gm where we cannot write within a cacheline that
  * is being simultaneously being read by the GPU, or within the sampler
  * prefetch. In general, the chipsets seem to have a requirement that sampler
@@ -93,7 +109,20 @@ search_snoop_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags);
 #define IS_USER_MAP(ptr) ((uintptr_t)(ptr) & 2)
 #define __MAP_TYPE(ptr) ((uintptr_t)(ptr) & 3)
 
-#define LOCAL_I915_PARAM_HAS_SEMAPHORES	 20
+#define MAKE_REQUEST(rq, ring) ((struct kgem_request *)((uintptr_t)(rq) | (ring)))
+
+#define LOCAL_I915_PARAM_HAS_BLT		11
+#define LOCAL_I915_PARAM_HAS_RELAXED_FENCING	12
+#define LOCAL_I915_PARAM_HAS_RELAXED_DELTA	15
+#define LOCAL_I915_PARAM_HAS_SEMAPHORES		20
+#define LOCAL_I915_PARAM_HAS_SECURE_BATCHES	23
+#define LOCAL_I915_PARAM_HAS_PINNED_BATCHES	24
+#define LOCAL_I915_PARAM_HAS_NO_RELOC		25
+#define LOCAL_I915_PARAM_HAS_HANDLE_LUT		26
+
+#define LOCAL_I915_EXEC_IS_PINNED		(1<<10)
+#define LOCAL_I915_EXEC_NO_RELOC		(1<<11)
+#define LOCAL_I915_EXEC_HANDLE_LUT		(1<<12)
 
 #define LOCAL_I915_GEM_USERPTR       0x32
 #define LOCAL_IOCTL_I915_GEM_USERPTR DRM_IOWR (DRM_COMMAND_BASE + LOCAL_I915_GEM_USERPTR, struct local_i915_gem_userptr)
@@ -101,7 +130,8 @@ struct local_i915_gem_userptr {
 	uint64_t user_ptr;
 	uint32_t user_size;
 	uint32_t flags;
-#define I915_USERPTR_READ_ONLY 0x1
+#define I915_USERPTR_READ_ONLY (1<<0)
+#define I915_USERPTR_UNSYNCHRONIZED (1<<31)
 	uint32_t handle;
 };
 
@@ -170,13 +200,13 @@ static void kgem_sna_flush(struct kgem *kgem)
 		sna_render_flush_solid(sna);
 }
 
-static int gem_set_tiling(int fd, uint32_t handle, int tiling, int stride)
+static bool gem_set_tiling(int fd, uint32_t handle, int tiling, int stride)
 {
 	struct drm_i915_gem_set_tiling set_tiling;
 	int ret;
 
 	if (DBG_NO_TILING)
-		return I915_TILING_NONE;
+		return false;
 
 	VG_CLEAR(set_tiling);
 	do {
@@ -186,7 +216,7 @@ static int gem_set_tiling(int fd, uint32_t handle, int tiling, int stride)
 
 		ret = ioctl(fd, DRM_IOCTL_I915_GEM_SET_TILING, &set_tiling);
 	} while (ret == -1 && (errno == EINTR || errno == EAGAIN));
-	return set_tiling.tiling_mode;
+	return ret == 0;
 }
 
 static bool gem_set_cacheing(int fd, uint32_t handle, int cacheing)
@@ -206,14 +236,17 @@ static uint32_t gem_userptr(int fd, void *ptr, int size, int read_only)
 	VG_CLEAR(arg);
 	arg.user_ptr = (uintptr_t)ptr;
 	arg.user_size = size;
-	arg.flags = 0;
+	arg.flags = I915_USERPTR_UNSYNCHRONIZED;
 	if (read_only)
 		arg.flags |= I915_USERPTR_READ_ONLY;
 
 	if (drmIoctl(fd, LOCAL_IOCTL_I915_GEM_USERPTR, &arg)) {
-		DBG(("%s: failed to map %p + %d bytes: %d\n",
-		     __FUNCTION__, ptr, size, errno));
-		return 0;
+		arg.flags &= ~I915_USERPTR_UNSYNCHRONIZED;
+		if (drmIoctl(fd, LOCAL_IOCTL_I915_GEM_USERPTR, &arg)) {
+			DBG(("%s: failed to map %p + %d bytes: %d\n",
+			     __FUNCTION__, ptr, size, errno));
+			return 0;
+		}
 	}
 
 	return arg.handle;
@@ -262,6 +295,11 @@ retry_gtt:
 		if (kgem_expire_cache(kgem))
 			goto retry_gtt;
 
+		if (kgem->need_expire) {
+			kgem_cleanup_cache(kgem);
+			goto retry_gtt;
+		}
+
 		return NULL;
 	}
 
@@ -274,6 +312,11 @@ retry_mmap:
 		if (__kgem_throttle_retire(kgem, 0))
 			goto retry_mmap;
 
+		if (kgem->need_expire) {
+			kgem_cleanup_cache(kgem);
+			goto retry_mmap;
+		}
+
 		ptr = NULL;
 	}
 
@@ -345,8 +388,7 @@ static int gem_read(int fd, uint32_t handle, const void *dst,
 	return 0;
 }
 
-static bool
-kgem_busy(struct kgem *kgem, int handle)
+bool __kgem_busy(struct kgem *kgem, int handle)
 {
 	struct drm_i915_gem_busy busy;
 
@@ -360,26 +402,23 @@ kgem_busy(struct kgem *kgem, int handle)
 	return busy.busy;
 }
 
-void kgem_bo_retire(struct kgem *kgem, struct kgem_bo *bo)
+static void kgem_bo_retire(struct kgem *kgem, struct kgem_bo *bo)
 {
-	DBG(("%s: handle=%d, domain=%d\n",
-	     __FUNCTION__, bo->handle, bo->domain));
-	assert(bo->flush || !kgem_busy(kgem, bo->handle));
-
-	if (bo->rq)
-		kgem_retire(kgem);
-
-	if (bo->exec == NULL) {
-		DBG(("%s: retiring bo handle=%d (needed flush? %d), rq? %d\n",
-		     __FUNCTION__, bo->handle, bo->needs_flush, bo->rq != NULL));
-		assert(list_is_empty(&bo->vma));
-		bo->rq = NULL;
-		list_del(&bo->request);
+	DBG(("%s: retiring bo handle=%d (needed flush? %d), rq? %d [busy?=%d]\n",
+	     __FUNCTION__, bo->handle, bo->needs_flush, bo->rq != NULL,
+	     __kgem_busy(kgem, bo->handle)));
+	assert(bo->exec == NULL);
+	assert(list_is_empty(&bo->vma));
 
-		bo->needs_flush = false;
+	if (bo->rq) {
+		if (!__kgem_busy(kgem, bo->handle)) {
+			__kgem_bo_clear_busy(bo);
+			kgem_retire(kgem);
+		}
+	} else {
+		assert(!bo->needs_flush);
+		ASSERT_IDLE(kgem, bo->handle);
 	}
-
-	bo->domain = DOMAIN_NONE;
 }
 
 bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
@@ -387,15 +426,18 @@ bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo,
 {
 	assert(bo->refcnt);
 	assert(!bo->purged);
-	assert(bo->flush || !kgem_busy(kgem, bo->handle));
 	assert(bo->proxy == NULL);
+	ASSERT_IDLE(kgem, bo->handle);
 
 	assert(length <= bytes(bo));
 	if (gem_write(kgem->fd, bo->handle, 0, length, data))
 		return false;
 
 	DBG(("%s: flush=%d, domain=%d\n", __FUNCTION__, bo->flush, bo->domain));
-	kgem_bo_retire(kgem, bo);
+	if (bo->exec == NULL) {
+		kgem_bo_retire(kgem, bo);
+		bo->domain = DOMAIN_NONE;
+	}
 	return true;
 }
 
@@ -490,10 +532,19 @@ static void gem_close(int fd, uint32_t handle)
 
 constant inline static unsigned long __fls(unsigned long word)
 {
+#if defined(__GNUC__) && (defined(__i386__) || defined(__x86__) || defined(__x86_64__))
 	asm("bsr %1,%0"
 	    : "=r" (word)
 	    : "rm" (word));
 	return word;
+#else
+	unsigned int v = 0;
+
+	while (word >>= 1)
+		v++;
+
+	return v;
+#endif
 }
 
 constant inline static int cache_bucket(int num_pages)
@@ -509,6 +560,7 @@ static struct kgem_bo *__kgem_bo_init(struct kgem_bo *bo,
 
 	bo->refcnt = 1;
 	bo->handle = handle;
+	bo->target_handle = -1;
 	num_pages(bo) = num_pages;
 	bucket(bo) = cache_bucket(num_pages);
 	bo->reusable = true;
@@ -536,9 +588,7 @@ static struct kgem_bo *__kgem_bo_alloc(int handle, int num_pages)
 	return __kgem_bo_init(bo, handle, num_pages);
 }
 
-static struct kgem_request _kgem_static_request;
-
-static struct kgem_request *__kgem_request_alloc(void)
+static struct kgem_request *__kgem_request_alloc(struct kgem *kgem)
 {
 	struct kgem_request *rq;
 
@@ -548,7 +598,7 @@ static struct kgem_request *__kgem_request_alloc(void)
 	} else {
 		rq = malloc(sizeof(*rq));
 		if (rq == NULL)
-			rq = &_kgem_static_request;
+			rq = &kgem->static_request;
 	}
 
 	list_init(&rq->buffers);
@@ -567,11 +617,15 @@ static void __kgem_request_free(struct kgem_request *rq)
 
 static struct list *inactive(struct kgem *kgem, int num_pages)
 {
+	assert(num_pages < MAX_CACHE_SIZE / PAGE_SIZE);
+	assert(cache_bucket(num_pages) < NUM_CACHE_BUCKETS);
 	return &kgem->inactive[cache_bucket(num_pages)];
 }
 
 static struct list *active(struct kgem *kgem, int num_pages, int tiling)
 {
+	assert(num_pages < MAX_CACHE_SIZE / PAGE_SIZE);
+	assert(cache_bucket(num_pages) < NUM_CACHE_BUCKETS);
 	return &kgem->active[cache_bucket(num_pages)][tiling];
 }
 
@@ -581,7 +635,7 @@ agp_aperture_size(struct pci_device *dev, unsigned gen)
 	/* XXX assume that only future chipsets are unknown and follow
 	 * the post gen2 PCI layout.
 	 */
-	return dev->regions[gen < 30 ? 0 : 2].size;
+	return dev->regions[gen < 030 ? 0 : 2].size;
 }
 
 static size_t
@@ -636,6 +690,35 @@ static int gem_param(struct kgem *kgem, int name)
 	return v;
 }
 
+static bool test_has_execbuffer2(struct kgem *kgem)
+{
+	struct drm_i915_gem_execbuffer2 execbuf;
+
+	memset(&execbuf, 0, sizeof(execbuf));
+	execbuf.buffer_count = 1;
+
+	return (drmIoctl(kgem->fd,
+			 DRM_IOCTL_I915_GEM_EXECBUFFER2,
+			 &execbuf) == -1 &&
+		errno == EFAULT);
+}
+
+static bool test_has_no_reloc(struct kgem *kgem)
+{
+	if (DBG_NO_FAST_RELOC)
+		return false;
+
+	return gem_param(kgem, LOCAL_I915_PARAM_HAS_NO_RELOC) > 0;
+}
+
+static bool test_has_handle_lut(struct kgem *kgem)
+{
+	if (DBG_NO_HANDLE_LUT)
+		return false;
+
+	return gem_param(kgem, LOCAL_I915_PARAM_HAS_HANDLE_LUT) > 0;
+}
+
 static bool test_has_semaphores_enabled(struct kgem *kgem)
 {
 	FILE *file;
@@ -674,6 +757,9 @@ static bool is_hw_supported(struct kgem *kgem,
 	if (DBG_NO_HW)
 		return false;
 
+	if (!test_has_execbuffer2(kgem))
+		return false;
+
 	if (kgem->gen == (unsigned)-1) /* unknown chipset, assume future gen */
 		return kgem->has_blt;
 
@@ -682,12 +768,12 @@ static bool is_hw_supported(struct kgem *kgem,
 	 * hw acceleration.
 	 */
 
-	if (kgem->gen == 60 && dev->revision < 8) {
+	if (kgem->gen == 060 && dev->revision < 8) {
 		/* pre-production SNB with dysfunctional BLT */
 		return false;
 	}
 
-	if (kgem->gen >= 60) /* Only if the kernel supports the BLT ring */
+	if (kgem->gen >= 060) /* Only if the kernel supports the BLT ring */
 		return kgem->has_blt;
 
 	return true;
@@ -695,11 +781,11 @@ static bool is_hw_supported(struct kgem *kgem,
 
 static bool test_has_relaxed_fencing(struct kgem *kgem)
 {
-	if (kgem->gen < 40) {
+	if (kgem->gen < 040) {
 		if (DBG_NO_RELAXED_FENCING)
 			return false;
 
-		return gem_param(kgem, I915_PARAM_HAS_RELAXED_FENCING) > 0;
+		return gem_param(kgem, LOCAL_I915_PARAM_HAS_RELAXED_FENCING) > 0;
 	} else
 		return true;
 }
@@ -716,7 +802,7 @@ static bool test_has_llc(struct kgem *kgem)
 #endif
 	if (has_llc == -1) {
 		DBG(("%s: no kernel/drm support for HAS_LLC, assuming support for LLC based on GPU generation\n", __FUNCTION__));
-		has_llc = kgem->gen >= 60;
+		has_llc = kgem->gen >= 060;
 	}
 
 	return has_llc;
@@ -731,7 +817,7 @@ static bool test_has_cacheing(struct kgem *kgem)
 		return false;
 
 	/* Incoherent blt and sampler hangs the GPU */
-	if (kgem->gen == 40)
+	if (kgem->gen == 040)
 		return false;
 
 	handle = gem_create(kgem->fd, 1);
@@ -753,7 +839,7 @@ static bool test_has_userptr(struct kgem *kgem)
 		return false;
 
 	/* Incoherent blt and sampler hangs the GPU */
-	if (kgem->gen == 40)
+	if (kgem->gen == 040)
 		return false;
 
 	ptr = malloc(PAGE_SIZE);
@@ -767,13 +853,101 @@ static bool test_has_userptr(struct kgem *kgem)
 #endif
 }
 
+static bool test_has_secure_batches(struct kgem *kgem)
+{
+	if (DBG_NO_SECURE_BATCHES)
+		return false;
+
+	return gem_param(kgem, LOCAL_I915_PARAM_HAS_SECURE_BATCHES) > 0;
+}
+
+static bool test_has_pinned_batches(struct kgem *kgem)
+{
+	if (DBG_NO_PINNED_BATCHES)
+		return false;
+
+	return gem_param(kgem, LOCAL_I915_PARAM_HAS_PINNED_BATCHES) > 0;
+}
+
 static int kgem_get_screen_index(struct kgem *kgem)
 {
 	struct sna *sna = container_of(kgem, struct sna, kgem);
 	return sna->scrn->scrnIndex;
 }
 
-void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
+static bool kgem_init_pinned_batches(struct kgem *kgem)
+{
+	int count[2] = { 16, 4 };
+	int size[2] = { 1, 4 };
+	int n, i;
+
+	if (kgem->wedged)
+		return true;
+
+	for (n = 0; n < ARRAY_SIZE(count); n++) {
+		for (i = 0; i < count[n]; i++) {
+			struct drm_i915_gem_pin pin;
+			struct kgem_bo *bo;
+
+			VG_CLEAR(pin);
+
+			pin.handle = gem_create(kgem->fd, size[n]);
+			if (pin.handle == 0)
+				goto err;
+
+			DBG(("%s: new handle=%d, num_pages=%d\n",
+			     __FUNCTION__, pin.handle, size[n]));
+
+			bo = __kgem_bo_alloc(pin.handle, size[n]);
+			if (bo == NULL) {
+				gem_close(kgem->fd, pin.handle);
+				goto err;
+			}
+
+			pin.alignment = 0;
+			if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_PIN, &pin)) {
+				gem_close(kgem->fd, pin.handle);
+				goto err;
+			}
+			bo->presumed_offset = pin.offset;
+			debug_alloc__bo(kgem, bo);
+			list_add(&bo->list, &kgem->pinned_batches[n]);
+		}
+	}
+
+	return true;
+
+err:
+	for (n = 0; n < ARRAY_SIZE(kgem->pinned_batches); n++) {
+		while (!list_is_empty(&kgem->pinned_batches[n])) {
+			kgem_bo_destroy(kgem,
+					list_first_entry(&kgem->pinned_batches[n],
+							 struct kgem_bo, list));
+		}
+	}
+
+	/* For simplicity populate the lists with a single unpinned bo */
+	for (n = 0; n < ARRAY_SIZE(count); n++) {
+		struct kgem_bo *bo;
+		uint32_t handle;
+
+		handle = gem_create(kgem->fd, size[n]);
+		if (handle == 0)
+			break;
+
+		bo = __kgem_bo_alloc(handle, size[n]);
+		if (bo == NULL) {
+			gem_close(kgem->fd, handle);
+			break;
+		}
+
+		debug_alloc__bo(kgem, bo);
+		list_add(&bo->list, &kgem->pinned_batches[n]);
+	}
+	return false;
+}
+
+void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen)
 {
 	struct drm_i915_gem_get_aperture aperture;
 	size_t totalram;
@@ -787,12 +961,36 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	kgem->fd = fd;
 	kgem->gen = gen;
 
-	kgem->has_blt = gem_param(kgem, I915_PARAM_HAS_BLT) > 0;
+	list_init(&kgem->requests[0]);
+	list_init(&kgem->requests[1]);
+	list_init(&kgem->batch_buffers);
+	list_init(&kgem->active_buffers);
+	list_init(&kgem->flushing);
+	list_init(&kgem->large);
+	list_init(&kgem->large_inactive);
+	list_init(&kgem->snoop);
+	list_init(&kgem->scanout);
+	for (i = 0; i < ARRAY_SIZE(kgem->pinned_batches); i++)
+		list_init(&kgem->pinned_batches[i]);
+	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++)
+		list_init(&kgem->inactive[i]);
+	for (i = 0; i < ARRAY_SIZE(kgem->active); i++) {
+		for (j = 0; j < ARRAY_SIZE(kgem->active[i]); j++)
+			list_init(&kgem->active[i][j]);
+	}
+	for (i = 0; i < ARRAY_SIZE(kgem->vma); i++) {
+		for (j = 0; j < ARRAY_SIZE(kgem->vma[i].inactive); j++)
+			list_init(&kgem->vma[i].inactive[j]);
+	}
+	kgem->vma[MAP_GTT].count = -MAX_GTT_VMA_CACHE;
+	kgem->vma[MAP_CPU].count = -MAX_CPU_VMA_CACHE;
+
+	kgem->has_blt = gem_param(kgem, LOCAL_I915_PARAM_HAS_BLT) > 0;
 	DBG(("%s: has BLT ring? %d\n", __FUNCTION__,
 	     kgem->has_blt));
 
 	kgem->has_relaxed_delta =
-		gem_param(kgem, I915_PARAM_HAS_RELAXED_DELTA) > 0;
+		gem_param(kgem, LOCAL_I915_PARAM_HAS_RELAXED_DELTA) > 0;
 	DBG(("%s: has relaxed delta? %d\n", __FUNCTION__,
 	     kgem->has_relaxed_delta));
 
@@ -812,16 +1010,32 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	DBG(("%s: has userptr? %d\n", __FUNCTION__,
 	     kgem->has_userptr));
 
+	kgem->has_no_reloc = test_has_no_reloc(kgem);
+	DBG(("%s: has no-reloc? %d\n", __FUNCTION__,
+	     kgem->has_no_reloc));
+
+	kgem->has_handle_lut = test_has_handle_lut(kgem);
+	DBG(("%s: has handle-lut? %d\n", __FUNCTION__,
+	     kgem->has_handle_lut));
+
 	kgem->has_semaphores = false;
 	if (kgem->has_blt && test_has_semaphores_enabled(kgem))
 		kgem->has_semaphores = true;
 	DBG(("%s: semaphores enabled? %d\n", __FUNCTION__,
 	     kgem->has_semaphores));
 
-	kgem->can_blt_cpu = gen >= 30;
+	kgem->can_blt_cpu = gen >= 030;
 	DBG(("%s: can blt to cpu? %d\n", __FUNCTION__,
 	     kgem->can_blt_cpu));
 
+	kgem->has_secure_batches = test_has_secure_batches(kgem);
+	DBG(("%s: can use privileged batchbuffers? %d\n", __FUNCTION__,
+	     kgem->has_secure_batches));
+
+	kgem->has_pinned_batches = test_has_pinned_batches(kgem);
+	DBG(("%s: can use pinned batchbuffers (to avoid CS w/a)? %d\n", __FUNCTION__,
+	     kgem->has_pinned_batches));
+
 	if (!is_hw_supported(kgem, dev)) {
 		xf86DrvMsg(kgem_get_screen_index(kgem), X_WARNING,
 			   "Detected unsupported/dysfunctional hardware, disabling acceleration.\n");
@@ -833,47 +1047,35 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	}
 
 	kgem->batch_size = ARRAY_SIZE(kgem->batch);
-	if (gen == 22)
+	if (gen == 020 && !kgem->has_pinned_batches)
+		/* Limited to what we can pin */
+		kgem->batch_size = 4*1024;
+	if (gen == 022)
 		/* 865g cannot handle a batch spanning multiple pages */
 		kgem->batch_size = PAGE_SIZE / sizeof(uint32_t);
-	if (gen >= 70 && gen < 80)
+	if ((gen >> 3) == 7)
 		kgem->batch_size = 16*1024;
 	if (!kgem->has_relaxed_delta && kgem->batch_size > 4*1024)
 		kgem->batch_size = 4*1024;
 
+	if (!kgem_init_pinned_batches(kgem) && gen == 020) {
+		xf86DrvMsg(kgem_get_screen_index(kgem), X_WARNING,
+			   "Unable to reserve memory for GPU, disabling acceleration.\n");
+		kgem->wedged = 1;
+	}
+
 	DBG(("%s: maximum batch size? %d\n", __FUNCTION__,
 	     kgem->batch_size));
 
 	kgem->min_alignment = 4;
-	if (gen < 40)
+	if (gen < 040)
 		kgem->min_alignment = 64;
 
 	kgem->half_cpu_cache_pages = cpu_cache_size() >> 13;
 	DBG(("%s: half cpu cache %d pages\n", __FUNCTION__,
 	     kgem->half_cpu_cache_pages));
 
-	list_init(&kgem->requests[0]);
-	list_init(&kgem->requests[1]);
-	list_init(&kgem->batch_buffers);
-	list_init(&kgem->active_buffers);
-	list_init(&kgem->flushing);
-	list_init(&kgem->large);
-	list_init(&kgem->large_inactive);
-	list_init(&kgem->snoop);
-	for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++)
-		list_init(&kgem->inactive[i]);
-	for (i = 0; i < ARRAY_SIZE(kgem->active); i++) {
-		for (j = 0; j < ARRAY_SIZE(kgem->active[i]); j++)
-			list_init(&kgem->active[i][j]);
-	}
-	for (i = 0; i < ARRAY_SIZE(kgem->vma); i++) {
-		for (j = 0; j < ARRAY_SIZE(kgem->vma[i].inactive); j++)
-			list_init(&kgem->vma[i].inactive[j]);
-	}
-	kgem->vma[MAP_GTT].count = -MAX_GTT_VMA_CACHE;
-	kgem->vma[MAP_CPU].count = -MAX_CPU_VMA_CACHE;
-
-	kgem->next_request = __kgem_request_alloc();
+	kgem->next_request = __kgem_request_alloc(kgem);
 
 	DBG(("%s: cpu bo enabled %d: llc? %d, set-cache-level? %d, userptr? %d\n", __FUNCTION__,
 	     !DBG_NO_CPU && (kgem->has_llc | kgem->has_userptr | kgem->has_cacheing),
@@ -885,10 +1087,15 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	if (aperture.aper_size == 0)
 		aperture.aper_size = 64*1024*1024;
 
+	DBG(("%s: aperture size %lld, available now %lld\n",
+	     __FUNCTION__,
+	     (long long)aperture.aper_size,
+	     (long long)aperture.aper_available_size));
+
 	kgem->aperture_total = aperture.aper_size;
 	kgem->aperture_high = aperture.aper_size * 3/4;
 	kgem->aperture_low = aperture.aper_size * 1/3;
-	if (gen < 33) {
+	if (gen < 033) {
 		/* Severe alignment penalties */
 		kgem->aperture_high /= 2;
 		kgem->aperture_low /= 2;
@@ -907,21 +1114,15 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	kgem->buffer_size = 64 * 1024;
 	while (kgem->buffer_size < kgem->aperture_mappable >> 10)
 		kgem->buffer_size *= 2;
+	if (kgem->buffer_size >> 12 > kgem->half_cpu_cache_pages)
+		kgem->buffer_size = kgem->half_cpu_cache_pages << 12;
 	DBG(("%s: buffer size=%d [%d KiB]\n", __FUNCTION__,
 	     kgem->buffer_size, kgem->buffer_size / 1024));
 
-	kgem->max_object_size = 2 * aperture.aper_size / 3;
+	kgem->max_object_size = 3 * (kgem->aperture_high >> 12) << 10;
 	kgem->max_gpu_size = kgem->max_object_size;
 	if (!kgem->has_llc)
 		kgem->max_gpu_size = MAX_CACHE_SIZE;
-	if (gen < 40) {
-		/* If we have to use fences for blitting, we have to make
-		 * sure we can fit them into the aperture.
-		 */
-		kgem->max_gpu_size = kgem->aperture_mappable / 2;
-		if (kgem->max_gpu_size > kgem->aperture_low)
-			kgem->max_gpu_size = kgem->aperture_low;
-	}
 
 	totalram = total_ram_size();
 	if (totalram == 0) {
@@ -935,12 +1136,9 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	if (kgem->max_gpu_size > totalram / 4)
 		kgem->max_gpu_size = totalram / 4;
 
-	half_gpu_max = kgem->max_gpu_size / 2;
-	if (kgem->gen >= 40)
-		kgem->max_cpu_size = half_gpu_max;
-	else
-		kgem->max_cpu_size = kgem->max_object_size;
+	kgem->max_cpu_size = kgem->max_object_size;
 
+	half_gpu_max = kgem->max_gpu_size / 2;
 	kgem->max_copy_tile_size = (MAX_CACHE_SIZE + 1)/2;
 	if (kgem->max_copy_tile_size > half_gpu_max)
 		kgem->max_copy_tile_size = half_gpu_max;
@@ -981,6 +1179,14 @@ void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen)
 	if ((int)kgem->fence_max < 0)
 		kgem->fence_max = 5; /* minimum safe value for all hw */
 	DBG(("%s: max fences=%d\n", __FUNCTION__, kgem->fence_max));
+
+	kgem->batch_flags_base = 0;
+	if (kgem->has_no_reloc)
+		kgem->batch_flags_base |= LOCAL_I915_EXEC_NO_RELOC;
+	if (kgem->has_handle_lut)
+		kgem->batch_flags_base |= LOCAL_I915_EXEC_HANDLE_LUT;
+	if (kgem->has_pinned_batches)
+		kgem->batch_flags_base |= LOCAL_I915_EXEC_IS_PINNED;
 }
 
 /* XXX hopefully a good approximation */
@@ -1013,9 +1219,9 @@ static uint32_t kgem_untiled_pitch(struct kgem *kgem,
 void kgem_get_tile_size(struct kgem *kgem, int tiling,
 			int *tile_width, int *tile_height, int *tile_size)
 {
-	if (kgem->gen <= 30) {
+	if (kgem->gen <= 030) {
 		if (tiling) {
-			if (kgem->gen < 30) {
+			if (kgem->gen < 030) {
 				*tile_width = 128;
 				*tile_height = 16;
 				*tile_size = 2048;
@@ -1064,14 +1270,14 @@ static uint32_t kgem_surface_size(struct kgem *kgem,
 	assert(width <= MAXSHORT);
 	assert(height <= MAXSHORT);
 
-	if (kgem->gen <= 30) {
+	if (kgem->gen <= 030) {
 		if (tiling) {
-			if (kgem->gen < 30) {
+			if (kgem->gen < 030) {
 				tile_width = 128;
-				tile_height = 16;
+				tile_height = 32;
 			} else {
 				tile_width = 512;
-				tile_height =  8;
+				tile_height = 16;
 			}
 		} else {
 			tile_width = 2 * bpp >> 3;
@@ -1087,19 +1293,21 @@ static uint32_t kgem_surface_size(struct kgem *kgem,
 				   kgem_pitch_alignment(kgem, flags));
 		tile_height = 2;
 		break;
+
+		/* XXX align to an even tile row */
 	case I915_TILING_X:
 		tile_width = 512;
-		tile_height = 8;
+		tile_height = 16;
 		break;
 	case I915_TILING_Y:
 		tile_width = 128;
-		tile_height = 32;
+		tile_height = 64;
 		break;
 	}
 
 	*pitch = ALIGN(width * bpp / 8, tile_width);
 	height = ALIGN(height, tile_height);
-	if (kgem->gen >= 40)
+	if (kgem->gen >= 040)
 		return PAGE_ALIGN(*pitch * height);
 
 	/* If it is too wide for the blitter, don't even bother.  */
@@ -1120,7 +1328,7 @@ static uint32_t kgem_surface_size(struct kgem *kgem,
 		return PAGE_ALIGN(size);
 
 	/*  We need to allocate a pot fence region for a tiled buffer. */
-	if (kgem->gen < 30)
+	if (kgem->gen < 030)
 		tile_width = 512 * 1024;
 	else
 		tile_width = 1024 * 1024;
@@ -1134,18 +1342,19 @@ static uint32_t kgem_aligned_height(struct kgem *kgem,
 {
 	uint32_t tile_height;
 
-	if (kgem->gen <= 30) {
-		tile_height = tiling ? kgem->gen < 30 ? 16 : 8 : 1;
+	if (kgem->gen <= 030) {
+		tile_height = tiling ? kgem->gen < 030 ? 32 : 16 : 1;
 	} else switch (tiling) {
+		/* XXX align to an even tile row */
 	default:
 	case I915_TILING_NONE:
-		tile_height = 2;
+		tile_height = 1;
 		break;
 	case I915_TILING_X:
-		tile_height = 8;
+		tile_height = 16;
 		break;
 	case I915_TILING_Y:
-		tile_height = 32;
+		tile_height = 64;
 		break;
 	}
 
@@ -1161,6 +1370,7 @@ kgem_add_handle(struct kgem *kgem, struct kgem_bo *bo)
 	     __FUNCTION__, bo->handle, kgem->nexec));
 
 	assert(kgem->nexec < ARRAY_SIZE(kgem->exec));
+	bo->target_handle = kgem->has_handle_lut ? kgem->nexec : bo->handle;
 	exec = memset(&kgem->exec[kgem->nexec++], 0, sizeof(*exec));
 	exec->handle = bo->handle;
 	exec->offset = bo->presumed_offset;
@@ -1170,10 +1380,10 @@ kgem_add_handle(struct kgem *kgem, struct kgem_bo *bo)
 	return exec;
 }
 
-void _kgem_add_bo(struct kgem *kgem, struct kgem_bo *bo)
+static void kgem_add_bo(struct kgem *kgem, struct kgem_bo *bo)
 {
 	bo->exec = kgem_add_handle(kgem, bo);
-	bo->rq = kgem->next_request;
+	bo->rq = MAKE_REQUEST(kgem->next_request, kgem->ring);
 
 	list_move_tail(&bo->request, &kgem->next_request->buffers);
 
@@ -1194,14 +1404,30 @@ static void kgem_fixup_self_relocs(struct kgem *kgem, struct kgem_bo *bo)
 {
 	int n;
 
-	for (n = 0; n < kgem->nreloc; n++) {
-		if (kgem->reloc[n].target_handle == 0) {
-			kgem->reloc[n].target_handle = bo->handle;
-			kgem->reloc[n].presumed_offset = bo->presumed_offset;
-			kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0])] =
-				kgem->reloc[n].delta + bo->presumed_offset;
+	if (kgem->nreloc__self == 0)
+		return;
+
+	for (n = 0; n < kgem->nreloc__self; n++) {
+		int i = kgem->reloc__self[n];
+		assert(kgem->reloc[i].target_handle == ~0U);
+		kgem->reloc[i].target_handle = bo->target_handle;
+		kgem->reloc[i].presumed_offset = bo->presumed_offset;
+		kgem->batch[kgem->reloc[i].offset/sizeof(kgem->batch[0])] =
+			kgem->reloc[i].delta + bo->presumed_offset;
+	}
+
+	if (n == 256) {
+		for (n = kgem->reloc__self[255]; n < kgem->nreloc; n++) {
+			if (kgem->reloc[n].target_handle == ~0U) {
+				kgem->reloc[n].target_handle = bo->target_handle;
+				kgem->reloc[n].presumed_offset = bo->presumed_offset;
+				kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0])] =
+					kgem->reloc[n].delta + bo->presumed_offset;
+			}
 		}
+
 	}
+
 }
 
 static void kgem_bo_binding_free(struct kgem *kgem, struct kgem_bo *bo)
@@ -1284,11 +1510,12 @@ inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
 	assert(bo->rq == NULL);
 	assert(bo->exec == NULL);
 	assert(bo->domain != DOMAIN_GPU);
-	assert(!kgem_busy(kgem, bo->handle));
 	assert(!bo->proxy);
 	assert(!bo->io);
+	assert(!bo->scanout);
 	assert(!bo->needs_flush);
 	assert(list_is_empty(&bo->vma));
+	ASSERT_IDLE(kgem, bo->handle);
 
 	kgem->need_expire = true;
 
@@ -1302,7 +1529,7 @@ inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
 	if (bo->map) {
 		int type = IS_CPU_MAP(bo->map);
 		if (bucket(bo) >= NUM_CACHE_BUCKETS ||
-		    (!type && !kgem_bo_is_mappable(kgem, bo))) {
+		    (!type && !__kgem_bo_is_mappable(kgem, bo))) {
 			munmap(MAP(bo->map), bytes(bo));
 			bo->map = NULL;
 		}
@@ -1313,6 +1540,32 @@ inline static void kgem_bo_move_to_inactive(struct kgem *kgem,
 	}
 }
 
+static struct kgem_bo *kgem_bo_replace_io(struct kgem_bo *bo)
+{
+	struct kgem_bo *base;
+
+	if (!bo->io)
+		return bo;
+
+	assert(!bo->snoop);
+	base = malloc(sizeof(*base));
+	if (base) {
+		DBG(("%s: transferring io handle=%d to bo\n",
+		     __FUNCTION__, bo->handle));
+		/* transfer the handle to a minimum bo */
+		memcpy(base, bo, sizeof(*base));
+		base->io = false;
+		list_init(&base->list);
+		list_replace(&bo->request, &base->request);
+		list_replace(&bo->vma, &base->vma);
+		free(bo);
+		bo = base;
+	} else
+		bo->reusable = false;
+
+	return bo;
+}
+
 inline static void kgem_bo_remove_from_inactive(struct kgem *kgem,
 						struct kgem_bo *bo)
 {
@@ -1335,16 +1588,14 @@ inline static void kgem_bo_remove_from_active(struct kgem *kgem,
 
 	list_del(&bo->list);
 	assert(bo->rq != NULL);
-	if (bo->rq == &_kgem_static_request)
+	if (bo->rq == (void *)kgem)
 		list_del(&bo->request);
 	assert(list_is_empty(&bo->vma));
 }
 
 static void kgem_bo_clear_scanout(struct kgem *kgem, struct kgem_bo *bo)
 {
-	if (!bo->scanout)
-		return;
-
+	assert(bo->scanout);
 	assert(bo->proxy == NULL);
 
 	DBG(("%s: handle=%d, fb=%d (reusable=%d)\n",
@@ -1356,7 +1607,6 @@ static void kgem_bo_clear_scanout(struct kgem *kgem, struct kgem_bo *bo)
 	}
 
 	bo->scanout = false;
-	bo->needs_flush = true;
 	bo->flush = false;
 	bo->reusable = true;
 
@@ -1376,6 +1626,20 @@ static void _kgem_bo_delete_buffer(struct kgem *kgem, struct kgem_bo *bo)
 		io->used = bo->delta;
 }
 
+static void kgem_bo_move_to_scanout(struct kgem *kgem, struct kgem_bo *bo)
+{
+	assert(bo->refcnt == 0);
+	assert(bo->exec == NULL);
+	assert(bo->scanout);
+	assert(bo->delta);
+	assert(!bo->snoop);
+	assert(!bo->io);
+
+	DBG(("%s: moving %d [fb %d] to scanout cachee\n", __FUNCTION__,
+	     bo->handle, bo->delta));
+	list_move(&bo->list, &kgem->scanout);
+}
+
 static void kgem_bo_move_to_snoop(struct kgem *kgem, struct kgem_bo *bo)
 {
 	assert(bo->refcnt == 0);
@@ -1416,6 +1680,7 @@ search_snoop_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 	list_for_each_entry(bo, &kgem->snoop, list) {
 		assert(bo->refcnt == 0);
 		assert(bo->snoop);
+		assert(!bo->scanout);
 		assert(bo->proxy == NULL);
 		assert(bo->tiling == I915_TILING_NONE);
 		assert(bo->rq == NULL);
@@ -1462,7 +1727,6 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 	assert(bo->proxy == NULL);
 
 	bo->binding.offset = 0;
-	kgem_bo_clear_scanout(kgem, bo);
 
 	if (DBG_NO_CACHE)
 		goto destroy;
@@ -1471,39 +1735,22 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 		DBG(("%s: handle=%d is snooped\n", __FUNCTION__, bo->handle));
 		assert(!bo->flush);
 		assert(list_is_empty(&bo->list));
+		if (bo->exec == NULL && bo->rq && !__kgem_busy(kgem, bo->handle))
+			__kgem_bo_clear_busy(bo);
 		if (bo->rq == NULL) {
-			if (bo->needs_flush && kgem_busy(kgem, bo->handle)) {
-				DBG(("%s: handle=%d is snooped, tracking until free\n",
-				     __FUNCTION__, bo->handle));
-				list_add(&bo->request, &kgem->flushing);
-				bo->rq = &_kgem_static_request;
-			}
-		}
-		if (bo->rq == NULL)
+			assert(!bo->needs_flush);
 			kgem_bo_move_to_snoop(kgem, bo);
+		}
 		return;
 	}
 
-	if (bo->io) {
-		struct kgem_bo *base;
-
-		assert(!bo->snoop);
-		base = malloc(sizeof(*base));
-		if (base) {
-			DBG(("%s: transferring io handle=%d to bo\n",
-			     __FUNCTION__, bo->handle));
-			/* transfer the handle to a minimum bo */
-			memcpy(base, bo, sizeof(*base));
-			base->io = false;
-			list_init(&base->list);
-			list_replace(&bo->request, &base->request);
-			list_replace(&bo->vma, &base->vma);
-			free(bo);
-			bo = base;
-		} else
-			bo->reusable = false;
+	if (bo->scanout) {
+		kgem_bo_move_to_scanout(kgem, bo);
+		return;
 	}
 
+	if (bo->io)
+		bo = kgem_bo_replace_io(bo);
 	if (!bo->reusable) {
 		DBG(("%s: handle=%d, not reusable\n",
 		     __FUNCTION__, bo->handle));
@@ -1519,6 +1766,20 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 	assert(bo->io == false);
 	assert(bo->scanout == false);
 
+	if (bo->exec && kgem->nexec == 1) {
+		DBG(("%s: only handle in batch, discarding last operations\n",
+		     __FUNCTION__));
+		assert(bo->exec == &kgem->exec[0]);
+		assert(kgem->exec[0].handle == bo->handle);
+		assert(RQ(bo->rq) == kgem->next_request);
+		bo->refcnt = 1;
+		kgem_reset(kgem);
+		bo->refcnt = 0;
+	}
+
+	if (bo->rq && bo->exec == NULL && !__kgem_busy(kgem, bo->handle))
+		__kgem_bo_clear_busy(bo);
+
 	if (bo->rq) {
 		struct list *cache;
 
@@ -1534,26 +1795,6 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 	assert(bo->exec == NULL);
 	assert(list_is_empty(&bo->request));
 
-	if (bo->needs_flush) {
-		if ((bo->needs_flush = kgem_busy(kgem, bo->handle))) {
-			struct list *cache;
-
-			DBG(("%s: handle=%d -> flushing\n",
-			     __FUNCTION__, bo->handle));
-
-			list_add(&bo->request, &kgem->flushing);
-			if (bucket(bo) < NUM_CACHE_BUCKETS)
-				cache = &kgem->active[bucket(bo)][bo->tiling];
-			else
-				cache = &kgem->large;
-			list_add(&bo->list, cache);
-			bo->rq = &_kgem_static_request;
-			return;
-		}
-
-		bo->domain = DOMAIN_NONE;
-	}
-
 	if (!IS_CPU_MAP(bo->map)) {
 		if (!kgem_bo_set_purgeable(kgem, bo))
 			goto destroy;
@@ -1627,27 +1868,27 @@ static bool kgem_retire__flushing(struct kgem *kgem)
 	bool retired = false;
 
 	list_for_each_entry_safe(bo, next, &kgem->flushing, request) {
-		assert(bo->rq == &_kgem_static_request);
+		assert(bo->rq == (void *)kgem);
 		assert(bo->exec == NULL);
 
-		if (kgem_busy(kgem, bo->handle))
+		if (__kgem_busy(kgem, bo->handle))
 			break;
 
-		bo->needs_flush = false;
-		bo->domain = DOMAIN_NONE;
-		bo->rq = NULL;
-		list_del(&bo->request);
+		__kgem_bo_clear_busy(bo);
 
-		if (!bo->refcnt) {
-			if (bo->snoop) {
-				kgem_bo_move_to_snoop(kgem, bo);
-			} else if (kgem_bo_set_purgeable(kgem, bo)) {
-				assert(bo->reusable);
-				kgem_bo_move_to_inactive(kgem, bo);
-				retired = true;
-			} else
-				kgem_bo_free(kgem, bo);
-		}
+		if (bo->refcnt)
+			continue;
+
+		if (bo->snoop) {
+			kgem_bo_move_to_snoop(kgem, bo);
+		} else if (bo->scanout) {
+			kgem_bo_move_to_scanout(kgem, bo);
+		} else if ((bo = kgem_bo_replace_io(bo))->reusable &&
+			   kgem_bo_set_purgeable(kgem, bo)) {
+			kgem_bo_move_to_inactive(kgem, bo);
+			retired = true;
+		} else
+			kgem_bo_free(kgem, bo);
 	}
 #if HAS_DEBUG_FULL
 	{
@@ -1658,149 +1899,143 @@ static bool kgem_retire__flushing(struct kgem *kgem)
 	}
 #endif
 
+	kgem->need_retire |= !list_is_empty(&kgem->flushing);
+
 	return retired;
 }
 
-static bool kgem_retire__requests(struct kgem *kgem)
+
+static bool __kgem_retire_rq(struct kgem *kgem, struct kgem_request *rq)
 {
-	struct kgem_bo *bo;
 	bool retired = false;
-	int n;
 
-	for (n = 0; n < ARRAY_SIZE(kgem->requests); n++) {
-		while (!list_is_empty(&kgem->requests[n])) {
-			struct kgem_request *rq;
+	DBG(("%s: request %d complete\n",
+	     __FUNCTION__, rq->bo->handle));
 
-			rq = list_first_entry(&kgem->requests[n],
-					      struct kgem_request,
-					      list);
-			if (kgem_busy(kgem, rq->bo->handle))
-				break;
-
-			DBG(("%s: request %d complete\n",
-			     __FUNCTION__, rq->bo->handle));
+	while (!list_is_empty(&rq->buffers)) {
+		struct kgem_bo *bo;
 
-			while (!list_is_empty(&rq->buffers)) {
-				bo = list_first_entry(&rq->buffers,
-						      struct kgem_bo,
-						      request);
+		bo = list_first_entry(&rq->buffers,
+				      struct kgem_bo,
+				      request);
 
-				assert(bo->rq == rq);
-				assert(bo->exec == NULL);
-				assert(bo->domain == DOMAIN_GPU);
-
-				list_del(&bo->request);
-
-				if (bo->needs_flush)
-					bo->needs_flush = kgem_busy(kgem, bo->handle);
-				if (bo->needs_flush) {
-					DBG(("%s: moving %d to flushing\n",
-					     __FUNCTION__, bo->handle));
-					list_add(&bo->request, &kgem->flushing);
-					bo->rq = &_kgem_static_request;
-				} else {
-					bo->domain = DOMAIN_NONE;
-					bo->rq = NULL;
-				}
+		assert(RQ(bo->rq) == rq);
+		assert(bo->exec == NULL);
+		assert(bo->domain == DOMAIN_GPU || bo->domain == DOMAIN_NONE);
 
-				if (bo->refcnt)
-					continue;
+		list_del(&bo->request);
 
-				if (bo->snoop) {
-					if (bo->needs_flush) {
-						list_add(&bo->request, &kgem->flushing);
-						bo->rq = &_kgem_static_request;
-					} else {
-						kgem_bo_move_to_snoop(kgem, bo);
-					}
-					continue;
-				}
+		if (bo->needs_flush)
+			bo->needs_flush = __kgem_busy(kgem, bo->handle);
+		if (bo->needs_flush) {
+			DBG(("%s: moving %d to flushing\n",
+			     __FUNCTION__, bo->handle));
+			list_add(&bo->request, &kgem->flushing);
+			bo->rq = (void *)kgem;
+			continue;
+		}
 
-				if (!bo->reusable) {
-					DBG(("%s: closing %d\n",
-					     __FUNCTION__, bo->handle));
-					kgem_bo_free(kgem, bo);
-					continue;
-				}
+		bo->domain = DOMAIN_NONE;
+		bo->rq = NULL;
+		if (bo->refcnt)
+			continue;
 
-				if (!bo->needs_flush) {
-					if (kgem_bo_set_purgeable(kgem, bo)) {
-						kgem_bo_move_to_inactive(kgem, bo);
-						retired = true;
-					} else {
-						DBG(("%s: closing %d\n",
-						     __FUNCTION__, bo->handle));
-						kgem_bo_free(kgem, bo);
-					}
-				}
-			}
+		if (bo->snoop) {
+			kgem_bo_move_to_snoop(kgem, bo);
+		} else if (bo->scanout) {
+			kgem_bo_move_to_scanout(kgem, bo);
+		} else if ((bo = kgem_bo_replace_io(bo))->reusable &&
+			   kgem_bo_set_purgeable(kgem, bo)) {
+			kgem_bo_move_to_inactive(kgem, bo);
+			retired = true;
+		} else {
+			DBG(("%s: closing %d\n",
+			     __FUNCTION__, bo->handle));
+			kgem_bo_free(kgem, bo);
+		}
+	}
 
-			assert(rq->bo->rq == NULL);
-			assert(list_is_empty(&rq->bo->request));
-
-			if (--rq->bo->refcnt == 0) {
-				if (kgem_bo_set_purgeable(kgem, rq->bo)) {
-					kgem_bo_move_to_inactive(kgem, rq->bo);
-					retired = true;
-				} else {
-					DBG(("%s: closing %d\n",
-					     __FUNCTION__, rq->bo->handle));
-					kgem_bo_free(kgem, rq->bo);
-				}
-			}
+	assert(rq->bo->rq == NULL);
+	assert(list_is_empty(&rq->bo->request));
 
-			__kgem_request_free(rq);
-			kgem->num_requests--;
+	if (--rq->bo->refcnt == 0) {
+		if (kgem_bo_set_purgeable(kgem, rq->bo)) {
+			kgem_bo_move_to_inactive(kgem, rq->bo);
+			retired = true;
+		} else {
+			DBG(("%s: closing %d\n",
+			     __FUNCTION__, rq->bo->handle));
+			kgem_bo_free(kgem, rq->bo);
 		}
+	}
 
-#if HAS_DEBUG_FULL
-		{
-			int count = 0;
+	__kgem_request_free(rq);
+	return retired;
+}
 
-			list_for_each_entry(bo, &kgem->requests[n], request)
-				count++;
+static bool kgem_retire__requests_ring(struct kgem *kgem, int ring)
+{
+	bool retired = false;
 
-			bo = NULL;
-			if (!list_is_empty(&kgem->requests[n]))
-				bo = list_first_entry(&kgem->requests[n],
-						      struct kgem_request,
-						      list)->bo;
+	while (!list_is_empty(&kgem->requests[ring])) {
+		struct kgem_request *rq;
 
-			ErrorF("%s: ring=%d, %d outstanding requests, oldest=%d\n",
-			       __FUNCTION__, n, count, bo ? bo->handle : 0);
-		}
-#endif
+		rq = list_first_entry(&kgem->requests[ring],
+				      struct kgem_request,
+				      list);
+		if (__kgem_busy(kgem, rq->bo->handle))
+			break;
+
+		retired |= __kgem_retire_rq(kgem, rq);
 	}
 
 #if HAS_DEBUG_FULL
 	{
+		struct kgem_bo *bo;
 		int count = 0;
 
-		for (n = 0; n < ARRAY_SIZE(kgem->requests); n++)
-			list_for_each_entry(bo, &kgem->requests[n], request)
-				count++;
+		list_for_each_entry(bo, &kgem->requests[ring], request)
+			count++;
 
-		assert(count == kgem->num_requests);
+		bo = NULL;
+		if (!list_is_empty(&kgem->requests[ring]))
+			bo = list_first_entry(&kgem->requests[ring],
+					      struct kgem_request,
+					      list)->bo;
+
+		ErrorF("%s: ring=%d, %d outstanding requests, oldest=%d\n",
+		       __FUNCTION__, ring, count, bo ? bo->handle : 0);
 	}
 #endif
 
 	return retired;
 }
 
+static bool kgem_retire__requests(struct kgem *kgem)
+{
+	bool retired = false;
+	int n;
+
+	for (n = 0; n < ARRAY_SIZE(kgem->requests); n++) {
+		retired |= kgem_retire__requests_ring(kgem, n);
+		kgem->need_retire |= !list_is_empty(&kgem->requests[n]);
+	}
+
+	return retired;
+}
+
 bool kgem_retire(struct kgem *kgem)
 {
 	bool retired = false;
 
 	DBG(("%s\n", __FUNCTION__));
 
+	kgem->need_retire = false;
+
 	retired |= kgem_retire__flushing(kgem);
-	if (kgem->num_requests)
-		retired |= kgem_retire__requests(kgem);
+	retired |= kgem_retire__requests(kgem);
 	retired |= kgem_retire__buffers(kgem);
 
-	kgem->need_retire =
-		kgem->num_requests ||
-		!list_is_empty(&kgem->flushing);
 	DBG(("%s -- retired=%d, need_retire=%d\n",
 	     __FUNCTION__, retired, kgem->need_retire));
 
@@ -1809,31 +2044,25 @@ bool kgem_retire(struct kgem *kgem)
 	return retired;
 }
 
-bool __kgem_is_idle(struct kgem *kgem)
+bool __kgem_ring_is_idle(struct kgem *kgem, int ring)
 {
-	int n;
+	struct kgem_request *rq;
 
-	assert(kgem->num_requests);
+	assert(!list_is_empty(&kgem->requests[ring]));
 
-	for (n = 0; n < ARRAY_SIZE(kgem->requests); n++) {
-		struct kgem_request *rq;
-
-		if (list_is_empty(&kgem->requests[n]))
-			continue;
+	rq = list_last_entry(&kgem->requests[ring],
+			     struct kgem_request, list);
+	if (__kgem_busy(kgem, rq->bo->handle)) {
+		DBG(("%s: last requests handle=%d still busy\n",
+		     __FUNCTION__, rq->bo->handle));
+		return false;
+	}
 
-		rq = list_last_entry(&kgem->requests[n],
-				     struct kgem_request, list);
-		if (kgem_busy(kgem, rq->bo->handle)) {
-			DBG(("%s: last requests handle=%d still busy\n",
-			     __FUNCTION__, rq->bo->handle));
-			return false;
-		}
+	DBG(("%s: ring=%d idle (handle=%d)\n",
+	     __FUNCTION__, ring, rq->bo->handle));
 
-		DBG(("%s: ring=%d idle (handle=%d)\n",
-		     __FUNCTION__, n, rq->bo->handle));
-	}
-	kgem_retire__requests(kgem);
-	assert(kgem->num_requests == 0);
+	kgem_retire__requests_ring(kgem, ring);
+	assert(list_is_empty(&kgem->requests[ring]));
 	return true;
 }
 
@@ -1853,10 +2082,11 @@ static void kgem_commit(struct kgem *kgem)
 		assert(!bo->purged);
 		assert(bo->exec);
 		assert(bo->proxy == NULL || bo->exec == &_kgem_dummy_exec);
-		assert(bo->rq == rq || (bo->proxy->rq == rq));
+		assert(RQ(bo->rq) == rq || (RQ(bo->proxy->rq) == rq));
 
 		bo->presumed_offset = bo->exec->offset;
 		bo->exec = NULL;
+		bo->target_handle = -1;
 
 		if (!bo->refcnt && !bo->reusable) {
 			assert(!bo->snoop);
@@ -1870,13 +2100,14 @@ static void kgem_commit(struct kgem *kgem)
 
 		if (bo->proxy) {
 			/* proxies are not used for domain tracking */
-			list_del(&bo->request);
-			bo->rq = NULL;
 			bo->exec = NULL;
+			__kgem_bo_clear_busy(bo);
 		}
+
+		kgem->scanout_busy |= bo->scanout;
 	}
 
-	if (rq == &_kgem_static_request) {
+	if (rq == &kgem->static_request) {
 		struct drm_i915_gem_set_domain set_domain;
 
 		DBG(("%s: syncing due to allocation failure\n", __FUNCTION__));
@@ -1894,10 +2125,10 @@ static void kgem_commit(struct kgem *kgem)
 		assert(list_is_empty(&rq->buffers));
 
 		gem_close(kgem->fd, rq->bo->handle);
+		kgem_cleanup_cache(kgem);
 	} else {
 		list_add_tail(&rq->list, &kgem->requests[rq->ring]);
 		kgem->need_throttle = kgem->need_retire = 1;
-		kgem->num_requests++;
 	}
 
 	kgem->next_request = NULL;
@@ -1946,13 +2177,12 @@ static void kgem_finish_buffers(struct kgem *kgem)
 
 			assert(!bo->need_io);
 
-			used = ALIGN(bo->used + PAGE_SIZE-1, PAGE_SIZE);
+			used = ALIGN(bo->used, PAGE_SIZE);
 			if (!DBG_NO_UPLOAD_ACTIVE &&
 			    used + PAGE_SIZE <= bytes(&bo->base) &&
-			    (kgem->has_llc || !IS_CPU_MAP(bo->base.map))) {
+			    (kgem->has_llc || !IS_CPU_MAP(bo->base.map) || bo->base.snoop)) {
 				DBG(("%s: retaining upload buffer (%d/%d)\n",
 				     __FUNCTION__, bo->used, bytes(&bo->base)));
-				assert(!bo->base.snoop);
 				bo->used = used;
 				list_move(&bo->base.list,
 					  &kgem->active_buffers);
@@ -1973,16 +2203,65 @@ static void kgem_finish_buffers(struct kgem *kgem)
 		}
 
 		assert(bo->need_io);
-		assert(bo->base.rq == kgem->next_request);
+		assert(bo->base.rq == MAKE_REQUEST(kgem->next_request, kgem->ring));
 		assert(bo->base.domain != DOMAIN_GPU);
 
 		if (bo->base.refcnt == 1 &&
 		    bo->base.size.pages.count > 1 &&
 		    bo->used < bytes(&bo->base) / 2) {
 			struct kgem_bo *shrink;
+			unsigned alloc = NUM_PAGES(bo->used);
+
+			shrink = search_snoop_cache(kgem, alloc,
+						    CREATE_INACTIVE | CREATE_NO_RETIRE);
+			if (shrink) {
+				void *map;
+				int n;
 
-			shrink = search_linear_cache(kgem,
-						     PAGE_ALIGN(bo->used),
+				DBG(("%s: used=%d, shrinking %d to %d, handle %d to %d\n",
+				     __FUNCTION__,
+				     bo->used, bytes(&bo->base), bytes(shrink),
+				     bo->base.handle, shrink->handle));
+
+				assert(bo->used <= bytes(shrink));
+				map = kgem_bo_map__cpu(kgem, shrink);
+				if (map) {
+					kgem_bo_sync__cpu(kgem, shrink);
+					memcpy(map, bo->mem, bo->used);
+
+					shrink->target_handle =
+						kgem->has_handle_lut ? bo->base.target_handle : shrink->handle;
+					for (n = 0; n < kgem->nreloc; n++) {
+						if (kgem->reloc[n].target_handle == bo->base.target_handle) {
+							kgem->reloc[n].target_handle = shrink->target_handle;
+							kgem->reloc[n].presumed_offset = shrink->presumed_offset;
+							kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0])] =
+								kgem->reloc[n].delta + shrink->presumed_offset;
+						}
+					}
+
+					bo->base.exec->handle = shrink->handle;
+					bo->base.exec->offset = shrink->presumed_offset;
+					shrink->exec = bo->base.exec;
+					shrink->rq = bo->base.rq;
+					list_replace(&bo->base.request,
+						     &shrink->request);
+					list_init(&bo->base.request);
+					shrink->needs_flush = bo->base.dirty;
+
+					bo->base.exec = NULL;
+					bo->base.rq = NULL;
+					bo->base.dirty = false;
+					bo->base.needs_flush = false;
+					bo->used = 0;
+
+					goto decouple;
+				}
+
+				__kgem_bo_destroy(kgem, shrink);
+			}
+
+			shrink = search_linear_cache(kgem, alloc,
 						     CREATE_INACTIVE | CREATE_NO_RETIRE);
 			if (shrink) {
 				int n;
@@ -1993,40 +2272,44 @@ static void kgem_finish_buffers(struct kgem *kgem)
 				     bo->base.handle, shrink->handle));
 
 				assert(bo->used <= bytes(shrink));
-				gem_write(kgem->fd, shrink->handle,
-					  0, bo->used, bo->mem);
-
-				for (n = 0; n < kgem->nreloc; n++) {
-					if (kgem->reloc[n].target_handle == bo->base.handle) {
-						kgem->reloc[n].target_handle = shrink->handle;
-						kgem->reloc[n].presumed_offset = shrink->presumed_offset;
-						kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0])] =
-							kgem->reloc[n].delta + shrink->presumed_offset;
+				if (gem_write(kgem->fd, shrink->handle,
+					      0, bo->used, bo->mem) == 0) {
+					shrink->target_handle =
+						kgem->has_handle_lut ? bo->base.target_handle : shrink->handle;
+					for (n = 0; n < kgem->nreloc; n++) {
+						if (kgem->reloc[n].target_handle == bo->base.target_handle) {
+							kgem->reloc[n].target_handle = shrink->target_handle;
+							kgem->reloc[n].presumed_offset = shrink->presumed_offset;
+							kgem->batch[kgem->reloc[n].offset/sizeof(kgem->batch[0])] =
+								kgem->reloc[n].delta + shrink->presumed_offset;
+						}
 					}
+
+					bo->base.exec->handle = shrink->handle;
+					bo->base.exec->offset = shrink->presumed_offset;
+					shrink->exec = bo->base.exec;
+					shrink->rq = bo->base.rq;
+					list_replace(&bo->base.request,
+						     &shrink->request);
+					list_init(&bo->base.request);
+					shrink->needs_flush = bo->base.dirty;
+
+					bo->base.exec = NULL;
+					bo->base.rq = NULL;
+					bo->base.dirty = false;
+					bo->base.needs_flush = false;
+					bo->used = 0;
+
+					goto decouple;
 				}
 
-				bo->base.exec->handle = shrink->handle;
-				bo->base.exec->offset = shrink->presumed_offset;
-				shrink->exec = bo->base.exec;
-				shrink->rq = bo->base.rq;
-				list_replace(&bo->base.request,
-					     &shrink->request);
-				list_init(&bo->base.request);
-				shrink->needs_flush = bo->base.dirty;
-
-				bo->base.exec = NULL;
-				bo->base.rq = NULL;
-				bo->base.dirty = false;
-				bo->base.needs_flush = false;
-				bo->used = 0;
-
-				goto decouple;
+				__kgem_bo_destroy(kgem, shrink);
 			}
 		}
 
 		DBG(("%s: handle=%d, uploading %d/%d\n",
 		     __FUNCTION__, bo->base.handle, bo->used, bytes(&bo->base)));
-		assert(!kgem_busy(kgem, bo->base.handle));
+		ASSERT_IDLE(kgem, bo->base.handle);
 		assert(bo->used <= bytes(&bo->base));
 		gem_write(kgem->fd, bo->base.handle,
 			  0, bo->used, bo->mem);
@@ -2058,11 +2341,9 @@ static void kgem_cleanup(struct kgem *kgem)
 						      struct kgem_bo,
 						      request);
 
-				list_del(&bo->request);
-				bo->rq = NULL;
 				bo->exec = NULL;
-				bo->domain = DOMAIN_NONE;
 				bo->dirty = false;
+				__kgem_bo_clear_busy(bo);
 				if (bo->refcnt == 0)
 					kgem_bo_free(kgem, bo);
 			}
@@ -2071,7 +2352,6 @@ static void kgem_cleanup(struct kgem *kgem)
 		}
 	}
 
-	kgem->num_requests = 0;
 	kgem_close_inactive(kgem);
 }
 
@@ -2079,7 +2359,7 @@ static int kgem_batch_write(struct kgem *kgem, uint32_t handle, uint32_t size)
 {
 	int ret;
 
-	assert(!kgem_busy(kgem, handle));
+	ASSERT_IDLE(kgem, handle);
 
 	/* If there is no surface data, just upload the batch */
 	if (kgem->surface == kgem->batch_size)
@@ -2122,34 +2402,46 @@ void kgem_reset(struct kgem *kgem)
 						 request);
 			list_del(&bo->request);
 
+			assert(RQ(bo->rq) == rq);
+
 			bo->binding.offset = 0;
 			bo->exec = NULL;
+			bo->target_handle = -1;
 			bo->dirty = false;
-			bo->rq = NULL;
-			bo->domain = DOMAIN_NONE;
 
-			if (!bo->refcnt) {
+			if (bo->needs_flush && __kgem_busy(kgem, bo->handle)) {
+				list_add(&bo->request, &kgem->flushing);
+				bo->rq = (void *)kgem;
+			} else
+				__kgem_bo_clear_busy(bo);
+
+			if (!bo->refcnt && !bo->reusable) {
+				assert(!bo->snoop);
 				DBG(("%s: discarding handle=%d\n",
 				     __FUNCTION__, bo->handle));
 				kgem_bo_free(kgem, bo);
 			}
 		}
 
-		if (kgem->next_request != &_kgem_static_request)
-			free(kgem->next_request);
+		if (rq != &kgem->static_request) {
+			list_init(&rq->list);
+			__kgem_request_free(rq);
+		}
 	}
 
 	kgem->nfence = 0;
 	kgem->nexec = 0;
 	kgem->nreloc = 0;
+	kgem->nreloc__self = 0;
 	kgem->aperture = 0;
 	kgem->aperture_fenced = 0;
 	kgem->nbatch = 0;
 	kgem->surface = kgem->batch_size;
 	kgem->mode = KGEM_NONE;
 	kgem->flush = 0;
+	kgem->batch_flags = kgem->batch_flags_base;
 
-	kgem->next_request = __kgem_request_alloc();
+	kgem->next_request = __kgem_request_alloc(kgem);
 
 	kgem_sna_reset(kgem);
 }
@@ -2173,7 +2465,7 @@ static int compact_batch_surface(struct kgem *kgem)
 		shrink *= sizeof(uint32_t);
 		for (n = 0; n < kgem->nreloc; n++) {
 			if (kgem->reloc[n].read_domains == I915_GEM_DOMAIN_INSTRUCTION &&
-			    kgem->reloc[n].target_handle == 0)
+			    kgem->reloc[n].target_handle == ~0U)
 				kgem->reloc[n].delta -= shrink;
 
 			if (kgem->reloc[n].offset >= sizeof(uint32_t)*kgem->nbatch)
@@ -2184,6 +2476,74 @@ static int compact_batch_surface(struct kgem *kgem)
 	return size * sizeof(uint32_t);
 }
 
+static struct kgem_bo *
+kgem_create_batch(struct kgem *kgem, int size)
+{
+	struct drm_i915_gem_set_domain set_domain;
+	struct kgem_bo *bo;
+
+	if (size <= 4096) {
+		bo = list_first_entry(&kgem->pinned_batches[0],
+				      struct kgem_bo,
+				      list);
+		if (!bo->rq) {
+out_4096:
+			list_move_tail(&bo->list, &kgem->pinned_batches[0]);
+			return kgem_bo_reference(bo);
+		}
+
+		if (!__kgem_busy(kgem, bo->handle)) {
+			assert(RQ(bo->rq)->bo == bo);
+			__kgem_retire_rq(kgem, RQ(bo->rq));
+			goto out_4096;
+		}
+	}
+
+	if (size <= 16384) {
+		bo = list_first_entry(&kgem->pinned_batches[1],
+				      struct kgem_bo,
+				      list);
+		if (!bo->rq) {
+out_16384:
+			list_move_tail(&bo->list, &kgem->pinned_batches[1]);
+			return kgem_bo_reference(bo);
+		}
+
+		if (!__kgem_busy(kgem, bo->handle)) {
+			assert(RQ(bo->rq)->bo == bo);
+			__kgem_retire_rq(kgem, RQ(bo->rq));
+			goto out_16384;
+		}
+	}
+
+	if (kgem->gen == 020 && !kgem->has_pinned_batches) {
+		assert(size <= 16384);
+
+		bo = list_first_entry(&kgem->pinned_batches[size > 4096],
+				      struct kgem_bo,
+				      list);
+		list_move_tail(&bo->list, &kgem->pinned_batches[size > 4096]);
+
+		DBG(("%s: syncing due to busy batches\n", __FUNCTION__));
+
+		VG_CLEAR(set_domain);
+		set_domain.handle = bo->handle;
+		set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+		set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+		if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain)) {
+			DBG(("%s: sync: GPU hang detected\n", __FUNCTION__));
+			kgem_throttle(kgem);
+			return NULL;
+		}
+
+		kgem_retire(kgem);
+		assert(bo->rq == NULL);
+		return kgem_bo_reference(bo);
+	}
+
+	return kgem_create_linear(kgem, size, CREATE_NO_THROTTLE);
+}
+
 void _kgem_submit(struct kgem *kgem)
 {
 	struct kgem_request *rq;
@@ -2212,7 +2572,7 @@ void _kgem_submit(struct kgem *kgem)
 
 	kgem_finish_buffers(kgem);
 
-#if HAS_DEBUG_FULL && SHOW_BATCH
+#if SHOW_BATCH
 	__kgem_batch_debug(kgem, batch_end);
 #endif
 
@@ -2221,7 +2581,7 @@ void _kgem_submit(struct kgem *kgem)
 		size = compact_batch_surface(kgem);
 	else
 		size = kgem->nbatch * sizeof(kgem->batch[0]);
-	rq->bo = kgem_create_linear(kgem, size, CREATE_NO_THROTTLE);
+	rq->bo = kgem_create_batch(kgem, size);
 	if (rq->bo) {
 		uint32_t handle = rq->bo->handle;
 		int i;
@@ -2233,13 +2593,14 @@ void _kgem_submit(struct kgem *kgem)
 		kgem->exec[i].relocation_count = kgem->nreloc;
 		kgem->exec[i].relocs_ptr = (uintptr_t)kgem->reloc;
 		kgem->exec[i].alignment = 0;
-		kgem->exec[i].offset = 0;
+		kgem->exec[i].offset = rq->bo->presumed_offset;
 		kgem->exec[i].flags = 0;
 		kgem->exec[i].rsvd1 = 0;
 		kgem->exec[i].rsvd2 = 0;
 
+		rq->bo->target_handle = kgem->has_handle_lut ? i : handle;
 		rq->bo->exec = &kgem->exec[i];
-		rq->bo->rq = rq; /* useful sanity check */
+		rq->bo->rq = MAKE_REQUEST(rq, kgem->ring); /* useful sanity check */
 		list_add(&rq->bo->request, &rq->buffers);
 		rq->ring = kgem->ring == KGEM_BLT;
 
@@ -2258,7 +2619,7 @@ void _kgem_submit(struct kgem *kgem)
 			execbuf.num_cliprects = 0;
 			execbuf.DR1 = 0;
 			execbuf.DR4 = 0;
-			execbuf.flags = kgem->ring;
+			execbuf.flags = kgem->ring | kgem->batch_flags;
 			execbuf.rsvd1 = 0;
 			execbuf.rsvd2 = 0;
 
@@ -2281,13 +2642,23 @@ void _kgem_submit(struct kgem *kgem)
 					       DRM_IOCTL_I915_GEM_EXECBUFFER2,
 					       &execbuf);
 			}
-			if (ret == -1 && (errno == EIO || errno == EBUSY)) {
-				DBG(("%s: GPU hang detected\n", __FUNCTION__));
-				kgem_throttle(kgem);
-				ret = 0;
+			if (DEBUG_SYNC && ret == 0) {
+				struct drm_i915_gem_set_domain set_domain;
+
+				VG_CLEAR(set_domain);
+				set_domain.handle = handle;
+				set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+				set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+
+				ret = drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain);
 			}
+			if (ret == -1) {
+				DBG(("%s: GPU hang detected [%d]\n",
+				     __FUNCTION__, errno));
+				kgem_throttle(kgem);
+				kgem->wedged = true;
+
 #if !NDEBUG
-			if (ret < 0) {
 				ret = errno;
 				ErrorF("batch[%d/%d]: %d %d %d, nreloc=%d, nexec=%d, nfence=%d, aperture=%d: errno=%d\n",
 				       kgem->mode, kgem->ring, batch_end, kgem->nbatch, kgem->surface,
@@ -2323,33 +2694,16 @@ void _kgem_submit(struct kgem *kgem)
 					       (int)kgem->reloc[i].presumed_offset);
 				}
 
-				i = open("/tmp/batchbuffer", O_WRONLY | O_CREAT | O_APPEND, 0666);
-				if (i != -1) {
-					i = write(i, kgem->batch, batch_end*sizeof(uint32_t));
-					(void)i;
-				}
-
-				FatalError("SNA: failed to submit batchbuffer, errno=%d\n", ret);
-			}
-#endif
-
-			if (DEBUG_FLUSH_SYNC) {
-				struct drm_i915_gem_set_domain set_domain;
-
-				DBG(("%s: debug sync, starting\n", __FUNCTION__));
-
-				VG_CLEAR(set_domain);
-				set_domain.handle = handle;
-				set_domain.read_domains = I915_GEM_DOMAIN_GTT;
-				set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+				if (DEBUG_SYNC) {
+					int fd = open("/tmp/batchbuffer", O_WRONLY | O_CREAT | O_APPEND, 0666);
+					if (fd != -1) {
+						write(fd, kgem->batch, batch_end*sizeof(uint32_t));
+						close(fd);
+					}
 
-				ret = drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain);
-				if (ret == -1) {
-					DBG(("%s: sync: GPU hang detected\n", __FUNCTION__));
-					kgem_throttle(kgem);
+					FatalError("SNA: failed to submit batchbuffer, errno=%d\n", ret);
 				}
-
-				DBG(("%s: debug sync, completed\n", __FUNCTION__));
+#endif
 			}
 		}
 
@@ -2425,6 +2779,13 @@ bool kgem_expire_cache(struct kgem *kgem)
 
 	}
 
+	while (!list_is_empty(&kgem->scanout)) {
+		bo = list_first_entry(&kgem->scanout, struct kgem_bo, list);
+		list_del(&bo->list);
+		kgem_bo_clear_scanout(kgem, bo);
+		__kgem_bo_destroy(kgem, bo);
+	}
+
 	expire = 0;
 	list_for_each_entry(bo, &kgem->snoop, list) {
 		if (bo->delta) {
@@ -2619,7 +2980,7 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 			return NULL;
 		}
 
-		if (!__kgem_throttle_retire(kgem, 0)) {
+		if (!__kgem_throttle_retire(kgem, flags)) {
 			DBG(("%s: nothing retired\n", __FUNCTION__));
 			return NULL;
 		}
@@ -2642,6 +3003,7 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 			assert(bo->proxy == NULL);
 			assert(bo->rq == NULL);
 			assert(bo->exec == NULL);
+			assert(!bo->scanout);
 
 			if (num_pages > num_pages(bo)) {
 				DBG(("inactive too small: %d < %d\n",
@@ -2655,8 +3017,8 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 			}
 
 			if (I915_TILING_NONE != bo->tiling &&
-			    gem_set_tiling(kgem->fd, bo->handle,
-					   I915_TILING_NONE, 0) != I915_TILING_NONE)
+			    !gem_set_tiling(kgem->fd, bo->handle,
+					    I915_TILING_NONE, 0))
 				continue;
 
 			kgem_bo_remove_from_inactive(kgem, bo);
@@ -2668,12 +3030,15 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 			     __FUNCTION__, bo->handle, num_pages(bo)));
 			assert(use_active || bo->domain != DOMAIN_GPU);
 			assert(!bo->needs_flush);
-			//assert(!kgem_busy(kgem, bo->handle));
+			ASSERT_MAYBE_IDLE(kgem, bo->handle, !use_active);
 			return bo;
 		}
 
 		if (flags & CREATE_EXACT)
 			return NULL;
+
+		if (flags & CREATE_CPU_MAP && !kgem->has_llc)
+			return NULL;
 	}
 
 	cache = use_active ? active(kgem, num_pages, I915_TILING_NONE) : inactive(kgem, num_pages);
@@ -2682,12 +3047,13 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 		assert(bo->reusable);
 		assert(!!bo->rq == !!use_active);
 		assert(bo->proxy == NULL);
+		assert(!bo->scanout);
 
 		if (num_pages > num_pages(bo))
 			continue;
 
 		if (use_active &&
-		    kgem->gen <= 40 &&
+		    kgem->gen <= 040 &&
 		    bo->tiling != I915_TILING_NONE)
 			continue;
 
@@ -2703,11 +3069,12 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 			if (first)
 				continue;
 
-			if (gem_set_tiling(kgem->fd, bo->handle,
-					   I915_TILING_NONE, 0) != I915_TILING_NONE)
+			if (!gem_set_tiling(kgem->fd, bo->handle,
+					    I915_TILING_NONE, 0))
 				continue;
 
 			bo->tiling = I915_TILING_NONE;
+			bo->pitch = 0;
 		}
 
 		if (bo->map) {
@@ -2751,7 +3118,7 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 		assert(list_is_empty(&bo->list));
 		assert(use_active || bo->domain != DOMAIN_GPU);
 		assert(!bo->needs_flush || use_active);
-		//assert(use_active || !kgem_busy(kgem, bo->handle));
+		ASSERT_MAYBE_IDLE(kgem, bo->handle, !use_active);
 		return bo;
 	}
 
@@ -2771,7 +3138,7 @@ search_linear_cache(struct kgem *kgem, unsigned int num_pages, unsigned flags)
 		assert(list_is_empty(&first->list));
 		assert(use_active || first->domain != DOMAIN_GPU);
 		assert(!first->needs_flush || use_active);
-		//assert(use_active || !kgem_busy(kgem, first->handle));
+		ASSERT_MAYBE_IDLE(kgem, first->handle, !use_active);
 		return first;
 	}
 
@@ -2878,10 +3245,15 @@ struct kgem_bo *kgem_create_linear(struct kgem *kgem, int size, unsigned flags)
 	size = (size + PAGE_SIZE - 1) / PAGE_SIZE;
 	bo = search_linear_cache(kgem, size, CREATE_INACTIVE | flags);
 	if (bo) {
+		assert(bo->domain != DOMAIN_GPU);
+		ASSERT_IDLE(kgem, bo->handle);
 		bo->refcnt = 1;
 		return bo;
 	}
 
+	if (flags & CREATE_CACHED)
+		return NULL;
+
 	handle = gem_create(kgem->fd, size);
 	if (handle == 0)
 		return NULL;
@@ -2902,7 +3274,7 @@ int kgem_choose_tiling(struct kgem *kgem, int tiling, int width, int height, int
 	if (DBG_NO_TILING)
 		return tiling < 0 ? tiling : I915_TILING_NONE;
 
-	if (kgem->gen < 40) {
+	if (kgem->gen < 040) {
 		if (tiling && width * bpp > 8192 * 8) {
 			DBG(("%s: pitch too large for tliing [%d]\n",
 			     __FUNCTION__, width*bpp/8));
@@ -2910,13 +3282,17 @@ int kgem_choose_tiling(struct kgem *kgem, int tiling, int width, int height, int
 			goto done;
 		}
 	} else {
+		/* XXX rendering to I915_TILING_Y seems broken? */
+		if (kgem->gen < 050 && tiling == I915_TILING_Y)
+			tiling = I915_TILING_X;
+
 		if (width*bpp > (MAXSHORT-512) * 8) {
-			DBG(("%s: large pitch [%d], forcing TILING_X\n",
-			     __FUNCTION__, width*bpp/8));
 			if (tiling > 0)
 				tiling = -tiling;
 			else if (tiling == 0)
 				tiling = -I915_TILING_X;
+			DBG(("%s: large pitch [%d], forcing TILING [%d]\n",
+			     __FUNCTION__, width*bpp/8, tiling));
 		} else if (tiling && (width|height) > 8192) {
 			DBG(("%s: large tiled buffer [%dx%d], forcing TILING_X\n",
 			     __FUNCTION__, width, height));
@@ -2927,9 +3303,9 @@ int kgem_choose_tiling(struct kgem *kgem, int tiling, int width, int height, int
 	if (tiling < 0)
 		return tiling;
 
-	if (tiling && height == 1) {
-		DBG(("%s: disabling tiling [%d] for single row\n",
-		     __FUNCTION__,height));
+	if (tiling && (height == 1 || width == 1)) {
+		DBG(("%s: disabling tiling [%dx%d] for single row/col\n",
+		     __FUNCTION__,width, height));
 		tiling = I915_TILING_NONE;
 		goto done;
 	}
@@ -3004,6 +3380,7 @@ unsigned kgem_can_create_2d(struct kgem *kgem,
 {
 	uint32_t pitch, size;
 	unsigned flags = 0;
+	int tiling;
 	int bpp;
 
 	DBG(("%s: %dx%d @ %d\n", __FUNCTION__, width, height, depth));
@@ -3023,33 +3400,41 @@ unsigned kgem_can_create_2d(struct kgem *kgem,
 	size = kgem_surface_size(kgem, false, 0,
 				 width, height, bpp,
 				 I915_TILING_NONE, &pitch);
-	if (size > 0 && size <= kgem->max_cpu_size)
-		flags |= KGEM_CAN_CREATE_CPU | KGEM_CAN_CREATE_GPU;
-	if (size > 0 && size <= kgem->aperture_mappable/4)
-		flags |= KGEM_CAN_CREATE_GTT;
-	if (size > kgem->large_object_size)
-		flags |= KGEM_CAN_CREATE_LARGE;
-	if (size > kgem->max_object_size) {
-		DBG(("%s: too large (untiled) %d > %d\n",
-		     __FUNCTION__, size, kgem->max_object_size));
-		return 0;
+	DBG(("%s: untiled size=%d\n", __FUNCTION__, size));
+	if (size > 0) {
+		if (size <= kgem->max_cpu_size)
+			flags |= KGEM_CAN_CREATE_CPU;
+		if (size <= kgem->max_gpu_size)
+			flags |= KGEM_CAN_CREATE_GPU;
+		if (size <= kgem->aperture_mappable/4)
+			flags |= KGEM_CAN_CREATE_GTT;
+		if (size > kgem->large_object_size)
+			flags |= KGEM_CAN_CREATE_LARGE;
+		if (size > kgem->max_object_size) {
+			DBG(("%s: too large (untiled) %d > %d\n",
+			     __FUNCTION__, size, kgem->max_object_size));
+			return 0;
+		}
 	}
 
-	size = kgem_surface_size(kgem, false, 0,
-				 width, height, bpp,
-				 kgem_choose_tiling(kgem, I915_TILING_X,
-						    width, height, bpp),
-				 &pitch);
-	if (size > 0 && size <= kgem->max_gpu_size)
-		flags |= KGEM_CAN_CREATE_GPU;
-	if (size > 0 && size <= kgem->aperture_mappable/4)
-		flags |= KGEM_CAN_CREATE_GTT;
-	if (size > kgem->large_object_size)
-		flags |= KGEM_CAN_CREATE_LARGE;
-	if (size > kgem->max_object_size) {
-		DBG(("%s: too large (tiled) %d > %d\n",
-		     __FUNCTION__, size, kgem->max_object_size));
-		return 0;
+	tiling = kgem_choose_tiling(kgem, I915_TILING_X,
+				    width, height, bpp);
+	if (tiling != I915_TILING_NONE) {
+		size = kgem_surface_size(kgem, false, 0,
+					 width, height, bpp, tiling,
+					 &pitch);
+		DBG(("%s: tiled[%d] size=%d\n", __FUNCTION__, tiling, size));
+		if (size > 0 && size <= kgem->max_gpu_size)
+			flags |= KGEM_CAN_CREATE_GPU;
+		if (size > 0 && size <= kgem->aperture_mappable/4)
+			flags |= KGEM_CAN_CREATE_GTT;
+		if (size > kgem->large_object_size)
+			flags |= KGEM_CAN_CREATE_LARGE;
+		if (size > kgem->max_object_size) {
+			DBG(("%s: too large (tiled) %d > %d\n",
+			     __FUNCTION__, size, kgem->max_object_size));
+			return 0;
+		}
 	}
 
 	return flags;
@@ -3060,9 +3445,9 @@ inline int kgem_bo_fenced_size(struct kgem *kgem, struct kgem_bo *bo)
 	unsigned int size;
 
 	assert(bo->tiling);
-	assert(kgem->gen < 40);
+	assert(kgem->gen < 040);
 
-	if (kgem->gen < 30)
+	if (kgem->gen < 030)
 		size = 512 * 1024;
 	else
 		size = 1024 * 1024;
@@ -3104,6 +3489,36 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 	size /= PAGE_SIZE;
 	bucket = cache_bucket(size);
 
+	if (flags & CREATE_SCANOUT) {
+		list_for_each_entry(bo, &kgem->scanout, list) {
+			assert(bo->scanout);
+			assert(bo->delta);
+			assert(!bo->purged);
+
+			if (size > num_pages(bo) || num_pages(bo) > 2*size)
+				continue;
+
+			if (bo->tiling != tiling ||
+			    (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
+				if (!gem_set_tiling(kgem->fd, bo->handle,
+						    tiling, pitch))
+					continue;
+
+				bo->tiling = tiling;
+				bo->pitch = pitch;
+			}
+
+			list_del(&bo->list);
+
+			bo->unique_id = kgem_get_unique_id(kgem);
+			DBG(("  1:from scanout: pitch=%d, tiling=%d, handle=%d, id=%d\n",
+			     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
+			assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
+			bo->refcnt = 1;
+			return bo;
+		}
+	}
+
 	if (bucket >= NUM_CACHE_BUCKETS) {
 		DBG(("%s: large bo num pages=%d, bucket=%d\n",
 		     __FUNCTION__, size, bucket));
@@ -3116,10 +3531,12 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 
 		list_for_each_entry(bo, &kgem->large, list) {
 			assert(!bo->purged);
+			assert(!bo->scanout);
 			assert(bo->refcnt == 0);
 			assert(bo->reusable);
+			assert(bo->flush == true);
 
-			if (kgem->gen < 40) {
+			if (kgem->gen < 040) {
 				if (bo->pitch < pitch) {
 					DBG(("tiled and pitch too small: tiling=%d, (want %d), pitch=%d, need %d\n",
 					     bo->tiling, tiling,
@@ -3134,11 +3551,12 @@ struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 					continue;
 
 				if (bo->pitch != pitch || bo->tiling != tiling) {
-					if (gem_set_tiling(kgem->fd, bo->handle,
-							   tiling, pitch) != tiling)
+					if (!gem_set_tiling(kgem->fd, bo->handle,
+							    tiling, pitch))
 						continue;
 
 					bo->pitch = pitch;
+					bo->tiling = tiling;
 				}
 			}
 
@@ -3157,16 +3575,19 @@ large_inactive:
 		list_for_each_entry(bo, &kgem->large_inactive, list) {
 			assert(bo->refcnt == 0);
 			assert(bo->reusable);
+			assert(!bo->scanout);
 
 			if (size > num_pages(bo))
 				continue;
 
 			if (bo->tiling != tiling ||
 			    (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
-				if (tiling != gem_set_tiling(kgem->fd,
-							     bo->handle,
-							     tiling, pitch))
+				if (!gem_set_tiling(kgem->fd, bo->handle,
+						    tiling, pitch))
 					continue;
+
+				bo->tiling = tiling;
+				bo->pitch = pitch;
 			}
 
 			if (bo->purged && !kgem_bo_clear_purgeable(kgem, bo)) {
@@ -3201,10 +3622,12 @@ large_inactive:
 			list_for_each_entry(bo, cache, vma) {
 				assert(bucket(bo) == bucket);
 				assert(bo->refcnt == 0);
+				assert(!bo->scanout);
 				assert(bo->map);
 				assert(IS_CPU_MAP(bo->map) == for_cpu);
 				assert(bo->rq == NULL);
 				assert(list_is_empty(&bo->request));
+				assert(bo->flush == false);
 
 				if (size > num_pages(bo)) {
 					DBG(("inactive too small: %d < %d\n",
@@ -3233,13 +3656,17 @@ large_inactive:
 				DBG(("  from inactive vma: pitch=%d, tiling=%d: handle=%d, id=%d\n",
 				     bo->pitch, bo->tiling, bo->handle, bo->unique_id));
 				assert(bo->reusable);
-				assert(bo->domain != DOMAIN_GPU && !kgem_busy(kgem, bo->handle));
+				assert(bo->domain != DOMAIN_GPU);
+				ASSERT_IDLE(kgem, bo->handle);
 				assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
 				bo->refcnt = 1;
 				return bo;
 			}
 		} while (!list_is_empty(cache) &&
 			 __kgem_throttle_retire(kgem, flags));
+
+		if (flags & CREATE_CPU_MAP && !kgem->has_llc)
+			goto create;
 	}
 
 	if (flags & CREATE_INACTIVE)
@@ -3260,8 +3687,10 @@ search_again:
 			assert(bucket(bo) == bucket);
 			assert(bo->reusable);
 			assert(bo->tiling == tiling);
+			assert(bo->flush == false);
+			assert(!bo->scanout);
 
-			if (kgem->gen < 40) {
+			if (kgem->gen < 040) {
 				if (bo->pitch < pitch) {
 					DBG(("tiled and pitch too small: tiling=%d, (want %d), pitch=%d, need %d\n",
 					     bo->tiling, tiling,
@@ -3276,9 +3705,10 @@ search_again:
 					continue;
 
 				if (bo->pitch != pitch) {
-					gem_set_tiling(kgem->fd,
-						       bo->handle,
-						       tiling, pitch);
+					if (!gem_set_tiling(kgem->fd,
+							    bo->handle,
+							    tiling, pitch))
+						continue;
 
 					bo->pitch = pitch;
 				}
@@ -3300,7 +3730,9 @@ search_again:
 			assert(!bo->purged);
 			assert(bo->refcnt == 0);
 			assert(bo->reusable);
+			assert(!bo->scanout);
 			assert(bo->tiling == tiling);
+			assert(bo->flush == false);
 
 			if (num_pages(bo) < size)
 				continue;
@@ -3319,7 +3751,7 @@ search_again:
 	}
 
 	if (--retry && flags & CREATE_EXACT) {
-		if (kgem->gen >= 40) {
+		if (kgem->gen >= 040) {
 			for (i = I915_TILING_NONE; i <= I915_TILING_Y; i++) {
 				if (i == tiling)
 					continue;
@@ -3329,13 +3761,15 @@ search_again:
 					assert(!bo->purged);
 					assert(bo->refcnt == 0);
 					assert(bo->reusable);
+					assert(!bo->scanout);
+					assert(bo->flush == false);
 
 					if (num_pages(bo) < size)
 						continue;
 
-					if (tiling != gem_set_tiling(kgem->fd,
-								     bo->handle,
-								     tiling, pitch))
+					if (!gem_set_tiling(kgem->fd,
+							    bo->handle,
+							    tiling, pitch))
 						continue;
 
 					kgem_bo_remove_from_active(kgem, bo);
@@ -3369,6 +3803,8 @@ search_again:
 				assert(!bo->purged);
 				assert(bo->refcnt == 0);
 				assert(bo->reusable);
+				assert(!bo->scanout);
+				assert(bo->flush == false);
 
 				if (bo->tiling) {
 					if (bo->pitch < pitch) {
@@ -3408,6 +3844,8 @@ search_inactive:
 	list_for_each_entry(bo, cache, list) {
 		assert(bucket(bo) == bucket);
 		assert(bo->reusable);
+		assert(!bo->scanout);
+		assert(bo->flush == false);
 
 		if (size > num_pages(bo)) {
 			DBG(("inactive too small: %d < %d\n",
@@ -3417,9 +3855,8 @@ search_inactive:
 
 		if (bo->tiling != tiling ||
 		    (tiling != I915_TILING_NONE && bo->pitch != pitch)) {
-			if (tiling != gem_set_tiling(kgem->fd,
-						     bo->handle,
-						     tiling, pitch))
+			if (!gem_set_tiling(kgem->fd, bo->handle,
+					    tiling, pitch))
 				continue;
 
 			if (bo->map)
@@ -3444,7 +3881,7 @@ search_inactive:
 		assert(bo->refcnt == 0);
 		assert(bo->reusable);
 		assert((flags & CREATE_INACTIVE) == 0 || bo->domain != DOMAIN_GPU);
-		assert((flags & CREATE_INACTIVE) == 0 || !kgem_busy(kgem, bo->handle));
+		ASSERT_MAYBE_IDLE(kgem, bo->handle, flags & CREATE_INACTIVE);
 		assert(bo->pitch*kgem_aligned_height(kgem, height, bo->tiling) <= kgem_bo_size(bo));
 		bo->refcnt = 1;
 		return bo;
@@ -3479,8 +3916,9 @@ create:
 	bo->domain = DOMAIN_CPU;
 	bo->unique_id = kgem_get_unique_id(kgem);
 	bo->pitch = pitch;
-	if (tiling != I915_TILING_NONE)
-		bo->tiling = gem_set_tiling(kgem->fd, handle, tiling, pitch);
+	if (tiling != I915_TILING_NONE &&
+	    gem_set_tiling(kgem->fd, handle, tiling, pitch))
+		bo->tiling = tiling;
 	if (bucket >= NUM_CACHE_BUCKETS) {
 		DBG(("%s: marking large bo for automatic flushing\n",
 		     __FUNCTION__));
@@ -3611,16 +4049,23 @@ void _kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 	__kgem_bo_destroy(kgem, bo);
 }
 
-bool __kgem_flush(struct kgem *kgem, struct kgem_bo *bo)
+void __kgem_flush(struct kgem *kgem, struct kgem_bo *bo)
 {
+	assert(bo->rq);
+	assert(bo->exec == NULL);
+	assert(bo->needs_flush);
+
 	/* The kernel will emit a flush *and* update its own flushing lists. */
-	if (!bo->needs_flush)
-		return false;
+	if (!__kgem_busy(kgem, bo->handle))
+		__kgem_bo_clear_busy(bo);
 
-	bo->needs_flush = kgem_busy(kgem, bo->handle);
 	DBG(("%s: handle=%d, busy?=%d\n",
-	     __FUNCTION__, bo->handle, bo->needs_flush));
-	return bo->needs_flush;
+	     __FUNCTION__, bo->handle, bo->rq != NULL));
+}
+
+inline static bool needs_semaphore(struct kgem *kgem, struct kgem_bo *bo)
+{
+	return kgem->nreloc && bo->rq && RQ_RING(bo->rq) != kgem->ring;
 }
 
 bool kgem_check_bo(struct kgem *kgem, ...)
@@ -3629,22 +4074,22 @@ bool kgem_check_bo(struct kgem *kgem, ...)
 	struct kgem_bo *bo;
 	int num_exec = 0;
 	int num_pages = 0;
-
-	if (kgem_flush(kgem))
-		return false;
+	bool flush = false;
 
 	va_start(ap, kgem);
 	while ((bo = va_arg(ap, struct kgem_bo *))) {
+		while (bo->proxy)
+			bo = bo->proxy;
 		if (bo->exec)
 			continue;
 
-		while (bo->proxy) {
-			bo = bo->proxy;
-			if (bo->exec)
-				continue;
-		}
+		if (needs_semaphore(kgem, bo))
+			return false;
+
 		num_pages += num_pages(bo);
 		num_exec++;
+
+		flush |= bo->flush;
 	}
 	va_end(ap);
 
@@ -3654,7 +4099,11 @@ bool kgem_check_bo(struct kgem *kgem, ...)
 	if (!num_pages)
 		return true;
 
-	if (kgem->aperture > kgem->aperture_low && kgem_is_idle(kgem)) {
+	if (kgem_flush(kgem, flush))
+		return false;
+
+	if (kgem->aperture > kgem->aperture_low &&
+	    kgem_ring_is_idle(kgem, kgem->ring)) {
 		DBG(("%s: current aperture usage (%d) is greater than low water mark (%d)\n",
 		     __FUNCTION__, kgem->aperture, kgem->aperture_low));
 		return false;
@@ -3679,46 +4128,55 @@ bool kgem_check_bo_fenced(struct kgem *kgem, struct kgem_bo *bo)
 {
 	uint32_t size;
 
-	if (kgem_flush(kgem))
-		return false;
-
 	while (bo->proxy)
 		bo = bo->proxy;
 	if (bo->exec) {
-		if (kgem->gen < 40 &&
+		if (kgem->gen < 040 &&
 		    bo->tiling != I915_TILING_NONE &&
 		    (bo->exec->flags & EXEC_OBJECT_NEEDS_FENCE) == 0) {
 			if (kgem->nfence >= kgem->fence_max)
 				return false;
 
+			if (3*kgem->aperture_fenced > kgem->aperture_mappable &&
+			    kgem_ring_is_idle(kgem, kgem->ring))
+				return false;
+
 			size = kgem->aperture_fenced;
 			size += kgem_bo_fenced_size(kgem, bo);
-			if (4*size > 3*kgem->aperture_mappable)
+			if (3*size > 2*kgem->aperture_mappable)
 				return false;
 		}
 
 		return true;
 	}
 
+	if (needs_semaphore(kgem, bo))
+		return false;
+
+	if (kgem_flush(kgem, bo->flush))
+		return false;
+
 	if (kgem->nexec >= KGEM_EXEC_SIZE(kgem) - 1)
 		return false;
 
-	if (kgem->aperture > kgem->aperture_low)
+	if (kgem->aperture > kgem->aperture_low &&
+	    kgem_ring_is_idle(kgem, kgem->ring))
 		return false;
 
 	if (kgem->aperture + num_pages(bo) > kgem->aperture_high)
 		return false;
 
-	if (kgem->gen < 40 && bo->tiling != I915_TILING_NONE) {
+	if (kgem->gen < 040 && bo->tiling != I915_TILING_NONE) {
 		if (kgem->nfence >= kgem->fence_max)
 			return false;
 
-		if (2*kgem->aperture_fenced > kgem->aperture_mappable)
+		if (3*kgem->aperture_fenced > kgem->aperture_mappable &&
+		    kgem_ring_is_idle(kgem, kgem->ring))
 			return false;
 
 		size = kgem->aperture_fenced;
 		size += kgem_bo_fenced_size(kgem, bo);
-		if (4*size > 3*kgem->aperture_mappable)
+		if (3*size > 2*kgem->aperture_mappable)
 			return false;
 	}
 
@@ -3733,16 +4191,14 @@ bool kgem_check_many_bo_fenced(struct kgem *kgem, ...)
 	int num_exec = 0;
 	int num_pages = 0;
 	int fenced_size = 0;
-
-	if (kgem_flush(kgem))
-		return false;
+	bool flush = false;
 
 	va_start(ap, kgem);
 	while ((bo = va_arg(ap, struct kgem_bo *))) {
 		while (bo->proxy)
 			bo = bo->proxy;
 		if (bo->exec) {
-			if (kgem->gen >= 40 || bo->tiling == I915_TILING_NONE)
+			if (kgem->gen >= 040 || bo->tiling == I915_TILING_NONE)
 				continue;
 
 			if ((bo->exec->flags & EXEC_OBJECT_NEEDS_FENCE) == 0) {
@@ -3753,12 +4209,17 @@ bool kgem_check_many_bo_fenced(struct kgem *kgem, ...)
 			continue;
 		}
 
+		if (needs_semaphore(kgem, bo))
+			return false;
+
 		num_pages += num_pages(bo);
 		num_exec++;
-		if (kgem->gen < 40 && bo->tiling) {
+		if (kgem->gen < 040 && bo->tiling) {
 			fenced_size += kgem_bo_fenced_size(kgem, bo);
 			num_fence++;
 		}
+
+		flush |= bo->flush;
 	}
 	va_end(ap);
 
@@ -3766,15 +4227,20 @@ bool kgem_check_many_bo_fenced(struct kgem *kgem, ...)
 		if (kgem->nfence + num_fence > kgem->fence_max)
 			return false;
 
-		if (2*kgem->aperture_fenced > kgem->aperture_mappable)
+		if (3*kgem->aperture_fenced > kgem->aperture_mappable &&
+		    kgem_ring_is_idle(kgem, kgem->ring))
 			return false;
 
-		if (4*(fenced_size + kgem->aperture_fenced) > 3*kgem->aperture_mappable)
+		if (3*(fenced_size + kgem->aperture_fenced) > 2*kgem->aperture_mappable)
 			return false;
 	}
 
 	if (num_pages) {
-		if (kgem->aperture > kgem->aperture_low)
+		if (kgem_flush(kgem, flush))
+			return false;
+
+		if (kgem->aperture > kgem->aperture_low &&
+		    kgem_ring_is_idle(kgem, kgem->ring))
 			return false;
 
 		if (num_pages + kgem->aperture > kgem->aperture_high)
@@ -3816,20 +4282,25 @@ uint32_t kgem_add_reloc(struct kgem *kgem,
 			if (bo->exec == NULL) {
 				list_move_tail(&bo->request,
 					       &kgem->next_request->buffers);
-				bo->rq = kgem->next_request;
+				bo->rq = MAKE_REQUEST(kgem->next_request,
+						      kgem->ring);
 				bo->exec = &_kgem_dummy_exec;
 			}
 
+			if (read_write_domain & 0x7fff && !bo->dirty)
+				__kgem_bo_mark_dirty(bo);
+
 			bo = bo->proxy;
 			assert(bo->refcnt);
 			assert(!bo->purged);
 		}
 
 		if (bo->exec == NULL)
-			_kgem_add_bo(kgem, bo);
-		assert(bo->rq == kgem->next_request);
+			kgem_add_bo(kgem, bo);
+		assert(bo->rq == MAKE_REQUEST(kgem->next_request, kgem->ring));
+		assert(RQ_RING(bo->rq) == kgem->ring);
 
-		if (kgem->gen < 40 && read_write_domain & KGEM_RELOC_FENCED) {
+		if (kgem->gen < 040 && read_write_domain & KGEM_RELOC_FENCED) {
 			if (bo->tiling &&
 			    (bo->exec->flags & EXEC_OBJECT_NEEDS_FENCE) == 0) {
 				assert(kgem->nfence < kgem->fence_max);
@@ -3841,19 +4312,21 @@ uint32_t kgem_add_reloc(struct kgem *kgem,
 		}
 
 		kgem->reloc[index].delta = delta;
-		kgem->reloc[index].target_handle = bo->handle;
+		kgem->reloc[index].target_handle = bo->target_handle;
 		kgem->reloc[index].presumed_offset = bo->presumed_offset;
 
-		if (read_write_domain & 0x7ff) {
+		if (read_write_domain & 0x7fff && !bo->dirty) {
 			assert(!bo->snoop || kgem->can_blt_cpu);
-			kgem_bo_mark_dirty(bo);
+			__kgem_bo_mark_dirty(bo);
 		}
 
 		delta += bo->presumed_offset;
 	} else {
 		kgem->reloc[index].delta = delta;
-		kgem->reloc[index].target_handle = 0;
+		kgem->reloc[index].target_handle = ~0U;
 		kgem->reloc[index].presumed_offset = 0;
+		if (kgem->nreloc__self < 256)
+			kgem->reloc__self[kgem->nreloc__self++] = index;
 	}
 	kgem->reloc[index].read_domains = read_write_domain >> 16;
 	kgem->reloc[index].write_domain = read_write_domain & 0x7fff;
@@ -3984,7 +4457,7 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 	ptr = bo->map;
 	if (ptr == NULL) {
 		assert(kgem_bo_size(bo) <= kgem->aperture_mappable / 2);
-		assert(kgem->gen != 21 || bo->tiling != I915_TILING_Y);
+		assert(kgem->gen != 021 || bo->tiling != I915_TILING_Y);
 
 		kgem_trim_vma_cache(kgem, MAP_GTT, bucket(bo));
 
@@ -4005,7 +4478,7 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo)
 		struct drm_i915_gem_set_domain set_domain;
 
 		DBG(("%s: sync: needs_flush? %d, domain? %d, busy? %d\n", __FUNCTION__,
-		     bo->needs_flush, bo->domain, kgem_busy(kgem, bo->handle)));
+		     bo->needs_flush, bo->domain, __kgem_busy(kgem, bo->handle)));
 
 		/* XXX use PROT_READ to avoid the write flush? */
 
@@ -4097,6 +4570,11 @@ retry:
 		if (__kgem_throttle_retire(kgem, 0))
 			goto retry;
 
+		if (kgem->need_expire) {
+			kgem_cleanup_cache(kgem);
+			goto retry;
+		}
+
 		return NULL;
 	}
 
@@ -4132,6 +4610,11 @@ retry:
 		if (__kgem_throttle_retire(kgem, 0))
 			goto retry;
 
+		if (kgem->need_expire) {
+			kgem_cleanup_cache(kgem);
+			goto retry;
+		}
+
 		return NULL;
 	}
 
@@ -4180,10 +4663,7 @@ uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo)
 	 */
 	bo->reusable = false;
 
-	/* The bo is outside of our control, so presume it is written to */
-	bo->needs_flush = true;
-	if (bo->domain != DOMAIN_GPU)
-		bo->domain = DOMAIN_NONE;
+	kgem_bo_unclean(kgem, bo);
 
 	/* Henceforth, we need to broadcast all updates to clients and
 	 * flush our rendering before doing so.
@@ -4231,8 +4711,8 @@ void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo)
 	if (bo->domain != DOMAIN_CPU) {
 		struct drm_i915_gem_set_domain set_domain;
 
-		DBG(("%s: sync: needs_flush? %d, domain? %d, busy? %d\n", __FUNCTION__,
-		     bo->needs_flush, bo->domain, kgem_busy(kgem, bo->handle)));
+		DBG(("%s: SYNC: needs_flush? %d, domain? %d, busy? %d\n", __FUNCTION__,
+		     bo->needs_flush, bo->domain, __kgem_busy(kgem, bo->handle)));
 
 		VG_CLEAR(set_domain);
 		set_domain.handle = bo->handle;
@@ -4246,6 +4726,30 @@ void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo)
 	}
 }
 
+void kgem_bo_sync__cpu_full(struct kgem *kgem, struct kgem_bo *bo, bool write)
+{
+	assert(bo->proxy == NULL);
+	kgem_bo_submit(kgem, bo);
+
+	if (bo->domain != DOMAIN_CPU) {
+		struct drm_i915_gem_set_domain set_domain;
+
+		DBG(("%s: SYNC: needs_flush? %d, domain? %d, busy? %d\n", __FUNCTION__,
+		     bo->needs_flush, bo->domain, __kgem_busy(kgem, bo->handle)));
+
+		VG_CLEAR(set_domain);
+		set_domain.handle = bo->handle;
+		set_domain.read_domains = I915_GEM_DOMAIN_CPU;
+		set_domain.write_domain = write ? I915_GEM_DOMAIN_CPU : 0;
+
+		if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_SET_DOMAIN, &set_domain) == 0) {
+			if (write || bo->needs_flush)
+				kgem_bo_retire(kgem, bo);
+			bo->domain = write ? DOMAIN_CPU : DOMAIN_NONE;
+		}
+	}
+}
+
 void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo)
 {
 	assert(bo->proxy == NULL);
@@ -4254,8 +4758,8 @@ void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo)
 	if (bo->domain != DOMAIN_GTT) {
 		struct drm_i915_gem_set_domain set_domain;
 
-		DBG(("%s: sync: needs_flush? %d, domain? %d, busy? %d\n", __FUNCTION__,
-		     bo->needs_flush, bo->domain, kgem_busy(kgem, bo->handle)));
+		DBG(("%s: SYNC: needs_flush? %d, domain? %d, busy? %d\n", __FUNCTION__,
+		     bo->needs_flush, bo->domain, __kgem_busy(kgem, bo->handle)));
 
 		VG_CLEAR(set_domain);
 		set_domain.handle = bo->handle;
@@ -4271,10 +4775,10 @@ void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo)
 
 void kgem_clear_dirty(struct kgem *kgem)
 {
-	struct kgem_request *rq = kgem->next_request;
+	struct list * const buffers = &kgem->next_request->buffers;
 	struct kgem_bo *bo;
 
-	list_for_each_entry(bo, &rq->buffers, request) {
+	list_for_each_entry(bo, buffers, request) {
 		if (!bo->dirty)
 			break;
 
@@ -4305,6 +4809,7 @@ struct kgem_bo *kgem_create_proxy(struct kgem *kgem,
 	bo->tiling = target->tiling;
 	bo->pitch = target->pitch;
 
+	assert(!bo->scanout);
 	bo->proxy = kgem_bo_reference(target);
 	bo->delta = offset;
 
@@ -4351,7 +4856,7 @@ static inline bool
 use_snoopable_buffer(struct kgem *kgem, uint32_t flags)
 {
 	if ((flags & KGEM_BUFFER_WRITE) == 0)
-		return kgem->gen >= 30;
+		return kgem->gen >= 030;
 
 	return true;
 }
@@ -4425,8 +4930,6 @@ create_snoopable_buffer(struct kgem *kgem, unsigned alloc)
 	struct kgem_buffer *bo;
 	uint32_t handle;
 
-	assert(!kgem->has_llc);
-
 	if (kgem->has_cacheing) {
 		struct kgem_bo *old;
 
@@ -4524,9 +5027,6 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	/* we should never be asked to create anything TOO large */
 	assert(size <= kgem->max_object_size);
 
-	if (kgem->has_llc)
-		flags &= ~KGEM_BUFFER_INPLACE;
-
 #if !DBG_NO_UPLOAD_CACHE
 	list_for_each_entry(bo, &kgem->batch_buffers, base.list) {
 		assert(bo->base.io);
@@ -4580,8 +5080,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			assert(bo->base.io);
 			assert(bo->base.refcnt >= 1);
 			assert(bo->mmapped);
-			assert(!bo->base.snoop);
-			assert(!IS_CPU_MAP(bo->base.map) || kgem->has_llc);
+			assert(!IS_CPU_MAP(bo->base.map) || kgem->has_llc || bo->base.snoop);
 
 			if ((bo->write & ~flags) & KGEM_BUFFER_INPLACE) {
 				DBG(("%s: skip write %x buffer, need %x\n",
@@ -4608,11 +5107,16 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		alloc = ALIGN(size, kgem->buffer_size);
 	if (alloc > MAX_CACHE_SIZE)
 		alloc = PAGE_ALIGN(size);
+
+	if (alloc > kgem->aperture_mappable / 4)
+		flags &= ~KGEM_BUFFER_INPLACE;
 	alloc /= PAGE_SIZE;
-	if (kgem->has_llc) {
+
+	if (kgem->has_llc &&
+	    (flags & KGEM_BUFFER_WRITE_INPLACE) != KGEM_BUFFER_WRITE_INPLACE) {
 		bo = buffer_alloc();
 		if (bo == NULL)
-			return NULL;
+			goto skip_llc;
 
 		old = NULL;
 		if ((flags & KGEM_BUFFER_WRITE) == 0)
@@ -4630,7 +5134,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			uint32_t handle = gem_create(kgem->fd, alloc);
 			if (handle == 0) {
 				free(bo);
-				return NULL;
+				goto skip_llc;
 			}
 			__kgem_bo_init(&bo->base, handle, alloc);
 			DBG(("%s: created LLC handle=%d for buffer\n",
@@ -4646,17 +5150,14 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		if (bo->mem) {
 			if (flags & KGEM_BUFFER_WRITE)
 				kgem_bo_sync__cpu(kgem, &bo->base);
-
-			alloc = num_pages(&bo->base);
+			flags &= ~KGEM_BUFFER_INPLACE;
 			goto init;
 		} else {
 			bo->base.refcnt = 0; /* for valgrind */
 			kgem_bo_free(kgem, &bo->base);
 		}
 	}
-
-	if (PAGE_SIZE * alloc > kgem->aperture_mappable / 4)
-		flags &= ~KGEM_BUFFER_INPLACE;
+skip_llc:
 
 	if ((flags & KGEM_BUFFER_WRITE_INPLACE) == KGEM_BUFFER_WRITE_INPLACE) {
 		/* The issue with using a GTT upload buffer is that we may
@@ -4695,7 +5196,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 						  CREATE_EXACT | CREATE_INACTIVE | CREATE_GTT_MAP);
 		if (old == NULL) {
 			old = search_linear_cache(kgem, alloc, CREATE_INACTIVE);
-			if (old && !kgem_bo_is_mappable(kgem, old)) {
+			if (old && !__kgem_bo_is_mappable(kgem, old)) {
 				_kgem_bo_destroy(kgem, old);
 				old = NULL;
 			}
@@ -4703,7 +5204,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 		if (old) {
 			DBG(("%s: reusing handle=%d for buffer\n",
 			     __FUNCTION__, old->handle));
-			assert(kgem_bo_is_mappable(kgem, old));
+			assert(__kgem_bo_is_mappable(kgem, old));
 			assert(!old->snoop);
 			assert(old->rq == NULL);
 
@@ -4719,9 +5220,8 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 
 			bo->mem = kgem_bo_map(kgem, &bo->base);
 			if (bo->mem) {
-				alloc = num_pages(&bo->base);
 				if (IS_CPU_MAP(bo->base.map))
-				    flags &= ~KGEM_BUFFER_INPLACE;
+					flags &= ~KGEM_BUFFER_INPLACE;
 				goto init;
 			} else {
 				bo->base.refcnt = 0;
@@ -4742,16 +5242,13 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			if (flags & KGEM_BUFFER_WRITE)
 				kgem_bo_sync__cpu(kgem, &bo->base);
 			flags &= ~KGEM_BUFFER_INPLACE;
-			alloc = num_pages(&bo->base);
 			goto init;
 		}
 
-		if ((flags & KGEM_BUFFER_WRITE_INPLACE) != KGEM_BUFFER_WRITE_INPLACE) {
+		if ((flags & KGEM_BUFFER_INPLACE) == 0) {
 			bo = create_snoopable_buffer(kgem, alloc);
-			if (bo) {
-				flags &= ~KGEM_BUFFER_INPLACE;
+			if (bo)
 				goto init;
-			}
 		}
 	}
 
@@ -4765,8 +5262,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 	if (old) {
 		DBG(("%s: reusing ordinary handle %d for io\n",
 		     __FUNCTION__, old->handle));
-		alloc = num_pages(old);
-		bo = buffer_alloc_with_data(alloc);
+		bo = buffer_alloc_with_data(num_pages(old));
 		if (bo == NULL)
 			return NULL;
 
@@ -4793,7 +5289,6 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			DBG(("%s: reusing handle=%d for buffer\n",
 			     __FUNCTION__, old->handle));
 
-			alloc = num_pages(old);
 			init_buffer_from_bo(bo, old);
 		} else {
 			uint32_t handle = gem_create(kgem->fd, alloc);
@@ -4803,7 +5298,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 			}
 
 			DBG(("%s: created handle=%d for buffer\n",
-			     __FUNCTION__, bo->base.handle));
+			     __FUNCTION__, handle));
 
 			__kgem_bo_init(&bo->base, handle, alloc);
 			debug_alloc(kgem, alloc * PAGE_SIZE);
@@ -4815,16 +5310,18 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 
 		if (flags & KGEM_BUFFER_WRITE) {
 			bo->mem = kgem_bo_map__cpu(kgem, &bo->base);
-			if (bo->mem != NULL)
+			if (bo->mem != NULL) {
 				kgem_bo_sync__cpu(kgem, &bo->base);
-			goto init;
+				goto init;
+			}
 		}
 
 		DBG(("%s: failing back to new pwrite buffer\n", __FUNCTION__));
 		old = &bo->base;
-		bo = buffer_alloc_with_data(alloc);
+		bo = buffer_alloc_with_data(num_pages(old));
 		if (bo == NULL) {
-			free(old);
+			old->refcnt= 0;
+			kgem_bo_free(kgem, old);
 			return NULL;
 		}
 
@@ -4839,7 +5336,7 @@ struct kgem_bo *kgem_create_buffer(struct kgem *kgem,
 init:
 	bo->base.io = true;
 	assert(bo->base.refcnt == 1);
-	assert(num_pages(&bo->base) == alloc);
+	assert(num_pages(&bo->base) >= NUM_PAGES(size));
 	assert(!bo->need_io || !bo->base.needs_flush);
 	assert(!bo->need_io || bo->base.domain != DOMAIN_GPU);
 	assert(bo->mem);
@@ -4852,8 +5349,8 @@ init:
 	assert(list_is_empty(&bo->base.list));
 	list_add(&bo->base.list, &kgem->batch_buffers);
 
-	DBG(("%s(pages=%d) new handle=%d, used=%d, write=%d\n",
-	     __FUNCTION__, alloc, bo->base.handle, bo->used, bo->write));
+	DBG(("%s(pages=%d [%d]) new handle=%d, used=%d, write=%d\n",
+	     __FUNCTION__, num_pages(&bo->base), alloc, bo->base.handle, bo->used, bo->write));
 
 done:
 	bo->used = ALIGN(bo->used, UPLOAD_ALIGNMENT);
@@ -4919,10 +5416,10 @@ struct kgem_bo *kgem_create_buffer_2d(struct kgem *kgem,
 
 struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
 					 const void *data,
-					 BoxPtr box,
+					 const BoxRec *box,
 					 int stride, int bpp)
 {
-	int width = box->x2 - box->x1;
+	int width  = box->x2 - box->x1;
 	int height = box->y2 - box->y1;
 	struct kgem_bo *bo;
 	void *dst;
@@ -4987,7 +5484,7 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 		     __FUNCTION__,
 		     bo->base.needs_flush,
 		     bo->base.domain,
-		     kgem_busy(kgem, bo->base.handle)));
+		     __kgem_busy(kgem, bo->base.handle)));
 
 		assert(!IS_CPU_MAP(bo->base.map) || bo->base.snoop || kgem->has_llc);
 
@@ -5007,6 +5504,7 @@ void kgem_buffer_read_sync(struct kgem *kgem, struct kgem_bo *_bo)
 			return;
 	}
 	kgem_bo_retire(kgem, &bo->base);
+	bo->base.domain = DOMAIN_NONE;
 }
 
 uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format)
@@ -5104,18 +5602,22 @@ kgem_replace_bo(struct kgem *kgem,
 	dst->unique_id = kgem_get_unique_id(kgem);
 	dst->refcnt = 1;
 
-	kgem_set_mode(kgem, KGEM_BLT);
+	kgem_set_mode(kgem, KGEM_BLT, dst);
 	if (!kgem_check_batch(kgem, 8) ||
 	    !kgem_check_reloc(kgem, 2) ||
 	    !kgem_check_many_bo_fenced(kgem, src, dst, NULL)) {
-		_kgem_submit(kgem);
+		kgem_submit(kgem);
+		if (!kgem_check_many_bo_fenced(kgem, src, dst, NULL)) {
+			kgem_bo_destroy(kgem, dst);
+			return NULL;
+		}
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
 	br00 = XY_SRC_COPY_BLT_CMD;
 	br13 = pitch;
 	pitch = src->pitch;
-	if (kgem->gen >= 40 && src->tiling) {
+	if (kgem->gen >= 040 && src->tiling) {
 		br00 |= BLT_SRC_TILED;
 		pitch >>= 2;
 	}
diff --git a/src/sna/kgem.h b/src/sna/kgem.h
index e547215bb..a23194feb 100644
--- a/src/sna/kgem.h
+++ b/src/sna/kgem.h
@@ -43,6 +43,12 @@
 #endif
 
 struct kgem_bo {
+	struct kgem_request *rq;
+#define RQ(rq) ((struct kgem_request *)((uintptr_t)(rq) & ~3))
+#define RQ_RING(rq) ((uintptr_t)(rq) & 3)
+#define RQ_IS_BLT(rq) (RQ_RING(rq) == KGEM_BLT)
+	struct drm_i915_gem_exec_object2 *exec;
+
 	struct kgem_bo *proxy;
 
 	struct list list;
@@ -52,8 +58,6 @@ struct kgem_bo {
 	void *map;
 #define IS_CPU_MAP(ptr) ((uintptr_t)(ptr) & 1)
 #define IS_GTT_MAP(ptr) (ptr && ((uintptr_t)(ptr) & 1) == 0)
-	struct kgem_request *rq;
-	struct drm_i915_gem_exec_object2 *exec;
 
 	struct kgem_bo_binding {
 		struct kgem_bo_binding *next;
@@ -64,6 +68,7 @@ struct kgem_bo {
 	uint32_t unique_id;
 	uint32_t refcnt;
 	uint32_t handle;
+	uint32_t target_handle;
 	uint32_t presumed_offset;
 	uint32_t delta;
 	union {
@@ -126,22 +131,30 @@ struct kgem {
 	struct list large_inactive;
 	struct list active[NUM_CACHE_BUCKETS][3];
 	struct list inactive[NUM_CACHE_BUCKETS];
+	struct list pinned_batches[2];
 	struct list snoop;
+	struct list scanout;
 	struct list batch_buffers, active_buffers;
 
 	struct list requests[2];
 	struct kgem_request *next_request;
-	uint32_t num_requests;
+	struct kgem_request static_request;
 
 	struct {
 		struct list inactive[NUM_CACHE_BUCKETS];
 		int16_t count;
 	} vma[NUM_MAP_TYPES];
 
+	uint32_t batch_flags;
+	uint32_t batch_flags_base;
+#define I915_EXEC_SECURE (1<<9)
+#define LOCAL_EXEC_OBJECT_WRITE (1<<2)
+
 	uint16_t nbatch;
 	uint16_t surface;
 	uint16_t nexec;
 	uint16_t nreloc;
+	uint16_t nreloc__self;
 	uint16_t nfence;
 	uint16_t batch_size;
 	uint16_t min_alignment;
@@ -151,6 +164,7 @@ struct kgem {
 	uint32_t need_purge:1;
 	uint32_t need_retire:1;
 	uint32_t need_throttle:1;
+	uint32_t scanout_busy:1;
 	uint32_t busy:1;
 
 	uint32_t has_userptr :1;
@@ -158,8 +172,12 @@ struct kgem {
 	uint32_t has_relaxed_fencing :1;
 	uint32_t has_relaxed_delta :1;
 	uint32_t has_semaphores :1;
+	uint32_t has_secure_batches :1;
+	uint32_t has_pinned_batches :1;
 	uint32_t has_cacheing :1;
 	uint32_t has_llc :1;
+	uint32_t has_no_reloc :1;
+	uint32_t has_handle_lut :1;
 
 	uint32_t can_blt_cpu :1;
 
@@ -179,6 +197,7 @@ struct kgem {
 	uint32_t batch[64*1024-8];
 	struct drm_i915_gem_exec_object2 exec[256];
 	struct drm_i915_gem_relocation_entry reloc[4096];
+	uint16_t reloc__self[256];
 
 #ifdef DEBUG_MEMORY
 	struct {
@@ -200,7 +219,7 @@ struct kgem {
 #define KGEM_EXEC_SIZE(K) (int)(ARRAY_SIZE((K)->exec)-KGEM_EXEC_RESERVED)
 #define KGEM_RELOC_SIZE(K) (int)(ARRAY_SIZE((K)->reloc)-KGEM_RELOC_RESERVED)
 
-void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, int gen);
+void kgem_init(struct kgem *kgem, int fd, struct pci_device *dev, unsigned gen);
 void kgem_reset(struct kgem *kgem);
 
 struct kgem_bo *kgem_create_map(struct kgem *kgem,
@@ -218,7 +237,7 @@ struct kgem_bo *kgem_create_proxy(struct kgem *kgem,
 
 struct kgem_bo *kgem_upload_source_image(struct kgem *kgem,
 					 const void *data,
-					 BoxPtr box,
+					 const BoxRec *box,
 					 int stride, int bpp);
 void kgem_proxy_bo_attach(struct kgem_bo *bo, struct kgem_bo **ptr);
 
@@ -245,8 +264,9 @@ enum {
 	CREATE_SCANOUT = 0x10,
 	CREATE_PRIME = 0x20,
 	CREATE_TEMPORARY = 0x40,
-	CREATE_NO_RETIRE = 0x80,
-	CREATE_NO_THROTTLE = 0x100,
+	CREATE_CACHED = 0x80,
+	CREATE_NO_RETIRE = 0x100,
+	CREATE_NO_THROTTLE = 0x200,
 };
 struct kgem_bo *kgem_create_2d(struct kgem *kgem,
 			       int width,
@@ -264,17 +284,25 @@ uint32_t kgem_bo_get_binding(struct kgem_bo *bo, uint32_t format);
 void kgem_bo_set_binding(struct kgem_bo *bo, uint32_t format, uint16_t offset);
 int kgem_bo_get_swizzling(struct kgem *kgem, struct kgem_bo *bo);
 
-void kgem_bo_retire(struct kgem *kgem, struct kgem_bo *bo);
 bool kgem_retire(struct kgem *kgem);
-bool __kgem_is_idle(struct kgem *kgem);
+
+bool __kgem_ring_is_idle(struct kgem *kgem, int ring);
+static inline bool kgem_ring_is_idle(struct kgem *kgem, int ring)
+{
+	ring = ring == KGEM_BLT;
+
+	if (list_is_empty(&kgem->requests[ring]))
+		return true;
+
+	return __kgem_ring_is_idle(kgem, ring);
+}
+
 static inline bool kgem_is_idle(struct kgem *kgem)
 {
-	if (kgem->num_requests == 0) {
-		DBG(("%s: no outstanding requests\n", __FUNCTION__));
+	if (!kgem->need_retire)
 		return true;
-	}
 
-	return __kgem_is_idle(kgem);
+	return kgem_ring_is_idle(kgem, kgem->ring);
 }
 
 void _kgem_submit(struct kgem *kgem);
@@ -284,9 +312,12 @@ static inline void kgem_submit(struct kgem *kgem)
 		_kgem_submit(kgem);
 }
 
-static inline bool kgem_flush(struct kgem *kgem)
+static inline bool kgem_flush(struct kgem *kgem, bool flush)
 {
-	return kgem->flush && kgem_is_idle(kgem);
+	if (kgem->nreloc == 0)
+		return false;
+
+	return (kgem->flush ^ flush) && kgem_ring_is_idle(kgem, kgem->ring);
 }
 
 static inline void kgem_bo_submit(struct kgem *kgem, struct kgem_bo *bo)
@@ -295,7 +326,7 @@ static inline void kgem_bo_submit(struct kgem *kgem, struct kgem_bo *bo)
 		_kgem_submit(kgem);
 }
 
-bool __kgem_flush(struct kgem *kgem, struct kgem_bo *bo);
+void __kgem_flush(struct kgem *kgem, struct kgem_bo *bo);
 static inline void kgem_bo_flush(struct kgem *kgem, struct kgem_bo *bo)
 {
 	kgem_bo_submit(kgem, bo);
@@ -307,7 +338,7 @@ static inline void kgem_bo_flush(struct kgem *kgem, struct kgem_bo *bo)
 	 * we assume direct access. And as the useual failure is EIO, we do
 	 * not actualy care.
 	 */
-	(void)__kgem_flush(kgem, bo);
+	__kgem_flush(kgem, bo);
 }
 
 static inline struct kgem_bo *kgem_bo_reference(struct kgem_bo *bo)
@@ -327,7 +358,9 @@ static inline void kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo)
 
 void kgem_clear_dirty(struct kgem *kgem);
 
-static inline void kgem_set_mode(struct kgem *kgem, enum kgem_mode mode)
+static inline void kgem_set_mode(struct kgem *kgem,
+				 enum kgem_mode mode,
+				 struct kgem_bo *bo)
 {
 	assert(!kgem->wedged);
 
@@ -335,6 +368,9 @@ static inline void kgem_set_mode(struct kgem *kgem, enum kgem_mode mode)
 	kgem_submit(kgem);
 #endif
 
+	if (kgem->nreloc && bo->exec == NULL && kgem_ring_is_idle(kgem, kgem->ring))
+		_kgem_submit(kgem);
+
 	if (kgem->mode == mode)
 		return;
 
@@ -346,6 +382,7 @@ static inline void _kgem_set_mode(struct kgem *kgem, enum kgem_mode mode)
 {
 	assert(kgem->mode == KGEM_NONE);
 	assert(kgem->nbatch == 0);
+	assert(!kgem->wedged);
 	kgem->context_switch(kgem, mode);
 	kgem->mode = mode;
 }
@@ -384,33 +421,21 @@ static inline bool kgem_check_batch_with_surfaces(struct kgem *kgem,
 		kgem_check_exec(kgem, num_surfaces);
 }
 
-static inline uint32_t *kgem_get_batch(struct kgem *kgem, int num_dwords)
+static inline uint32_t *kgem_get_batch(struct kgem *kgem)
 {
-	if (!kgem_check_batch(kgem, num_dwords))
+	if (kgem->nreloc) {
+		unsigned mode = kgem->mode;
 		_kgem_submit(kgem);
+		_kgem_set_mode(kgem, mode);
+	}
 
 	return kgem->batch + kgem->nbatch;
 }
 
-static inline void kgem_advance_batch(struct kgem *kgem, int num_dwords)
-{
-	kgem->nbatch += num_dwords;
-}
-
 bool kgem_check_bo(struct kgem *kgem, ...) __attribute__((sentinel(0)));
 bool kgem_check_bo_fenced(struct kgem *kgem, struct kgem_bo *bo);
 bool kgem_check_many_bo_fenced(struct kgem *kgem, ...) __attribute__((sentinel(0)));
 
-void _kgem_add_bo(struct kgem *kgem, struct kgem_bo *bo);
-static inline void kgem_add_bo(struct kgem *kgem, struct kgem_bo *bo)
-{
-	if (bo->proxy)
-		bo = bo->proxy;
-
-	if (bo->exec == NULL)
-		_kgem_add_bo(kgem, bo);
-}
-
 #define KGEM_RELOC_FENCED 0x8000
 uint32_t kgem_add_reloc(struct kgem *kgem,
 			uint32_t pos,
@@ -425,6 +450,7 @@ void kgem_bo_sync__gtt(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__debug(struct kgem *kgem, struct kgem_bo *bo);
 void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
 void kgem_bo_sync__cpu(struct kgem *kgem, struct kgem_bo *bo);
+void kgem_bo_sync__cpu_full(struct kgem *kgem, struct kgem_bo *bo, bool write);
 void *__kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo);
 void __kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr);
 uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo);
@@ -460,7 +486,7 @@ static inline bool kgem_bo_blt_pitch_is_ok(struct kgem *kgem,
 					   struct kgem_bo *bo)
 {
 	int pitch = bo->pitch;
-	if (kgem->gen >= 40 && bo->tiling)
+	if (kgem->gen >= 040 && bo->tiling)
 		pitch /= 4;
 	if (pitch > MAXSHORT) {
 		DBG(("%s: can not blt to handle=%d, adjusted pitch=%d\n",
@@ -483,16 +509,13 @@ static inline bool kgem_bo_can_blt(struct kgem *kgem,
 	return kgem_bo_blt_pitch_is_ok(kgem, bo);
 }
 
-static inline bool kgem_bo_is_mappable(struct kgem *kgem,
-				       struct kgem_bo *bo)
+static inline bool __kgem_bo_is_mappable(struct kgem *kgem,
+					 struct kgem_bo *bo)
 {
-	DBG(("%s: domain=%d, offset: %d size: %d\n",
-	     __FUNCTION__, bo->domain, bo->presumed_offset, kgem_bo_size(bo)));
-
 	if (bo->domain == DOMAIN_GTT)
 		return true;
 
-	if (kgem->gen < 40 && bo->tiling &&
+	if (kgem->gen < 040 && bo->tiling &&
 	    bo->presumed_offset & (kgem_bo_fenced_size(kgem, bo) - 1))
 		return false;
 
@@ -502,17 +525,24 @@ static inline bool kgem_bo_is_mappable(struct kgem *kgem,
 	return bo->presumed_offset + kgem_bo_size(bo) <= kgem->aperture_mappable;
 }
 
+static inline bool kgem_bo_is_mappable(struct kgem *kgem,
+				       struct kgem_bo *bo)
+{
+	DBG(("%s: domain=%d, offset: %d size: %d\n",
+	     __FUNCTION__, bo->domain, bo->presumed_offset, kgem_bo_size(bo)));
+	assert(bo->refcnt);
+	return __kgem_bo_is_mappable(kgem, bo);
+}
+
 static inline bool kgem_bo_mapped(struct kgem *kgem, struct kgem_bo *bo)
 {
 	DBG(("%s: map=%p, tiling=%d, domain=%d\n",
 	     __FUNCTION__, bo->map, bo->tiling, bo->domain));
+	assert(bo->refcnt);
 
 	if (bo->map == NULL)
 		return bo->tiling == I915_TILING_NONE && bo->domain == DOMAIN_CPU;
 
-	if (bo->tiling == I915_TILING_X && !bo->scanout && kgem->has_llc)
-		return IS_CPU_MAP(bo->map);
-
 	return IS_CPU_MAP(bo->map) == !bo->tiling;
 }
 
@@ -524,7 +554,7 @@ static inline bool kgem_bo_can_map(struct kgem *kgem, struct kgem_bo *bo)
 	if (!bo->tiling && kgem->has_llc)
 		return true;
 
-	if (kgem->gen == 21 && bo->tiling == I915_TILING_Y)
+	if (kgem->gen == 021 && bo->tiling == I915_TILING_Y)
 		return false;
 
 	return kgem_bo_size(bo) <= kgem->aperture_mappable / 4;
@@ -532,15 +562,32 @@ static inline bool kgem_bo_can_map(struct kgem *kgem, struct kgem_bo *bo)
 
 static inline bool kgem_bo_is_snoop(struct kgem_bo *bo)
 {
+	assert(bo->refcnt);
 	while (bo->proxy)
 		bo = bo->proxy;
 	return bo->snoop;
 }
 
+bool __kgem_busy(struct kgem *kgem, int handle);
+
+static inline void kgem_bo_mark_busy(struct kgem_bo *bo, int ring)
+{
+	bo->rq = (struct kgem_request *)((uintptr_t)bo->rq | ring);
+}
+
+inline static void __kgem_bo_clear_busy(struct kgem_bo *bo)
+{
+	bo->needs_flush = false;
+	list_del(&bo->request);
+	bo->rq = NULL;
+	bo->domain = DOMAIN_NONE;
+}
+
 static inline bool kgem_bo_is_busy(struct kgem_bo *bo)
 {
 	DBG(("%s: handle=%d, domain: %d exec? %d, rq? %d\n", __FUNCTION__,
 	     bo->handle, bo->domain, bo->exec != NULL, bo->rq != NULL));
+	assert(bo->refcnt);
 	return bo->rq;
 }
 
@@ -548,10 +595,17 @@ static inline bool __kgem_bo_is_busy(struct kgem *kgem, struct kgem_bo *bo)
 {
 	DBG(("%s: handle=%d, domain: %d exec? %d, rq? %d\n", __FUNCTION__,
 	     bo->handle, bo->domain, bo->exec != NULL, bo->rq != NULL));
-	if (kgem_flush(kgem))
+	assert(bo->refcnt);
+
+	if (bo->exec)
+		return true;
+
+	if (kgem_flush(kgem, bo->flush))
 		kgem_submit(kgem);
-	if (bo->rq && !bo->exec)
-		kgem_retire(kgem);
+
+	if (bo->rq && !__kgem_busy(kgem, bo->handle))
+		__kgem_bo_clear_busy(bo);
+
 	return kgem_bo_is_busy(bo);
 }
 
@@ -560,21 +614,42 @@ static inline bool kgem_bo_is_dirty(struct kgem_bo *bo)
 	if (bo == NULL)
 		return false;
 
+	assert(bo->refcnt);
 	return bo->dirty;
 }
 
+static inline void kgem_bo_unclean(struct kgem *kgem, struct kgem_bo *bo)
+{
+	/* The bo is outside of our control, so presume it is written to */
+	bo->needs_flush = true;
+	if (bo->rq == NULL)
+		bo->rq = (void *)kgem;
+
+	if (bo->domain != DOMAIN_GPU)
+		bo->domain = DOMAIN_NONE;
+}
+
+static inline void __kgem_bo_mark_dirty(struct kgem_bo *bo)
+{
+	DBG(("%s: handle=%d (proxy? %d)\n", __FUNCTION__,
+	     bo->handle, bo->proxy != NULL));
+
+	bo->exec->flags |= LOCAL_EXEC_OBJECT_WRITE;
+	bo->needs_flush = bo->dirty = true;
+	list_move(&bo->request, &RQ(bo->rq)->buffers);
+}
+
 static inline void kgem_bo_mark_dirty(struct kgem_bo *bo)
 {
+	assert(bo->refcnt);
 	do {
-		if (bo->dirty)
-			return;
-
-		DBG(("%s: handle=%d\n", __FUNCTION__, bo->handle));
 		assert(bo->exec);
 		assert(bo->rq);
 
-		bo->needs_flush = bo->dirty = true;
-		list_move(&bo->request, &bo->rq->buffers);
+		if (bo->dirty)
+			return;
+
+		__kgem_bo_mark_dirty(bo);
 	} while ((bo = bo->proxy));
 }
 
@@ -600,7 +675,7 @@ bool kgem_expire_cache(struct kgem *kgem);
 void kgem_purge_cache(struct kgem *kgem);
 void kgem_cleanup_cache(struct kgem *kgem);
 
-#if HAS_EXTRA_DEBUG
+#if HAS_DEBUG_FULL
 void __kgem_batch_debug(struct kgem *kgem, uint32_t nbatch);
 #else
 static inline void __kgem_batch_debug(struct kgem *kgem, uint32_t nbatch)
diff --git a/src/sna/kgem_debug.c b/src/sna/kgem_debug.c
index 2dc1b4564..48c75889c 100644
--- a/src/sna/kgem_debug.c
+++ b/src/sna/kgem_debug.c
@@ -62,7 +62,7 @@ kgem_debug_get_bo_for_reloc_entry(struct kgem *kgem,
 		return NULL;
 
 	list_for_each_entry(bo, &kgem->next_request->buffers, request)
-		if (bo->handle == reloc->target_handle && bo->proxy == NULL)
+		if (bo->target_handle == reloc->target_handle && bo->proxy == NULL)
 			break;
 
 	assert(&bo->request != &kgem->next_request->buffers);
@@ -74,6 +74,9 @@ static int kgem_debug_handle_is_fenced(struct kgem *kgem, uint32_t handle)
 {
 	int i;
 
+	if (kgem->has_handle_lut)
+		return kgem->exec[handle].flags & EXEC_OBJECT_NEEDS_FENCE;
+
 	for (i = 0; i < kgem->nexec; i++)
 		if (kgem->exec[i].handle == handle)
 			return kgem->exec[i].flags & EXEC_OBJECT_NEEDS_FENCE;
@@ -86,7 +89,7 @@ static int kgem_debug_handle_tiling(struct kgem *kgem, uint32_t handle)
 	struct kgem_bo *bo;
 
 	list_for_each_entry(bo, &kgem->next_request->buffers, request)
-		if (bo->handle == handle)
+		if (bo->target_handle == handle)
 			return bo->tiling;
 
 	return 0;
@@ -95,7 +98,7 @@ static int kgem_debug_handle_tiling(struct kgem *kgem, uint32_t handle)
 void
 kgem_debug_print(const uint32_t *data,
 		 uint32_t offset, unsigned int index,
-		 char *fmt, ...)
+		 const char *fmt, ...)
 {
 	va_list va;
 	char buf[240];
@@ -273,7 +276,7 @@ decode_2d(struct kgem *kgem, uint32_t offset)
 				 kgem_debug_handle_is_fenced(kgem, reloc->target_handle),
 				 kgem_debug_handle_tiling(kgem, reloc->target_handle));
 		kgem_debug_print(data, offset, 5, "color\n");
-		assert(kgem->gen >= 40 ||
+		assert(kgem->gen >= 040 ||
 		       kgem_debug_handle_is_fenced(kgem, reloc->target_handle));
 		return len;
 
@@ -321,7 +324,7 @@ decode_2d(struct kgem *kgem, uint32_t offset)
 				 reloc->read_domains, reloc->write_domain,
 				 kgem_debug_handle_is_fenced(kgem, reloc->target_handle),
 				 kgem_debug_handle_tiling(kgem, reloc->target_handle));
-		assert(kgem->gen >= 40 ||
+		assert(kgem->gen >= 040 ||
 		       kgem_debug_handle_is_fenced(kgem, reloc->target_handle));
 
 		kgem_debug_print(data, offset, 5, "src (%d,%d)\n",
@@ -336,7 +339,7 @@ decode_2d(struct kgem *kgem, uint32_t offset)
 				 reloc->read_domains, reloc->write_domain,
 				 kgem_debug_handle_is_fenced(kgem, reloc->target_handle),
 				 kgem_debug_handle_tiling(kgem, reloc->target_handle));
-		assert(kgem->gen >= 40 ||
+		assert(kgem->gen >= 040 ||
 		       kgem_debug_handle_is_fenced(kgem, reloc->target_handle));
 
 		return len;
@@ -368,18 +371,18 @@ decode_2d(struct kgem *kgem, uint32_t offset)
 
 static int (*decode_3d(int gen))(struct kgem*, uint32_t)
 {
-	if (gen >= 80) {
-	} else if (gen >= 70) {
+	if (gen >= 0100) {
+	} else if (gen >= 070) {
 		return kgem_gen7_decode_3d;
-	} else if (gen >= 60) {
+	} else if (gen >= 060) {
 		return kgem_gen6_decode_3d;
-	} else if (gen >= 50) {
+	} else if (gen >= 050) {
 		return kgem_gen5_decode_3d;
-	} else if (gen >= 40) {
+	} else if (gen >= 040) {
 		return kgem_gen4_decode_3d;
-	} else if (gen >= 30) {
+	} else if (gen >= 030) {
 		return kgem_gen3_decode_3d;
-	} else if (gen >= 20) {
+	} else if (gen >= 020) {
 		return kgem_gen2_decode_3d;
 	}
 	assert(0);
@@ -387,18 +390,18 @@ static int (*decode_3d(int gen))(struct kgem*, uint32_t)
 
 static void (*finish_state(int gen))(struct kgem*)
 {
-	if (gen >= 80) {
-	} else if (gen >= 70) {
+	if (gen >= 0100) {
+	} else if (gen >= 070) {
 		return kgem_gen7_finish_state;
-	} else if (gen >= 60) {
+	} else if (gen >= 060) {
 		return kgem_gen6_finish_state;
-	} else if (gen >= 50) {
+	} else if (gen >= 050) {
 		return kgem_gen5_finish_state;
-	} else if (gen >= 40) {
+	} else if (gen >= 040) {
 		return kgem_gen4_finish_state;
-	} else if (gen >= 30) {
+	} else if (gen >= 030) {
 		return kgem_gen3_finish_state;
-	} else if (gen >= 20) {
+	} else if (gen >= 020) {
 		return kgem_gen2_finish_state;
 	}
 	assert(0);
diff --git a/src/sna/kgem_debug.h b/src/sna/kgem_debug.h
index 82d6f6664..a0c9fc177 100644
--- a/src/sna/kgem_debug.h
+++ b/src/sna/kgem_debug.h
@@ -4,7 +4,7 @@
 void
 kgem_debug_print(const uint32_t *data,
 		 uint32_t offset, unsigned int index,
-		 char *fmt, ...);
+		 const char *fmt, ...);
 
 struct drm_i915_gem_relocation_entry *
 kgem_debug_get_reloc_entry(struct kgem *kgem, uint32_t offset);
diff --git a/src/sna/kgem_debug_gen5.c b/src/sna/kgem_debug_gen5.c
index e23ceb1fa..8b55dd919 100644
--- a/src/sna/kgem_debug_gen5.c
+++ b/src/sna/kgem_debug_gen5.c
@@ -73,7 +73,7 @@ static void gen5_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
 	int i, size;
 
 	reloc = kgem_debug_get_reloc_entry(kgem, &data[1] - kgem->batch);
-	if (reloc->target_handle == 0) {
+	if (reloc->target_handle == -1) {
 		base = kgem->batch;
 		size = kgem->nbatch * sizeof(uint32_t);
 	} else {
@@ -529,20 +529,19 @@ int kgem_gen5_decode_3d(struct kgem *kgem, uint32_t offset)
 		for (i = 1; i < len;) {
 			gen5_update_vertex_elements(kgem, (i - 1)/2, data + i);
 
-			kgem_debug_print(data, offset, i, "buffer %d: %svalid, type 0x%04x, "
-				  "src offset 0x%04x bytes\n",
-				  data[i] >> 27,
-				  data[i] & (1 << 26) ? "" : "in",
-				  (data[i] >> 16) & 0x1ff,
-				  data[i] & 0x07ff);
+			kgem_debug_print(data, offset, i,
+					 "buffer %d: %svalid, type 0x%04x, "
+					 "src offset 0x%04x bytes\n",
+					 data[i] >> 27,
+					 data[i] & (1 << 26) ? "" : "in",
+					 (data[i] >> 16) & 0x1ff,
+					 data[i] & 0x07ff);
 			i++;
-			kgem_debug_print(data, offset, i, "(%s, %s, %s, %s), "
-				  "dst offset 0x%02x bytes\n",
+			kgem_debug_print(data, offset, i, "(%s, %s, %s, %s)\n",
 				  get_965_element_component(data[i], 0),
 				  get_965_element_component(data[i], 1),
 				  get_965_element_component(data[i], 2),
-				  get_965_element_component(data[i], 3),
-				  (data[i] & 0xff) * 4);
+				  get_965_element_component(data[i], 3));
 			i++;
 		}
 		state.num_ve = (len - 1) / 2; /* XXX? */
diff --git a/src/sna/kgem_debug_gen6.c b/src/sna/kgem_debug_gen6.c
index e0b09d558..7ef55d38f 100644
--- a/src/sna/kgem_debug_gen6.c
+++ b/src/sna/kgem_debug_gen6.c
@@ -75,11 +75,11 @@ static void gen6_update_vertex_buffer(struct kgem *kgem, const uint32_t *data)
 	assert(i < kgem->nreloc);
 	reloc = kgem->reloc[i].target_handle;
 
-	if (reloc == 0) {
+	if (reloc == -1) {
 		base = kgem->batch;
 	} else {
 		list_for_each_entry(bo, &kgem->next_request->buffers, request)
-			if (bo->handle == reloc)
+			if (bo->target_handle == reloc)
 				break;
 		assert(&bo->request != &kgem->next_request->buffers);
 		base = kgem_bo_map__debug(kgem, bo);
@@ -643,7 +643,7 @@ int kgem_gen6_decode_3d(struct kgem *kgem, uint32_t offset)
 	case 0x6101:
 		i = 0;
 		kgem_debug_print(data, offset, i++, "STATE_BASE_ADDRESS\n");
-		if (kgem->gen >= 60) {
+		if (kgem->gen >= 060) {
 			assert(len == 10);
 
 			state_base_out(data, offset, i++, "general");
@@ -658,7 +658,7 @@ int kgem_gen6_decode_3d(struct kgem *kgem, uint32_t offset)
 			state_max_out(data, offset, i++, "instruction");
 
 			gen6_update_dynamic_buffer(kgem, offset + 3);
-		} else if (kgem->gen >= 50) {
+		} else if (kgem->gen >= 050) {
 			assert(len == 8);
 
 			state_base_out(data, offset, i++, "general");
@@ -674,7 +674,7 @@ int kgem_gen6_decode_3d(struct kgem *kgem, uint32_t offset)
 		return len;
 
 	case 0x7801:
-		if (kgem->gen >= 60) {
+		if (kgem->gen >= 060) {
 			assert(len == 4);
 
 			kgem_debug_print(data, offset, 0,
@@ -686,7 +686,7 @@ int kgem_gen6_decode_3d(struct kgem *kgem, uint32_t offset)
 			kgem_debug_print(data, offset, 1, "VS binding table\n");
 			kgem_debug_print(data, offset, 2, "GS binding table\n");
 			kgem_debug_print(data, offset, 3, "WM binding table\n");
-		} else if (kgem->gen >= 40) {
+		} else if (kgem->gen >= 040) {
 			assert(len == 6);
 
 			kgem_debug_print(data, offset, 0,
diff --git a/src/sna/sna.h b/src/sna/sna.h
index 031be7287..b470c48a0 100644
--- a/src/sna/sna.h
+++ b/src/sna/sna.h
@@ -42,6 +42,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #endif
 
 #include <stdint.h>
+
 #include "compiler.h"
 
 #include <xorg-server.h>
@@ -79,11 +80,9 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define DBG(x)
 #endif
 
-#define DEBUG_NO_RENDER 0
 #define DEBUG_NO_BLT 0
 
 #define DEBUG_FLUSH_BATCH 0
-#define DEBUG_FLUSH_SYNC 0
 
 #define TEST_ALL 0
 #define TEST_ACCEL (TEST_ALL || 0)
@@ -112,9 +111,9 @@ struct sna_pixmap {
 	struct kgem_bo *gpu_bo, *cpu_bo;
 	struct sna_damage *gpu_damage, *cpu_damage;
 	void *ptr;
+#define PTR(ptr) ((void*)((uintptr_t)(ptr) & ~1))
 
 	struct list list;
-	struct list inactive;
 
 	uint32_t stride;
 	uint32_t clear_color;
@@ -127,11 +126,10 @@ struct sna_pixmap {
 #define PIN_SCANOUT 0x1
 #define PIN_DRI 0x2
 #define PIN_PRIME 0x4
+	uint8_t create :4;
 	uint8_t mapped :1;
 	uint8_t shm :1;
 	uint8_t clear :1;
-	uint8_t undamaged :1;
-	uint8_t create :3;
 	uint8_t header :1;
 	uint8_t cpu :1;
 };
@@ -143,6 +141,15 @@ struct sna_glyph {
 	uint16_t size, pos;
 };
 
+static inline WindowPtr root(ScreenPtr screen)
+{
+#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,10,0,0,0)
+	return screen->root;
+#else
+	return WindowTable[screen->myNum];
+#endif
+}
+
 static inline PixmapPtr get_window_pixmap(WindowPtr window)
 {
 	return fbGetWindowPixmap(window);
@@ -160,7 +167,7 @@ extern DevPrivateKeyRec sna_pixmap_key;
 
 constant static inline struct sna_pixmap *sna_pixmap(PixmapPtr pixmap)
 {
-	return ((void **)dixGetPrivateAddr(&pixmap->devPrivates, &sna_pixmap_key))[1];
+	return ((void **)__get_private(pixmap, sna_pixmap_key))[1];
 }
 
 static inline struct sna_pixmap *sna_pixmap_from_drawable(DrawablePtr drawable)
@@ -178,14 +185,13 @@ struct sna_gc {
 
 static inline struct sna_gc *sna_gc(GCPtr gc)
 {
-	return dixGetPrivateAddr(&gc->devPrivates, &sna_gc_key);
+	return (struct sna_gc *)__get_private(gc, sna_gc_key);
 }
 
 enum {
 	FLUSH_TIMER = 0,
 	THROTTLE_TIMER,
 	EXPIRE_TIMER,
-	INACTIVE_TIMER,
 #if DEBUG_MEMORY
 	DEBUG_MEMORY_TIMER,
 #endif
@@ -196,10 +202,9 @@ struct sna {
 	ScrnInfoPtr scrn;
 
 	unsigned flags;
-#define SNA_NO_THROTTLE		0x1
-#define SNA_NO_DELAYED_FLUSH	0x2
-#define SNA_NO_WAIT		0x4
-#define SNA_NO_FLIP		0x8
+#define SNA_NO_WAIT		0x1
+#define SNA_NO_FLIP		0x2
+#define SNA_TRIPLE_BUFFER	0x4
 #define SNA_TEAR_FREE		0x10
 #define SNA_FORCE_SHADOW	0x20
 
@@ -213,7 +218,6 @@ struct sna {
 
 	struct list flush_pixmaps;
 	struct list active_pixmaps;
-	struct list inactive_clock[2];
 
 	PixmapPtr front;
 	PixmapPtr freed_pixmap;
@@ -237,7 +241,6 @@ struct sna {
 	unsigned int tiling;
 #define SNA_TILING_FB		0x1
 #define SNA_TILING_2D		0x2
-#define SNA_TILING_3D		0x4
 #define SNA_TILING_ALL (~0)
 
 	EntityInfoPtr pEnt;
@@ -262,7 +265,6 @@ struct sna {
 		struct gen6_render_state gen6;
 		struct gen7_render_state gen7;
 	} render_state;
-	uint32_t have_render;
 
 	bool dri_available;
 	bool dri_open;
@@ -298,6 +300,7 @@ extern void sna_mode_update(struct sna *sna);
 extern void sna_mode_disable_unused(struct sna *sna);
 extern void sna_mode_wakeup(struct sna *sna);
 extern void sna_mode_redisplay(struct sna *sna);
+extern void sna_mode_close(struct sna *sna);
 extern void sna_mode_fini(struct sna *sna);
 
 extern int sna_page_flip(struct sna *sna,
@@ -320,7 +323,7 @@ to_sna_from_screen(ScreenPtr screen)
 constant static inline struct sna *
 to_sna_from_pixmap(PixmapPtr pixmap)
 {
-	return ((void **)dixGetPrivateAddr(&pixmap->devPrivates, &sna_pixmap_key))[0];
+	return ((void **)__get_private(pixmap, sna_pixmap_key))[0];
 }
 
 constant static inline struct sna *
@@ -371,10 +374,11 @@ static inline void sna_dri_vblank_handler(struct sna *sna, struct drm_event_vbla
 static inline void sna_dri_destroy_window(WindowPtr win) { }
 static inline void sna_dri_close(struct sna *sna, ScreenPtr pScreen) { }
 #endif
+void sna_dri_pixmap_update_bo(struct sna *sna, PixmapPtr pixmap);
 
 extern int sna_crtc_to_pipe(xf86CrtcPtr crtc);
-extern int sna_crtc_to_plane(xf86CrtcPtr crtc);
-extern int sna_crtc_id(xf86CrtcPtr crtc);
+extern uint32_t sna_crtc_to_plane(xf86CrtcPtr crtc);
+extern uint32_t sna_crtc_id(xf86CrtcPtr crtc);
 
 CARD32 sna_format_for_depth(int depth);
 CARD32 sna_render_format_for_depth(int depth);
@@ -438,6 +442,9 @@ void sna_pixmap_destroy(PixmapPtr pixmap);
 #define __MOVE_FORCE 0x40
 #define __MOVE_DRI 0x80
 
+bool
+sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int flags);
+
 struct sna_pixmap *sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags);
 static inline struct sna_pixmap *
 sna_pixmap_force_to_gpu(PixmapPtr pixmap, unsigned flags)
@@ -483,6 +490,24 @@ struct kgem_bo *
 sna_drawable_use_bo(DrawablePtr drawable, unsigned flags, const BoxRec *box,
 		    struct sna_damage ***damage);
 
+inline static int16_t bound(int16_t a, uint16_t b)
+{
+	int v = (int)a + (int)b;
+	if (v > MAXSHORT)
+		return MAXSHORT;
+	return v;
+}
+
+inline static int16_t clamp(int16_t a, int16_t b)
+{
+	int v = (int)a + (int)b;
+	if (v > MAXSHORT)
+		return MAXSHORT;
+	if (v < MINSHORT)
+		return MINSHORT;
+	return v;
+}
+
 static inline bool
 box_inplace(PixmapPtr pixmap, const BoxRec *box)
 {
@@ -587,6 +612,20 @@ _sna_get_transformed_coordinates(int x, int y,
 	*y_out = result[1] / (double)result[2];
 }
 
+static inline void
+_sna_get_transformed_scaled(int x, int y,
+			    const PictTransform *transform, const float *sf,
+			    float *x_out, float *y_out)
+{
+	*x_out = sf[0] * (transform->matrix[0][0] * x +
+			  transform->matrix[0][1] * y +
+			  transform->matrix[0][2]);
+
+	*y_out = sf[1] * (transform->matrix[1][0] * x +
+			  transform->matrix[1][1] * y +
+			  transform->matrix[1][2]);
+}
+
 void
 sna_get_transformed_coordinates(int x, int y,
 				const PictTransform *transform,
@@ -602,6 +641,12 @@ bool sna_transform_is_integer_translation(const PictTransform *t,
 					  int16_t *tx, int16_t *ty);
 bool sna_transform_is_translation(const PictTransform *t,
 				  pixman_fixed_t *tx, pixman_fixed_t *ty);
+static inline bool
+sna_affine_transform_is_rotation(const PictTransform *t)
+{
+	assert(sna_transform_is_affine(t));
+	return t->matrix[0][1] | t->matrix[1][0];
+}
 
 static inline bool
 sna_transform_equal(const PictTransform *a, const PictTransform *b)
@@ -635,7 +680,7 @@ static inline bool wedged(struct sna *sna)
 
 static inline bool can_render(struct sna *sna)
 {
-	return likely(!sna->kgem.wedged && sna->have_render);
+	return likely(!sna->kgem.wedged && sna->render.prefer_gpu & PREFER_GPU_RENDER);
 }
 
 static inline uint32_t pixmap_size(PixmapPtr pixmap)
@@ -665,6 +710,15 @@ void sna_composite(CARD8 op,
 		   INT16 mask_x, INT16 mask_y,
 		   INT16 dst_x,  INT16 dst_y,
 		   CARD16 width, CARD16 height);
+void sna_composite_fb(CARD8 op,
+		      PicturePtr src,
+		      PicturePtr mask,
+		      PicturePtr dst,
+		      RegionPtr region,
+		      INT16 src_x,  INT16 src_y,
+		      INT16 mask_x, INT16 mask_y,
+		      INT16 dst_x,  INT16 dst_y,
+		      CARD16 width, CARD16 height);
 void sna_composite_rectangles(CARD8		 op,
 			      PicturePtr		 dst,
 			      xRenderColor	*color,
@@ -787,6 +841,7 @@ memcpy_xor(const void *src, void *dst, int bpp,
 
 #define SNA_CREATE_FB 0x10
 #define SNA_CREATE_SCRATCH 0x11
+#define SNA_CREATE_GLYPHS 0x12
 
 inline static bool is_power_of_two(unsigned x)
 {
@@ -801,4 +856,22 @@ inline static bool is_clipped(const RegionRec *r,
 		r->extents.y2 - r->extents.y1 != d->height);
 }
 
+void sna_threads_init(void);
+int sna_use_threads (int width, int height, int threshold);
+void sna_threads_run(void (*func)(void *arg), void *arg);
+void sna_threads_wait(void);
+
+void sna_image_composite(pixman_op_t        op,
+			 pixman_image_t    *src,
+			 pixman_image_t    *mask,
+			 pixman_image_t    *dst,
+			 int16_t            src_x,
+			 int16_t            src_y,
+			 int16_t            mask_x,
+			 int16_t            mask_y,
+			 int16_t            dst_x,
+			 int16_t            dst_y,
+			 uint16_t           width,
+			 uint16_t           height);
+
 #endif /* _SNA_H */
diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c
index a8a0c931a..827dcf4ac 100644
--- a/src/sna/sna_accel.c
+++ b/src/sna/sna_accel.c
@@ -29,6 +29,7 @@
 #include "config.h"
 #endif
 
+#include "intel_options.h"
 #include "sna.h"
 #include "sna_reg.h"
 #include "rop.h"
@@ -51,6 +52,8 @@
 #include <sys/mman.h>
 #include <unistd.h>
 
+#define FAULT_INJECTION 0
+
 #define FORCE_INPLACE 0
 #define FORCE_FALLBACK 0
 #define FORCE_FLUSH 0
@@ -60,8 +63,9 @@
 #define USE_INPLACE 1
 #define USE_WIDE_SPANS 0 /* -1 force CPU, 1 force GPU */
 #define USE_ZERO_SPANS 1 /* -1 force CPU, 1 force GPU */
-#define USE_INACTIVE 0
 #define USE_CPU_BO 1
+#define USE_USERPTR_UPLOADS 1
+#define USE_USERPTR_DOWNLOADS 1
 
 #define MIGRATE_ALL 0
 #define DBG_NO_CPU_UPLOAD 0
@@ -92,6 +96,9 @@
 #define NO_TILE_8x8 0
 #define NO_STIPPLE_8x8 0
 
+#define IS_STATIC_PTR(ptr) ((uintptr_t)(ptr) & 1)
+#define MAKE_STATIC_PTR(ptr) ((void*)((uintptr_t)(ptr) | 1))
+
 #if 0
 static void __sna_fallback_flush(DrawablePtr d)
 {
@@ -318,6 +325,8 @@ static void assert_pixmap_damage(PixmapPtr p)
 	if (priv == NULL)
 		return;
 
+	assert(priv->gpu_damage == NULL || priv->gpu_bo);
+
 	if (priv->clear) {
 		assert(DAMAGE_IS_ALL(priv->gpu_damage));
 		assert(priv->cpu_damage == NULL);
@@ -345,7 +354,7 @@ static void assert_pixmap_damage(PixmapPtr p)
 		_sna_damage_debug_get_region(DAMAGE_PTR(priv->cpu_damage), &cpu);
 
 	RegionIntersect(&reg, &cpu, &gpu);
-	assert(!RegionNotEmpty(&reg));
+	assert(RegionNil(&reg));
 
 	RegionUninit(&reg);
 	RegionUninit(&gpu);
@@ -405,8 +414,6 @@ static void sna_pixmap_free_gpu(struct sna *sna, struct sna_pixmap *priv)
 		priv->mapped = false;
 	}
 
-	list_del(&priv->inactive);
-
 	/* and reset the upload counter */
 	priv->source_count = SOURCE_BIAS;
 }
@@ -433,20 +440,20 @@ sna_pixmap_alloc_cpu(struct sna *sna,
 						  pixmap->drawable.width,
 						  pixmap->drawable.height,
 						  pixmap->drawable.bitsPerPixel,
-						  from_gpu ? 0 : CREATE_CPU_MAP | CREATE_INACTIVE);
+						  from_gpu ? 0 : CREATE_CPU_MAP | CREATE_INACTIVE | CREATE_NO_THROTTLE);
 		if (priv->cpu_bo) {
 			priv->ptr = kgem_bo_map__cpu(&sna->kgem, priv->cpu_bo);
-			priv->stride = priv->cpu_bo->pitch;
 			if (priv->ptr) {
 				DBG(("%s: allocated CPU handle=%d (snooped? %d)\n", __FUNCTION__,
 				     priv->cpu_bo->handle, priv->cpu_bo->snoop));
+				priv->stride = priv->cpu_bo->pitch;
 #ifdef DEBUG_MEMORY
 				sna->debug_memory.cpu_bo_allocs++;
 				sna->debug_memory.cpu_bo_bytes += kgem_bo_size(priv->cpu_bo);
+#endif
 			} else {
 				kgem_bo_destroy(&sna->kgem, priv->cpu_bo);
 				priv->cpu_bo = NULL;
-#endif
 			}
 		}
 	}
@@ -459,17 +466,15 @@ sna_pixmap_alloc_cpu(struct sna *sna,
 
 	assert(priv->ptr);
 done:
-	pixmap->devPrivate.ptr = priv->ptr;
-	pixmap->devKind = priv->stride;
 	assert(priv->stride);
+	assert(!priv->mapped);
+	pixmap->devPrivate.ptr = PTR(priv->ptr);
+	pixmap->devKind = priv->stride;
 	return priv->ptr != NULL;
 }
 
-static void sna_pixmap_free_cpu(struct sna *sna, struct sna_pixmap *priv)
+static void __sna_pixmap_free_cpu(struct sna *sna, struct sna_pixmap *priv)
 {
-	assert(priv->cpu_damage == NULL);
-	assert(list_is_empty(&priv->list));
-
 	if (priv->cpu_bo) {
 		DBG(("%s: discarding CPU buffer, handle=%d, size=%d\n",
 		     __FUNCTION__, priv->cpu_bo->handle, kgem_bo_size(priv->cpu_bo)));
@@ -477,17 +482,29 @@ static void sna_pixmap_free_cpu(struct sna *sna, struct sna_pixmap *priv)
 		sna->debug_memory.cpu_bo_allocs--;
 		sna->debug_memory.cpu_bo_bytes -= kgem_bo_size(priv->cpu_bo);
 #endif
-		if (priv->cpu_bo->flush) {
-			assert(priv->cpu_bo->reusable == false);
+		if (!priv->cpu_bo->reusable) {
+			assert(priv->cpu_bo->flush == true);
 			kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
 			sna_accel_watch_flush(sna, -1);
 		}
 		kgem_bo_destroy(&sna->kgem, priv->cpu_bo);
-		priv->cpu_bo = NULL;
-	} else
+	} else if (!IS_STATIC_PTR(priv->ptr))
 		free(priv->ptr);
+}
+
+static void sna_pixmap_free_cpu(struct sna *sna, struct sna_pixmap *priv)
+{
+	assert(priv->cpu_damage == NULL);
+	assert(list_is_empty(&priv->list));
+
+	if (IS_STATIC_PTR(priv->ptr))
+		return;
 
+	__sna_pixmap_free_cpu(sna, priv);
+
+	priv->cpu_bo = NULL;
 	priv->ptr = NULL;
+
 	if (!priv->mapped)
 		priv->pixmap->devPrivate.ptr = NULL;
 }
@@ -499,14 +516,14 @@ static inline uint32_t default_tiling(PixmapPtr pixmap,
 	struct sna *sna = to_sna_from_pixmap(pixmap);
 
 	/* Try to avoid hitting the Y-tiling GTT mapping bug on 855GM */
-	if (sna->kgem.gen == 21)
+	if (sna->kgem.gen == 021)
 		return I915_TILING_X;
 
 	/* Only on later generations was the render pipeline
 	 * more flexible than the BLT. So on gen2/3, prefer to
 	 * keep large objects accessible through the BLT.
 	 */
-	if (sna->kgem.gen < 40 &&
+	if (sna->kgem.gen < 040 &&
 	    (pixmap->drawable.width  > sna->render.max_3d_size ||
 	     pixmap->drawable.height > sna->render.max_3d_size))
 		return I915_TILING_X;
@@ -518,7 +535,6 @@ static inline uint32_t default_tiling(PixmapPtr pixmap,
 		DBG(("%s: entire source is damaged, using Y-tiling\n",
 		     __FUNCTION__));
 		sna_damage_destroy(&priv->gpu_damage);
-		priv->undamaged = false;
 
 		return I915_TILING_Y;
 	}
@@ -611,7 +627,7 @@ struct kgem_bo *sna_pixmap_change_tiling(PixmapPtr pixmap, uint32_t tiling)
 
 static inline void sna_set_pixmap(PixmapPtr pixmap, struct sna_pixmap *sna)
 {
-	((void **)dixGetPrivateAddr(&pixmap->devPrivates, &sna_pixmap_key))[1] = sna;
+	((void **)__get_private(pixmap, sna_pixmap_key))[1] = sna;
 	assert(sna_pixmap(pixmap) == sna);
 }
 
@@ -619,7 +635,6 @@ static struct sna_pixmap *
 _sna_pixmap_init(struct sna_pixmap *priv, PixmapPtr pixmap)
 {
 	list_init(&priv->list);
-	list_init(&priv->inactive);
 	priv->source_count = SOURCE_BIAS;
 	priv->pixmap = pixmap;
 
@@ -664,6 +679,7 @@ bool sna_pixmap_attach_to_bo(PixmapPtr pixmap, struct kgem_bo *bo)
 		return false;
 
 	priv->gpu_bo = kgem_bo_reference(bo);
+	assert(priv->gpu_bo->proxy == NULL);
 	sna_damage_all(&priv->gpu_damage,
 		       pixmap->drawable.width,
 		       pixmap->drawable.height);
@@ -711,11 +727,13 @@ create_pixmap(struct sna *sna, ScreenPtr screen,
 		datasize += adjust;
 	}
 
+	DBG(("%s: allocating pixmap %dx%d, depth=%d, size=%ld\n",
+	     __FUNCTION__, width, height, depth, (long)datasize));
 	pixmap = AllocatePixmap(screen, datasize);
 	if (!pixmap)
 		return NullPixmap;
 
-	((void **)dixGetPrivateAddr(&pixmap->devPrivates, &sna_pixmap_key))[0] = sna;
+	((void **)__get_private(pixmap, sna_pixmap_key))[0] = sna;
 	assert(to_sna_from_pixmap(pixmap) == sna);
 
 	pixmap->drawable.type = DRAWABLE_PIXMAP;
@@ -764,7 +782,7 @@ sna_pixmap_create_shm(ScreenPtr screen,
 	DBG(("%s(%dx%d, depth=%d, bpp=%d, pitch=%d)\n",
 	     __FUNCTION__, width, height, depth, bpp, pitch));
 
-	if (wedged(sna) || bpp == 0 || pitch*height <= 4096) {
+	if (wedged(sna) || bpp == 0 || pitch*height < 4096) {
 fallback:
 		pixmap = sna_pixmap_create_unattached(screen, 0, 0, depth);
 		if (pixmap == NULL)
@@ -833,6 +851,8 @@ fallback:
 
 	priv->cpu = true;
 	priv->shm = true;
+	priv->stride = pitch;
+	priv->ptr = MAKE_STATIC_PTR(addr);
 	sna_damage_all(&priv->cpu_damage, width, height);
 
 	pixmap->devKind = pitch;
@@ -863,7 +883,8 @@ sna_pixmap_create_scratch(ScreenPtr screen,
 	     width, height, depth, tiling));
 
 	bpp = bits_per_pixel(depth);
-	if (tiling == I915_TILING_Y && !sna->have_render)
+	if (tiling == I915_TILING_Y &&
+	    (sna->render.prefer_gpu & PREFER_GPU_RENDER) == 0)
 		tiling = I915_TILING_X;
 
 	if (tiling == I915_TILING_Y &&
@@ -1108,6 +1129,7 @@ sna_create_pixmap_shared(struct sna *sna, ScreenPtr screen,
 		assert(priv->gpu_bo->tiling == I915_TILING_NONE);
 		assert((priv->gpu_bo->pitch & 255) == 0);
 
+		assert(!priv->mapped);
 		pixmap->devPrivate.ptr =
 			kgem_bo_map__async(&sna->kgem, priv->gpu_bo);
 		if (pixmap->devPrivate.ptr == NULL) {
@@ -1136,8 +1158,10 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 {
 	struct sna *sna = to_sna_from_screen(screen);
 	PixmapPtr pixmap;
+	struct sna_pixmap *priv;
 	unsigned flags;
 	int pad;
+	void *ptr;
 
 	DBG(("%s(%d, %d, %d, usage=%x)\n", __FUNCTION__,
 	     width, height, depth, usage));
@@ -1161,19 +1185,29 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 		goto fallback;
 	}
 
-	if (!can_render(sna))
+	if (unlikely((sna->render.prefer_gpu & PREFER_GPU_RENDER) == 0))
+		flags &= ~KGEM_CAN_CREATE_GPU;
+	if (wedged(sna))
 		flags = 0;
 
-	if (usage == CREATE_PIXMAP_USAGE_SCRATCH) {
+	switch (usage) {
+	case CREATE_PIXMAP_USAGE_SCRATCH:
 		if (flags & KGEM_CAN_CREATE_GPU)
 			return sna_pixmap_create_scratch(screen,
 							 width, height, depth,
 							 I915_TILING_X);
 		else
 			goto fallback;
-	}
 
-	if (usage == SNA_CREATE_SCRATCH) {
+	case SNA_CREATE_GLYPHS:
+		if (flags & KGEM_CAN_CREATE_GPU)
+			return sna_pixmap_create_scratch(screen,
+							 width, height, depth,
+							 -I915_TILING_Y);
+		else
+			goto fallback;
+
+	case SNA_CREATE_SCRATCH:
 		if (flags & KGEM_CAN_CREATE_GPU)
 			return sna_pixmap_create_scratch(screen,
 							 width, height, depth,
@@ -1188,7 +1222,7 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 		usage = 0;
 
 	pad = PixmapBytePad(width, depth);
-	if (pad * height <= 4096) {
+	if (pad * height < 4096) {
 		DBG(("%s: small buffer [%d], attaching to shadow pixmap\n",
 		     __FUNCTION__, pad * height));
 		pixmap = create_pixmap(sna, screen,
@@ -1196,10 +1230,10 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 		if (pixmap == NullPixmap)
 			return NullPixmap;
 
-		sna_pixmap_attach(pixmap);
+		ptr = MAKE_STATIC_PTR(pixmap->devPrivate.ptr);
+		pad = pixmap->devKind;
+		flags &= ~(KGEM_CAN_CREATE_GPU | KGEM_CAN_CREATE_CPU);
 	} else {
-		struct sna_pixmap *priv;
-
 		DBG(("%s: creating GPU pixmap %dx%d, stride=%d, flags=%x\n",
 		     __FUNCTION__, width, height, pad, flags));
 
@@ -1212,16 +1246,19 @@ static PixmapPtr sna_create_pixmap(ScreenPtr screen,
 		pixmap->devKind = pad;
 		pixmap->devPrivate.ptr = NULL;
 
-		priv = sna_pixmap_attach(pixmap);
-		if (priv == NULL) {
-			free(pixmap);
-			goto fallback;
-		}
+		ptr = NULL;
+	}
 
-		priv->stride = pad;
-		priv->create = flags;
+	priv = sna_pixmap_attach(pixmap);
+	if (priv == NULL) {
+		free(pixmap);
+		goto fallback;
 	}
 
+	priv->stride = pad;
+	priv->create = flags;
+	priv->ptr = ptr;
+
 	return pixmap;
 
 fallback:
@@ -1235,9 +1272,10 @@ void sna_add_flush_pixmap(struct sna *sna,
 	DBG(("%s: marking pixmap=%ld for flushing\n",
 	     __FUNCTION__, priv->pixmap->drawable.serialNumber));
 	assert(bo);
+	assert(bo->flush);
 	list_move(&priv->list, &sna->flush_pixmaps);
 
-	if (bo->exec == NULL) {
+	if (bo->exec == NULL && kgem_is_idle(&sna->kgem)) {
 		DBG(("%s: new flush bo, flushin before\n", __FUNCTION__));
 		kgem_submit(&sna->kgem);
 	}
@@ -1248,12 +1286,11 @@ static void __sna_free_pixmap(struct sna *sna,
 			      struct sna_pixmap *priv)
 {
 	list_del(&priv->list);
-	list_del(&priv->inactive);
 
 	sna_damage_destroy(&priv->gpu_damage);
 	sna_damage_destroy(&priv->cpu_damage);
 
-	sna_pixmap_free_cpu(sna, priv);
+	__sna_pixmap_free_cpu(sna, priv);
 
 	if (priv->header) {
 		assert(!priv->shm);
@@ -1308,7 +1345,8 @@ void sna_pixmap_destroy(PixmapPtr pixmap)
 
 static inline bool pixmap_inplace(struct sna *sna,
 				  PixmapPtr pixmap,
-				  struct sna_pixmap *priv)
+				  struct sna_pixmap *priv,
+				  bool write_only)
 {
 	if (FORCE_INPLACE)
 		return FORCE_INPLACE > 0;
@@ -1317,7 +1355,10 @@ static inline bool pixmap_inplace(struct sna *sna,
 		return false;
 
 	if (priv->mapped)
-		return true;
+		return !IS_CPU_MAP(priv->gpu_bo->map);
+
+	if (!write_only && priv->cpu_damage)
+		return false;
 
 	return (pixmap->devKind * pixmap->drawable.height >> 12) >
 		sna->kgem.half_cpu_cache_pages;
@@ -1332,8 +1373,12 @@ sna_pixmap_create_mappable_gpu(PixmapPtr pixmap)
 	if (wedged(sna))
 		return false;
 
+	if ((priv->create & KGEM_CAN_CREATE_GTT) == 0)
+		return false;
+
 	assert_pixmap_damage(pixmap);
 
+	assert(priv->gpu_damage == NULL);
 	assert(priv->gpu_bo == NULL);
 	priv->gpu_bo =
 		kgem_create_2d(&sna->kgem,
@@ -1395,21 +1440,43 @@ static inline bool use_cpu_bo_for_upload(struct sna *sna,
 	     kgem_bo_is_busy(priv->gpu_bo),
 	     kgem_bo_is_busy(priv->cpu_bo)));
 
+	if (!priv->cpu)
+		return true;
+
 	if (flags & (MOVE_WRITE | MOVE_ASYNC_HINT))
 		return true;
 
+	if (priv->gpu_bo->tiling)
+		return true;
+
 	return kgem_bo_is_busy(priv->gpu_bo) || kgem_bo_is_busy(priv->cpu_bo);
 }
 
 static inline bool operate_inplace(struct sna_pixmap *priv, unsigned flags)
 {
-	if ((flags & MOVE_INPLACE_HINT) == 0 || priv->gpu_bo == NULL)
+	if ((flags & MOVE_INPLACE_HINT) == 0) {
+		DBG(("%s: no, inplace operation not suitable\n", __FUNCTION__));
+		return false;
+	}
+
+	assert((flags & MOVE_ASYNC_HINT) == 0);
+
+	if ((priv->create & KGEM_CAN_CREATE_GTT) == 0) {
+		DBG(("%s: no, not accessible via GTT\n", __FUNCTION__));
 		return false;
+	}
+
+	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo)) {
+		DBG(("%s: yes, CPU is busy\n", __FUNCTION__));
+		return true;
+	}
 
-	if (flags & MOVE_WRITE && kgem_bo_is_busy(priv->gpu_bo))
+	if (flags & MOVE_WRITE && priv->gpu_bo&&kgem_bo_is_busy(priv->gpu_bo)) {
+		DBG(("%s: no, GPU is busy, so stage write\n", __FUNCTION__));
 		return false;
+	}
 
-	return priv->stride != 0;
+	return true;
 }
 
 bool
@@ -1437,13 +1504,15 @@ _sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned int flags)
 	     priv->gpu_bo ? priv->gpu_bo->handle : 0,
 	     priv->gpu_damage, priv->cpu_damage, priv->clear));
 
+	assert(priv->gpu_damage == NULL || priv->gpu_bo);
+
 	if (USE_INPLACE && (flags & MOVE_READ) == 0) {
 		assert(flags & MOVE_WRITE);
 		DBG(("%s: no readbck, discarding gpu damage [%d], pending clear[%d]\n",
 		     __FUNCTION__, priv->gpu_damage != NULL, priv->clear));
 
 		if (priv->create & KGEM_CAN_CREATE_GPU &&
-		    pixmap_inplace(sna, pixmap, priv)) {
+		    pixmap_inplace(sna, pixmap, priv, true)) {
 			assert(!priv->shm);
 			DBG(("%s: write inplace\n", __FUNCTION__));
 			if (priv->gpu_bo) {
@@ -1460,29 +1529,26 @@ _sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned int flags)
 			    !sna_pixmap_create_mappable_gpu(pixmap))
 				goto skip_inplace_map;
 
-			if (!priv->mapped) {
-				pixmap->devPrivate.ptr =
-					kgem_bo_map(&sna->kgem, priv->gpu_bo);
-				if (pixmap->devPrivate.ptr == NULL)
-					goto skip_inplace_map;
+			pixmap->devPrivate.ptr =
+				kgem_bo_map(&sna->kgem, priv->gpu_bo);
+			priv->mapped = pixmap->devPrivate.ptr != NULL;
+			if (!priv->mapped)
+				goto skip_inplace_map;
 
-				priv->mapped = true;
-			}
 			pixmap->devKind = priv->gpu_bo->pitch;
 
+			assert(priv->gpu_bo->proxy == NULL);
 			sna_damage_all(&priv->gpu_damage,
 				       pixmap->drawable.width,
 				       pixmap->drawable.height);
 			sna_damage_destroy(&priv->cpu_damage);
-			priv->undamaged = false;
 			priv->clear = false;
 			priv->cpu = false;
 			list_del(&priv->list);
-			if (priv->cpu_bo) {
-				assert(!priv->shm);
-				assert(!priv->cpu_bo->flush);
-				sna_pixmap_free_cpu(sna, priv);
-			}
+
+			assert(!priv->shm);
+			assert(priv->cpu_bo == NULL || !priv->cpu_bo->flush);
+			sna_pixmap_free_cpu(sna, priv);
 
 			assert_pixmap_damage(pixmap);
 			return true;
@@ -1490,6 +1556,7 @@ _sna_pixmap_move_to_cpu(PixmapPtr pixmap, unsigned int flags)
 
 skip_inplace_map:
 		sna_damage_destroy(&priv->gpu_damage);
+		priv->clear = false;
 		if (priv->cpu_bo && !priv->cpu_bo->flush &&
 		    __kgem_bo_is_busy(&sna->kgem, priv->cpu_bo)) {
 			DBG(("%s: discarding busy CPU bo\n", __FUNCTION__));
@@ -1497,10 +1564,12 @@ skip_inplace_map:
 			assert(priv->gpu_bo == NULL || priv->gpu_damage == NULL);
 
 			sna_damage_destroy(&priv->cpu_damage);
-			priv->undamaged = false;
-
-			sna_pixmap_free_gpu(sna, priv);
 			sna_pixmap_free_cpu(sna, priv);
+
+			if (!sna_pixmap_alloc_cpu(sna, pixmap, priv, false))
+				return false;
+
+			goto mark_damage;
 		}
 	}
 
@@ -1512,52 +1581,89 @@ skip_inplace_map:
 	assert(priv->gpu_bo == NULL || priv->gpu_bo->proxy == NULL);
 
 	if (operate_inplace(priv, flags) &&
-	    pixmap_inplace(sna, pixmap, priv) &&
-	    sna_pixmap_move_to_gpu(pixmap, flags)) {
+	    pixmap_inplace(sna, pixmap, priv, (flags & MOVE_READ) == 0) &&
+	    (priv->gpu_bo || sna_pixmap_create_mappable_gpu(pixmap))) {
 		kgem_bo_submit(&sna->kgem, priv->gpu_bo);
 
-		DBG(("%s: try to operate inplace\n", __FUNCTION__));
-		assert(priv->cpu == false);
+		DBG(("%s: try to operate inplace (GTT)\n", __FUNCTION__));
+		assert((flags & MOVE_READ) == 0 || priv->cpu == false);
 
-		pixmap->devPrivate.ptr =
-			kgem_bo_map(&sna->kgem, priv->gpu_bo);
-		if (pixmap->devPrivate.ptr != NULL) {
-			priv->mapped = true;
+		pixmap->devPrivate.ptr = kgem_bo_map(&sna->kgem, priv->gpu_bo);
+		priv->mapped = pixmap->devPrivate.ptr != NULL;
+		if (priv->mapped) {
 			pixmap->devKind = priv->gpu_bo->pitch;
 			if (flags & MOVE_WRITE) {
+				assert(priv->gpu_bo->proxy == NULL);
 				sna_damage_all(&priv->gpu_damage,
 					       pixmap->drawable.width,
 					       pixmap->drawable.height);
 				sna_damage_destroy(&priv->cpu_damage);
+				sna_pixmap_free_cpu(sna, priv);
 				list_del(&priv->list);
-				priv->undamaged = false;
 				priv->clear = false;
 			}
 
 			assert_pixmap_damage(pixmap);
-			DBG(("%s: operate inplace\n", __FUNCTION__));
+			DBG(("%s: operate inplace (GTT)\n", __FUNCTION__));
 			return true;
 		}
-
-		priv->mapped = false;
 	}
 
 	if (priv->mapped) {
-		assert(!priv->shm);
-		pixmap->devPrivate.ptr = NULL;
+		assert(!priv->shm && priv->stride);
+		pixmap->devPrivate.ptr = PTR(priv->ptr);
+		pixmap->devKind = priv->stride;
 		priv->mapped = false;
 	}
 
-	if (priv->clear && priv->cpu_bo && !priv->cpu_bo->flush &&
+	if (priv->gpu_damage &&
+	    ((flags & MOVE_ASYNC_HINT) == 0 ||
+	     !__kgem_bo_is_busy(&sna->kgem, priv->gpu_bo)) &&
+	    priv->gpu_bo->tiling == I915_TILING_NONE &&
+	    sna_pixmap_move_to_gpu(pixmap, MOVE_READ)) {
+		kgem_bo_submit(&sna->kgem, priv->gpu_bo);
+
+		DBG(("%s: try to operate inplace (CPU)\n", __FUNCTION__));
+
+		assert(!priv->mapped);
+		pixmap->devPrivate.ptr =
+			kgem_bo_map__cpu(&sna->kgem, priv->gpu_bo);
+		if (pixmap->devPrivate.ptr != NULL) {
+			priv->cpu = true;
+			priv->mapped = true;
+			pixmap->devKind = priv->gpu_bo->pitch;
+			if (flags & MOVE_WRITE) {
+				assert(priv->gpu_bo->proxy == NULL);
+				sna_damage_all(&priv->gpu_damage,
+					       pixmap->drawable.width,
+					       pixmap->drawable.height);
+				sna_damage_destroy(&priv->cpu_damage);
+				sna_pixmap_free_cpu(sna, priv);
+				list_del(&priv->list);
+				priv->clear = false;
+			}
+
+			kgem_bo_sync__cpu_full(&sna->kgem,
+					       priv->gpu_bo, flags & MOVE_WRITE);
+			assert_pixmap_damage(pixmap);
+			DBG(("%s: operate inplace (CPU)\n", __FUNCTION__));
+			return true;
+		}
+	}
+
+	if (((flags & MOVE_READ) == 0 || priv->clear) &&
+	    priv->cpu_bo && !priv->cpu_bo->flush &&
 	    __kgem_bo_is_busy(&sna->kgem, priv->cpu_bo)) {
 		assert(!priv->shm);
-		assert(DAMAGE_IS_ALL(priv->gpu_damage));
 		sna_pixmap_free_cpu(sna, priv);
 	}
 
 	if (pixmap->devPrivate.ptr == NULL &&
-	    !sna_pixmap_alloc_cpu(sna, pixmap, priv, priv->gpu_damage != NULL))
+	    !sna_pixmap_alloc_cpu(sna, pixmap, priv,
+				  flags & MOVE_READ ? priv->gpu_damage && !priv->clear : 0))
 		return false;
+	assert(pixmap->devPrivate.ptr);
+	assert(!priv->mapped);
 
 	if (priv->clear) {
 		DBG(("%s: applying clear [%08x]\n",
@@ -1585,7 +1691,7 @@ skip_inplace_map:
 			       pixmap->drawable.width,
 			       pixmap->drawable.height);
 		sna_pixmap_free_gpu(sna, priv);
-		priv->undamaged = false;
+		assert(priv->gpu_damage == NULL);
 		priv->clear = false;
 	}
 
@@ -1594,6 +1700,7 @@ skip_inplace_map:
 		int n;
 
 		DBG(("%s: flushing GPU damage\n", __FUNCTION__));
+		assert(priv->gpu_bo);
 
 		n = sna_damage_get_boxes(priv->gpu_damage, &box);
 		if (n) {
@@ -1615,16 +1722,15 @@ skip_inplace_map:
 
 		__sna_damage_destroy(DAMAGE_PTR(priv->gpu_damage));
 		priv->gpu_damage = NULL;
-		priv->undamaged = true;
 	}
 
 	if (flags & MOVE_WRITE || priv->create & KGEM_CAN_CREATE_LARGE) {
+mark_damage:
 		DBG(("%s: marking as damaged\n", __FUNCTION__));
 		sna_damage_all(&priv->cpu_damage,
 			       pixmap->drawable.width,
 			       pixmap->drawable.height);
 		sna_pixmap_free_gpu(sna, priv);
-		priv->undamaged = false;
 
 		if (priv->flush) {
 			assert(!priv->shm);
@@ -1634,25 +1740,25 @@ skip_inplace_map:
 
 done:
 	if (flags & MOVE_WRITE) {
+		assert(DAMAGE_IS_ALL(priv->cpu_damage));
 		priv->source_count = SOURCE_BIAS;
 		assert(priv->gpu_bo == NULL || priv->gpu_bo->proxy == NULL);
 		if (priv->gpu_bo && priv->gpu_bo->domain != DOMAIN_GPU) {
 			DBG(("%s: discarding inactive GPU bo\n", __FUNCTION__));
-			assert(DAMAGE_IS_ALL(priv->cpu_damage));
 			sna_pixmap_free_gpu(sna, priv);
-			priv->undamaged = false;
 		}
 	}
 
 	if (priv->cpu_bo) {
 		if ((flags & MOVE_ASYNC_HINT) == 0) {
 			DBG(("%s: syncing CPU bo\n", __FUNCTION__));
-			kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
+			kgem_bo_sync__cpu_full(&sna->kgem,
+					       priv->cpu_bo, flags & MOVE_WRITE);
+			assert(!priv->shm || !kgem_bo_is_busy(priv->cpu_bo));
 		}
 		if (flags & MOVE_WRITE) {
 			DBG(("%s: discarding GPU bo in favour of CPU bo\n", __FUNCTION__));
 			sna_pixmap_free_gpu(sna, priv);
-			priv->undamaged = false;
 		}
 	}
 	priv->cpu = (flags & MOVE_ASYNC_HINT) == 0;
@@ -1717,29 +1823,30 @@ static inline bool region_inplace(struct sna *sna,
 	if (wedged(sna) && !priv->pinned)
 		return false;
 
-	if (priv->cpu) {
-		DBG(("%s: no, preferring last action of CPU\n", __FUNCTION__));
-		return false;
-	}
-
-	if (!write_only &&
+	if ((priv->cpu || !write_only) &&
 	    region_overlaps_damage(region, priv->cpu_damage, 0, 0)) {
 		DBG(("%s: no, uncovered CPU damage pending\n", __FUNCTION__));
 		return false;
 	}
 
-	if (priv->flush) {
-		DBG(("%s: yes, exported via dri, will flush\n", __FUNCTION__));
-		return true;
+	if (priv->cpu) {
+		DBG(("%s: no, preferring last action of CPU\n", __FUNCTION__));
+		return false;
 	}
 
 	if (priv->mapped) {
 		DBG(("%s: yes, already mapped, continuiung\n", __FUNCTION__));
+		return !IS_CPU_MAP(priv->gpu_bo->map);
+	}
+
+	if (priv->flush) {
+		DBG(("%s: yes, exported via dri, will flush\n", __FUNCTION__));
 		return true;
 	}
 
 	if (DAMAGE_IS_ALL(priv->gpu_damage)) {
 		DBG(("%s: yes, already wholly damaged on the GPU\n", __FUNCTION__));
+		assert(priv->gpu_bo);
 		return true;
 	}
 
@@ -1785,6 +1892,8 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 		return true;
 	}
 
+	assert(priv->gpu_damage == NULL || priv->gpu_bo);
+
 	if (sna_damage_is_all(&priv->cpu_damage,
 			      pixmap->drawable.width,
 			      pixmap->drawable.height)) {
@@ -1792,7 +1901,6 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 		     __FUNCTION__, pixmap->drawable.serialNumber));
 
 		sna_damage_destroy(&priv->gpu_damage);
-		priv->undamaged = false;
 
 		if (flags & MOVE_WRITE)
 			sna_pixmap_free_gpu(sna, priv);
@@ -1804,6 +1912,14 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 		goto out;
 	}
 
+	if (USE_INPLACE &&
+	    (flags & (MOVE_READ | MOVE_ASYNC_HINT)) == 0 &&
+	    (priv->flush || box_inplace(pixmap, &region->extents))) {
+		DBG(("%s: marking for inplace hint (%d, %d)\n",
+		     __FUNCTION__, priv->flush, box_inplace(pixmap, &region->extents)));
+		flags |= MOVE_INPLACE_HINT;
+	}
+
 	if (flags & MOVE_WHOLE_HINT)
 		return _sna_pixmap_move_to_cpu(pixmap, flags);
 
@@ -1824,132 +1940,40 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 		return _sna_pixmap_move_to_cpu(pixmap, flags);
 	}
 
-	if (USE_INPLACE && (flags & MOVE_READ) == 0) {
-		DBG(("%s: no read, checking to see if we can stream the write into the GPU bo\n",
-		     __FUNCTION__));
-		assert(flags & MOVE_WRITE);
-
-		if (priv->stride && priv->gpu_bo &&
-		    kgem_bo_can_map(&sna->kgem, priv->gpu_bo) &&
-		    region_inplace(sna, pixmap, region, priv, true)) {
-			assert(priv->gpu_bo->proxy == NULL);
-			if (!__kgem_bo_is_busy(&sna->kgem, priv->gpu_bo)) {
-				pixmap->devPrivate.ptr =
-					kgem_bo_map(&sna->kgem, priv->gpu_bo);
-				if (pixmap->devPrivate.ptr == NULL) {
-					if (dx | dy)
-						RegionTranslate(region, -dx, -dy);
-					return false;
-				}
-
-				priv->mapped = true;
-				pixmap->devKind = priv->gpu_bo->pitch;
-
-				sna_damage_subtract(&priv->cpu_damage, region);
-				if (priv->cpu_damage == NULL) {
-					list_del(&priv->list);
-					sna_damage_all(&priv->gpu_damage,
-						       pixmap->drawable.width,
-						       pixmap->drawable.height);
-					priv->undamaged = false;
-				} else
-					sna_damage_add(&priv->gpu_damage,
-						       region);
-
-				priv->clear = false;
-				priv->cpu = false;
-				assert_pixmap_damage(pixmap);
-				if (dx | dy)
-					RegionTranslate(region, -dx, -dy);
-				return true;
-			}
-		}
-
-		if (priv->cpu_bo && !priv->cpu_bo->flush) {
-			if (__kgem_bo_is_busy(&sna->kgem, priv->cpu_bo)) {
-				sna_damage_subtract(&priv->cpu_damage, region);
-				if (!sna_pixmap_move_to_gpu(pixmap, MOVE_WRITE)) {
-					if (dx | dy)
-						RegionTranslate(region, -dx, -dy);
-					return false;
-				}
-
-				assert(!priv->shm);
-				sna_pixmap_free_cpu(sna, priv);
-			}
-		}
-
-		if (priv->gpu_bo == NULL && priv->stride &&
-		    sna_pixmap_choose_tiling(pixmap, DEFAULT_TILING) != I915_TILING_NONE &&
-		    region_inplace(sna, pixmap, region, priv, true) &&
-		    sna_pixmap_create_mappable_gpu(pixmap)) {
-			pixmap->devPrivate.ptr =
-				kgem_bo_map(&sna->kgem, priv->gpu_bo);
-			if (pixmap->devPrivate.ptr == NULL) {
-				if (dx | dy)
-					RegionTranslate(region, -dx, -dy);
-				return false;
-			}
-
-			priv->mapped = true;
-			pixmap->devKind = priv->gpu_bo->pitch;
-
-			sna_damage_subtract(&priv->cpu_damage, region);
-			if (priv->cpu_damage == NULL) {
-				list_del(&priv->list);
-				sna_damage_all(&priv->gpu_damage,
-					       pixmap->drawable.width,
-					       pixmap->drawable.height);
-				priv->undamaged = false;
-			} else
-				sna_damage_add(&priv->gpu_damage, region);
-
-			assert_pixmap_damage(pixmap);
-			priv->clear = false;
-			priv->cpu = false;
-			if (dx | dy)
-				RegionTranslate(region, -dx, -dy);
-			return true;
-		}
-	}
-
 	if (operate_inplace(priv, flags) &&
-	    kgem_bo_can_map(&sna->kgem, priv->gpu_bo) &&
-	    region_inplace(sna, pixmap, region, priv, (flags & MOVE_READ) == 0)) {
+	    region_inplace(sna, pixmap, region, priv, (flags & MOVE_READ) == 0) &&
+	    (priv->gpu_bo || sna_pixmap_create_mappable_gpu(pixmap))) {
 		kgem_bo_submit(&sna->kgem, priv->gpu_bo);
 
 		DBG(("%s: try to operate inplace\n", __FUNCTION__));
 
-		pixmap->devPrivate.ptr =
-			kgem_bo_map(&sna->kgem, priv->gpu_bo);
-		if (pixmap->devPrivate.ptr != NULL) {
-			priv->mapped = true;
+		pixmap->devPrivate.ptr = kgem_bo_map(&sna->kgem, priv->gpu_bo);
+		priv->mapped = pixmap->devPrivate.ptr != NULL;
+		if (priv->mapped) {
 			pixmap->devKind = priv->gpu_bo->pitch;
-			if (flags & MOVE_WRITE &&
-			    !DAMAGE_IS_ALL(priv->gpu_damage)) {
-				sna_damage_add(&priv->gpu_damage, region);
-				if (sna_damage_is_all(&priv->gpu_damage,
-						      pixmap->drawable.width,
-						      pixmap->drawable.height)) {
-					DBG(("%s: replaced entire pixmap, destroying CPU shadow\n",
-					     __FUNCTION__));
-					sna_damage_destroy(&priv->cpu_damage);
-					priv->undamaged = false;
-					list_del(&priv->list);
-				} else
-					sna_damage_subtract(&priv->cpu_damage,
-							    region);
+			if (flags & MOVE_WRITE) {
+				if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
+					sna_damage_add(&priv->gpu_damage, region);
+					if (sna_damage_is_all(&priv->gpu_damage,
+							      pixmap->drawable.width,
+							      pixmap->drawable.height)) {
+						DBG(("%s: replaced entire pixmap, destroying CPU shadow\n",
+						     __FUNCTION__));
+						sna_damage_destroy(&priv->cpu_damage);
+						list_del(&priv->list);
+					} else
+						sna_damage_subtract(&priv->cpu_damage,
+								    region);
+				}
+				priv->clear = false;
 			}
 			assert_pixmap_damage(pixmap);
-			priv->clear = false;
 			priv->cpu = false;
 			if (dx | dy)
 				RegionTranslate(region, -dx, -dy);
 			DBG(("%s: operate inplace\n", __FUNCTION__));
 			return true;
 		}
-
-		priv->mapped = false;
 	}
 
 	if (priv->clear && flags & MOVE_WRITE) {
@@ -1965,12 +1989,26 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 		priv->mapped = false;
 	}
 
+	if ((priv->clear || (flags & MOVE_READ) == 0) &&
+	    priv->cpu_bo && !priv->cpu_bo->flush &&
+	    __kgem_bo_is_busy(&sna->kgem, priv->cpu_bo)) {
+		sna_damage_subtract(&priv->cpu_damage, region);
+		if (sna_pixmap_move_to_gpu(pixmap, MOVE_READ | MOVE_ASYNC_HINT)) {
+			sna_damage_all(&priv->gpu_damage,
+				       pixmap->drawable.width,
+				       pixmap->drawable.height);
+			sna_pixmap_free_cpu(sna, priv);
+		}
+	}
+
 	if (pixmap->devPrivate.ptr == NULL &&
-	    !sna_pixmap_alloc_cpu(sna, pixmap, priv, priv->gpu_damage != NULL)) {
+	    !sna_pixmap_alloc_cpu(sna, pixmap, priv,
+				  flags & MOVE_READ ? priv->gpu_damage && !priv->clear : 0)) {
 		if (dx | dy)
 			RegionTranslate(region, -dx, -dy);
 		return false;
 	}
+	assert(pixmap->devPrivate.ptr);
 
 	if (priv->gpu_bo == NULL) {
 		assert(priv->gpu_damage == NULL);
@@ -1979,8 +2017,8 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 
 	assert(priv->gpu_bo->proxy == NULL);
 	if (priv->clear) {
-		int n = REGION_NUM_RECTS(region);
-		BoxPtr box = REGION_RECTS(region);
+		int n = RegionNumRects(region);
+		BoxPtr box = RegionRects(region);
 
 		DBG(("%s: pending clear, doing partial fill\n", __FUNCTION__));
 		if (priv->cpu_bo) {
@@ -2022,6 +2060,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 			DBG(("%s: forced migration\n", __FUNCTION__));
 
 			assert(pixmap_contains_damage(pixmap, priv->gpu_damage));
+			assert(priv->gpu_bo);
 
 			ok = false;
 			if (use_cpu_bo_for_download(sna, priv, &priv->gpu_damage->extents)) {
@@ -2038,7 +2077,6 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 					       box, n);
 		}
 		sna_damage_destroy(&priv->gpu_damage);
-		priv->undamaged = true;
 	}
 
 	if (priv->gpu_damage &&
@@ -2048,6 +2086,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 		     __FUNCTION__,
 		     region->extents.x2 - region->extents.x1,
 		     region->extents.y2 - region->extents.y1));
+		assert(priv->gpu_bo);
 
 		if (priv->cpu_damage == NULL) {
 			if ((flags & MOVE_WRITE) == 0 &&
@@ -2095,9 +2134,9 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 			 * reads.
 			 */
 			if (flags & MOVE_WRITE) {
-				int n = REGION_NUM_RECTS(region), i;
-				BoxPtr boxes = REGION_RECTS(region);
-				BoxPtr blocks = malloc(sizeof(BoxRec) * REGION_NUM_RECTS(region));
+				int n = RegionNumRects(region), i;
+				BoxPtr boxes = RegionRects(region);
+				BoxPtr blocks = malloc(sizeof(BoxRec) * RegionNumRects(region));
 				if (blocks) {
 					for (i = 0; i < n; i++) {
 						blocks[i].x1 = boxes[i].x1 & ~31;
@@ -2150,12 +2189,11 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 				}
 
 				sna_damage_destroy(&priv->gpu_damage);
-				priv->undamaged = true;
 			} else if (DAMAGE_IS_ALL(priv->gpu_damage) ||
 				   sna_damage_contains_box__no_reduce(priv->gpu_damage,
 								      &r->extents)) {
-				BoxPtr box = REGION_RECTS(r);
-				int n = REGION_NUM_RECTS(r);
+				BoxPtr box = RegionRects(r);
+				int n = RegionNumRects(r);
 				bool ok = false;
 
 				DBG(("%s: region wholly inside damage\n",
@@ -2175,14 +2213,13 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 						       box, n);
 
 				sna_damage_subtract(&priv->gpu_damage, r);
-				priv->undamaged = true;
 			} else {
 				RegionRec need;
 
 				pixman_region_init(&need);
 				if (sna_damage_intersect(priv->gpu_damage, r, &need)) {
-					BoxPtr box = REGION_RECTS(&need);
-					int n = REGION_NUM_RECTS(&need);
+					BoxPtr box = RegionRects(&need);
+					int n = RegionNumRects(&need);
 					bool ok = false;
 
 					DBG(("%s: region intersects damage\n",
@@ -2202,7 +2239,6 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 							       box, n);
 
 					sna_damage_subtract(&priv->gpu_damage, r);
-					priv->undamaged = true;
 					RegionUninit(&need);
 				}
 			}
@@ -2212,7 +2248,7 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable,
 	}
 
 done:
-	if (flags & MOVE_WRITE) {
+	if ((flags & (MOVE_WRITE | MOVE_ASYNC_HINT)) == MOVE_WRITE) {
 		DBG(("%s: applying cpu damage\n", __FUNCTION__));
 		assert(!DAMAGE_IS_ALL(priv->cpu_damage));
 		assert_pixmap_contains_box(pixmap, RegionExtents(region));
@@ -2226,7 +2262,6 @@ done:
 				     __FUNCTION__));
 				sna_pixmap_free_gpu(sna, priv);
 			}
-			priv->undamaged = false;
 		}
 		if (priv->flush) {
 			assert(!priv->shm);
@@ -2245,8 +2280,8 @@ out:
 	}
 	if ((flags & MOVE_ASYNC_HINT) == 0 && priv->cpu_bo) {
 		DBG(("%s: syncing cpu bo\n", __FUNCTION__));
-		kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
-		assert(!kgem_bo_is_busy(priv->cpu_bo));
+		kgem_bo_sync__cpu_full(&sna->kgem,
+				       priv->cpu_bo, flags & MOVE_WRITE);
 	}
 	priv->cpu = (flags & MOVE_ASYNC_HINT) == 0;
 	assert(pixmap->devPrivate.ptr);
@@ -2354,31 +2389,30 @@ static inline struct sna_pixmap *
 sna_pixmap_mark_active(struct sna *sna, struct sna_pixmap *priv)
 {
 	assert(priv->gpu_bo);
-	if (USE_INACTIVE &&
-	    !priv->pinned && priv->gpu_bo->proxy == NULL &&
-	    (priv->create & KGEM_CAN_CREATE_LARGE) == 0)
-		list_move(&priv->inactive, &sna->active_pixmaps);
 	return priv;
 }
 
-static bool
+bool
 sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int flags)
 {
 	struct sna *sna = to_sna_from_pixmap(pixmap);
 	struct sna_pixmap *priv = sna_pixmap(pixmap);
 	RegionRec i, r;
 
-	DBG(("%s()\n", __FUNCTION__));
+	DBG(("%s: pixmap=%ld box=(%d, %d), (%d, %d), flags=%x\n",
+	     __FUNCTION__, pixmap->drawable.serialNumber,
+	     box->x1, box->y1, box->x2, box->y2, flags));
 
+	assert(box->x2 > box->x1 && box->y2 > box->y1);
 	assert_pixmap_damage(pixmap);
 	assert_pixmap_contains_box(pixmap, box);
 	assert(!wedged(sna));
+	assert(priv->gpu_damage == NULL || priv->gpu_bo);
 
 	if (sna_damage_is_all(&priv->gpu_damage,
 			      pixmap->drawable.width,
 			      pixmap->drawable.height)) {
 		sna_damage_destroy(&priv->cpu_damage);
-		priv->undamaged = false;
 		list_del(&priv->list);
 		goto done;
 	}
@@ -2390,7 +2424,6 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int fl
 	assert_pixmap_damage(pixmap);
 
 	if (priv->cpu_damage == NULL) {
-		priv->undamaged = false;
 		list_del(&priv->list);
 		return sna_pixmap_move_to_gpu(pixmap, flags);
 	}
@@ -2398,6 +2431,8 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int fl
 	if (priv->gpu_bo == NULL) {
 		unsigned create, tiling;
 
+		assert(priv->gpu_damage == NULL);
+
 		create = CREATE_INACTIVE;
 		if (pixmap->usage_hint == SNA_CREATE_FB)
 			create |= CREATE_EXACT | CREATE_SCANOUT;
@@ -2423,6 +2458,11 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int fl
 		priv->mapped = false;
 	}
 
+	if (priv->shm) {
+		assert(!priv->flush);
+		sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
+	}
+
 	region_set(&r, box);
 	if (MIGRATE_ALL || region_subsumes_damage(&r, priv->cpu_damage)) {
 		int n;
@@ -2437,15 +2477,12 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int fl
 							    pixmap, priv->cpu_bo, 0, 0,
 							    pixmap, priv->gpu_bo, 0, 0,
 							    box, n, 0);
-				if (ok && priv->shm) {
-					assert(!priv->flush);
-					sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
-				}
 			}
 			if (!ok) {
+				assert(!priv->mapped);
 				if (pixmap->devPrivate.ptr == NULL) {
-					assert(priv->stride && priv->ptr);
-					pixmap->devPrivate.ptr = priv->ptr;
+					assert(priv->ptr && priv->stride);
+					pixmap->devPrivate.ptr = PTR(priv->ptr);
 					pixmap->devKind = priv->stride;
 				}
 				if (n == 1 && !priv->pinned &&
@@ -2471,7 +2508,6 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int fl
 
 		sna_damage_destroy(&priv->cpu_damage);
 		list_del(&priv->list);
-		priv->undamaged = true;
 	} else if (DAMAGE_IS_ALL(priv->cpu_damage) ||
 		   sna_damage_contains_box__no_reduce(priv->cpu_damage, box)) {
 		bool ok = false;
@@ -2481,15 +2517,12 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int fl
 						    pixmap, priv->cpu_bo, 0, 0,
 						    pixmap, priv->gpu_bo, 0, 0,
 						    box, 1, 0);
-			if (ok && priv->shm) {
-				assert(!priv->flush);
-				sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
-			}
 		}
 		if (!ok) {
+			assert(!priv->mapped);
 			if (pixmap->devPrivate.ptr == NULL) {
-				assert(priv->stride && priv->ptr);
-				pixmap->devPrivate.ptr = priv->ptr;
+				assert(priv->ptr && priv->stride);
+				pixmap->devPrivate.ptr = PTR(priv->ptr);
 				pixmap->devKind = priv->stride;
 			}
 			ok = sna_write_boxes(sna, pixmap,
@@ -2503,12 +2536,11 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int fl
 			return false;
 
 		sna_damage_subtract(&priv->cpu_damage, &r);
-		priv->undamaged = true;
 	} else if (sna_damage_intersect(priv->cpu_damage, &r, &i)) {
-		int n = REGION_NUM_RECTS(&i);
+		int n = RegionNumRects(&i);
 		bool ok;
 
-		box = REGION_RECTS(&i);
+		box = RegionRects(&i);
 		ok = false;
 		if (use_cpu_bo_for_upload(sna, priv, 0)) {
 			DBG(("%s: using CPU bo for upload to GPU, %d boxes\n", __FUNCTION__, n));
@@ -2516,15 +2548,12 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int fl
 						    pixmap, priv->cpu_bo, 0, 0,
 						    pixmap, priv->gpu_bo, 0, 0,
 						    box, n, 0);
-			if (ok && priv->shm) {
-				assert(!priv->flush);
-				sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
-			}
 		}
 		if (!ok) {
+			assert(!priv->mapped);
 			if (pixmap->devPrivate.ptr == NULL) {
-				assert(priv->stride && priv->ptr);
-				pixmap->devPrivate.ptr = priv->ptr;
+				assert(priv->ptr && priv->stride);
+				pixmap->devPrivate.ptr = PTR(priv->ptr);
 				pixmap->devKind = priv->stride;
 			}
 			ok = sna_write_boxes(sna, pixmap,
@@ -2538,26 +2567,21 @@ sna_pixmap_move_area_to_gpu(PixmapPtr pixmap, const BoxRec *box, unsigned int fl
 			return false;
 
 		sna_damage_subtract(&priv->cpu_damage, &r);
-		priv->undamaged = true;
 		RegionUninit(&i);
 	}
 
-	if (priv->shm) {
-		assert(!priv->flush);
-		sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
-	}
-
 done:
 	if (flags & MOVE_WRITE) {
 		priv->clear = false;
 		priv->cpu = false;
-		if (priv->cpu_damage == NULL && box_inplace(pixmap, box)) {
+		if (priv->cpu_damage == NULL &&
+		    box_inplace(pixmap, &r.extents)) {
 			DBG(("%s: large operation on undamaged, promoting to full GPU\n",
 			     __FUNCTION__));
+			assert(priv->gpu_bo->proxy == NULL);
 			sna_damage_all(&priv->gpu_damage,
 				       pixmap->drawable.width,
 				       pixmap->drawable.height);
-			priv->undamaged = false;
 		}
 	}
 
@@ -2582,6 +2606,8 @@ sna_drawable_use_bo(DrawablePtr drawable, unsigned flags, const BoxRec *box,
 	     box->x1, box->y1, box->x2, box->y2,
 	     flags));
 
+	assert(box->x2 > box->x1 && box->y2 > box->y1);
+	assert(pixmap->refcnt);
 	assert_pixmap_damage(pixmap);
 	assert_drawable_contains_box(drawable, box);
 
@@ -2604,7 +2630,7 @@ sna_drawable_use_bo(DrawablePtr drawable, unsigned flags, const BoxRec *box,
 		flags |= PREFER_GPU;
 	if (priv->shm)
 		flags &= ~PREFER_GPU;
-	if (priv->cpu && (flags & FORCE_GPU) == 0)
+	if (priv->cpu && (flags & (FORCE_GPU | IGNORE_CPU)) == 0)
 		flags &= ~PREFER_GPU;
 
 	DBG(("%s: flush=%d, shm=%d, cpu=%d => flags=%x\n",
@@ -2619,6 +2645,7 @@ sna_drawable_use_bo(DrawablePtr drawable, unsigned flags, const BoxRec *box,
 	if (DAMAGE_IS_ALL(priv->gpu_damage)) {
 		DBG(("%s: use GPU fast path (all-damaged)\n", __FUNCTION__));
 		assert(priv->cpu_damage == NULL);
+		assert(priv->gpu_bo);
 		goto use_gpu_bo;
 	}
 
@@ -2692,7 +2719,6 @@ sna_drawable_use_bo(DrawablePtr drawable, unsigned flags, const BoxRec *box,
 			sna_damage_subtract(&priv->cpu_damage, &region);
 			if (priv->cpu_damage == NULL) {
 				list_del(&priv->list);
-				priv->undamaged = false;
 				priv->cpu = false;
 			}
 		}
@@ -2721,6 +2747,7 @@ create_gpu_bo:
 	     region.extents.x2, region.extents.y2));
 
 	if (priv->gpu_damage) {
+		assert(priv->gpu_bo);
 		if (!priv->cpu_damage) {
 			if (sna_damage_contains_box__no_reduce(priv->gpu_damage,
 							       &region.extents)) {
@@ -2779,18 +2806,19 @@ move_to_gpu:
 
 done:
 	assert(priv->gpu_bo != NULL);
+	assert(priv->gpu_bo->refcnt);
 	if (sna_damage_is_all(&priv->gpu_damage,
 			      pixmap->drawable.width,
 			      pixmap->drawable.height)) {
 		sna_damage_destroy(&priv->cpu_damage);
 		list_del(&priv->list);
-		priv->undamaged = false;
 		*damage = NULL;
 	} else
 		*damage = &priv->gpu_damage;
 
 	DBG(("%s: using GPU bo with damage? %d\n",
 	     __FUNCTION__, *damage != NULL));
+	assert(*damage == NULL || !DAMAGE_IS_ALL(*damage));
 	assert(priv->gpu_bo->proxy == NULL);
 	assert(priv->clear == false);
 	assert(priv->cpu == false);
@@ -2799,22 +2827,34 @@ done:
 use_gpu_bo:
 	DBG(("%s: using whole GPU bo\n", __FUNCTION__));
 	assert(priv->gpu_bo != NULL);
+	assert(priv->gpu_bo->refcnt);
 	assert(priv->gpu_bo->proxy == NULL);
+	assert(priv->gpu_damage);
 	priv->clear = false;
 	priv->cpu = false;
-	if (USE_INACTIVE &&
-	    !priv->pinned && (priv->create & KGEM_CAN_CREATE_LARGE) == 0)
-		list_move(&priv->inactive,
-			  &to_sna_from_pixmap(pixmap)->active_pixmaps);
 	*damage = NULL;
 	return priv->gpu_bo;
 
 use_cpu_bo:
-	if (!USE_CPU_BO)
-		return NULL;
+	if (!USE_CPU_BO || priv->cpu_bo == NULL) {
+cpu_fail:
+		if ((flags & FORCE_GPU) && priv->gpu_bo) {
+			get_drawable_deltas(drawable, pixmap, &dx, &dy);
+
+			region.extents = *box;
+			region.extents.x1 += dx;
+			region.extents.x2 += dx;
+			region.extents.y1 += dy;
+			region.extents.y2 += dy;
+			region.data = NULL;
+
+			goto move_to_gpu;
+		}
 
-	if (priv->cpu_bo == NULL)
 		return NULL;
+	}
+
+	assert(priv->cpu_bo->refcnt);
 
 	sna = to_sna_from_pixmap(pixmap);
 	if ((flags & FORCE_GPU) == 0 &&
@@ -2851,12 +2891,25 @@ use_cpu_bo:
 	}
 
 	if (!sna->kgem.can_blt_cpu)
-		return NULL;
+		goto cpu_fail;
 
 	if (!sna_drawable_move_region_to_cpu(&pixmap->drawable, &region,
-					     MOVE_READ | MOVE_ASYNC_HINT)) {
+					     (flags & IGNORE_CPU ? MOVE_READ : 0) | MOVE_WRITE | MOVE_ASYNC_HINT)) {
 		DBG(("%s: failed to move-to-cpu, fallback\n", __FUNCTION__));
-		return NULL;
+		goto cpu_fail;
+	}
+
+	if (priv->shm) {
+		assert(!priv->flush);
+		sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
+
+		/* As we may have flushed and retired,, recheck for busy bo */
+		if ((flags & FORCE_GPU) == 0 && !kgem_bo_is_busy(priv->cpu_bo))
+			return NULL;
+	}
+	if (priv->flush) {
+		assert(!priv->shm);
+		sna_add_flush_pixmap(sna, priv, priv->gpu_bo);
 	}
 
 	if (sna_damage_is_all(&priv->cpu_damage,
@@ -2873,21 +2926,9 @@ use_cpu_bo:
 			*damage = &priv->cpu_damage;
 	}
 
-	if (priv->shm) {
-		assert(!priv->flush);
-		sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
-
-		/* As we may have flushed and retired,, recheck for busy bo */
-		if ((flags & FORCE_GPU) == 0 && !kgem_bo_is_busy(priv->cpu_bo))
-			return NULL;
-	}
-	if (priv->flush) {
-		assert(!priv->shm);
-		sna_add_flush_pixmap(sna, priv, priv->gpu_bo);
-	}
-
 	DBG(("%s: using CPU bo with damage? %d\n",
 	     __FUNCTION__, *damage != NULL));
+	assert(damage == NULL || !DAMAGE_IS_ALL(*damage));
 	assert(priv->clear == false);
 	return priv->cpu_bo;
 }
@@ -3001,19 +3042,21 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 			       pixmap->drawable.height);
 	}
 
+	assert(priv->gpu_damage == NULL || priv->gpu_bo);
+
 	if (sna_damage_is_all(&priv->gpu_damage,
 			      pixmap->drawable.width,
 			      pixmap->drawable.height)) {
 		DBG(("%s: already all-damaged\n", __FUNCTION__));
 		sna_damage_destroy(&priv->cpu_damage);
 		list_del(&priv->list);
-		priv->undamaged = false;
-		assert(priv->cpu == false);
+		assert(priv->cpu == false || IS_CPU_MAP(priv->gpu_bo->map));
 		goto active;
 	}
 
 	if (flags & MOVE_WRITE && priv->gpu_bo && priv->gpu_bo->proxy) {
 		DBG(("%s: discarding cached upload buffer\n", __FUNCTION__));
+		assert(priv->gpu_damage == NULL);
 		kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
 		priv->gpu_bo = NULL;
 	}
@@ -3045,6 +3088,8 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 			create = 0;
 			if (priv->cpu_damage && priv->cpu_bo == NULL)
 				create = CREATE_GTT_MAP | CREATE_INACTIVE;
+			if (flags & MOVE_INPLACE_HINT)
+				create = CREATE_GTT_MAP | CREATE_INACTIVE;
 
 			priv->gpu_bo =
 				kgem_create_2d(&sna->kgem,
@@ -3055,6 +3100,7 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 		}
 		if (priv->gpu_bo == NULL) {
 			DBG(("%s: not creating GPU bo\n", __FUNCTION__));
+			assert(priv->gpu_damage == NULL);
 			assert(list_is_empty(&priv->list));
 			return NULL;
 		}
@@ -3066,6 +3112,7 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 			 * synchronisation that takes the most time. This is
 			 * mitigated by avoiding fallbacks in the first place.
 			 */
+			assert(priv->gpu_bo->proxy == NULL);
 			sna_damage_all(&priv->gpu_damage,
 				       pixmap->drawable.width,
 				       pixmap->drawable.height);
@@ -3078,17 +3125,16 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 	if (priv->gpu_bo->proxy) {
 		DBG(("%s: reusing cached upload\n", __FUNCTION__));
 		assert((flags & MOVE_WRITE) == 0);
+		assert(priv->gpu_damage == NULL);
 		return priv;
 	}
 
 	if (priv->cpu_damage == NULL)
 		goto done;
 
-	if (priv->mapped) {
-		assert(priv->stride);
-		pixmap->devPrivate.ptr = priv->ptr;
-		pixmap->devKind = priv->stride;
-		priv->mapped = false;
+	if (priv->shm) {
+		assert(!priv->flush);
+		sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
 	}
 
 	n = sna_damage_get_boxes(priv->cpu_damage, &box);
@@ -3098,9 +3144,6 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 		assert(pixmap_contains_damage(pixmap, priv->cpu_damage));
 		DBG(("%s: uploading %d damage boxes\n", __FUNCTION__, n));
 
-		if (!priv->cpu)
-			flags |= MOVE_ASYNC_HINT;
-
 		ok = false;
 		if (use_cpu_bo_for_upload(sna, priv, flags)) {
 			DBG(("%s: using CPU bo for upload to GPU\n", __FUNCTION__));
@@ -3110,9 +3153,15 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 						    box, n, 0);
 		}
 		if (!ok) {
+			if (priv->mapped) {
+				assert(priv->stride && priv->stride);
+				pixmap->devPrivate.ptr = PTR(priv->ptr);
+				pixmap->devKind = priv->stride;
+				priv->mapped = false;
+			}
 			if (pixmap->devPrivate.ptr == NULL) {
-				assert(priv->stride && priv->ptr);
-				pixmap->devPrivate.ptr = priv->ptr;
+				assert(priv->ptr && priv->stride);
+				pixmap->devPrivate.ptr = PTR(priv->ptr);
 				pixmap->devKind = priv->stride;
 			}
 			if (n == 1 && !priv->pinned &&
@@ -3137,15 +3186,14 @@ sna_pixmap_move_to_gpu(PixmapPtr pixmap, unsigned flags)
 
 	__sna_damage_destroy(DAMAGE_PTR(priv->cpu_damage));
 	priv->cpu_damage = NULL;
-	priv->undamaged = true;
-
-	if (priv->shm) {
-		assert(!priv->flush);
-		sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
-	}
 
 	/* For large bo, try to keep only a single copy around */
-	if (priv->create & KGEM_CAN_CREATE_LARGE) {
+	if (priv->create & KGEM_CAN_CREATE_LARGE ||
+	    flags & MOVE_SOURCE_HINT) {
+		DBG(("%s: disposing of system copy for large/source\n",
+		     __FUNCTION__));
+		assert(!priv->shm);
+		assert(priv->gpu_bo->proxy == NULL);
 		sna_damage_all(&priv->gpu_damage,
 			       pixmap->drawable.width,
 			       pixmap->drawable.height);
@@ -3157,14 +3205,8 @@ done:
 	sna_damage_reduce_all(&priv->gpu_damage,
 			      pixmap->drawable.width,
 			      pixmap->drawable.height);
-	if (DAMAGE_IS_ALL(priv->gpu_damage)) {
-		priv->undamaged = false;
-		if (priv->ptr) {
-			assert(priv->cpu_bo == NULL || !priv->cpu_bo->flush);
-			assert(!priv->shm);
-			sna_pixmap_free_cpu(sna, priv);
-		}
-	}
+	if (DAMAGE_IS_ALL(priv->gpu_damage))
+		sna_pixmap_free_cpu(sna, priv);
 
 active:
 	if (flags & MOVE_WRITE)
@@ -3209,7 +3251,7 @@ static bool must_check sna_gc_move_to_cpu(GCPtr gc,
 
 	if (gc->clientClipType == CT_PIXMAP) {
 		PixmapPtr clip = gc->clientClip;
-		gc->clientClip = BitmapToRegion(gc->pScreen, clip);
+		gc->clientClip = region_from_bitmap(gc->pScreen, clip);
 		gc->pScreen->DestroyPixmap(clip);
 		gc->clientClipType = gc->clientClip ? CT_REGION : CT_NONE;
 		changes |= GCClipMask;
@@ -3346,24 +3388,6 @@ static inline void box_add_pt(BoxPtr box, int16_t x, int16_t y)
 		box->y2 = y;
 }
 
-static int16_t bound(int16_t a, uint16_t b)
-{
-	int v = (int)a + (int)b;
-	if (v > MAXSHORT)
-		return MAXSHORT;
-	return v;
-}
-
-static int16_t clamp(int16_t a, int16_t b)
-{
-	int v = (int)a + (int)b;
-	if (v > MAXSHORT)
-		return MAXSHORT;
-	if (v < MINSHORT)
-		return MINSHORT;
-	return v;
-}
-
 static inline bool box32_to_box16(const Box32Rec *b32, BoxRec *b16)
 {
 	b16->x1 = b32->x1;
@@ -3394,141 +3418,10 @@ static inline void box32_add_rect(Box32Rec *box, const xRectangle *r)
 }
 
 static bool
-sna_put_image_upload_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
-			 int x, int y, int w, int h, char *bits, int stride)
-{
-	PixmapPtr pixmap = get_drawable_pixmap(drawable);
-	struct sna *sna = to_sna_from_pixmap(pixmap);
-	struct sna_pixmap *priv = sna_pixmap(pixmap);
-	BoxPtr box;
-	int nbox;
-	int16_t dx, dy;
-
-	box = REGION_RECTS(region);
-	nbox = REGION_NUM_RECTS(region);
-
-	DBG(("%s: %d x [(%d, %d), (%d, %d)...]\n",
-	     __FUNCTION__, nbox,
-	     box->x1, box->y1, box->x2, box->y2));
-
-	if (gc->alu != GXcopy)
-		return false;
-
-	if (priv->gpu_bo == NULL &&
-	    !sna_pixmap_create_mappable_gpu(pixmap))
-		return false;
-
-	assert(priv->gpu_bo);
-	assert(priv->gpu_bo->proxy == NULL);
-
-	if (!priv->pinned && nbox == 1 &&
-	    box->x1 <= 0 && box->y1 <= 0 &&
-	    box->x2 >= pixmap->drawable.width &&
-	    box->y2 >= pixmap->drawable.height)
-		return sna_replace(sna, pixmap, &priv->gpu_bo, bits, stride);
-
-	get_drawable_deltas(drawable, pixmap, &dx, &dy);
-	x += dx + drawable->x;
-	y += dy + drawable->y;
-
-	return sna_write_boxes(sna, pixmap,
-			       priv->gpu_bo, 0, 0,
-			       bits, stride, -x, -y,
-			       box, nbox);
-}
-
-static bool upload_inplace(struct sna *sna,
-			   PixmapPtr pixmap,
-			   struct sna_pixmap *priv,
-			   RegionRec *region)
-{
-	if (priv->shm) {
-		DBG(("%s: no, SHM Pixmap\n", __FUNCTION__));
-		return false;
-	}
-
-	if (priv->create & KGEM_CAN_CREATE_LARGE) {
-		if (priv->gpu_bo) {
-			DBG(("%s: yes, large buffer and already have GPU bo\n",
-			     __FUNCTION__));
-			return true;
-		}
-		if (priv->cpu_bo){
-			DBG(("%s: no, large buffer and already have CPU bo\n",
-			     __FUNCTION__));
-			return false;
-		}
-	}
-
-	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo) &&
-	    !(priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))) {
-		DBG(("%s: yes, CPU bo is busy, but the GPU bo is not\n", __FUNCTION__));
-		return true;
-	}
-
-	if (!region_inplace(sna, pixmap, region, priv, true)) {
-		DBG(("%s? no, region not suitable\n", __FUNCTION__));
-		return false;
-	}
-
-	if (sna->kgem.has_llc && !priv->flush) {
-		if (priv->cpu_bo) {
-			if (priv->cpu_damage &&
-			    kgem_bo_is_busy(priv->cpu_bo) &&
-			    !region_subsumes_damage(region, priv->cpu_damage)) {
-				DBG(("%s? yes, CPU bo is busy\n", __FUNCTION__));
-				return true;
-			}
-
-			DBG(("%s? no, have CPU bo\n", __FUNCTION__));
-			return false;
-		}
-
-		if (priv->create & KGEM_CAN_CREATE_CPU) {
-			DBG(("%s? no, can create CPU bo\n", __FUNCTION__));
-			return false;
-		}
-	}
-
-	if (priv->gpu_bo) {
-		if (priv->gpu_bo->proxy)
-			return false;
-
-		if (!kgem_bo_can_map(&sna->kgem, priv->gpu_bo)) {
-			DBG(("%s? no, GPU bo not mappable\n", __FUNCTION__));
-			return false;
-		}
-
-		if (!kgem_bo_is_busy(priv->gpu_bo)) {
-			DBG(("%s? yes, GPU bo is idle\n", __FUNCTION__));
-			return true;
-		}
-
-		if (!priv->pinned &&
-		    region_subsumes_drawable(region, &pixmap->drawable)) {
-			DBG(("%s? yes, will replace busy GPU\n", __FUNCTION__));
-			return true;
-		}
-	}
-
-	if ((priv->create & (KGEM_CAN_CREATE_GPU | KGEM_CAN_CREATE_CPU)) == KGEM_CAN_CREATE_GPU &&
-	    region_subsumes_drawable(region, &pixmap->drawable)) {
-		DBG(("%s? yes, will fill fresh GPU bo\n", __FUNCTION__));
-		return true;
-	}
-
-	return false;
-}
-
-static bool
 sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		    int x, int y, int w, int  h, char *bits, int stride)
 {
 	PixmapPtr pixmap = get_drawable_pixmap(drawable);
-	struct sna *sna = to_sna_from_pixmap(pixmap);
-	struct sna_pixmap *priv = sna_pixmap(pixmap);
-	char *dst_bits;
-	int dst_stride;
 	BoxRec *box;
 	int16_t dx, dy;
 	int n;
@@ -3538,170 +3431,22 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	if (gc->alu != GXcopy)
 		return false;
 
-	if (!priv) {
-		if (drawable->depth < 8)
-			return false;
-
-		goto blt;
-	}
-
-	/* XXX performing the upload inplace is currently about 20x slower
-	 * for putimage10 on gen6 -- mostly due to slow page faulting in kernel.
-	 * So we try again with vma caching and only for pixmaps who will be
-	 * immediately flushed...
-	 */
-	if (upload_inplace(sna, pixmap, priv, region) &&
-	    sna_put_image_upload_blt(drawable, gc, region,
-				     x, y, w, h, bits, stride)) {
-		if (!DAMAGE_IS_ALL(priv->gpu_damage)) {
-			DBG(("%s: marking damage\n", __FUNCTION__));
-			if (region_subsumes_drawable(region, &pixmap->drawable))
-				sna_damage_destroy(&priv->cpu_damage);
-			else
-				sna_damage_subtract(&priv->cpu_damage, region);
-			if (priv->cpu_damage == NULL) {
-				sna_damage_all(&priv->gpu_damage,
-					       pixmap->drawable.width,
-					       pixmap->drawable.height);
-				list_del(&priv->list);
-				priv->undamaged = false;
-			} else
-				sna_damage_add(&priv->gpu_damage, region);
-		}
-
-		/* And mark as having a valid GTT mapping for future uploads */
-		if (priv->stride && kgem_bo_can_map(&sna->kgem, priv->gpu_bo)) {
-			pixmap->devPrivate.ptr =
-				kgem_bo_map__async(&sna->kgem, priv->gpu_bo);
-			if (pixmap->devPrivate.ptr) {
-				priv->mapped = true;
-				pixmap->devKind = priv->gpu_bo->pitch;
-			}
-		}
-
-		assert_pixmap_damage(pixmap);
-		priv->clear = false;
-		priv->cpu = false;
-		return true;
-	}
-
-	if (priv->gpu_bo && priv->gpu_bo->proxy) {
-		DBG(("%s: discarding cached upload buffer\n", __FUNCTION__));
-		kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
-		priv->gpu_bo = NULL;
-	}
-
-	if (priv->mapped) {
-		assert(!priv->shm);
-		pixmap->devPrivate.ptr = NULL;
-		priv->mapped = false;
-	}
-
-	/* If the GPU is currently accessing the CPU pixmap, then
-	 * we will need to wait for that to finish before we can
-	 * modify the memory.
-	 *
-	 * However, we can queue some writes to the GPU bo to avoid
-	 * the wait. Or we can try to replace the CPU bo.
-	 */
-	if (!priv->shm && priv->cpu_bo && __kgem_bo_is_busy(&sna->kgem, priv->cpu_bo)) {
-		assert(!priv->cpu_bo->flush);
-		DBG(("%s: cpu bo will stall, upload damage and discard\n",
-		     __FUNCTION__));
-		if (priv->cpu_damage) {
-			if (!region_subsumes_drawable(region, &pixmap->drawable)) {
-				sna_damage_subtract(&priv->cpu_damage, region);
-				if (!sna_pixmap_move_to_gpu(pixmap, MOVE_READ | MOVE_ASYNC_HINT))
-					return false;
-			} else {
-				sna_damage_destroy(&priv->cpu_damage);
-				priv->undamaged = false;
-			}
-		}
-		assert(priv->cpu_damage == NULL);
-		sna_damage_all(&priv->gpu_damage,
-			       pixmap->drawable.width,
-			       pixmap->drawable.height);
-		sna_pixmap_free_cpu(sna, priv);
-		assert(pixmap->devPrivate.ptr == NULL);
-	}
-
-	if (pixmap->devPrivate.ptr == NULL &&
-	    !sna_pixmap_alloc_cpu(sna, pixmap, priv, false))
-		return true;
-
-	if (priv->cpu_bo) {
-		DBG(("%s: syncing CPU bo\n", __FUNCTION__));
-		kgem_bo_sync__cpu(&sna->kgem, priv->cpu_bo);
-	}
-
-	if (priv->clear) {
-		DBG(("%s: applying clear [%08x]\n",
-		     __FUNCTION__, priv->clear_color));
-
-		if (priv->clear_color == 0) {
-			memset(pixmap->devPrivate.ptr,
-			       0, pixmap->devKind * pixmap->drawable.height);
-		} else {
-			pixman_fill(pixmap->devPrivate.ptr,
-				    pixmap->devKind/sizeof(uint32_t),
-				    pixmap->drawable.bitsPerPixel,
-				    0, 0,
-				    pixmap->drawable.width,
-				    pixmap->drawable.height,
-				    priv->clear_color);
-		}
-
-		sna_damage_all(&priv->cpu_damage,
-			       pixmap->drawable.width,
-			       pixmap->drawable.height);
-		sna_pixmap_free_gpu(sna, priv);
-		priv->undamaged = false;
-	}
+	if (drawable->depth < 8)
+		return false;
 
-	if (!DAMAGE_IS_ALL(priv->cpu_damage)) {
-		DBG(("%s: marking damage\n", __FUNCTION__));
-		if (region_subsumes_drawable(region, &pixmap->drawable)) {
-			DBG(("%s: replacing entire pixmap\n", __FUNCTION__));
-			sna_damage_all(&priv->cpu_damage,
-				       pixmap->drawable.width,
-				       pixmap->drawable.height);
-			sna_pixmap_free_gpu(sna, priv);
-			priv->undamaged = false;
-			assert(priv->gpu_damage == NULL);
-		} else {
-			sna_damage_subtract(&priv->gpu_damage, region);
-			sna_damage_add(&priv->cpu_damage, region);
-			if (priv->gpu_bo &&
-			    sna_damage_is_all(&priv->cpu_damage,
-					      pixmap->drawable.width,
-					      pixmap->drawable.height)) {
-				DBG(("%s: replaced entire pixmap\n", __FUNCTION__));
-				sna_pixmap_free_gpu(sna, priv);
-				priv->undamaged = false;
-			}
-		}
-		if (priv->flush) {
-			assert(!priv->shm);
-			sna_add_flush_pixmap(sna, priv, priv->gpu_bo);
-		}
-	}
-	assert(!priv->flush || !list_is_empty(&priv->list));
-	priv->cpu = true;
+	if (!sna_drawable_move_region_to_cpu(&pixmap->drawable,
+					     region, MOVE_WRITE))
+		return false;
 
-blt:
 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
 	x += dx + drawable->x;
 	y += dy + drawable->y;
 
 	DBG(("%s: upload(%d, %d, %d, %d)\n", __FUNCTION__, x, y, w, h));
 
-	dst_stride = pixmap->devKind;
-	dst_bits = pixmap->devPrivate.ptr;
-
 	/* Region is pre-clipped and translated into pixmap space */
-	box = REGION_RECTS(region);
-	n = REGION_NUM_RECTS(region);
+	box = RegionRects(region);
+	n = RegionNumRects(region);
 	do {
 		DBG(("%s: copy box (%d, %d)->(%d, %d)x(%d, %d)\n",
 		     __FUNCTION__,
@@ -3722,9 +3467,9 @@ blt:
 		assert(box->x2 - x <= w);
 		assert(box->y2 - y <= h);
 
-		memcpy_blt(bits, dst_bits,
+		memcpy_blt(bits, pixmap->devPrivate.ptr,
 			   pixmap->drawable.bitsPerPixel,
-			   stride, dst_stride,
+			   stride, pixmap->devKind,
 			   box->x1 - x, box->y1 - y,
 			   box->x1, box->y1,
 			   box->x2 - box->x1, box->y2 - box->y1);
@@ -3790,11 +3535,11 @@ sna_put_xybitmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	x += dx + drawable->x;
 	y += dy + drawable->y;
 
-	kgem_set_mode(&sna->kgem, KGEM_BLT);
+	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
 
 	/* Region is pre-clipped and translated into pixmap space */
-	box = REGION_RECTS(region);
-	n = REGION_NUM_RECTS(region);
+	box = RegionRects(region);
+	n = RegionNumRects(region);
 	do {
 		int bx1 = (box->x1 - x) & ~7;
 		int bx2 = (box->x2 - x + 7) & ~7;
@@ -3810,7 +3555,9 @@ sna_put_xybitmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		if (!kgem_check_batch(&sna->kgem, 8) ||
 		    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 		    !kgem_check_reloc_and_exec(&sna->kgem, 2)) {
-			_kgem_submit(&sna->kgem);
+			kgem_submit(&sna->kgem);
+			if (!kgem_check_bo_fenced(&sna->kgem, bo))
+				return false;
 			_kgem_set_mode(&sna->kgem, KGEM_BLT);
 		}
 
@@ -3840,7 +3587,7 @@ sna_put_xybitmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 		b[0] = XY_MONO_SRC_COPY | 3 << 20;
 		b[0] |= ((box->x1 - x) & 7) << 17;
 		b[1] = bo->pitch;
-		if (sna->kgem.gen >= 40 && bo->tiling) {
+		if (sna->kgem.gen >= 040 && bo->tiling) {
 			b[0] |= BLT_DST_TILED;
 			b[1] >>= 2;
 		}
@@ -3912,12 +3659,12 @@ sna_put_xypixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 	x += dx + drawable->x;
 	y += dy + drawable->y;
 
-	kgem_set_mode(&sna->kgem, KGEM_BLT);
+	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
 
 	skip = h * BitmapBytePad(w + left);
 	for (i = 1 << (gc->depth-1); i; i >>= 1, bits += skip) {
-		const BoxRec *box = REGION_RECTS(region);
-		int n = REGION_NUM_RECTS(region);
+		const BoxRec *box = RegionRects(region);
+		int n = RegionNumRects(region);
 
 		if ((gc->planemask & i) == 0)
 			continue;
@@ -3938,7 +3685,9 @@ sna_put_xypixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 			if (!kgem_check_batch(&sna->kgem, 12) ||
 			    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 			    !kgem_check_reloc_and_exec(&sna->kgem, 2)) {
-				_kgem_submit(&sna->kgem);
+				kgem_submit(&sna->kgem);
+				if (!kgem_check_bo_fenced(&sna->kgem, bo))
+					return false;
 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
 			}
 
@@ -3968,7 +3717,7 @@ sna_put_xypixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region,
 			b[0] = XY_FULL_MONO_PATTERN_MONO_SRC_BLT | 3 << 20;
 			b[0] |= ((box->x1 - x) & 7) << 17;
 			b[1] = bo->pitch;
-			if (sna->kgem.gen >= 40 && bo->tiling) {
+			if (sna->kgem.gen >= 040 && bo->tiling) {
 				b[0] |= BLT_DST_TILED;
 				b[1] >>= 2;
 			}
@@ -4037,7 +3786,7 @@ sna_put_image(DrawablePtr drawable, GCPtr gc, int depth,
 	    gc->pCompositeClip->extents.x2 < region.extents.x2 ||
 	    gc->pCompositeClip->extents.y2 < region.extents.y2) {
 		RegionIntersect(&region, &region, gc->pCompositeClip);
-		if (!RegionNotEmpty(&region))
+		if (RegionNil(&region))
 			return;
 	}
 
@@ -4120,8 +3869,10 @@ move_to_gpu(PixmapPtr pixmap, struct sna_pixmap *priv,
 	int h = box->y2 - box->y1;
 	int count;
 
-	if (DAMAGE_IS_ALL(priv->gpu_damage))
+	if (DAMAGE_IS_ALL(priv->gpu_damage)) {
+		assert(priv->gpu_bo);
 		return true;
+	}
 
 	if (priv->gpu_bo) {
 		if (alu != GXcopy)
@@ -4248,15 +3999,17 @@ sna_self_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	if (dst != src)
 		get_drawable_deltas(dst, pixmap, &tx, &ty);
 
-	if (priv == NULL || DAMAGE_IS_ALL(priv->cpu_damage))
+	if (priv == NULL || DAMAGE_IS_ALL(priv->cpu_damage) || priv->shm)
 		goto fallback;
 
 	if (priv->gpu_damage) {
+		assert(priv->gpu_bo);
+
 		if (alu == GXcopy && priv->clear)
 			goto out;
 
 		assert(priv->gpu_bo->proxy == NULL);
-		if (!sna_pixmap_move_to_gpu(pixmap, MOVE_WRITE | MOVE_READ)) {
+		if (!sna_pixmap_move_to_gpu(pixmap, MOVE_WRITE | MOVE_READ | MOVE_ASYNC_HINT)) {
 			DBG(("%s: fallback - not a pure copy and failed to move dst to GPU\n",
 			     __FUNCTION__));
 			goto fallback;
@@ -4334,7 +4087,7 @@ sna_pixmap_is_gpu(PixmapPtr pixmap)
 }
 
 static int
-source_prefer_gpu(struct sna_pixmap *priv)
+source_prefer_gpu(struct sna *sna, struct sna_pixmap *priv)
 {
 	if (priv == NULL) {
 		DBG(("%s: source unattached, use cpu\n", __FUNCTION__));
@@ -4348,6 +4101,7 @@ source_prefer_gpu(struct sna_pixmap *priv)
 
 	if (priv->gpu_damage) {
 		DBG(("%s: source has gpu damage, force gpu\n", __FUNCTION__));
+		assert(priv->gpu_bo);
 		return PREFER_GPU | FORCE_GPU;
 	}
 
@@ -4357,13 +4111,50 @@ source_prefer_gpu(struct sna_pixmap *priv)
 	}
 
 	if (DAMAGE_IS_ALL(priv->cpu_damage))
-		return 0;
+		return priv->cpu_bo && kgem_is_idle(&sna->kgem);
 
 	DBG(("%s: source has GPU bo? %d\n",
 	     __FUNCTION__, priv->gpu_bo != NULL));
 	return priv->gpu_bo != NULL;
 }
 
+static bool use_shm_bo(struct sna *sna,
+		       struct kgem_bo *bo,
+		       struct sna_pixmap *priv,
+		       int alu)
+{
+	if (priv == NULL || priv->cpu_bo == NULL) {
+		DBG(("%s: no, not attached\n", __FUNCTION__));
+		return false;
+	}
+
+	if (!priv->shm) {
+		DBG(("%s: yes, ordinary CPU bo\n", __FUNCTION__));
+		return true;
+	}
+
+	if (alu != GXcopy) {
+		DBG(("%s: yes, complex alu=%d\n", __FUNCTION__, alu));
+		return true;
+	}
+	if (bo->tiling) {
+		DBG(("%s:, yes, dst tiled=%d\n", __FUNCTION__, bo->tiling));
+		return true;
+	}
+
+	if (__kgem_bo_is_busy(&sna->kgem, bo)) {
+		DBG(("%s: yes, dst is busy\n", __FUNCTION__));
+		return true;
+	}
+
+	if (__kgem_bo_is_busy(&sna->kgem, priv->cpu_bo)) {
+		DBG(("%s: yes, src is busy\n", __FUNCTION__));
+		return true;
+	}
+
+	return false;
+}
+
 static void
 sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	       RegionPtr region, int dx, int dy,
@@ -4435,7 +4226,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	if (dst_priv == NULL)
 		goto fallback;
 
-	hint = source_prefer_gpu(src_priv) ?:
+	hint = source_prefer_gpu(sna, src_priv) ?:
 		region_inplace(sna, dst_pixmap, region,
 			       dst_priv, alu_overwrites(alu));
 	if (dst_priv->cpu_damage && alu_overwrites(alu)) {
@@ -4453,6 +4244,8 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 		if (region->data == NULL)
 			hint |= IGNORE_CPU;
 	}
+	if (replaces)
+		hint |= IGNORE_CPU;
 
 	bo = sna_drawable_use_bo(&dst_pixmap->drawable, hint,
 				 &region->extents, &damage);
@@ -4493,7 +4286,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 
 		if (src_priv &&
 		    move_to_gpu(src_pixmap, src_priv, &region->extents, alu) &&
-		    sna_pixmap_move_to_gpu(src_pixmap, MOVE_READ)) {
+		    sna_pixmap_move_to_gpu(src_pixmap, MOVE_READ | MOVE_ASYNC_HINT)) {
 			DBG(("%s: move whole src_pixmap to GPU and copy\n",
 			     __FUNCTION__));
 			if (!sna->render.copy_boxes(sna, alu,
@@ -4525,7 +4318,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 			area.y2 += src_dy;
 
 			if (!sna_pixmap_move_area_to_gpu(src_pixmap, &area,
-							 MOVE_READ))
+							 MOVE_READ | MOVE_ASYNC_HINT))
 				goto fallback;
 
 			if (!sna->render.copy_boxes(sna, alu,
@@ -4545,11 +4338,11 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 		if (bo != dst_priv->gpu_bo)
 			goto fallback;
 
-		if (src_priv && src_priv->cpu_bo) {
+		if (use_shm_bo(sna, bo, src_priv, alu)) {
 			bool ret;
 
-			DBG(("%s: region overlaps CPU damage, copy from CPU bo\n",
-			     __FUNCTION__));
+			DBG(("%s: region overlaps CPU damage, copy from CPU bo (shm? %d)\n",
+			     __FUNCTION__, src_priv->shm));
 
 			assert(bo != dst_priv->cpu_bo);
 
@@ -4561,29 +4354,31 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 			if (!ret)
 				goto fallback;
 
+			if (src_priv->shm) {
+				assert(!src_priv->flush);
+				sna_add_flush_pixmap(sna, src_priv, src_priv->cpu_bo);
+			}
+
 			if (!sna->render.copy_boxes(sna, alu,
 						    src_pixmap, src_priv->cpu_bo, src_dx, src_dy,
 						    dst_pixmap, bo, 0, 0,
-						    box, n, 0)) {
+						    box, n, src_priv->shm ? COPY_LAST : 0)) {
 				DBG(("%s: fallback - accelerated copy boxes failed\n",
 				     __FUNCTION__));
 				goto fallback;
 			}
 
-			if (src_priv->shm) {
-				assert(!src_priv->flush);
-				sna_add_flush_pixmap(sna, src_priv, src_priv->cpu_bo);
-			}
-
 			if (damage)
 				sna_damage_add(damage, region);
 			return;
 		}
 
-		if (src_priv == NULL &&
+		if (USE_USERPTR_UPLOADS &&
+		    src_priv == NULL &&
 		    sna->kgem.has_userptr &&
-		    __kgem_bo_is_busy(&sna->kgem, bo) &&
-		    box_inplace(src_pixmap, &region->extents)) {
+		    box_inplace(src_pixmap, &region->extents) &&
+		    ((sna->kgem.has_llc && bo->tiling && !bo->scanout) ||
+		     __kgem_bo_is_busy(&sna->kgem, bo))) {
 			struct kgem_bo *src_bo;
 			bool ok = false;
 
@@ -4664,8 +4459,13 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 
 			if (n == 1 &&
 			    tmp->drawable.width == src_pixmap->drawable.width &&
-			    tmp->drawable.height == src_pixmap->drawable.height)
+			    tmp->drawable.height == src_pixmap->drawable.height) {
+				DBG(("%s: caching upload for src bo\n",
+				     __FUNCTION__));
+				assert(src_priv->gpu_damage == NULL);
+				assert(src_priv->gpu_bo == NULL);
 				kgem_proxy_bo_attach(src_bo, &src_priv->gpu_bo);
+			}
 
 			if (!sna->render.copy_boxes(sna, alu,
 						    tmp, src_bo, dx, dy,
@@ -4695,8 +4495,7 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 				if (src_pixmap->devPrivate.ptr == NULL) {
 					if (!src_priv->ptr) /* uninitialised!*/
 						return;
-					assert(src_priv->stride);
-					src_pixmap->devPrivate.ptr = src_priv->ptr;
+					src_pixmap->devPrivate.ptr = PTR(src_priv->ptr);
 					src_pixmap->devKind = src_priv->stride;
 				}
 			}
@@ -4721,15 +4520,16 @@ sna_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 					goto fallback;
 			}
 
+			assert(dst_priv->clear == false);
 			dst_priv->cpu = false;
 			if (damage) {
+				assert(dst_priv->gpu_bo->proxy == NULL);
 				if (replaces) {
 					sna_damage_destroy(&dst_priv->cpu_damage);
 					sna_damage_all(&dst_priv->gpu_damage,
 						       dst_pixmap->drawable.width,
 						       dst_pixmap->drawable.height);
 					list_del(&dst_priv->list);
-					dst_priv->undamaged = false;
 				} else
 					sna_damage_add(&dst_priv->gpu_damage,
 						       region);
@@ -4752,6 +4552,7 @@ fallback:
 				return;
 		}
 
+		assert(dst_pixmap->devPrivate.ptr);
 		do {
 			pixman_fill(dst_pixmap->devPrivate.ptr,
 				    dst_pixmap->devKind/sizeof(uint32_t),
@@ -4891,10 +4692,7 @@ sna_do_copy(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 		return NULL;
 	}
 
-	if (src->pScreen->SourceValidate)
-		src->pScreen->SourceValidate(src, sx, sy,
-					     width, height,
-					     gc->subWindowMode);
+	SourceValidate(src, sx, sy, width, height, gc->subWindowMode);
 
 	sx += src->x;
 	sy += src->y;
@@ -4938,7 +4736,7 @@ sna_do_copy(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 		 * VT is inactive, make sure the region isn't empty
 		 */
 		if (((WindowPtr)src)->parent ||
-		    !RegionNotEmpty(&((WindowPtr)src)->borderClip)) {
+		    RegionNil(&((WindowPtr)src)->borderClip)) {
 			DBG(("%s: include inferiors\n", __FUNCTION__));
 			free_clip = clip = NotClippedByChildren((WindowPtr)src);
 		}
@@ -4973,17 +4771,17 @@ sna_do_copy(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 		if (free_clip)
 			RegionDestroy(free_clip);
 	}
-	DBG(("%s: src extents (%d, %d), (%d, %d) x %d\n", __FUNCTION__,
+	DBG(("%s: src extents (%d, %d), (%d, %d) x %ld\n", __FUNCTION__,
 	     region.extents.x1, region.extents.y1,
 	     region.extents.x2, region.extents.y2,
-	     RegionNumRects(&region)));
+	     (long)RegionNumRects(&region)));
 	RegionTranslate(&region, dx-sx, dy-sy);
 	if (gc->pCompositeClip->data)
 		RegionIntersect(&region, &region, gc->pCompositeClip);
-	DBG(("%s: copy region (%d, %d), (%d, %d) x %d\n", __FUNCTION__,
+	DBG(("%s: copy region (%d, %d), (%d, %d) x %ld\n", __FUNCTION__,
 	     region.extents.x1, region.extents.y1,
 	     region.extents.x2, region.extents.y2,
-	     RegionNumRects(&region)));
+	     (long)RegionNumRects(&region)));
 
 	if (RegionNotEmpty(&region))
 		copy(src, dst, gc, &region, sx-dx, sy-dy, bitPlane, closure);
@@ -5005,8 +4803,8 @@ sna_fallback_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 			RegionPtr region, int dx, int dy,
 			Pixel bitplane, void *closure)
 {
-	DBG(("%s (boxes=%dx[(%d, %d), (%d, %d)...], src=+(%d, %d), alu=%d\n",
-	     __FUNCTION__, RegionNumRects(region),
+	DBG(("%s (boxes=%ldx[(%d, %d), (%d, %d)...], src=+(%d, %d), alu=%d\n",
+	     __FUNCTION__, (long)RegionNumRects(region),
 	     region->extents.x1, region->extents.y1,
 	     region->extents.x2, region->extents.y2,
 	     dx, dy, gc->alu));
@@ -5014,16 +4812,17 @@ sna_fallback_copy_boxes(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	if (!sna_gc_move_to_cpu(gc, dst, region))
 		return;
 
+	RegionTranslate(region, dx, dy);
+	if (!sna_drawable_move_region_to_cpu(src, region, MOVE_READ))
+		goto out_gc;
+	RegionTranslate(region, -dx, -dy);
+
 	if (src == dst ||
 	    get_drawable_pixmap(src) == get_drawable_pixmap(dst)) {
+		DBG(("%s: self-copy\n", __FUNCTION__));
 		if (!sna_drawable_move_to_cpu(dst, MOVE_WRITE | MOVE_READ))
 			goto out_gc;
 	} else {
-		RegionTranslate(region, dx, dy);
-		if (!sna_drawable_move_region_to_cpu(src, region, MOVE_READ))
-			goto out_gc;
-		RegionTranslate(region, -dx, -dy);
-
 		if (!sna_drawable_move_region_to_cpu(dst, region,
 						     drawable_gc_flags(dst, gc, false)))
 			goto out_gc;
@@ -5049,10 +4848,11 @@ sna_copy_area(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	if (gc->planemask == 0)
 		return NULL;
 
-	DBG(("%s: src=(%d, %d)x(%d, %d)+(%d, %d) -> dst=(%d, %d)+(%d, %d)\n",
+	DBG(("%s: src=(%d, %d)x(%d, %d)+(%d, %d) -> dst=(%d, %d)+(%d, %d); alu=%d, pm=%lx\n",
 	     __FUNCTION__,
 	     src_x, src_y, width, height, src->x, src->y,
-	     dst_x, dst_y, dst->x, dst->y));
+	     dst_x, dst_y, dst->x, dst->y,
+	     gc->alu, gc->planemask));
 
 	if (FORCE_FALLBACK || !ACCEL_COPY_AREA || wedged(sna) ||
 	    !PM_IS_SOLID(dst, gc->planemask))
@@ -5599,7 +5399,7 @@ no_damage_clipped:
 
 		region_set(&clip, extents);
 		region_maybe_clip(&clip, gc->pCompositeClip);
-		if (!RegionNotEmpty(&clip))
+		if (RegionNil(&clip))
 			return true;
 
 		assert(dx + clip.extents.x1 >= 0);
@@ -5607,9 +5407,9 @@ no_damage_clipped:
 		assert(dx + clip.extents.x2 <= pixmap->drawable.width);
 		assert(dy + clip.extents.y2 <= pixmap->drawable.height);
 
-		DBG(("%s: clip %d x [(%d, %d), (%d, %d)] x %d [(%d, %d)...]\n",
+		DBG(("%s: clip %ld x [(%d, %d), (%d, %d)] x %d [(%d, %d)...]\n",
 		     __FUNCTION__,
-		     REGION_NUM_RECTS(&clip),
+		     (long)RegionNumRects(&clip),
 		     clip.extents.x1, clip.extents.y1, clip.extents.x2, clip.extents.y2,
 		     n, pt->x, pt->y));
 
@@ -5700,7 +5500,7 @@ damage_clipped:
 
 		region_set(&clip, extents);
 		region_maybe_clip(&clip, gc->pCompositeClip);
-		if (!RegionNotEmpty(&clip))
+		if (RegionNil(&clip))
 			return true;
 
 		assert(dx + clip.extents.x1 >= 0);
@@ -5708,9 +5508,9 @@ damage_clipped:
 		assert(dx + clip.extents.x2 <= pixmap->drawable.width);
 		assert(dy + clip.extents.y2 <= pixmap->drawable.height);
 
-		DBG(("%s: clip %d x [(%d, %d), (%d, %d)] x %d [(%d, %d)...]\n",
+		DBG(("%s: clip %ld x [(%d, %d), (%d, %d)] x %d [(%d, %d)...]\n",
 		     __FUNCTION__,
-		     REGION_NUM_RECTS(&clip),
+		     RegionNumRects(&clip),
 		     clip.extents.x1, clip.extents.y1, clip.extents.x2, clip.extents.y2,
 		     n, pt->x, pt->y));
 
@@ -6021,7 +5821,7 @@ fallback:
 	DBG(("%s: fallback\n", __FUNCTION__));
 	region.data = NULL;
 	region_maybe_clip(&region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&region))
+	if (RegionNil(&region))
 		return;
 
 	if (!sna_gc_move_to_cpu(gc, drawable, &region))
@@ -6061,7 +5861,7 @@ sna_set_spans(DrawablePtr drawable, GCPtr gc, char *src,
 fallback:
 	region.data = NULL;
 	region_maybe_clip(&region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&region))
+	if (RegionNil(&region))
 		return;
 
 	if (!sna_gc_move_to_cpu(gc, drawable, &region))
@@ -6098,10 +5898,11 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
 	BoxPtr box;
 	int n;
 
-	DBG(("%s: plane=%x (%d,%d),(%d,%d)x%d\n",
-	     __FUNCTION__, (unsigned)bitplane, RegionNumRects(region),
+	DBG(("%s: plane=%x (%d,%d),(%d,%d)x%ld\n",
+	     __FUNCTION__, (unsigned)bitplane,
 	     region->extents.x1, region->extents.y1,
-	     region->extents.x2, region->extents.y2));
+	     region->extents.x2, region->extents.y2,
+	     (long)RegionNumRects(region)));
 
 	box = RegionRects(region);
 	n = RegionNumRects(region);
@@ -6112,14 +5913,14 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
 
 	br00 = 3 << 20;
 	br13 = arg->bo->pitch;
-	if (sna->kgem.gen >= 40 && arg->bo->tiling) {
+	if (sna->kgem.gen >= 040 && arg->bo->tiling) {
 		br00 |= BLT_DST_TILED;
 		br13 >>= 2;
 	}
 	br13 |= blt_depth(drawable->depth) << 24;
 	br13 |= copy_ROP[gc->alu] << 16;
 
-	kgem_set_mode(&sna->kgem, KGEM_BLT);
+	kgem_set_mode(&sna->kgem, KGEM_BLT, arg->bo);
 	do {
 		int bx1 = (box->x1 + sx) & ~7;
 		int bx2 = (box->x2 + sx + 7) & ~7;
@@ -6142,7 +5943,9 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
 			if (!kgem_check_batch(&sna->kgem, 7+src_stride) ||
 			    !kgem_check_bo_fenced(&sna->kgem, arg->bo) ||
 			    !kgem_check_reloc(&sna->kgem, 1)) {
-				_kgem_submit(&sna->kgem);
+				kgem_submit(&sna->kgem);
+				if (!kgem_check_bo_fenced(&sna->kgem, arg->bo))
+					return; /* XXX fallback? */
 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
 			}
 
@@ -6184,7 +5987,9 @@ sna_copy_bitmap_blt(DrawablePtr _bitmap, DrawablePtr drawable, GCPtr gc,
 			if (!kgem_check_batch(&sna->kgem, 8) ||
 			    !kgem_check_bo_fenced(&sna->kgem, arg->bo) ||
 			    !kgem_check_reloc_and_exec(&sna->kgem, 2)) {
-				_kgem_submit(&sna->kgem);
+				kgem_submit(&sna->kgem);
+				if (!kgem_check_bo_fenced(&sna->kgem, arg->bo))
+					return; /* XXX fallback? */
 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
 			}
 
@@ -6276,14 +6081,14 @@ sna_copy_plane_blt(DrawablePtr source, DrawablePtr drawable, GCPtr gc,
 
 	br00 = XY_MONO_SRC_COPY | 3 << 20;
 	br13 = arg->bo->pitch;
-	if (sna->kgem.gen >= 40 && arg->bo->tiling) {
+	if (sna->kgem.gen >= 040 && arg->bo->tiling) {
 		br00 |= BLT_DST_TILED;
 		br13 >>= 2;
 	}
 	br13 |= blt_depth(drawable->depth) << 24;
 	br13 |= copy_ROP[gc->alu] << 16;
 
-	kgem_set_mode(&sna->kgem, KGEM_BLT);
+	kgem_set_mode(&sna->kgem, KGEM_BLT, arg->bo);
 	do {
 		int bx1 = (box->x1 + sx) & ~7;
 		int bx2 = (box->x2 + sx + 7) & ~7;
@@ -6303,7 +6108,9 @@ sna_copy_plane_blt(DrawablePtr source, DrawablePtr drawable, GCPtr gc,
 		if (!kgem_check_batch(&sna->kgem, 8) ||
 		    !kgem_check_bo_fenced(&sna->kgem, arg->bo) ||
 		    !kgem_check_reloc_and_exec(&sna->kgem, 2)) {
-			_kgem_submit(&sna->kgem);
+			kgem_submit(&sna->kgem);
+			if (!kgem_check_bo_fenced(&sna->kgem, arg->bo))
+				return; /* XXX fallback? */
 			_kgem_set_mode(&sna->kgem, KGEM_BLT);
 		}
 
@@ -6504,7 +6311,7 @@ sna_copy_plane(DrawablePtr src, DrawablePtr dst, GCPtr gc,
 	     __FUNCTION__,
 	     region.extents.x1, region.extents.y1,
 	     region.extents.x2, region.extents.y2));
-	if (!RegionNotEmpty(&region))
+	if (RegionNil(&region))
 		goto empty;
 
 	RegionTranslate(&region,
@@ -6750,7 +6557,7 @@ sna_poly_point(DrawablePtr drawable, GCPtr gc,
 		DBG(("%s: trying solid fill [%08lx] blt paths\n",
 		     __FUNCTION__, gc->fgPixel));
 
-		if ((bo = sna_drawable_use_bo(drawable, 0,
+		if ((bo = sna_drawable_use_bo(drawable, PREFER_GPU,
 					      &region.extents, &damage)) &&
 		    sna_poly_point_blt(drawable, bo, damage,
 				       gc, mode, n, pt, flags & 2))
@@ -6761,7 +6568,7 @@ fallback:
 	DBG(("%s: fallback\n", __FUNCTION__));
 	region.data = NULL;
 	region_maybe_clip(&region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&region))
+	if (RegionNil(&region))
 		return;
 
 	if (!sna_gc_move_to_cpu(gc, drawable, &region))
@@ -6816,7 +6623,7 @@ sna_poly_zero_line_blt(DrawablePtr drawable,
 	region_set(&clip, extents);
 	if (clipped) {
 		region_maybe_clip(&clip, gc->pCompositeClip);
-		if (!RegionNotEmpty(&clip))
+		if (RegionNil(&clip))
 			return true;
 	}
 
@@ -6827,8 +6634,8 @@ sna_poly_zero_line_blt(DrawablePtr drawable,
 	     clip.extents.x2, clip.extents.y2,
 	     dx, dy, damage));
 
-	extents = REGION_RECTS(&clip);
-	last_extents = extents + REGION_NUM_RECTS(&clip);
+	extents = RegionRects(&clip);
+	last_extents = extents + RegionNumRects(&clip);
 
 	b = box;
 	do {
@@ -7223,6 +7030,8 @@ sna_poly_line_blt(DrawablePtr drawable,
 				b->y1 = p.y;
 				b->y2 = last.y;
 			}
+			b->y2 += last.x == p.x;
+			b->x2 += last.y == p.y;
 			DBG(("%s: blt (%d, %d), (%d, %d)\n",
 			     __FUNCTION__,
 			     b->x1, b->y1, b->x2, b->y2));
@@ -7241,7 +7050,7 @@ sna_poly_line_blt(DrawablePtr drawable,
 
 		region_set(&clip, extents);
 		region_maybe_clip(&clip, gc->pCompositeClip);
-		if (!RegionNotEmpty(&clip))
+		if (RegionNil(&clip))
 			return true;
 
 		last.x = pt->x + drawable->x;
@@ -7280,6 +7089,8 @@ sna_poly_line_blt(DrawablePtr drawable,
 					b->y1 = p.y;
 					b->y2 = last.y;
 				}
+				b->y2 += last.x == p.x;
+				b->x2 += last.y == p.y;
 				DBG(("%s: blt (%d, %d), (%d, %d)\n",
 				     __FUNCTION__,
 				     b->x1, b->y1, b->x2, b->y2));
@@ -7336,6 +7147,8 @@ sna_poly_line_blt(DrawablePtr drawable,
 					box.y1 = p.y;
 					box.y2 = last.y;
 				}
+				b->y2 += last.x == p.x;
+				b->x2 += last.y == p.y;
 				DBG(("%s: blt (%d, %d), (%d, %d)\n",
 				     __FUNCTION__,
 				     box.x1, box.y1, box.x2, box.y2));
@@ -7683,7 +7496,7 @@ spans_fallback:
 				} else {
 					region_maybe_clip(&data.region,
 							  gc->pCompositeClip);
-					if (!RegionNotEmpty(&data.region))
+					if (RegionNil(&data.region))
 						return;
 
 					if (region_is_singular(&data.region))
@@ -7708,7 +7521,7 @@ spans_fallback:
 				} else {
 					region_maybe_clip(&data.region,
 							  gc->pCompositeClip);
-					if (!RegionNotEmpty(&data.region))
+					if (RegionNil(&data.region))
 						return;
 
 					if (region_is_singular(&data.region))
@@ -7785,7 +7598,7 @@ spans_fallback:
 fallback:
 	DBG(("%s: fallback\n", __FUNCTION__));
 	region_maybe_clip(&data.region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&data.region))
+	if (RegionNil(&data.region))
 		return;
 
 	if (!sna_gc_move_to_cpu(gc, drawable, &data.region))
@@ -7915,7 +7728,7 @@ sna_poly_segment_blt(DrawablePtr drawable,
 
 		region_set(&clip, extents);
 		region_maybe_clip(&clip, gc->pCompositeClip);
-		if (!RegionNotEmpty(&clip))
+		if (RegionNil(&clip))
 			goto done;
 
 		if (clip.data) {
@@ -8022,7 +7835,7 @@ sna_poly_zero_segment_blt(DrawablePtr drawable,
 	region_set(&clip, extents);
 	if (clipped) {
 		region_maybe_clip(&clip, gc->pCompositeClip);
-		if (!RegionNotEmpty(&clip))
+		if (RegionNil(&clip))
 			return true;
 	}
 	DBG(("%s: [clipped] extents=(%d, %d), (%d, %d), delta=(%d, %d)\n",
@@ -8034,8 +7847,8 @@ sna_poly_zero_segment_blt(DrawablePtr drawable,
 	jump = _jump[(damage != NULL) | !!(dx|dy) << 1];
 
 	b = box;
-	extents = REGION_RECTS(&clip);
-	last_extents = extents + REGION_NUM_RECTS(&clip);
+	extents = RegionRects(&clip);
+	last_extents = extents + RegionNumRects(&clip);
 	do {
 		int n = _n;
 		const xSegment *s = _s;
@@ -8590,7 +8403,7 @@ spans_fallback:
 			} else {
 				region_maybe_clip(&data.region,
 						  gc->pCompositeClip);
-				if (!RegionNotEmpty(&data.region))
+				if (RegionNil(&data.region))
 					return;
 
 				if (region_is_singular(&data.region))
@@ -8629,7 +8442,7 @@ spans_fallback:
 fallback:
 	DBG(("%s: fallback\n", __FUNCTION__));
 	region_maybe_clip(&data.region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&data.region))
+	if (RegionNil(&data.region))
 		return;
 
 	if (!sna_gc_move_to_cpu(gc, drawable, &data.region))
@@ -8657,6 +8470,7 @@ sna_poly_rectangle_extents(DrawablePtr drawable, GCPtr gc,
 	Box32Rec box;
 	int extra = gc->lineWidth >> 1;
 	bool clipped;
+	bool zero = false;
 
 	if (n == 0)
 		return 0;
@@ -8665,9 +8479,13 @@ sna_poly_rectangle_extents(DrawablePtr drawable, GCPtr gc,
 	box.y1 = r->y;
 	box.x2 = box.x1 + r->width;
 	box.y2 = box.y1 + r->height;
+	zero |= (r->width | r->height) == 0;
 
-	while (--n)
-		box32_add_rect(&box, ++r);
+	while (--n) {
+		r++;
+		zero |= (r->width | r->height) == 0;
+		box32_add_rect(&box, r);
+	}
 
 	box.x2++;
 	box.y2++;
@@ -8677,13 +8495,15 @@ sna_poly_rectangle_extents(DrawablePtr drawable, GCPtr gc,
 		box.x2 += extra;
 		box.y1 -= extra;
 		box.y2 += extra;
-	}
+		zero = !zero;
+	} else
+		zero = true;
 
 	clipped = box32_trim_and_translate(&box, drawable, gc);
 	if (!box32_to_box16(&box, out))
 		return 0;
 
-	return 1 | clipped << 1;
+	return 1 | clipped << 1 | zero << 2;
 }
 
 static bool
@@ -8722,7 +8542,7 @@ zero:
 		xRectangle rr = *r++;
 
 		if ((rr.width | rr.height) == 0)
-			continue;
+			continue; /* XXX -> PolyLine */
 
 		DBG(("%s - zero : r[%d] = (%d, %d) x (%d, %d)\n", __FUNCTION__,
 		     n, rr.x, rr.y, rr.width, rr.height));
@@ -8777,7 +8597,7 @@ zero_clipped:
 
 		region_set(&clip, extents);
 		region_maybe_clip(&clip, gc->pCompositeClip);
-		if (!RegionNotEmpty(&clip))
+		if (RegionNil(&clip))
 			goto done;
 
 		if (clip.data) {
@@ -8791,7 +8611,7 @@ zero_clipped:
 				     n, rr.x, rr.y, rr.width, rr.height));
 
 				if ((rr.width | rr.height) == 0)
-					continue;
+					continue; /* XXX -> PolyLine */
 
 				rr.x += drawable->x;
 				rr.y += drawable->y;
@@ -8855,7 +8675,7 @@ zero_clipped:
 				     n, rr.x, rr.y, rr.width, rr.height));
 
 				if ((rr.width | rr.height) == 0)
-					continue;
+					continue; /* XXX -> PolyLine */
 
 				rr.x += drawable->x;
 				rr.y += drawable->y;
@@ -8923,7 +8743,7 @@ wide_clipped:
 		     __FUNCTION__,
 		     clip.extents.x1, clip.extents.y1,
 		     clip.extents.x2, clip.extents.y2));
-		if (!RegionNotEmpty(&clip))
+		if (RegionNil(&clip))
 			goto done;
 
 		if (clip.data) {
@@ -8935,7 +8755,7 @@ wide_clipped:
 				int count;
 
 				if ((rr.width | rr.height) == 0)
-					continue;
+					continue; /* XXX -> PolyLine */
 
 				rr.x += drawable->x;
 				rr.y += drawable->y;
@@ -9100,7 +8920,7 @@ wide:
 			xRectangle rr = *r++;
 
 			if ((rr.width | rr.height) == 0)
-				continue;
+				continue; /* XXX -> PolyLine */
 
 			rr.x += dx;
 			rr.y += dy;
@@ -9198,8 +9018,9 @@ sna_poly_rectangle(DrawablePtr drawable, GCPtr gc, int n, xRectangle *r)
 		goto fallback;
 	}
 
-	DBG(("%s: line=%d [%d], join=%d [%d], mask=%lu [%d]\n",
+	DBG(("%s: fill=_%d [%d], line=%d [%d], join=%d [%d], mask=%lu [%d]\n",
 	     __FUNCTION__,
+	     gc->fillStyle, gc->fillStyle == FillSolid,
 	     gc->lineStyle, gc->lineStyle == LineSolid,
 	     gc->joinStyle, gc->joinStyle == JoinMiter,
 	     gc->planemask, PM_IS_SOLID(drawable, gc->planemask)));
@@ -9207,7 +9028,7 @@ sna_poly_rectangle(DrawablePtr drawable, GCPtr gc, int n, xRectangle *r)
 	if (!PM_IS_SOLID(drawable, gc->planemask))
 		goto fallback;
 
-	if (gc->lineStyle == LineSolid && gc->joinStyle == JoinMiter) {
+	if (flags & 4 && gc->fillStyle == FillSolid && gc->lineStyle == LineSolid && gc->joinStyle == JoinMiter) {
 		DBG(("%s: trying blt solid fill [%08lx] paths\n",
 		     __FUNCTION__, gc->fgPixel));
 		if ((bo = sna_drawable_use_bo(drawable, PREFER_GPU,
@@ -9231,7 +9052,7 @@ fallback:
 
 	region.data = NULL;
 	region_maybe_clip(&region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&region))
+	if (RegionNil(&region))
 		return;
 
 	if (!sna_gc_move_to_cpu(gc, drawable, &region))
@@ -9370,7 +9191,7 @@ sna_poly_arc(DrawablePtr drawable, GCPtr gc, int n, xArc *arc)
 				} else {
 					region_maybe_clip(&data.region,
 							  gc->pCompositeClip);
-					if (!RegionNotEmpty(&data.region))
+					if (RegionNil(&data.region))
 						return;
 
 					if (region_is_singular(&data.region)) {
@@ -9394,7 +9215,7 @@ sna_poly_arc(DrawablePtr drawable, GCPtr gc, int n, xArc *arc)
 			} else {
 				region_maybe_clip(&data.region,
 						  gc->pCompositeClip);
-				if (!RegionNotEmpty(&data.region))
+				if (RegionNil(&data.region))
 					return;
 
 				sna_gc_ops__tmp.FillSpans = sna_fill_spans__gpu;
@@ -9430,7 +9251,7 @@ sna_poly_arc(DrawablePtr drawable, GCPtr gc, int n, xArc *arc)
 fallback:
 	DBG(("%s -- fallback\n", __FUNCTION__));
 	region_maybe_clip(&data.region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&data.region))
+	if (RegionNil(&data.region))
 		return;
 
 	if (!sna_gc_move_to_cpu(gc, drawable, &data.region))
@@ -9502,12 +9323,12 @@ sna_poly_fill_rect_blt(DrawablePtr drawable,
 				    r.y2 - r.y1 == pixmap->drawable.height) {
 					struct sna_pixmap *priv = sna_pixmap(pixmap);
 					if (bo == priv->gpu_bo) {
+						assert(priv->gpu_bo->proxy == NULL);
 						sna_damage_all(&priv->gpu_damage,
 							       pixmap->drawable.width,
 							       pixmap->drawable.height);
 						sna_damage_destroy(&priv->cpu_damage);
 						list_del(&priv->list);
-						priv->undamaged = false;
 						priv->clear = true;
 						priv->clear_color = gc->alu == GXcopy ? pixel : 0;
 
@@ -9573,7 +9394,7 @@ sna_poly_fill_rect_blt(DrawablePtr drawable,
 
 		region_set(&clip, extents);
 		region_maybe_clip(&clip, gc->pCompositeClip);
-		if (!RegionNotEmpty(&clip))
+		if (RegionNil(&clip))
 			goto done;
 
 		if (clip.data == NULL) {
@@ -9744,7 +9565,7 @@ sna_poly_fill_polygon(DrawablePtr draw, GCPtr gc,
 			} else {
 				region_maybe_clip(&data.region,
 						  gc->pCompositeClip);
-				if (!RegionNotEmpty(&data.region))
+				if (RegionNil(&data.region))
 					return;
 
 				if (region_is_singular(&data.region))
@@ -9781,7 +9602,7 @@ fallback:
 	     data.region.extents.x1, data.region.extents.y1,
 	     data.region.extents.x2, data.region.extents.y2));
 	region_maybe_clip(&data.region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&data.region)) {
+	if (RegionNil(&data.region)) {
 		DBG(("%s: nothing to do, all clipped\n", __FUNCTION__));
 		return;
 	}
@@ -9831,7 +9652,8 @@ sna_pixmap_get_source_bo(PixmapPtr pixmap)
 		return upload;
 	}
 
-	if (priv->gpu_damage && !sna_pixmap_move_to_gpu(pixmap, MOVE_READ))
+	if (priv->gpu_damage &&
+	    !sna_pixmap_move_to_gpu(pixmap, MOVE_READ | MOVE_ASYNC_HINT))
 		return NULL;
 
 	if (priv->cpu_damage && priv->cpu_bo)
@@ -9873,17 +9695,19 @@ sna_poly_fill_rect_tiled_8x8_blt(DrawablePtr drawable,
 	DBG(("%s x %d [(%d, %d)x(%d, %d)...], clipped=%x\n",
 	     __FUNCTION__, n, r->x, r->y, r->width, r->height, clipped));
 
-	kgem_set_mode(&sna->kgem, KGEM_BLT);
+	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
 	if (!kgem_check_batch(&sna->kgem, 8+2*3) ||
 	    !kgem_check_reloc(&sna->kgem, 2) ||
 	    !kgem_check_bo_fenced(&sna->kgem, bo)) {
-		_kgem_submit(&sna->kgem);
+		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo_fenced(&sna->kgem, bo))
+			return false;
 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
 	}
 
 	br00 = XY_SCANLINE_BLT;
 	br13 = bo->pitch;
-	if (sna->kgem.gen >= 40 && bo->tiling) {
+	if (sna->kgem.gen >= 040 && bo->tiling) {
 		br00 |= BLT_DST_TILED;
 		br13 >>= 2;
 	}
@@ -9988,7 +9812,7 @@ sna_poly_fill_rect_tiled_8x8_blt(DrawablePtr drawable,
 
 		region_set(&clip, extents);
 		region_maybe_clip(&clip, gc->pCompositeClip);
-		if (!RegionNotEmpty(&clip))
+		if (RegionNil(&clip))
 			goto done;
 
 		b = sna->kgem.batch + sna->kgem.nbatch;
@@ -10315,7 +10139,7 @@ sna_poly_fill_rect_tiled_blt(DrawablePtr drawable,
 
 		region_set(&clip, extents);
 		region_maybe_clip(&clip, gc->pCompositeClip);
-		if (!RegionNotEmpty(&clip))
+		if (RegionNil(&clip))
 			goto done;
 
 		if (clip.data == NULL) {
@@ -10393,8 +10217,8 @@ sna_poly_fill_rect_tiled_blt(DrawablePtr drawable,
 				region.data = NULL;
 				RegionIntersect(&region, &region, &clip);
 
-				nbox = REGION_NUM_RECTS(&region);
-				box = REGION_RECTS(&region);
+				nbox = RegionNumRects(&region);
+				box = RegionRects(&region);
 				while (nbox--) {
 					int height = box->y2 - box->y1;
 					int dst_y = box->y1;
@@ -10487,7 +10311,7 @@ sna_poly_fill_rect_stippled_8x8_blt(DrawablePtr drawable,
 		DBG(("%s: pat offset (%d, %d)\n", __FUNCTION__ ,px, py));
 		br00 = XY_SCANLINE_BLT | px << 12 | py << 8 | 3 << 20;
 		br13 = bo->pitch;
-		if (sna->kgem.gen >= 40 && bo->tiling) {
+		if (sna->kgem.gen >= 040 && bo->tiling) {
 			br00 |= BLT_DST_TILED;
 			br13 >>= 2;
 		}
@@ -10507,11 +10331,13 @@ sna_poly_fill_rect_stippled_8x8_blt(DrawablePtr drawable,
 		} while (--j);
 	}
 
-	kgem_set_mode(&sna->kgem, KGEM_BLT);
+	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
 	if (!kgem_check_batch(&sna->kgem, 9 + 2*3) ||
 	    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 	    !kgem_check_reloc(&sna->kgem, 1)) {
-		_kgem_submit(&sna->kgem);
+		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo_fenced(&sna->kgem, bo))
+			return false;
 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
 	}
 
@@ -10592,7 +10418,7 @@ sna_poly_fill_rect_stippled_8x8_blt(DrawablePtr drawable,
 
 		region_set(&clip, extents);
 		region_maybe_clip(&clip, gc->pCompositeClip);
-		if (!RegionNotEmpty(&clip))
+		if (RegionNil(&clip))
 			return true;
 
 		b = sna->kgem.batch + sna->kgem.nbatch;
@@ -10783,11 +10609,11 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
 	     origin->x, origin->y));
 
 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
-	kgem_set_mode(&sna->kgem, KGEM_BLT);
+	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
 
 	br00 = 3 << 20;
 	br13 = bo->pitch;
-	if (sna->kgem.gen >= 40 && bo->tiling) {
+	if (sna->kgem.gen >= 040 && bo->tiling) {
 		br00 |= BLT_DST_TILED;
 		br13 >>= 2;
 	}
@@ -10821,7 +10647,9 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
 				if (!kgem_check_batch(&sna->kgem, 7+src_stride) ||
 				    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 				    !kgem_check_reloc(&sna->kgem, 1)) {
-					_kgem_submit(&sna->kgem);
+					kgem_submit(&sna->kgem);
+					if (!kgem_check_bo_fenced(&sna->kgem, bo))
+						return false;
 					_kgem_set_mode(&sna->kgem, KGEM_BLT);
 				}
 
@@ -10863,7 +10691,9 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
 				if (!kgem_check_batch(&sna->kgem, 8) ||
 				    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 				    !kgem_check_reloc_and_exec(&sna->kgem, 2)) {
-					_kgem_submit(&sna->kgem);
+					kgem_submit(&sna->kgem);
+					if (!kgem_check_bo_fenced(&sna->kgem, bo))
+						return false;
 					_kgem_set_mode(&sna->kgem, KGEM_BLT);
 				}
 
@@ -10920,7 +10750,7 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
 
 		region_set(&clip, extents);
 		region_maybe_clip(&clip, gc->pCompositeClip);
-		if (!RegionNotEmpty(&clip))
+		if (RegionNil(&clip))
 			return true;
 
 		pat.x = origin->x + drawable->x;
@@ -10963,7 +10793,9 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
 					if (!kgem_check_batch(&sna->kgem, 7+src_stride) ||
 					    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 					    !kgem_check_reloc(&sna->kgem, 1)) {
-						_kgem_submit(&sna->kgem);
+						kgem_submit(&sna->kgem);
+						if (!kgem_check_bo_fenced(&sna->kgem, bo))
+							return false;
 						_kgem_set_mode(&sna->kgem, KGEM_BLT);
 					}
 
@@ -11002,7 +10834,9 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
 					if (!kgem_check_batch(&sna->kgem, 8) ||
 					    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 					    !kgem_check_reloc_and_exec(&sna->kgem, 2)) {
-						_kgem_submit(&sna->kgem);
+						kgem_submit(&sna->kgem);
+						if (!kgem_check_bo_fenced(&sna->kgem, bo))
+							return false;
 						_kgem_set_mode(&sna->kgem, KGEM_BLT);
 					}
 
@@ -11103,7 +10937,9 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
 						if (!kgem_check_batch(&sna->kgem, 7+src_stride) ||
 						    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 						    !kgem_check_reloc(&sna->kgem, 1)) {
-							_kgem_submit(&sna->kgem);
+							kgem_submit(&sna->kgem);
+							if (!kgem_check_bo_fenced(&sna->kgem, bo))
+								return false;
 							_kgem_set_mode(&sna->kgem, KGEM_BLT);
 						}
 
@@ -11142,7 +10978,9 @@ sna_poly_fill_rect_stippled_1_blt(DrawablePtr drawable,
 						if (!kgem_check_batch(&sna->kgem, 8) ||
 						    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 						    !kgem_check_reloc_and_exec(&sna->kgem, 2)) {
-							_kgem_submit(&sna->kgem);
+							kgem_submit(&sna->kgem);
+							if (!kgem_check_bo_fenced(&sna->kgem, bo))
+								return false;
 							_kgem_set_mode(&sna->kgem, KGEM_BLT);
 						}
 
@@ -11252,7 +11090,9 @@ sna_poly_fill_rect_stippled_n_box__imm(struct sna *sna,
 			if (!kgem_check_batch(&sna->kgem, 7+len) ||
 			    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 			    !kgem_check_reloc(&sna->kgem, 1)) {
-				_kgem_submit(&sna->kgem);
+				kgem_submit(&sna->kgem);
+				if (!kgem_check_bo_fenced(&sna->kgem, bo))
+					return; /* XXX fallback? */
 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
 			}
 
@@ -11356,7 +11196,9 @@ sna_poly_fill_rect_stippled_n_box(struct sna *sna,
 			if (!kgem_check_batch(&sna->kgem, 7+len) ||
 			    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 			    !kgem_check_reloc(&sna->kgem, 2)) {
-				_kgem_submit(&sna->kgem);
+				kgem_submit(&sna->kgem);
+				if (!kgem_check_bo_fenced(&sna->kgem, bo))
+					return; /* XXX fallback? */
 				_kgem_set_mode(&sna->kgem, KGEM_BLT);
 			}
 
@@ -11479,11 +11321,11 @@ sna_poly_fill_rect_stippled_n_blt__imm(DrawablePtr drawable,
 	     clipped, gc->alu, gc->fillStyle == FillOpaqueStippled));
 
 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
-	kgem_set_mode(&sna->kgem, KGEM_BLT);
+	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
 
 	br00 = XY_MONO_SRC_COPY_IMM | 3 << 20;
 	br13 = bo->pitch;
-	if (sna->kgem.gen >= 40 && bo->tiling) {
+	if (sna->kgem.gen >= 040 && bo->tiling) {
 		br00 |= BLT_DST_TILED;
 		br13 >>= 2;
 	}
@@ -11517,7 +11359,7 @@ sna_poly_fill_rect_stippled_n_blt__imm(DrawablePtr drawable,
 
 		region_set(&clip, extents);
 		region_maybe_clip(&clip, gc->pCompositeClip);
-		if (!RegionNotEmpty(&clip)) {
+		if (RegionNil(&clip)) {
 			DBG(("%s: all clipped\n", __FUNCTION__));
 			return true;
 		}
@@ -11624,11 +11466,11 @@ sna_poly_fill_rect_stippled_n_blt(DrawablePtr drawable,
 							      extents, clipped);
 
 	get_drawable_deltas(drawable, pixmap, &dx, &dy);
-	kgem_set_mode(&sna->kgem, KGEM_BLT);
+	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
 
 	br00 = XY_MONO_SRC_COPY | 3 << 20;
 	br13 = bo->pitch;
-	if (sna->kgem.gen >= 40 && bo->tiling) {
+	if (sna->kgem.gen >= 040 && bo->tiling) {
 		br00 |= BLT_DST_TILED;
 		br13 >>= 2;
 	}
@@ -11662,7 +11504,7 @@ sna_poly_fill_rect_stippled_n_blt(DrawablePtr drawable,
 
 		region_set(&clip, extents);
 		region_maybe_clip(&clip, gc->pCompositeClip);
-		if (!RegionNotEmpty(&clip)) {
+		if (RegionNil(&clip)) {
 			DBG(("%s: all clipped\n", __FUNCTION__));
 			return true;
 		}
@@ -11943,10 +11785,10 @@ sna_poly_fill_rect(DrawablePtr draw, GCPtr gc, int n, xRectangle *rect)
 		     box_inplace(pixmap, &region.extents))) {
 			DBG(("%s: promoting to full GPU\n", __FUNCTION__));
 			if (priv->gpu_bo) {
+				assert(priv->gpu_bo->proxy == NULL);
 				sna_damage_all(&priv->gpu_damage,
 					       pixmap->drawable.width,
 					       pixmap->drawable.height);
-				priv->undamaged = false;
 			}
 		}
 		if (priv->cpu_damage == NULL) {
@@ -12000,7 +11842,7 @@ fallback:
 	     region.extents.x2, region.extents.y2));
 	region.data = NULL;
 	region_maybe_clip(&region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&region)) {
+	if (RegionNil(&region)) {
 		DBG(("%s: nothing to do, all clipped\n", __FUNCTION__));
 		return;
 	}
@@ -12093,7 +11935,7 @@ sna_poly_fill_arc(DrawablePtr draw, GCPtr gc, int n, xArc *arc)
 			} else {
 				region_maybe_clip(&data.region,
 						  gc->pCompositeClip);
-				if (!RegionNotEmpty(&data.region))
+				if (RegionNil(&data.region))
 					return;
 
 				if (region_is_singular(&data.region))
@@ -12130,7 +11972,7 @@ fallback:
 	     data.region.extents.x1, data.region.extents.y1,
 	     data.region.extents.x2, data.region.extents.y2));
 	region_maybe_clip(&data.region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&data.region)) {
+	if (RegionNil(&data.region)) {
 		DBG(("%s: nothing to do, all clipped\n", __FUNCTION__));
 		return;
 	}
@@ -12256,19 +12098,21 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 	_y += drawable->y + dy;
 
 	RegionTranslate(clip, dx, dy);
-	extents = REGION_RECTS(clip);
-	last_extents = extents + REGION_NUM_RECTS(clip);
+	extents = RegionRects(clip);
+	last_extents = extents + RegionNumRects(clip);
 
 	if (!transparent) /* emulate miImageGlyphBlt */
 		sna_blt_fill_boxes(sna, GXcopy,
 				   bo, drawable->bitsPerPixel,
-				   bg, extents, REGION_NUM_RECTS(clip));
+				   bg, extents, RegionNumRects(clip));
 
-	kgem_set_mode(&sna->kgem, KGEM_BLT);
+	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
 	if (!kgem_check_batch(&sna->kgem, 16) ||
 	    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 	    !kgem_check_reloc(&sna->kgem, 1)) {
-		_kgem_submit(&sna->kgem);
+		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo_fenced(&sna->kgem, bo))
+			return false;
 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
 	}
 
@@ -12280,7 +12124,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 	b = sna->kgem.batch + sna->kgem.nbatch;
 	b[0] = XY_SETUP_BLT | 3 << 20;
 	b[1] = bo->pitch;
-	if (sna->kgem.gen >= 40 && bo->tiling) {
+	if (sna->kgem.gen >= 040 && bo->tiling) {
 		b[0] |= BLT_DST_TILED;
 		b[1] >>= 2;
 	}
@@ -12298,7 +12142,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 	sna->kgem.nbatch += 8;
 
 	br00 = XY_TEXT_IMMEDIATE_BLT;
-	if (bo->tiling && sna->kgem.gen >= 40)
+	if (bo->tiling && sna->kgem.gen >= 040)
 		br00 |= BLT_DST_TILED;
 
 	do {
@@ -12343,7 +12187,7 @@ sna_glyph_blt(DrawablePtr drawable, GCPtr gc,
 				b = sna->kgem.batch + sna->kgem.nbatch;
 				b[0] = XY_SETUP_BLT | 3 << 20;
 				b[1] = bo->pitch;
-				if (sna->kgem.gen >= 40 && bo->tiling) {
+				if (sna->kgem.gen >= 040 && bo->tiling) {
 					b[0] |= BLT_DST_TILED;
 					b[1] >>= 2;
 				}
@@ -12585,7 +12429,7 @@ sna_poly_text8(DrawablePtr drawable, GCPtr gc,
 
 	region.data = NULL;
 	region_maybe_clip(&region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&region))
+	if (RegionNil(&region))
 		return x + extents.overallRight;
 
 	if (FORCE_FALLBACK)
@@ -12659,7 +12503,7 @@ sna_poly_text16(DrawablePtr drawable, GCPtr gc,
 
 	region.data = NULL;
 	region_maybe_clip(&region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&region))
+	if (RegionNil(&region))
 		return x + extents.overallRight;
 
 	if (FORCE_FALLBACK)
@@ -12740,7 +12584,7 @@ sna_image_text8(DrawablePtr drawable, GCPtr gc,
 
 	region.data = NULL;
 	region_maybe_clip(&region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&region))
+	if (RegionNil(&region))
 		return;
 
 	DBG(("%s: clipped extents (%d, %d), (%d, %d)\n",
@@ -12822,7 +12666,7 @@ sna_image_text16(DrawablePtr drawable, GCPtr gc,
 
 	region.data = NULL;
 	region_maybe_clip(&region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&region))
+	if (RegionNil(&region))
 		return;
 
 	DBG(("%s: clipped extents (%d, %d), (%d, %d)\n",
@@ -12901,19 +12745,21 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 	_y += drawable->y + dy;
 
 	RegionTranslate(clip, dx, dy);
-	extents = REGION_RECTS(clip);
-	last_extents = extents + REGION_NUM_RECTS(clip);
+	extents = RegionRects(clip);
+	last_extents = extents + RegionNumRects(clip);
 
 	if (!transparent) /* emulate miImageGlyphBlt */
 		sna_blt_fill_boxes(sna, GXcopy,
 				   bo, drawable->bitsPerPixel,
-				   bg, extents, REGION_NUM_RECTS(clip));
+				   bg, extents, RegionNumRects(clip));
 
-	kgem_set_mode(&sna->kgem, KGEM_BLT);
+	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
 	if (!kgem_check_batch(&sna->kgem, 16) ||
 	    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 	    !kgem_check_reloc(&sna->kgem, 1)) {
-		_kgem_submit(&sna->kgem);
+		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo_fenced(&sna->kgem, bo))
+			return false;
 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
 	}
 
@@ -12924,7 +12770,7 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 	b = sna->kgem.batch + sna->kgem.nbatch;
 	b[0] = XY_SETUP_BLT | 1 << 20;
 	b[1] = bo->pitch;
-	if (sna->kgem.gen >= 40 && bo->tiling) {
+	if (sna->kgem.gen >= 040 && bo->tiling) {
 		b[0] |= BLT_DST_TILED;
 		b[1] >>= 2;
 	}
@@ -13005,7 +12851,7 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 				b = sna->kgem.batch + sna->kgem.nbatch;
 				b[0] = XY_SETUP_BLT | 1 << 20;
 				b[1] = bo->pitch;
-				if (sna->kgem.gen >= 40 && bo->tiling) {
+				if (sna->kgem.gen >= 040 && bo->tiling) {
 					b[0] |= BLT_DST_TILED;
 					b[1] >>= 2;
 				}
@@ -13028,7 +12874,7 @@ sna_reversed_glyph_blt(DrawablePtr drawable, GCPtr gc,
 			sna->kgem.nbatch += 3 + len;
 
 			b[0] = XY_TEXT_IMMEDIATE_BLT | (1 + len);
-			if (bo->tiling && sna->kgem.gen >= 40)
+			if (bo->tiling && sna->kgem.gen >= 040)
 				b[0] |= BLT_DST_TILED;
 			b[1] = (uint16_t)y1 << 16 | (uint16_t)x1;
 			b[2] = (uint16_t)(y1+h) << 16 | (uint16_t)(x1+w);
@@ -13122,7 +12968,7 @@ sna_image_glyph(DrawablePtr drawable, GCPtr gc,
 
 	region.data = NULL;
 	region_maybe_clip(&region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&region))
+	if (RegionNil(&region))
 		return;
 
 	if (FORCE_FALLBACK)
@@ -13200,7 +13046,7 @@ sna_poly_glyph(DrawablePtr drawable, GCPtr gc,
 
 	region.data = NULL;
 	region_maybe_clip(&region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&region))
+	if (RegionNil(&region))
 		return;
 
 	if (FORCE_FALLBACK)
@@ -13289,11 +13135,11 @@ sna_push_pixels_solid_blt(GCPtr gc,
 	     region->extents.x1, region->extents.y1,
 	     region->extents.x2, region->extents.y2));
 
-	kgem_set_mode(&sna->kgem, KGEM_BLT);
+	kgem_set_mode(&sna->kgem, KGEM_BLT, bo);
 
 	/* Region is pre-clipped and translated into pixmap space */
-	box = REGION_RECTS(region);
-	n = REGION_NUM_RECTS(region);
+	box = RegionRects(region);
+	n = RegionNumRects(region);
 	do {
 		int bx1 = (box->x1 - region->extents.x1) & ~7;
 		int bx2 = (box->x2 - region->extents.x1 + 7) & ~7;
@@ -13309,7 +13155,9 @@ sna_push_pixels_solid_blt(GCPtr gc,
 		if (!kgem_check_batch(&sna->kgem, 8) ||
 		    !kgem_check_bo_fenced(&sna->kgem, bo) ||
 		    !kgem_check_reloc_and_exec(&sna->kgem, 2)) {
-			_kgem_submit(&sna->kgem);
+			kgem_submit(&sna->kgem);
+			if (!kgem_check_bo_fenced(&sna->kgem, bo))
+				return false;
 			_kgem_set_mode(&sna->kgem, KGEM_BLT);
 		}
 
@@ -13340,7 +13188,7 @@ sna_push_pixels_solid_blt(GCPtr gc,
 		b[0] = XY_MONO_SRC_COPY | 3 << 20;
 		b[0] |= ((box->x1 - region->extents.x1) & 7) << 17;
 		b[1] = bo->pitch;
-		if (sna->kgem.gen >= 40 && bo->tiling) {
+		if (sna->kgem.gen >= 040 && bo->tiling) {
 			b[0] |= BLT_DST_TILED;
 			b[1] >>= 2;
 		}
@@ -13399,7 +13247,7 @@ sna_push_pixels(GCPtr gc, PixmapPtr bitmap, DrawablePtr drawable,
 
 	region.data = NULL;
 	region_maybe_clip(&region, gc->pCompositeClip);
-	if (!RegionNotEmpty(&region))
+	if (RegionNil(&region))
 		return;
 
 	switch (gc->fillStyle) {
@@ -13537,6 +13385,10 @@ static int sna_create_gc(GCPtr gc)
 	gc->miTranslate = 1;
 	gc->fExpose = 1;
 
+	gc->freeCompClip = 0;
+	gc->pCompositeClip = 0;
+	gc->pRotatedPixmap = 0;
+
 	fb_gc(gc)->bpp = bits_per_pixel(gc->depth);
 
 	gc->funcs = (GCFuncs *)&sna_gc_funcs;
@@ -13544,6 +13396,82 @@ static int sna_create_gc(GCPtr gc)
 	return true;
 }
 
+static bool
+sna_get_image_blt(DrawablePtr drawable,
+		  RegionPtr region,
+		  char *dst)
+{
+	PixmapPtr pixmap = get_drawable_pixmap(drawable);
+	struct sna_pixmap *priv = sna_pixmap(pixmap);
+	struct sna *sna = to_sna_from_pixmap(pixmap);
+	struct kgem_bo *dst_bo;
+	bool ok = false;
+	int pitch;
+
+	if (!USE_USERPTR_DOWNLOADS)
+		return false;
+
+	if (priv == NULL)
+		return false;
+
+	if (priv->clear) {
+		int w = region->extents.x2 - region->extents.x1;
+		int h = region->extents.y2 - region->extents.y1;
+
+		pitch = PixmapBytePad(w, pixmap->drawable.depth);
+		if (priv->clear_color == 0 ||
+		    pixmap->drawable.bitsPerPixel == 8) {
+			memset(dst, priv->clear_color, pitch * h);
+		} else {
+			pixman_fill((uint32_t *)dst,
+				    pitch/sizeof(uint32_t),
+				    pixmap->drawable.bitsPerPixel,
+				    0, 0,
+				    w, h,
+				    priv->clear_color);
+		}
+
+		return true;
+	}
+
+	if (!sna->kgem.has_userptr)
+		return false;
+
+	if (!DAMAGE_IS_ALL(priv->gpu_damage) ||
+	    !__kgem_bo_is_busy(&sna->kgem, priv->gpu_bo))
+		return false;
+
+	DBG(("%s: download through a temporary map\n", __FUNCTION__));
+
+	pitch = PixmapBytePad(region->extents.x2 - region->extents.x1,
+			      drawable->depth);
+	dst_bo = kgem_create_map(&sna->kgem, dst,
+				 pitch * (region->extents.y2 - region->extents.y1),
+				 false);
+	if (dst_bo) {
+		int16_t dx, dy;
+
+		dst_bo->flush = true;
+		dst_bo->pitch = pitch;
+		dst_bo->reusable = false;
+
+		get_drawable_deltas(drawable, pixmap, &dx, &dy);
+
+		ok = sna->render.copy_boxes(sna, GXcopy,
+					    pixmap, priv->gpu_bo, dx, dy,
+					    pixmap, dst_bo,
+					    -region->extents.x1,
+					    -region->extents.y1,
+					    &region->extents, 1,
+					    COPY_LAST);
+
+		kgem_bo_sync__cpu(&sna->kgem, dst_bo);
+		kgem_bo_destroy(&sna->kgem, dst_bo);
+	}
+
+	return ok;
+}
+
 static void
 sna_get_image(DrawablePtr drawable,
 	      int x, int y, int w, int h,
@@ -13552,6 +13480,7 @@ sna_get_image(DrawablePtr drawable,
 {
 	RegionRec region;
 	unsigned int flags;
+	bool can_blt;
 
 	if (!fbDrawableEnabled(drawable))
 		return;
@@ -13564,6 +13493,13 @@ sna_get_image(DrawablePtr drawable,
 	region.extents.y2 = region.extents.y1 + h;
 	region.data = NULL;
 
+	can_blt = format == ZPixmap &&
+		drawable->bitsPerPixel >= 8 &&
+		PM_IS_SOLID(drawable, mask);
+
+	if (can_blt && sna_get_image_blt(drawable, &region, dst))
+		return;
+
 	flags = MOVE_READ;
 	if ((w | h) == 1)
 		flags |= MOVE_INPLACE_HINT;
@@ -13572,9 +13508,7 @@ sna_get_image(DrawablePtr drawable,
 	if (!sna_drawable_move_region_to_cpu(drawable, &region, flags))
 		return;
 
-	if (format == ZPixmap &&
-	    drawable->bitsPerPixel >= 8 &&
-	    PM_IS_SOLID(drawable, mask)) {
+	if (can_blt) {
 		PixmapPtr pixmap = get_drawable_pixmap(drawable);
 		int16_t dx, dy;
 
@@ -13629,7 +13563,7 @@ sna_copy_window(WindowPtr win, DDXPointRec origin, RegionPtr src)
 
 	RegionNull(&dst);
 	RegionIntersect(&dst, &win->borderClip, src);
-	if (!RegionNotEmpty(&dst))
+	if (RegionNil(&dst))
 		return;
 
 #ifdef COMPOSITE
@@ -13697,8 +13631,10 @@ sna_accel_flush_callback(CallbackListPtr *list,
 
 		list_del(&priv->list);
 		if (priv->shm) {
-			DBG(("%s: syncing SHM pixmap=%ld\n", __FUNCTION__,
-			     priv->pixmap->drawable.serialNumber));
+			DBG(("%s: syncing SHM pixmap=%ld (refcnt=%d)\n",
+			     __FUNCTION__,
+			     priv->pixmap->drawable.serialNumber,
+			     priv->pixmap->refcnt));
 			ret = sna_pixmap_move_to_cpu(priv->pixmap,
 						     MOVE_READ | MOVE_WRITE);
 			assert(!ret || priv->gpu_bo == NULL);
@@ -13707,8 +13643,9 @@ sna_accel_flush_callback(CallbackListPtr *list,
 		} else {
 			DBG(("%s: flushing DRI pixmap=%ld\n", __FUNCTION__,
 			     priv->pixmap->drawable.serialNumber));
-			ret = sna_pixmap_move_to_gpu(priv->pixmap,
-						     MOVE_READ | __MOVE_FORCE);
+			if (sna_pixmap_move_to_gpu(priv->pixmap,
+						   MOVE_READ | __MOVE_FORCE))
+				kgem_bo_unclean(&sna->kgem, priv->gpu_bo);
 		}
 		(void)ret;
 	}
@@ -13810,6 +13747,15 @@ static bool stop_flush(struct sna *sna, struct sna_pixmap *scanout)
 	return scanout->cpu_damage || scanout->gpu_bo->needs_flush;
 }
 
+static void timer_enable(struct sna *sna, int whom, int interval)
+{
+	if (!sna->timer_active)
+		UpdateCurrentTimeIf();
+	sna->timer_active |= 1 << whom;
+	sna->timer_expire[whom] = TIME + interval;
+	DBG(("%s (time=%ld), starting timer %d\n", __FUNCTION__, (long)TIME, whom));
+}
+
 static bool sna_accel_do_flush(struct sna *sna)
 {
 	struct sna_pixmap *priv;
@@ -13822,9 +13768,6 @@ static bool sna_accel_do_flush(struct sna *sna)
 		return false;
 	}
 
-	if (sna->flags & SNA_NO_DELAYED_FLUSH)
-		return true;
-
 	interval = sna->vblank_interval ?: 20;
 	if (sna->timer_active & (1<<(FLUSH_TIMER))) {
 		int32_t delta = sna->timer_expire[FLUSH_TIMER] - TIME;
@@ -13835,26 +13778,18 @@ static bool sna_accel_do_flush(struct sna *sna)
 			sna->timer_expire[FLUSH_TIMER] = TIME + interval;
 			return true;
 		}
-	} else {
-		if (!start_flush(sna, priv)) {
-			DBG(("%s -- no pending write to scanout\n", __FUNCTION__));
-			if (priv)
-				kgem_bo_flush(&sna->kgem, priv->gpu_bo);
-		} else {
-			sna->timer_active |= 1 << FLUSH_TIMER;
-			sna->timer_expire[FLUSH_TIMER] = TIME + interval / 2;
-			DBG(("%s (time=%ld), starting\n", __FUNCTION__, (long)TIME));
-		}
-	}
+	} else if (!start_flush(sna, priv)) {
+		DBG(("%s -- no pending write to scanout\n", __FUNCTION__));
+		if (priv)
+			kgem_bo_flush(&sna->kgem, priv->gpu_bo);
+	} else
+		timer_enable(sna, FLUSH_TIMER, interval/2);
 
 	return false;
 }
 
 static bool sna_accel_do_throttle(struct sna *sna)
 {
-	if (sna->flags & SNA_NO_THROTTLE)
-		return false;
-
 	if (sna->timer_active & (1<<(THROTTLE_TIMER))) {
 		int32_t delta = sna->timer_expire[THROTTLE_TIMER] - TIME;
 		if (delta <= 3) {
@@ -13862,15 +13797,10 @@ static bool sna_accel_do_throttle(struct sna *sna)
 			sna->timer_expire[THROTTLE_TIMER] = TIME + 20;
 			return true;
 		}
-	} else {
-		if (!sna->kgem.need_retire) {
-			DBG(("%s -- no pending activity\n", __FUNCTION__));
-		} else {
-			DBG(("%s (time=%ld), starting\n", __FUNCTION__, (long)TIME));
-			sna->timer_active |= 1 << THROTTLE_TIMER;
-			sna->timer_expire[THROTTLE_TIMER] = TIME + 20;
-		}
-	}
+	} else if (!sna->kgem.need_retire) {
+		DBG(("%s -- no pending activity\n", __FUNCTION__));
+	} else
+		timer_enable(sna, THROTTLE_TIMER, 20);
 
 	return false;
 }
@@ -13885,65 +13815,12 @@ static bool sna_accel_do_expire(struct sna *sna)
 				TIME + MAX_INACTIVE_TIME * 1000;
 			return true;
 		}
-	} else {
-		if (sna->kgem.need_expire) {
-			sna->timer_active |= 1 << EXPIRE_TIMER;
-			sna->timer_expire[EXPIRE_TIMER] =
-				TIME + MAX_INACTIVE_TIME * 1000;
-			DBG(("%s (time=%ld), starting\n", __FUNCTION__, (long)TIME));
-		}
-	}
+	} else if (sna->kgem.need_expire)
+		timer_enable(sna, EXPIRE_TIMER, MAX_INACTIVE_TIME * 1000);
 
 	return false;
 }
 
-static bool sna_accel_do_inactive(struct sna *sna)
-{
-	if (!USE_INACTIVE)
-		return false;
-
-	if (sna->timer_active & (1<<(INACTIVE_TIMER))) {
-		int32_t delta = sna->timer_expire[INACTIVE_TIMER] - TIME;
-		if (delta <= 3) {
-			sna->timer_expire[INACTIVE_TIMER] =
-				TIME + 120 * 1000;
-			DBG(("%s (time=%ld), triggered\n", __FUNCTION__, (long)TIME));
-			return true;
-		}
-	} else {
-		if (!list_is_empty(&sna->active_pixmaps)) {
-			sna->timer_active |= 1 << INACTIVE_TIMER;
-			sna->timer_expire[INACTIVE_TIMER] =
-				TIME + 120 * 1000;
-			DBG(("%s (time=%ld), starting\n", __FUNCTION__, (long)TIME));
-		}
-	}
-
-	return false;
-}
-
-static int32_t sna_timeout(struct sna *sna)
-{
-	int32_t now = TIME, next = 0;
-	int i;
-
-	DBG(("%s: now=%d, active=%08x\n",
-	     __FUNCTION__, (int)now, sna->timer_active));
-	for (i = 0; i < NUM_TIMERS; i++) {
-		if (sna->timer_active & (1 << i)) {
-			int32_t delta = sna->timer_expire[i] - now;
-			DBG(("%s: timer[%d] expires in %d [%d]\n",
-			     __FUNCTION__, i, delta, sna->timer_expire[i]));
-			if (next == 0 || delta < next)
-				next = delta;
-		}
-	}
-
-	DBG(("%s: active=%08x, next=+%d\n",
-	     __FUNCTION__, sna->timer_active, next));
-	return next;
-}
-
 static void sna_accel_post_damage(struct sna *sna)
 {
 #if HAS_PIXMAP_SHARING
@@ -13958,7 +13835,7 @@ static void sna_accel_post_damage(struct sna *sna)
 		int n;
 
 		damage = DamageRegion(dirty->damage);
-		if (!RegionNotEmpty(damage))
+		if (RegionNil(damage))
 			continue;
 
 		src = dirty->src;
@@ -13979,9 +13856,19 @@ static void sna_accel_post_damage(struct sna *sna)
 		     region.extents.x2, region.extents.y2));
 
 		RegionIntersect(&region, &region, damage);
+		if (RegionNil(&region))
+			goto skip;
+
+		RegionTranslate(&region, -dirty->x, -dirty->y);
+		DamageRegionAppend(&dirty->slave_dst->drawable, &region);
+
+		DBG(("%s: slave:  ((%d, %d), (%d, %d))x%d\n", __FUNCTION__,
+		     region.extents.x1, region.extents.y1,
+		     region.extents.x2, region.extents.y2,
+		     RegionNumRects(&region.extents)));
 
-		box = REGION_RECTS(&region);
-		n = REGION_NUM_RECTS(&region);
+		box = RegionRects(&region);
+		n = RegionNumRects(&region);
 		if (wedged(sna)) {
 fallback:
 			if (!sna_pixmap_move_to_cpu(src, MOVE_READ))
@@ -13994,53 +13881,52 @@ fallback:
 			do {
 				DBG(("%s: copy box (%d, %d)->(%d, %d)x(%d, %d)\n",
 				     __FUNCTION__,
+				     box->x1 + dirty->x, box->y1 + dirty->y,
 				     box->x1, box->y1,
-				     box->x1 - dirty->x, box->y1 - dirty->y,
 				     box->x2 - box->x1, box->y2 - box->y1));
 
 				assert(box->x2 > box->x1);
 				assert(box->y2 > box->y1);
 
+				assert(box->x1 + dirty->x >= 0);
+				assert(box->y1 + dirty->y >= 0);
+				assert(box->x2 + dirty->x <= src->drawable.width);
+				assert(box->y2 + dirty->y <= src->drawable.height);
+
 				assert(box->x1 >= 0);
 				assert(box->y1 >= 0);
 				assert(box->x2 <= src->drawable.width);
 				assert(box->y2 <= src->drawable.height);
 
-				assert(box->x1 - dirty->x >= 0);
-				assert(box->y1 - dirty->y >= 0);
-				assert(box->x2 - dirty->x <= src->drawable.width);
-				assert(box->y2 - dirty->y <= src->drawable.height);
-
 				memcpy_blt(src->devPrivate.ptr,
 					   dst->devPrivate.ptr,
 					   src->drawable.bitsPerPixel,
 					   src->devKind, dst->devKind,
-					   box->x1, box->y1,
-					   box->x1 - dirty->x,
-					   box->y1 - dirty->y,
+					   box->x1 + dirty->x,
+					   box->y1 + dirty->y,
+					   box->x1,
+					   box->y1,
 					   box->x2 - box->x1,
 					   box->y2 - box->y1);
 				box++;
 			} while (--n);
 		} else {
-			if (!sna_pixmap_move_to_gpu(src, MOVE_READ | __MOVE_FORCE))
+			if (!sna_pixmap_move_to_gpu(src, MOVE_READ | MOVE_ASYNC_HINT | __MOVE_FORCE))
 				goto fallback;
 
-			if (!sna_pixmap_move_to_gpu(dst, MOVE_READ | MOVE_WRITE | __MOVE_FORCE))
+			if (!sna_pixmap_move_to_gpu(dst, MOVE_READ | MOVE_WRITE | MOVE_ASYNC_HINT | __MOVE_FORCE))
 				goto fallback;
 
 			if (!sna->render.copy_boxes(sna, GXcopy,
-						    src, sna_pixmap_get_bo(src), 0, 0,
-						    dst, sna_pixmap_get_bo(dst), -dirty->x, -dirty->y,
+						    src, sna_pixmap_get_bo(src), dirty->x, dirty->y,
+						    dst, sna_pixmap_get_bo(dst),0, 0,
 						    box, n, COPY_LAST))
 				goto fallback;
 
 			flush = true;
 		}
 
-		RegionTranslate(&region, -dirty->x, -dirty->y);
-		DamageRegionAppend(&dirty->slave_dst->drawable, &region);
-
+		DamageRegionProcessPending(&dirty->slave_dst->drawable);
 skip:
 		RegionUninit(&region);
 		DamageEmpty(dirty->damage);
@@ -14099,105 +13985,6 @@ static void sna_accel_expire(struct sna *sna)
 		sna_accel_disarm_timer(sna, EXPIRE_TIMER);
 }
 
-static void sna_accel_inactive(struct sna *sna)
-{
-	struct sna_pixmap *priv;
-	struct list preserve;
-
-	DBG(("%s (time=%ld)\n", __FUNCTION__, (long)TIME));
-
-#if HAS_FULL_DEBUG
-	{
-		unsigned count, bytes;
-
-		count = bytes = 0;
-		list_for_each_entry(priv, &sna->inactive_clock[1], inactive)
-			if (!priv->pinned)
-				count++, bytes += kgem_bo_size(priv->gpu_bo);
-
-		DBG(("%s: trimming %d inactive GPU buffers, %d bytes\n",
-		    __FUNCTION__, count, bytes));
-
-		count = bytes = 0;
-		list_for_each_entry(priv, &sna->active_pixmaps, inactive) {
-			if (priv->ptr &&
-			    sna_damage_is_all(&priv->gpu_damage,
-					      priv->pixmap->drawable.width,
-					      priv->pixmap->drawable.height)) {
-				count++, bytes += priv->pixmap->devKind * priv->pixmap->drawable.height;
-			}
-		}
-
-		DBG(("%s: trimming %d inactive CPU buffers, %d bytes\n",
-		    __FUNCTION__, count, bytes));
-	}
-#endif
-
-	/* clear out the oldest inactive pixmaps */
-	list_init(&preserve);
-	while (!list_is_empty(&sna->inactive_clock[1])) {
-		priv = list_first_entry(&sna->inactive_clock[1],
-					struct sna_pixmap,
-					inactive);
-		assert((priv->create & KGEM_CAN_CREATE_LARGE) == 0);
-		assert(priv->gpu_bo);
-		assert(!priv->gpu_bo->proxy);
-
-		/* XXX Rather than discarding the GPU buffer here, we
-		 * could mark it purgeable and allow the shrinker to
-		 * reap its storage only under memory pressure.
-		 */
-		list_del(&priv->inactive);
-		if (priv->pinned)
-			continue;
-
-		if (priv->ptr &&
-		    sna_damage_is_all(&priv->gpu_damage,
-				      priv->pixmap->drawable.width,
-				      priv->pixmap->drawable.height)) {
-			DBG(("%s: discarding inactive CPU shadow\n",
-			     __FUNCTION__));
-			sna_damage_destroy(&priv->cpu_damage);
-			list_del(&priv->list);
-
-			assert(priv->cpu_bo == NULL || !priv->cpu_bo->flush);
-			assert(!priv->shm);
-			sna_pixmap_free_cpu(sna, priv);
-			priv->undamaged = false;
-			priv->cpu = false;
-
-			list_add(&priv->inactive, &preserve);
-		} else {
-			DBG(("%s: discarding inactive GPU bo handle=%d\n",
-			     __FUNCTION__, priv->gpu_bo->handle));
-			if (!sna_pixmap_move_to_cpu(priv->pixmap,
-						    MOVE_READ | MOVE_WRITE | MOVE_ASYNC_HINT))
-				list_add(&priv->inactive, &preserve);
-		}
-	}
-
-	/* Age the current inactive pixmaps */
-	sna->inactive_clock[1].next = sna->inactive_clock[0].next;
-	sna->inactive_clock[0].next->prev = &sna->inactive_clock[1];
-	sna->inactive_clock[0].prev->next = &sna->inactive_clock[1];
-	sna->inactive_clock[1].prev = sna->inactive_clock[0].prev;
-
-	sna->inactive_clock[0].next = sna->active_pixmaps.next;
-	sna->active_pixmaps.next->prev = &sna->inactive_clock[0];
-	sna->active_pixmaps.prev->next = &sna->inactive_clock[0];
-	sna->inactive_clock[0].prev = sna->active_pixmaps.prev;
-
-	sna->active_pixmaps.next = preserve.next;
-	preserve.next->prev = &sna->active_pixmaps;
-	preserve.prev->next = &sna->active_pixmaps;
-	sna->active_pixmaps.prev = preserve.prev;
-
-	if (list_is_empty(&sna->inactive_clock[1]) &&
-	    list_is_empty(&sna->inactive_clock[0]) &&
-	    list_is_empty(&sna->active_pixmaps))
-		sna_accel_disarm_timer(sna, INACTIVE_TIMER);
-}
-
 #ifdef DEBUG_MEMORY
 static bool sna_accel_do_debug_memory(struct sna *sna)
 {
@@ -14236,7 +14023,7 @@ sna_get_window_pixmap(WindowPtr window)
 static void
 sna_set_window_pixmap(WindowPtr window, PixmapPtr pixmap)
 {
-	*(PixmapPtr *)dixGetPrivateAddr(&window->devPrivates, &sna_window_key) = pixmap;
+	*(PixmapPtr *)__get_private(window, sna_window_key) = pixmap;
 }
 
 static Bool
@@ -14306,11 +14093,15 @@ static bool sna_picture_init(ScreenPtr screen)
 {
 	PictureScreenPtr ps;
 
+	DBG(("%s\n", __FUNCTION__));
+
 	if (!miPictureInit(screen, NULL, 0))
 		return false;
 
 	ps = GetPictureScreen(screen);
 	assert(ps != NULL);
+	assert(ps->CreatePicture != NULL);
+	assert(ps->DestroyPicture != NULL);
 
 	ps->Composite = sna_composite;
 	ps->CompositeRects = sna_composite_rectangles;
@@ -14320,25 +14111,38 @@ static bool sna_picture_init(ScreenPtr screen)
 	ps->UnrealizeGlyph = sna_glyph_unrealize;
 	ps->AddTraps = sna_add_traps;
 	ps->Trapezoids = sna_composite_trapezoids;
+#if HAS_PIXMAN_TRIANGLES
 	ps->Triangles = sna_composite_triangles;
 #if PICTURE_SCREEN_VERSION >= 2
 	ps->TriStrip = sna_composite_tristrip;
 	ps->TriFan = sna_composite_trifan;
 #endif
+#endif
 
 	return true;
 }
 
+static bool sna_option_accel_blt(struct sna *sna)
+{
+	const char *s;
+
+	s = xf86GetOptValString(sna->Options, OPTION_ACCEL_METHOD);
+	if (s == NULL)
+		return false;
+
+	return strcasecmp(s, "blt") == 0;
+}
+
 bool sna_accel_init(ScreenPtr screen, struct sna *sna)
 {
 	const char *backend;
 
+	DBG(("%s\n", __FUNCTION__));
+
 	sna_font_key = AllocateFontPrivateIndex();
 
 	list_init(&sna->flush_pixmaps);
 	list_init(&sna->active_pixmaps);
-	list_init(&sna->inactive_clock[0]);
-	list_init(&sna->inactive_clock[1]);
 
 	AddGeneralSocket(sna->kgem.fd);
 
@@ -14404,33 +14208,30 @@ bool sna_accel_init(ScreenPtr screen, struct sna *sna)
 		return false;
 
 	backend = "no";
-	sna->have_render = false;
 	no_render_init(sna);
 
-#if !DEBUG_NO_RENDER
-	if (sna->info->gen >= 80) {
-	} else if (sna->info->gen >= 70) {
-		if ((sna->have_render = gen7_render_init(sna)))
+	if (sna_option_accel_blt(sna) || sna->info->gen >= 0100) {
+	} else if (sna->info->gen >= 070) {
+		if (gen7_render_init(sna))
 			backend = "IvyBridge";
-	} else if (sna->info->gen >= 60) {
-		if ((sna->have_render = gen6_render_init(sna)))
+	} else if (sna->info->gen >= 060) {
+		if (gen6_render_init(sna))
 			backend = "SandyBridge";
-	} else if (sna->info->gen >= 50) {
-		if ((sna->have_render = gen5_render_init(sna)))
+	} else if (sna->info->gen >= 050) {
+		if (gen5_render_init(sna))
 			backend = "Ironlake";
-	} else if (sna->info->gen >= 40) {
-		if ((sna->have_render = gen4_render_init(sna)))
-			backend = "Broadwater";
-	} else if (sna->info->gen >= 30) {
-		if ((sna->have_render = gen3_render_init(sna)))
+	} else if (sna->info->gen >= 040) {
+		if (gen4_render_init(sna))
+			backend = "Broadwater/Crestline";
+	} else if (sna->info->gen >= 030) {
+		if (gen3_render_init(sna))
 			backend = "gen3";
-	} else if (sna->info->gen >= 20) {
-		if ((sna->have_render = gen2_render_init(sna)))
+	} else if (sna->info->gen >= 020) {
+		if (gen2_render_init(sna))
 			backend = "gen2";
 	}
-#endif
-	DBG(("%s(backend=%s, have_render=%d)\n",
-	     __FUNCTION__, backend, sna->have_render));
+	DBG(("%s(backend=%s, prefer_gpu=%x)\n",
+	     __FUNCTION__, backend, sna->render.prefer_gpu));
 
 	kgem_reset(&sna->kgem);
 
@@ -14443,6 +14244,8 @@ bool sna_accel_init(ScreenPtr screen, struct sna *sna)
 
 void sna_accel_create(struct sna *sna)
 {
+	DBG(("%s\n", __FUNCTION__));
+
 	if (!sna_glyphs_create(sna))
 		goto fail;
 
@@ -14457,7 +14260,6 @@ void sna_accel_create(struct sna *sna)
 fail:
 	xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
 		   "Failed to allocate caches, disabling RENDER acceleration\n");
-	sna->have_render = false;
 	no_render_init(sna);
 }
 
@@ -14481,6 +14283,8 @@ void sna_accel_watch_flush(struct sna *sna, int enable)
 
 void sna_accel_close(struct sna *sna)
 {
+	DBG(("%s\n", __FUNCTION__));
+
 	sna_composite_close(sna);
 	sna_gradients_close(sna);
 	sna_glyphs_close(sna);
@@ -14500,24 +14304,25 @@ void sna_accel_close(struct sna *sna)
 
 void sna_accel_block_handler(struct sna *sna, struct timeval **tv)
 {
-	UpdateCurrentTimeIf();
+	if (sna->timer_active)
+		UpdateCurrentTimeIf();
 
-	if (sna->kgem.nbatch && kgem_is_idle(&sna->kgem)) {
+	if (sna->kgem.nbatch &&
+	    (sna->kgem.scanout_busy ||
+	     kgem_ring_is_idle(&sna->kgem, sna->kgem.ring))) {
 		DBG(("%s: GPU idle, flushing\n", __FUNCTION__));
 		_kgem_submit(&sna->kgem);
 	}
 
 	if (sna_accel_do_flush(sna))
 		sna_accel_flush(sna);
-	assert(sna->flags & SNA_NO_DELAYED_FLUSH ||
-	       sna_accel_scanout(sna) == NULL ||
+	assert(sna_accel_scanout(sna) == NULL ||
 	       sna_accel_scanout(sna)->gpu_bo->exec == NULL ||
 	       sna->timer_active & (1<<(FLUSH_TIMER)));
 
 	if (sna_accel_do_throttle(sna))
 		sna_accel_throttle(sna);
-	assert(sna->flags & SNA_NO_THROTTLE ||
-	       !sna->kgem.need_retire ||
+	assert(!sna->kgem.need_retire ||
 	       sna->timer_active & (1<<(THROTTLE_TIMER)));
 
 	if (sna_accel_do_expire(sna))
@@ -14525,9 +14330,6 @@ void sna_accel_block_handler(struct sna *sna, struct timeval **tv)
 	assert(!sna->kgem.need_expire ||
 	       sna->timer_active & (1<<(EXPIRE_TIMER)));
 
-	if (sna_accel_do_inactive(sna))
-		sna_accel_inactive(sna);
-
 	if (sna_accel_do_debug_memory(sna))
 		sna_accel_debug_memory(sna);
 
@@ -14537,24 +14339,28 @@ void sna_accel_block_handler(struct sna *sna, struct timeval **tv)
 		sna->watch_flush = 0;
 	}
 
-	if (sna->timer_active) {
+	if (sna->timer_active & 1) {
 		int32_t timeout;
 
 		DBG(("%s: evaluating timers, active=%x\n",
 		     __FUNCTION__, sna->timer_active));
-		timeout = sna_timeout(sna);
-		if (timeout) {
-			if (*tv == NULL) {
-				*tv = &sna->timer_tv;
-				goto set_tv;
-			}
-			if ((*tv)->tv_sec * 1000 + (*tv)->tv_usec / 1000 > timeout) {
+
+		timeout = sna->timer_expire[0] - TIME;
+		DBG(("%s: flush timer expires in %d [%d]\n",
+		     __FUNCTION__, timeout, sna->timer_expire[0]));
+
+		if (*tv == NULL) {
+			*tv = &sna->timer_tv;
+			goto set_tv;
+		}
+		if ((*tv)->tv_sec * 1000 + (*tv)->tv_usec / 1000 > timeout) {
 set_tv:
-				(*tv)->tv_sec = timeout / 1000;
-				(*tv)->tv_usec = timeout % 1000 * 1000;
-			}
+			(*tv)->tv_sec = timeout / 1000;
+			(*tv)->tv_usec = timeout % 1000 * 1000;
 		}
 	}
+
+	sna->kgem.scanout_busy = false;
 }
 
 void sna_accel_wakeup_handler(struct sna *sna)
@@ -14563,14 +14369,22 @@ void sna_accel_wakeup_handler(struct sna *sna)
 
 	if (sna->kgem.need_retire)
 		kgem_retire(&sna->kgem);
-	if (!sna->mode.shadow_active && !sna->kgem.need_retire) {
+	if (sna->kgem.nbatch && !sna->kgem.need_retire) {
 		DBG(("%s: GPU idle, flushing\n", __FUNCTION__));
-		kgem_submit(&sna->kgem);
+		_kgem_submit(&sna->kgem);
 	}
 	if (sna->kgem.need_purge)
 		kgem_purge_cache(&sna->kgem);
+
+	if (FAULT_INJECTION && (rand() % FAULT_INJECTION) == 0) {
+		ErrorF("%s hardware acceleration\n",
+		       sna->kgem.wedged ? "Re-enabling" : "Disabling");
+		kgem_submit(&sna->kgem);
+		sna->kgem.wedged = !sna->kgem.wedged;
+	}
 }
 
 void sna_accel_free(struct sna *sna)
 {
+	DBG(("%s\n", __FUNCTION__));
 }
diff --git a/src/sna/sna_blt.c b/src/sna/sna_blt.c
index 7410eb116..edfcb9ea4 100644
--- a/src/sna/sna_blt.c
+++ b/src/sna/sna_blt.c
@@ -119,7 +119,7 @@ static bool sna_blt_fill_init(struct sna *sna,
 
 	blt->br13 = bo->pitch;
 	blt->cmd = XY_SCANLINE_BLT;
-	if (kgem->gen >= 40 && bo->tiling) {
+	if (kgem->gen >= 040 && bo->tiling) {
 		blt->cmd |= BLT_DST_TILED;
 		blt->br13 >>= 2;
 	}
@@ -145,11 +145,12 @@ static bool sna_blt_fill_init(struct sna *sna,
 	blt->pixel = pixel;
 	blt->bpp = bpp;
 
-	kgem_set_mode(kgem, KGEM_BLT);
+	kgem_set_mode(kgem, KGEM_BLT, bo);
 	if (!kgem_check_batch(kgem, 12) ||
 	    !kgem_check_bo_fenced(kgem, bo)) {
-		_kgem_submit(kgem);
-		assert(kgem_check_bo_fenced(kgem, bo));
+		kgem_submit(kgem);
+		if (!kgem_check_bo_fenced(kgem, bo))
+			return false;
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
@@ -267,14 +268,14 @@ static bool sna_blt_copy_init(struct sna *sna,
 		blt->cmd |= BLT_WRITE_ALPHA | BLT_WRITE_RGB;
 
 	blt->pitch[0] = src->pitch;
-	if (kgem->gen >= 40 && src->tiling) {
+	if (kgem->gen >= 040 && src->tiling) {
 		blt->cmd |= BLT_SRC_TILED;
 		blt->pitch[0] >>= 2;
 	}
 	assert(blt->pitch[0] <= MAXSHORT);
 
 	blt->pitch[1] = dst->pitch;
-	if (kgem->gen >= 40 && dst->tiling) {
+	if (kgem->gen >= 040 && dst->tiling) {
 		blt->cmd |= BLT_DST_TILED;
 		blt->pitch[1] >>= 2;
 	}
@@ -289,9 +290,9 @@ static bool sna_blt_copy_init(struct sna *sna,
 	case 8: break;
 	}
 
-	kgem_set_mode(kgem, KGEM_BLT);
+	kgem_set_mode(kgem, KGEM_BLT, dst);
 	if (!kgem_check_many_bo_fenced(kgem, src, dst, NULL)) {
-		_kgem_submit(kgem);
+		kgem_submit(kgem);
 		if (!kgem_check_many_bo_fenced(kgem, src, dst, NULL))
 			return false;
 		_kgem_set_mode(kgem, KGEM_BLT);
@@ -317,14 +318,14 @@ static bool sna_blt_alpha_fixup_init(struct sna *sna,
 
 	blt->cmd = XY_FULL_MONO_PATTERN_BLT;
 	blt->pitch[0] = src->pitch;
-	if (kgem->gen >= 40 && src->tiling) {
+	if (kgem->gen >= 040 && src->tiling) {
 		blt->cmd |= BLT_SRC_TILED;
 		blt->pitch[0] >>= 2;
 	}
 	assert(blt->pitch[0] <= MAXSHORT);
 
 	blt->pitch[1] = dst->pitch;
-	if (kgem->gen >= 40 && dst->tiling) {
+	if (kgem->gen >= 040 && dst->tiling) {
 		blt->cmd |= BLT_DST_TILED;
 		blt->pitch[1] >>= 2;
 	}
@@ -341,9 +342,9 @@ static bool sna_blt_alpha_fixup_init(struct sna *sna,
 	}
 	blt->pixel = alpha;
 
-	kgem_set_mode(kgem, KGEM_BLT);
+	kgem_set_mode(kgem, KGEM_BLT, dst);
 	if (!kgem_check_many_bo_fenced(kgem, src, dst, NULL)) {
-		_kgem_submit(kgem);
+		kgem_submit(kgem);
 		if (!kgem_check_many_bo_fenced(kgem, src, dst, NULL))
 			return false;
 		_kgem_set_mode(kgem, KGEM_BLT);
@@ -433,7 +434,7 @@ static void sna_blt_copy_one(struct sna *sna,
 	    kgem->batch[kgem->nbatch-6] == (XY_COLOR_BLT | (blt->cmd & (BLT_WRITE_ALPHA | BLT_WRITE_RGB))) &&
 	    kgem->batch[kgem->nbatch-4] == ((uint32_t)dst_y << 16 | (uint16_t)dst_x) &&
 	    kgem->batch[kgem->nbatch-3] == ((uint32_t)(dst_y+height) << 16 | (uint16_t)(dst_x+width)) &&
-	    kgem->reloc[kgem->nreloc-1].target_handle == blt->bo[1]->handle) {
+	    kgem->reloc[kgem->nreloc-1].target_handle == blt->bo[1]->target_handle) {
 		DBG(("%s: replacing last fill\n", __FUNCTION__));
 		if (kgem_check_batch(kgem, 8-6)) {
 			b = kgem->batch + kgem->nbatch - 6;
@@ -933,6 +934,76 @@ static void blt_composite_fill_boxes_no_offset(struct sna *sna,
 	_sna_blt_fill_boxes(sna, &op->u.blt, box, n);
 }
 
+static void blt_composite_fill_boxes_no_offset__thread(struct sna *sna,
+						       const struct sna_composite_op *op,
+						       const BoxRec *box, int nbox)
+{
+	struct kgem *kgem = &sna->kgem;
+	const struct sna_blt_state *blt = &op->u.blt;
+	uint32_t cmd = blt->cmd;
+
+	DBG(("%s: %08x x %d\n", __FUNCTION__, blt->pixel, nbox));
+
+	sna_vertex_lock(&sna->render);
+	if (!kgem_check_batch(kgem, 3)) {
+		sna_vertex_wait__locked(&sna->render);
+		sna_blt_fill_begin(sna, blt);
+	}
+
+	do {
+		uint32_t *b = kgem->batch + kgem->nbatch;
+		int nbox_this_time;
+
+		nbox_this_time = nbox;
+		if (3*nbox_this_time > kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED)
+			nbox_this_time = (kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED) / 3;
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		kgem->nbatch += 3 * nbox_this_time;
+		assert(kgem->nbatch < kgem->surface);
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		while (nbox_this_time >= 8) {
+			b[0] = cmd; *(uint64_t *)(b+1) = *(const uint64_t *)box++;
+			b[3] = cmd; *(uint64_t *)(b+4) = *(const uint64_t *)box++;
+			b[6] = cmd; *(uint64_t *)(b+7) = *(const uint64_t *)box++;
+			b[9] = cmd; *(uint64_t *)(b+10) = *(const uint64_t *)box++;
+			b[12] = cmd; *(uint64_t *)(b+13) = *(const uint64_t *)box++;
+			b[15] = cmd; *(uint64_t *)(b+16) = *(const uint64_t *)box++;
+			b[18] = cmd; *(uint64_t *)(b+19) = *(const uint64_t *)box++;
+			b[21] = cmd; *(uint64_t *)(b+22) = *(const uint64_t *)box++;
+			b += 24;
+			nbox_this_time -= 8;
+		}
+		if (nbox_this_time & 4) {
+			b[0] = cmd; *(uint64_t *)(b+1) = *(const uint64_t *)box++;
+			b[3] = cmd; *(uint64_t *)(b+4) = *(const uint64_t *)box++;
+			b[6] = cmd; *(uint64_t *)(b+7) = *(const uint64_t *)box++;
+			b[9] = cmd; *(uint64_t *)(b+10) = *(const uint64_t *)box++;
+			b += 12;
+		}
+		if (nbox_this_time & 2) {
+			b[0] = cmd; *(uint64_t *)(b+1) = *(const uint64_t *)box++;
+			b[3] = cmd; *(uint64_t *)(b+4) = *(const uint64_t *)box++;
+			b += 6;
+		}
+		if (nbox_this_time & 1) {
+			b[0] = cmd; *(uint64_t *)(b+1) = *(const uint64_t *)box++;
+		}
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+		if (!nbox)
+			break;
+
+		sna_vertex_wait__locked(&sna->render);
+		sna_blt_fill_begin(sna, blt);
+	} while (1);
+	sna_vertex_unlock(&sna->render);
+}
+
 fastcall static void blt_composite_fill_box(struct sna *sna,
 					    const struct sna_composite_op *op,
 					    const BoxRec *box)
@@ -956,6 +1027,92 @@ static void blt_composite_fill_boxes(struct sna *sna,
 	} while (--n);
 }
 
+static inline uint64_t add4(const BoxRec *b, int16_t x, int16_t y)
+{
+	union {
+		uint64_t v;
+		int16_t i[4];
+	} vi;
+	vi.v = *(uint64_t *)b;
+	vi.i[0] += x;
+	vi.i[1] += y;
+	vi.i[2] += x;
+	vi.i[3] += y;
+	return vi.v;
+}
+
+static void blt_composite_fill_boxes__thread(struct sna *sna,
+					     const struct sna_composite_op *op,
+					     const BoxRec *box, int nbox)
+{
+	struct kgem *kgem = &sna->kgem;
+	const struct sna_blt_state *blt = &op->u.blt;
+	uint32_t cmd = blt->cmd;
+	int16_t dx = op->dst.x;
+	int16_t dy = op->dst.y;
+
+	DBG(("%s: %08x x %d\n", __FUNCTION__, blt->pixel, nbox));
+
+	sna_vertex_lock(&sna->render);
+	if (!kgem_check_batch(kgem, 3)) {
+		sna_vertex_wait__locked(&sna->render);
+		sna_blt_fill_begin(sna, blt);
+	}
+
+	do {
+		uint32_t *b = kgem->batch + kgem->nbatch;
+		int nbox_this_time;
+
+		nbox_this_time = nbox;
+		if (3*nbox_this_time > kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED)
+			nbox_this_time = (kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED) / 3;
+		assert(nbox_this_time);
+		nbox -= nbox_this_time;
+
+		kgem->nbatch += 3 * nbox_this_time;
+		assert(kgem->nbatch < kgem->surface);
+		sna_vertex_acquire__locked(&sna->render);
+		sna_vertex_unlock(&sna->render);
+
+		while (nbox_this_time >= 8) {
+			b[0] = cmd; *(uint64_t *)(b+1) = add4(box++, dx, dy);
+			b[3] = cmd; *(uint64_t *)(b+4) = add4(box++, dx, dy);
+			b[6] = cmd; *(uint64_t *)(b+7) = add4(box++, dx, dy);
+			b[9] = cmd; *(uint64_t *)(b+10) = add4(box++, dx, dy);
+			b[12] = cmd; *(uint64_t *)(b+13) = add4(box++, dx, dy);
+			b[15] = cmd; *(uint64_t *)(b+16) = add4(box++, dx, dy);
+			b[18] = cmd; *(uint64_t *)(b+19) = add4(box++, dx, dy);
+			b[21] = cmd; *(uint64_t *)(b+22) = add4(box++, dx, dy);
+			b += 24;
+			nbox_this_time -= 8;
+		}
+		if (nbox_this_time & 4) {
+			b[0] = cmd; *(uint64_t *)(b+1) = add4(box++, dx, dy);
+			b[3] = cmd; *(uint64_t *)(b+4) = add4(box++, dx, dy);
+			b[6] = cmd; *(uint64_t *)(b+7) = add4(box++, dx, dy);
+			b[9] = cmd; *(uint64_t *)(b+10) = add4(box++, dx, dy);
+			b += 12;
+		}
+		if (nbox_this_time & 2) {
+			b[0] = cmd; *(uint64_t *)(b+1) = add4(box++, dx, dy);
+			b[3] = cmd; *(uint64_t *)(b+4) = add4(box++, dx, dy);
+			b += 6;
+		}
+		if (nbox_this_time & 1) {
+			b[0] = cmd; *(uint64_t *)(b+1) = add4(box++, dx, dy);
+		}
+
+		sna_vertex_lock(&sna->render);
+		sna_vertex_release__locked(&sna->render);
+		if (!nbox)
+			break;
+
+		sna_vertex_wait__locked(&sna->render);
+		sna_blt_fill_begin(sna, blt);
+	} while (1);
+	sna_vertex_unlock(&sna->render);
+}
+
 fastcall
 static void blt_composite_nop(struct sna *sna,
 			       const struct sna_composite_op *op,
@@ -980,8 +1137,10 @@ begin_blt(struct sna *sna,
 	  struct sna_composite_op *op)
 {
 	if (!kgem_check_bo_fenced(&sna->kgem, op->dst.bo)) {
-		_kgem_submit(&sna->kgem);
-		assert(kgem_check_bo_fenced(&sna->kgem, op->dst.bo));
+		kgem_submit(&sna->kgem);
+		if (!kgem_check_bo_fenced(&sna->kgem, op->dst.bo))
+			return false;
+
 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
 	}
 
@@ -1011,6 +1170,7 @@ prepare_blt_clear(struct sna *sna,
 		op->blt   = blt_composite_fill__cpu;
 		op->box   = blt_composite_fill_box__cpu;
 		op->boxes = blt_composite_fill_boxes__cpu;
+		op->thread_boxes = blt_composite_fill_boxes__cpu;
 		op->done  = nop_done;
 		op->u.blt.pixel = 0;
 		return true;
@@ -1020,9 +1180,11 @@ prepare_blt_clear(struct sna *sna,
 	if (op->dst.x|op->dst.y) {
 		op->box   = blt_composite_fill_box;
 		op->boxes = blt_composite_fill_boxes;
+		op->thread_boxes = blt_composite_fill_boxes__thread;
 	} else {
 		op->box   = blt_composite_fill_box_no_offset;
 		op->boxes = blt_composite_fill_boxes_no_offset;
+		op->thread_boxes = blt_composite_fill_boxes_no_offset__thread;
 	}
 	op->done = nop_done;
 
@@ -1047,6 +1209,7 @@ prepare_blt_fill(struct sna *sna,
 		op->blt = blt_composite_fill__cpu;
 		op->box   = blt_composite_fill_box__cpu;
 		op->boxes = blt_composite_fill_boxes__cpu;
+		op->thread_boxes = blt_composite_fill_boxes__cpu;
 		op->done = nop_done;
 		return true;
 	}
@@ -1055,9 +1218,11 @@ prepare_blt_fill(struct sna *sna,
 	if (op->dst.x|op->dst.y) {
 		op->box   = blt_composite_fill_box;
 		op->boxes = blt_composite_fill_boxes;
+		op->thread_boxes = blt_composite_fill_boxes__thread;
 	} else {
 		op->box   = blt_composite_fill_box_no_offset;
 		op->boxes = blt_composite_fill_boxes_no_offset;
+		op->thread_boxes = blt_composite_fill_boxes_no_offset__thread;
 	}
 	op->done = nop_done;
 
@@ -1148,6 +1313,141 @@ static void blt_composite_copy_boxes(struct sna *sna,
 	} while(--nbox);
 }
 
+static inline uint32_t add2(uint32_t v, int16_t x, int16_t y)
+{
+	x += v & 0xffff;
+	y += v >> 16;
+	return (uint16_t)y << 16 | x;
+}
+
+static void blt_composite_copy_boxes__thread(struct sna *sna,
+					     const struct sna_composite_op *op,
+					     const BoxRec *box, int nbox)
+{
+	struct kgem *kgem = &sna->kgem;
+	int dst_dx = op->dst.x;
+	int dst_dy = op->dst.y;
+	int src_dx = op->src.offset[0];
+	int src_dy = op->src.offset[1];
+	uint32_t cmd = op->u.blt.cmd;
+	uint32_t br13 = op->u.blt.br13;
+	struct kgem_bo *src_bo = op->u.blt.bo[0];
+	struct kgem_bo *dst_bo = op->u.blt.bo[1];
+	int src_pitch = op->u.blt.pitch[0];
+
+	DBG(("%s: nbox=%d\n", __FUNCTION__, nbox));
+
+	sna_vertex_lock(&sna->render);
+
+	if ((dst_dx | dst_dy) == 0) {
+		uint64_t hdr = (uint64_t)br13 << 32 | cmd;
+		do {
+			int nbox_this_time;
+
+			nbox_this_time = nbox;
+			if (8*nbox_this_time > kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED)
+				nbox_this_time = (kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED) / 8;
+			if (2*nbox_this_time > KGEM_RELOC_SIZE(kgem) - kgem->nreloc)
+				nbox_this_time = (KGEM_RELOC_SIZE(kgem) - kgem->nreloc)/2;
+			assert(nbox_this_time);
+			nbox -= nbox_this_time;
+
+			do {
+				uint32_t *b = kgem->batch + kgem->nbatch;
+
+				DBG(("  %s: box=(%d, %d)x(%d, %d)\n",
+				     __FUNCTION__,
+				     box->x1, box->y1,
+				     box->x2 - box->x1, box->y2 - box->y1));
+
+				assert(box->x1 + src_dx >= 0);
+				assert(box->y1 + src_dy >= 0);
+				assert(box->x1 + src_dx <= INT16_MAX);
+				assert(box->y1 + src_dy <= INT16_MAX);
+
+				assert(box->x1 >= 0);
+				assert(box->y1 >= 0);
+
+				*(uint64_t *)&b[0] = hdr;
+				*(uint64_t *)&b[2] = *(const uint64_t *)box;
+				b[4] = kgem_add_reloc(kgem, kgem->nbatch + 4, dst_bo,
+						      I915_GEM_DOMAIN_RENDER << 16 |
+						      I915_GEM_DOMAIN_RENDER |
+						      KGEM_RELOC_FENCED,
+						      0);
+				b[5] = add2(b[2], src_dx, src_dy);
+				b[6] = src_pitch;
+				b[7] = kgem_add_reloc(kgem, kgem->nbatch + 7, src_bo,
+						      I915_GEM_DOMAIN_RENDER << 16 |
+						      KGEM_RELOC_FENCED,
+						      0);
+				kgem->nbatch += 8;
+				assert(kgem->nbatch < kgem->surface);
+				box++;
+			} while (--nbox_this_time);
+
+			if (!nbox)
+				break;
+
+			_kgem_submit(kgem);
+			_kgem_set_mode(kgem, KGEM_BLT);
+		} while (1);
+	} else {
+		do {
+			int nbox_this_time;
+
+			nbox_this_time = nbox;
+			if (8*nbox_this_time > kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED)
+				nbox_this_time = (kgem->surface - kgem->nbatch - KGEM_BATCH_RESERVED) / 8;
+			if (2*nbox_this_time > KGEM_RELOC_SIZE(kgem) - kgem->nreloc)
+				nbox_this_time = (KGEM_RELOC_SIZE(kgem) - kgem->nreloc)/2;
+			assert(nbox_this_time);
+			nbox -= nbox_this_time;
+
+			do {
+				uint32_t *b = kgem->batch + kgem->nbatch;
+
+				DBG(("  %s: box=(%d, %d)x(%d, %d)\n",
+				     __FUNCTION__,
+				     box->x1, box->y1,
+				     box->x2 - box->x1, box->y2 - box->y1));
+
+				assert(box->x1 + src_dx >= 0);
+				assert(box->y1 + src_dy >= 0);
+
+				assert(box->x1 + dst_dx >= 0);
+				assert(box->y1 + dst_dy >= 0);
+
+				b[0] = cmd;
+				b[1] = br13;
+				b[2] = ((box->y1 + dst_dy) << 16) | (box->x1 + dst_dx);
+				b[3] = ((box->y2 + dst_dy) << 16) | (box->x2 + dst_dx);
+				b[4] = kgem_add_reloc(kgem, kgem->nbatch + 4, dst_bo,
+						      I915_GEM_DOMAIN_RENDER << 16 |
+						      I915_GEM_DOMAIN_RENDER |
+						      KGEM_RELOC_FENCED,
+						      0);
+				b[5] = ((box->y1 + src_dy) << 16) | (box->x1 + src_dx);
+				b[6] = src_pitch;
+				b[7] = kgem_add_reloc(kgem, kgem->nbatch + 7, src_bo,
+						      I915_GEM_DOMAIN_RENDER << 16 |
+						      KGEM_RELOC_FENCED,
+						      0);
+				kgem->nbatch += 8;
+				assert(kgem->nbatch < kgem->surface);
+				box++;
+			} while (--nbox_this_time);
+
+			if (!nbox)
+				break;
+
+			_kgem_submit(kgem);
+			_kgem_set_mode(kgem, KGEM_BLT);
+		} while (1);
+	}
+	sna_vertex_unlock(&sna->render);
+}
+
 fastcall static void
 blt_composite_copy_with_alpha(struct sna *sna,
 			      const struct sna_composite_op *op,
@@ -1245,7 +1545,7 @@ prepare_blt_copy(struct sna *sna,
 	}
 
 	if (!kgem_check_many_bo_fenced(&sna->kgem, op->dst.bo, bo, NULL)) {
-		_kgem_submit(&sna->kgem);
+		kgem_submit(&sna->kgem);
 		if (!kgem_check_many_bo_fenced(&sna->kgem,
 					       op->dst.bo, bo, NULL)) {
 			DBG(("%s: fallback -- no room in aperture\n", __FUNCTION__));
@@ -1256,7 +1556,7 @@ prepare_blt_copy(struct sna *sna,
 
 	DBG(("%s\n", __FUNCTION__));
 
-	if (sna->kgem.gen >= 60)
+	if (sna->kgem.gen >= 060 && op->dst.bo == bo)
 		op->done = gen6_blt_copy_done;
 	else
 		op->done = nop_done;
@@ -1274,6 +1574,7 @@ prepare_blt_copy(struct sna *sna,
 		op->blt   = blt_composite_copy;
 		op->box   = blt_composite_copy_box;
 		op->boxes = blt_composite_copy_boxes;
+		op->thread_boxes = blt_composite_copy_boxes__thread;
 
 		if (!sna_blt_copy_init(sna, &op->u.blt, bo, op->dst.bo,
 				       src->drawable.bitsPerPixel,
@@ -1715,8 +2016,14 @@ sna_blt_composite(struct sna *sna,
 	was_clear = sna_drawable_is_clear(dst->pDrawable);
 	tmp->dst.pixmap = get_drawable_pixmap(dst->pDrawable);
 
-	dst_box.x1 = dst_x; dst_box.x2 = dst_x + width;
-	dst_box.y1 = dst_y; dst_box.y2 = dst_y + height;
+	if (width | height) {
+		dst_box.x1 = dst_x;
+		dst_box.x2 = bound(dst_x, width);
+		dst_box.y1 = dst_y;
+		dst_box.y2 = bound(dst_y, height);
+	} else
+		sna_render_picture_extents(dst, &dst_box);
+
 	bo = sna_drawable_use_bo(dst->pDrawable, PREFER_GPU, &dst_box, &tmp->damage);
 	if (bo && !kgem_bo_can_blt(&sna->kgem, bo)) {
 		DBG(("%s: can not blit to dst, tiling? %d, pitch? %d\n",
@@ -1834,10 +2141,21 @@ clear:
 	if (x < 0 || y < 0 ||
 	    x + width > src->pDrawable->width ||
 	    y + height > src->pDrawable->height) {
-		DBG(("%s: source extends outside (%d, %d), (%d, %d) of valid drawable %dx%d\n",
+		DBG(("%s: source extends outside (%d, %d), (%d, %d) of valid drawable %dx%d, repeat=%d\n",
 		     __FUNCTION__,
-		     x, y, x+width, y+width, src->pDrawable->width, src->pDrawable->height));
-		return false;
+		     x, y, x+width, y+width, src->pDrawable->width, src->pDrawable->height, src->repeatType));
+		if (src->repeat && src->repeatType == RepeatNormal) {
+			x = x % src->pDrawable->width;
+			y = y % src->pDrawable->height;
+			if (x < 0)
+				x += src->pDrawable->width;
+			if (y < 0)
+				y += src->pDrawable->height;
+			if (x + width  > src->pDrawable->width ||
+			    y + height > src->pDrawable->height)
+				return false;
+		} else
+			return false;
 	}
 
 	src_pixmap = get_drawable_pixmap(src->pDrawable);
@@ -1899,6 +2217,7 @@ put:
 				if (tmp->dst.bo == priv->cpu_bo) {
 					DBG(("%s: forcing the stall to overwrite a busy CPU bo\n", __FUNCTION__));
 					tmp->dst.bo = NULL;
+					tmp->damage = NULL;
 				}
 			}
 		}
@@ -1924,7 +2243,7 @@ static void convert_done(struct sna *sna, const struct sna_composite_op *op)
 {
 	struct kgem *kgem = &sna->kgem;
 
-	if (kgem->gen >= 60 && kgem_check_batch(kgem, 3)) {
+	if (kgem->gen >= 060 && op->src.bo == op->dst.bo && kgem_check_batch(kgem, 3)) {
 		uint32_t *b = kgem->batch + kgem->nbatch;
 		b[0] = XY_SETUP_CLIP;
 		b[1] = b[2] = 0;
@@ -1943,6 +2262,7 @@ sna_blt_composite__convert(struct sna *sna,
 			   struct sna_composite_op *tmp)
 {
 	uint32_t alpha_fixup;
+	int sx, sy;
 	uint8_t op;
 
 #if DEBUG_NO_BLT || NO_BLT_COMPOSITE
@@ -1993,19 +2313,36 @@ sna_blt_composite__convert(struct sna *sna,
 		return false;
 	}
 
-	x += tmp->src.offset[0];
-	y += tmp->src.offset[1];
+	sx = tmp->src.offset[0];
+	sy = tmp->src.offset[1];
+
+	x += sx;
+	y += sy;
 	if (x < 0 || y < 0 ||
 	    x + width  > tmp->src.width ||
 	    y + height > tmp->src.height) {
 		DBG(("%s: source extends outside (%d, %d), (%d, %d) of valid drawable %dx%d\n",
 		     __FUNCTION__,
 		     x, y, x+width, y+width, tmp->src.width, tmp->src.height));
-		return false;
+		if (tmp->src.repeat == RepeatNormal) {
+			int xx = x % tmp->src.width;
+			int yy = y % tmp->src.height;
+			if (xx < 0)
+				xx += tmp->src.width;
+			if (yy < 0)
+				yy += tmp->src.height;
+			if (xx + width  > tmp->src.width ||
+			    yy + height > tmp->src.height)
+				return false;
+
+			sx += xx - x;
+			sy += yy - y;
+		} else
+			return false;
 	}
 
 	if (!kgem_check_many_bo_fenced(&sna->kgem, tmp->dst.bo, tmp->src.bo, NULL)) {
-		_kgem_submit(&sna->kgem);
+		kgem_submit(&sna->kgem);
 		if (!kgem_check_many_bo_fenced(&sna->kgem,
 					       tmp->dst.bo, tmp->src.bo, NULL)) {
 			DBG(("%s: fallback -- no room in aperture\n", __FUNCTION__));
@@ -2014,13 +2351,14 @@ sna_blt_composite__convert(struct sna *sna,
 		_kgem_set_mode(&sna->kgem, KGEM_BLT);
 	}
 
-	tmp->u.blt.src_pixmap = NULL;
-	tmp->u.blt.sx = tmp->src.offset[0];
-	tmp->u.blt.sy = tmp->src.offset[1];
 	DBG(("%s: blt dst offset (%d, %d), source offset (%d, %d), with alpha fixup? %x\n",
 	     __FUNCTION__,
 	     tmp->dst.x, tmp->dst.y, tmp->u.blt.sx, tmp->u.blt.sy, alpha_fixup));
 
+	tmp->u.blt.src_pixmap = NULL;
+	tmp->u.blt.sx = sx;
+	tmp->u.blt.sy = sy;
+
 	if (alpha_fixup) {
 		tmp->blt   = blt_composite_copy_with_alpha;
 		tmp->box   = blt_composite_copy_box_with_alpha;
@@ -2035,6 +2373,7 @@ sna_blt_composite__convert(struct sna *sna,
 		tmp->blt   = blt_composite_copy;
 		tmp->box   = blt_composite_copy_box;
 		tmp->boxes = blt_composite_copy_boxes;
+		tmp->thread_boxes = blt_composite_copy_boxes__thread;
 
 		if (!sna_blt_copy_init(sna, &tmp->u.blt,
 				       tmp->src.bo, tmp->dst.bo,
@@ -2148,7 +2487,7 @@ bool sna_blt_copy(struct sna *sna, uint8_t alu,
 		return false;
 
 	op->blt  = sna_blt_copy_op_blt;
-	if (sna->kgem.gen >= 60)
+	if (sna->kgem.gen >= 060 && src == dst)
 		op->done = gen6_blt_copy_op_done;
 	else
 		op->done = sna_blt_copy_op_done;
@@ -2174,7 +2513,7 @@ static bool sna_blt_fill_box(struct sna *sna, uint8_t alu,
 
 	cmd = XY_COLOR_BLT;
 	br13 = bo->pitch;
-	if (kgem->gen >= 40 && bo->tiling) {
+	if (kgem->gen >= 040 && bo->tiling) {
 		cmd |= BLT_DST_TILED;
 		br13 >>= 2;
 	}
@@ -2194,7 +2533,7 @@ static bool sna_blt_fill_box(struct sna *sna, uint8_t alu,
 	if (overwrites && kgem->nbatch >= 6 &&
 	    kgem->batch[kgem->nbatch-6] == cmd &&
 	    *(uint64_t *)&kgem->batch[kgem->nbatch-4] == *(const uint64_t *)box &&
-	    kgem->reloc[kgem->nreloc-1].target_handle == bo->handle) {
+	    kgem->reloc[kgem->nreloc-1].target_handle == bo->target_handle) {
 		DBG(("%s: replacing last fill\n", __FUNCTION__));
 		kgem->batch[kgem->nbatch-5] = br13;
 		kgem->batch[kgem->nbatch-1] = color;
@@ -2203,7 +2542,7 @@ static bool sna_blt_fill_box(struct sna *sna, uint8_t alu,
 	if (overwrites && kgem->nbatch >= 8 &&
 	    (kgem->batch[kgem->nbatch-8] & 0xffc0000f) == XY_SRC_COPY_BLT_CMD &&
 	    *(uint64_t *)&kgem->batch[kgem->nbatch-6] == *(const uint64_t *)box &&
-	    kgem->reloc[kgem->nreloc-2].target_handle == bo->handle) {
+	    kgem->reloc[kgem->nreloc-2].target_handle == bo->target_handle) {
 		DBG(("%s: replacing last copy\n", __FUNCTION__));
 		kgem->batch[kgem->nbatch-8] = cmd;
 		kgem->batch[kgem->nbatch-7] = br13;
@@ -2226,11 +2565,11 @@ static bool sna_blt_fill_box(struct sna *sna, uint8_t alu,
 		return false;
 	}
 
-	kgem_set_mode(kgem, KGEM_BLT);
+	kgem_set_mode(kgem, KGEM_BLT, bo);
 	if (!kgem_check_batch(kgem, 6) ||
 	    !kgem_check_reloc(kgem, 1) ||
 	    !kgem_check_bo_fenced(kgem, bo)) {
-		_kgem_submit(kgem);
+		kgem_submit(kgem);
 		assert(kgem_check_bo_fenced(&sna->kgem, bo));
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
@@ -2288,7 +2627,7 @@ bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
 
 	br13 = bo->pitch;
 	cmd = XY_SCANLINE_BLT;
-	if (kgem->gen >= 40 && bo->tiling) {
+	if (kgem->gen >= 040 && bo->tiling) {
 		cmd |= 1 << 11;
 		br13 >>= 2;
 	}
@@ -2302,11 +2641,12 @@ bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
 	case 8: break;
 	}
 
-	kgem_set_mode(kgem, KGEM_BLT);
+	kgem_set_mode(kgem, KGEM_BLT, bo);
 	if (!kgem_check_batch(kgem, 12) ||
 	    !kgem_check_bo_fenced(kgem, bo)) {
-		_kgem_submit(kgem);
-		assert(kgem_check_bo_fenced(&sna->kgem, bo));
+		kgem_submit(kgem);
+		if (!kgem_check_bo_fenced(&sna->kgem, bo))
+			return false;
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
@@ -2405,13 +2745,6 @@ bool sna_blt_fill_boxes(struct sna *sna, uint8_t alu,
 	return true;
 }
 
-static inline uint32_t add2(uint32_t v, int16_t x, int16_t y)
-{
-	x += v & 0xffff;
-	y += v >> 16;
-	return (uint16_t)y << 16 | x;
-}
-
 bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 			struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
 			struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
@@ -2442,14 +2775,14 @@ bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 		cmd |= BLT_WRITE_ALPHA | BLT_WRITE_RGB;
 
 	src_pitch = src_bo->pitch;
-	if (kgem->gen >= 40 && src_bo->tiling) {
+	if (kgem->gen >= 040 && src_bo->tiling) {
 		cmd |= BLT_SRC_TILED;
 		src_pitch >>= 2;
 	}
 	assert(src_pitch <= MAXSHORT);
 
 	br13 = dst_bo->pitch;
-	if (kgem->gen >= 40 && dst_bo->tiling) {
+	if (kgem->gen >= 040 && dst_bo->tiling) {
 		cmd |= BLT_DST_TILED;
 		br13 >>= 2;
 	}
@@ -2466,7 +2799,7 @@ bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 	/* Compare first box against a previous fill */
 	if (kgem->nbatch >= 6 &&
 	    (alu == GXcopy || alu == GXclear || alu == GXset) &&
-	    kgem->reloc[kgem->nreloc-1].target_handle == dst_bo->handle &&
+	    kgem->reloc[kgem->nreloc-1].target_handle == dst_bo->target_handle &&
 	    kgem->batch[kgem->nbatch-6] == ((cmd & ~XY_SRC_COPY_BLT_CMD) | XY_COLOR_BLT) &&
 	    kgem->batch[kgem->nbatch-4] == ((uint32_t)(box->y1 + dst_dy) << 16 | (uint16_t)(box->x1 + dst_dx)) &&
 	    kgem->batch[kgem->nbatch-3] == ((uint32_t)(box->y2 + dst_dy) << 16 | (uint16_t)(box->x2 + dst_dx))) {
@@ -2475,11 +2808,11 @@ bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 		kgem->nreloc--;
 	}
 
-	kgem_set_mode(kgem, KGEM_BLT);
+	kgem_set_mode(kgem, KGEM_BLT, dst_bo);
 	if (!kgem_check_batch(kgem, 8) ||
 	    !kgem_check_reloc(kgem, 2) ||
 	    !kgem_check_many_bo_fenced(kgem, dst_bo, src_bo, NULL)) {
-		_kgem_submit(kgem);
+		kgem_submit(kgem);
 		if (!kgem_check_many_bo_fenced(kgem, dst_bo, src_bo, NULL))
 			return sna_tiling_blt_copy_boxes(sna, alu,
 							 src_bo, src_dx, src_dy,
@@ -2595,7 +2928,7 @@ bool sna_blt_copy_boxes(struct sna *sna, uint8_t alu,
 		} while (1);
 	}
 
-	if (kgem->gen >= 60 && kgem_check_batch(kgem, 3)) {
+	if (kgem->gen >= 060 && kgem_check_batch(kgem, 3)) {
 		uint32_t *b = kgem->batch + kgem->nbatch;
 		b[0] = XY_SETUP_CLIP;
 		b[1] = b[2] = 0;
diff --git a/src/sna/sna_composite.c b/src/sna/sna_composite.c
index 60d39cdea..a4b85fe58 100644
--- a/src/sna/sna_composite.c
+++ b/src/sna/sna_composite.c
@@ -399,8 +399,8 @@ static void _assert_pixmap_contains_box(PixmapPtr pixmap, BoxPtr box, const char
 
 static void apply_damage(struct sna_composite_op *op, RegionPtr region)
 {
-	DBG(("%s: damage=%p, region=%d [(%d, %d), (%d, %d) + (%d, %d)]\n",
-	     __FUNCTION__, op->damage, REGION_NUM_RECTS(region),
+	DBG(("%s: damage=%p, region=%ld [(%d, %d), (%d, %d) + (%d, %d)]\n",
+	     __FUNCTION__, op->damage, RegionNumRects(region),
 	     region->extents.x1, region->extents.y1,
 	     region->extents.x2, region->extents.y2,
 	     op->dst.x, op->dst.y));
@@ -438,6 +438,92 @@ static inline bool use_cpu(PixmapPtr pixmap, struct sna_pixmap *priv,
 	return (priv->create & KGEM_CAN_CREATE_GPU) == 0;
 }
 
+static void validate_source(PicturePtr picture)
+{
+#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,10,99,901,0)
+	miCompositeSourceValidate(picture);
+#else
+	miCompositeSourceValidate(picture,
+				  0, 0,
+				  picture->pDrawable ? picture->pDrawable->width : 0,
+				  picture->pDrawable ? picture->pDrawable->height : 0);
+#endif
+}
+
+void
+sna_composite_fb(CARD8 op,
+		 PicturePtr src,
+		 PicturePtr mask,
+		 PicturePtr dst,
+		 RegionPtr region,
+		 INT16 src_x,  INT16 src_y,
+		 INT16 mask_x, INT16 mask_y,
+		 INT16 dst_x,  INT16 dst_y,
+		 CARD16 width, CARD16 height)
+{
+	pixman_image_t *src_image, *mask_image, *dest_image;
+	int src_xoff, src_yoff;
+	int msk_xoff, msk_yoff;
+	int dst_xoff, dst_yoff;
+	unsigned flags;
+
+	DBG(("%s: fallback -- move dst to cpu\n", __FUNCTION__));
+	if (op <= PictOpSrc && !dst->alphaMap)
+		flags = MOVE_WRITE | MOVE_INPLACE_HINT;
+	else
+		flags = MOVE_WRITE | MOVE_READ;
+	if (!sna_drawable_move_region_to_cpu(dst->pDrawable, region, flags))
+		return;
+	if (dst->alphaMap &&
+	    !sna_drawable_move_to_cpu(dst->alphaMap->pDrawable, flags))
+		return;
+
+	if (src->pDrawable) {
+		DBG(("%s: fallback -- move src to cpu\n", __FUNCTION__));
+		if (!sna_drawable_move_to_cpu(src->pDrawable,
+					      MOVE_READ))
+			return;
+
+		if (src->alphaMap &&
+		    !sna_drawable_move_to_cpu(src->alphaMap->pDrawable,
+					      MOVE_READ))
+			return;
+	}
+
+	if (mask && mask->pDrawable) {
+		DBG(("%s: fallback -- move mask to cpu\n", __FUNCTION__));
+		if (!sna_drawable_move_to_cpu(mask->pDrawable,
+					      MOVE_READ))
+			return;
+
+		if (mask->alphaMap &&
+		    !sna_drawable_move_to_cpu(mask->alphaMap->pDrawable,
+					      MOVE_READ))
+			return;
+	}
+
+	DBG(("%s: fallback -- fbComposite\n", __FUNCTION__));
+
+	validate_source(src);
+	if (mask)
+		validate_source(mask);
+
+	src_image = image_from_pict(src, FALSE, &src_xoff, &src_yoff);
+	mask_image = image_from_pict(mask, FALSE, &msk_xoff, &msk_yoff);
+	dest_image = image_from_pict(dst, TRUE, &dst_xoff, &dst_yoff);
+
+	if (src_image && dest_image && !(mask && !mask_image))
+		sna_image_composite(op, src_image, mask_image, dest_image,
+				       src_x + src_xoff, src_y + src_yoff,
+				       mask_x + msk_xoff, mask_y + msk_yoff,
+				       dst_x + dst_xoff, dst_y + dst_yoff,
+				       width, height);
+
+	free_pixman_pict(src, src_image);
+	free_pixman_pict(mask, mask_image);
+	free_pixman_pict(dst, dest_image);
+}
+
 void
 sna_composite(CARD8 op,
 	      PicturePtr src,
@@ -510,7 +596,7 @@ sna_composite(CARD8 op,
 	}
 
 	if (use_cpu(pixmap, priv, op, width, height) &&
-	    !picture_is_gpu(src) && !picture_is_gpu(mask)) {
+	    !picture_is_gpu(sna, src) && !picture_is_gpu(sna, mask)) {
 		DBG(("%s: fallback, dst pixmap=%ld is too small (or completely damaged)\n",
 		     __FUNCTION__, pixmap->drawable.serialNumber));
 		goto fallback;
@@ -561,8 +647,8 @@ sna_composite(CARD8 op,
 		tmp.box(sna, &tmp, &region.extents);
 	else
 		tmp.boxes(sna, &tmp,
-			  REGION_BOXPTR(&region),
-			  REGION_NUM_RECTS(&region));
+			  RegionBoxptr(&region),
+			  RegionNumRects(&region));
 	apply_damage(&tmp, &region);
 	tmp.done(sna, &tmp);
 
@@ -610,23 +696,15 @@ fallback:
 	}
 
 	DBG(("%s: fallback -- fbComposite\n", __FUNCTION__));
-	fbComposite(op, src, mask, dst,
-		    src_x,  src_y,
-		    mask_x, mask_y,
-		    dst_x,  dst_y,
-		    width,  height);
+	sna_composite_fb(op, src, mask, dst, &region,
+			 src_x,  src_y,
+			 mask_x, mask_y,
+			 dst_x,  dst_y,
+			 width,  height);
 out:
 	REGION_UNINIT(NULL, &region);
 }
 
-static int16_t bound(int16_t a, uint16_t b)
-{
-	int v = (int)a + (int)b;
-	if (v > MAXSHORT)
-		return MAXSHORT;
-	return v;
-}
-
 static bool
 _pixman_region_init_clipped_rectangles(pixman_region16_t *region,
 				       unsigned int num_rects,
@@ -778,11 +856,11 @@ sna_composite_rectangles(CARD8		 op,
 		return;
 	}
 
-	DBG(("%s: drawable extents (%d, %d),(%d, %d) x %d\n",
+	DBG(("%s: drawable extents (%d, %d),(%d, %d) x %ld\n",
 	     __FUNCTION__,
 	     RegionExtents(&region)->x1, RegionExtents(&region)->y1,
 	     RegionExtents(&region)->x2, RegionExtents(&region)->y2,
-	     RegionNumRects(&region)));
+	     (long)RegionNumRects(&region)));
 
 	if (dst->pCompositeClip->data &&
 	    (!pixman_region_intersect(&region, &region, dst->pCompositeClip) ||
@@ -793,11 +871,11 @@ sna_composite_rectangles(CARD8		 op,
 		return;
 	}
 
-	DBG(("%s: clipped extents (%d, %d),(%d, %d) x %d\n",
+	DBG(("%s: clipped extents (%d, %d),(%d, %d) x %ld\n",
 	     __FUNCTION__,
 	     RegionExtents(&region)->x1, RegionExtents(&region)->y1,
 	     RegionExtents(&region)->x2, RegionExtents(&region)->y2,
-	     RegionNumRects(&region)));
+	     (long)RegionNumRects(&region)));
 
 	pixmap = get_drawable_pixmap(dst->pDrawable);
 	get_drawable_deltas(dst->pDrawable, pixmap, &dst_x, &dst_y);
@@ -827,6 +905,10 @@ sna_composite_rectangles(CARD8		 op,
 		goto fallback;
 	}
 
+	/* XXX xserver-1.8: CompositeRects is not tracked by Damage, so we must
+	 * manually append the damaged regions ourselves.
+	 */
+	DamageRegionAppend(&pixmap->drawable, &region);
 	boxes = pixman_region_rectangles(&region, &num_boxes);
 
 	/* If we going to be overwriting any CPU damage with a subsequent
@@ -849,10 +931,10 @@ sna_composite_rectangles(CARD8		 op,
 		    box_inplace(pixmap, &region.extents)) {
 			DBG(("%s: promoting to full GPU\n", __FUNCTION__));
 			if (priv->gpu_bo && priv->cpu_damage == NULL) {
+				assert(priv->gpu_bo->proxy == NULL);
 				sna_damage_all(&priv->gpu_damage,
 					       pixmap->drawable.width,
 					       pixmap->drawable.height);
-				priv->undamaged = false;
 			}
 		}
 		if (priv->cpu_damage == NULL) {
@@ -890,7 +972,6 @@ sna_composite_rectangles(CARD8		 op,
 				       pixmap->drawable.height);
 			sna_damage_destroy(damage == &priv->gpu_damage ?
 					   &priv->cpu_damage : &priv->gpu_damage);
-			priv->undamaged = false;
 		}
 
 		if (op <= PictOpSrc && bo == priv->gpu_bo) {
@@ -927,9 +1008,11 @@ fallback:
 	    !sna_drawable_move_to_cpu(dst->alphaMap->pDrawable, error))
 		goto done;
 
+	assert(pixmap->devPrivate.ptr);
+
 	if (op <= PictOpSrc) {
-		int nbox = REGION_NUM_RECTS(&region);
-		BoxPtr box = REGION_RECTS(&region);
+		int nbox = RegionNumRects(&region);
+		BoxPtr box = RegionRects(&region);
 		uint32_t pixel;
 
 		if (op == PictOpClear)
@@ -984,10 +1067,6 @@ fallback_composite:
 	}
 
 done:
-	/* XXX xserver-1.8: CompositeRects is not tracked by Damage, so we must
-	 * manually append the damaged regions ourselves.
-	 */
-	DamageRegionAppend(&pixmap->drawable, &region);
 	DamageRegionProcessPending(&pixmap->drawable);
 
 	pixman_region_fini(&region);
diff --git a/src/sna/sna_damage.c b/src/sna/sna_damage.c
index a870cbf5c..ab693af3a 100644
--- a/src/sna/sna_damage.c
+++ b/src/sna/sna_damage.c
@@ -507,6 +507,7 @@ static void __sna_damage_reduce(struct sna_damage *damage)
 	} else {
 		pixman_region16_t tmp;
 
+		assert(n == nboxes);
 		pixman_region_init_rects(&tmp, boxes, nboxes);
 		pixman_region_subtract(region, region, &tmp);
 		pixman_region_fini(&tmp);
@@ -1033,7 +1034,7 @@ static struct sna_damage *__sna_damage_subtract(struct sna_damage *damage,
 	if (damage == NULL)
 		return NULL;
 
-	if (!RegionNotEmpty(&damage->region)) {
+	if (RegionNil(&damage->region)) {
 no_damage:
 		__sna_damage_destroy(damage);
 		return NULL;
@@ -1126,7 +1127,7 @@ inline static struct sna_damage *__sna_damage_subtract_box(struct sna_damage *da
 	if (damage == NULL)
 		return NULL;
 
-	if (!RegionNotEmpty(&damage->region)) {
+	if (RegionNil(&damage->region)) {
 		__sna_damage_destroy(damage);
 		return NULL;
 	}
@@ -1198,7 +1199,7 @@ static struct sna_damage *__sna_damage_subtract_boxes(struct sna_damage *damage,
 	if (damage == NULL)
 		return NULL;
 
-	if (!RegionNotEmpty(&damage->region)) {
+	if (RegionNil(&damage->region)) {
 		__sna_damage_destroy(damage);
 		return NULL;
 	}
diff --git a/src/sna/sna_damage.h b/src/sna/sna_damage.h
index 5e800b7dc..03a54a3d0 100644
--- a/src/sna/sna_damage.h
+++ b/src/sna/sna_damage.h
@@ -2,7 +2,6 @@
 #define SNA_DAMAGE_H
 
 #include <regionstr.h>
-#include <list.h>
 
 #include "compiler.h"
 
@@ -259,20 +258,22 @@ static inline void sna_damage_reduce_all(struct sna_damage **_damage,
 	if (damage == NULL || DAMAGE_IS_ALL(damage))
 		return;
 
-	if (damage->mode == DAMAGE_ADD &&
-	    damage->extents.x1 <= 0 &&
-	    damage->extents.y1 <= 0 &&
-	    damage->extents.x2 >= width &&
-	    damage->extents.y2 >= height) {
-		if (damage->dirty) {
-			damage = *_damage = _sna_damage_reduce(damage);
-			if (damage == NULL)
-				return;
+	if (damage->mode == DAMAGE_ADD) {
+		if (damage->extents.x1 <= 0 &&
+		    damage->extents.y1 <= 0 &&
+		    damage->extents.x2 >= width &&
+		    damage->extents.y2 >= height) {
+			if (damage->dirty) {
+				damage = *_damage = _sna_damage_reduce(damage);
+				if (damage == NULL)
+					return;
+			}
+
+			if (damage->region.data == NULL)
+				*_damage = _sna_damage_all(damage, width, height);
 		}
-
-		if (damage->region.data == NULL)
-			*_damage = _sna_damage_all(damage, width, height);
-	}
+	} else
+		*_damage = _sna_damage_reduce(damage);
 }
 
 void __sna_damage_destroy(struct sna_damage *damage);
diff --git a/src/sna/sna_display.c b/src/sna/sna_display.c
index ed3237558..558d70626 100644
--- a/src/sna/sna_display.c
+++ b/src/sna/sna_display.c
@@ -113,7 +113,7 @@ static inline uint32_t fb_id(struct kgem_bo *bo)
 	return bo->delta;
 }
 
-int sna_crtc_id(xf86CrtcPtr crtc)
+uint32_t sna_crtc_id(xf86CrtcPtr crtc)
 {
 	return to_sna_crtc(crtc)->id;
 }
@@ -123,7 +123,7 @@ int sna_crtc_to_pipe(xf86CrtcPtr crtc)
 	return to_sna_crtc(crtc)->pipe;
 }
 
-int sna_crtc_to_plane(xf86CrtcPtr crtc)
+uint32_t sna_crtc_to_plane(xf86CrtcPtr crtc)
 {
 	return to_sna_crtc(crtc)->plane;
 }
@@ -164,6 +164,7 @@ static unsigned get_fb(struct sna *sna, struct kgem_bo *bo,
 			   scrn->depth, scrn->bitsPerPixel, bo->pitch, errno);
 		return 0;
 	}
+	assert(arg.fb_id != 0);
 
 	bo->scanout = true;
 	return bo->delta = arg.fb_id;
@@ -197,13 +198,15 @@ sna_output_backlight_set(xf86OutputPtr output, int level)
 	char path[1024], val[BACKLIGHT_VALUE_LEN];
 	int fd, len, ret;
 
-	DBG(("%s: level=%d\n", __FUNCTION__, level));
+	DBG(("%s: level=%d, max=%d\n", __FUNCTION__,
+	     level, sna_output->backlight_max));
 
-	if (level > sna_output->backlight_max)
-		level = sna_output->backlight_max;
-	if (!sna_output->backlight_iface || level < 0)
+	if (!sna_output->backlight_iface)
 		return;
 
+	if ((unsigned)level > sna_output->backlight_max)
+		level = sna_output->backlight_max;
+
 	len = snprintf(val, BACKLIGHT_VALUE_LEN, "%d\n", level);
 	sprintf(path, "%s/%s/brightness",
 		BACKLIGHT_CLASS, sna_output->backlight_iface);
@@ -886,6 +889,9 @@ void sna_copy_fbcon(struct sna *sna)
 
 	DBG(("%s\n", __FUNCTION__));
 
+	priv = sna_pixmap(sna->front);
+	assert(priv && priv->gpu_bo);
+
 	/* Scan the connectors for a framebuffer and assume that is the fbcon */
 	VG_CLEAR(fbcon);
 	fbcon.fb_id = 0;
@@ -912,6 +918,11 @@ void sna_copy_fbcon(struct sna *sna)
 		return;
 	}
 
+	if (fbcon.fb_id == priv->gpu_bo->delta) {
+		DBG(("%s: fb already installed as scanout\n", __FUNCTION__));
+		return;
+	}
+
 	/* Wrap the fbcon in a pixmap so that we select the right formats
 	 * in the render copy in case we need to preserve the fbcon
 	 * across a depth change upon starting X.
@@ -933,9 +944,6 @@ void sna_copy_fbcon(struct sna *sna)
 
 	DBG(("%s: fbcon handle=%d\n", __FUNCTION__, bo->handle));
 
-	priv = sna_pixmap(sna->front);
-	assert(priv && priv->gpu_bo);
-
 	sx = dx = 0;
 	if (box.x2 < (uint16_t)fbcon.width)
 		sx = (fbcon.width - box.x2) / 2;
@@ -957,7 +965,9 @@ void sna_copy_fbcon(struct sna *sna)
 
 	kgem_bo_destroy(&sna->kgem, bo);
 
+#if ABI_VIDEODRV_VERSION >= SET_ABI_VERSION(10, 0)
 	sna->scrn->pScreen->canDoBGNoneRoot = ok;
+#endif
 
 cleanup_scratch:
 	FreeScratchPixmapHeader(scratch);
@@ -1223,6 +1233,9 @@ sna_crtc_set_mode_major(xf86CrtcPtr crtc, DisplayModePtr mode,
 	struct drm_mode_modeinfo saved_kmode;
 	bool saved_transform;
 
+	if (mode->HDisplay == 0 || mode->VDisplay == 0)
+		return FALSE;
+
 	xf86DrvMsg(crtc->scrn->scrnIndex, X_INFO,
 		   "switch to mode %dx%d on crtc %d (pipe %d)\n",
 		   mode->HDisplay, mode->VDisplay,
@@ -1251,6 +1264,8 @@ retry: /* Attach per-crtc pixmap or direct */
 	if (bo == NULL)
 		return FALSE;
 
+	kgem_bo_submit(&sna->kgem, bo);
+
 	sna_crtc->bo = bo;
 	mode_to_kmode(&sna_crtc->kmode, mode);
 	if (!sna_crtc_apply(crtc)) {
@@ -1307,9 +1322,12 @@ sna_crtc_dpms(xf86CrtcPtr crtc, int mode)
 void sna_mode_adjust_frame(struct sna *sna, int x, int y)
 {
 	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(sna->scrn);
-	xf86OutputPtr output = config->output[config->compat_output];
-	xf86CrtcPtr crtc = output->crtc;
+	xf86CrtcPtr crtc;
 
+	if ((unsigned)config->compat_output >= config->num_output)
+		return;
+
+	crtc = config->output[config->compat_output]->crtc;
 	if (crtc && crtc->enabled) {
 		int saved_x = crtc->x;
 		int saved_y = crtc->y;
@@ -1456,9 +1474,10 @@ static const xf86CrtcFuncsRec sna_crtc_funcs = {
 #endif
 };
 
-static uint32_t
+static int
 sna_crtc_find_plane(struct sna *sna, int pipe)
 {
+#ifdef DRM_IOCTL_MODE_GETPLANERESOURCES
 	struct drm_mode_get_plane_res r;
 	uint32_t *planes, id = 0;
 	int i;
@@ -1494,7 +1513,11 @@ sna_crtc_find_plane(struct sna *sna, int pipe)
 	}
 	free(planes);
 
+	assert(id);
 	return id;
+#else
+	return 0;
+#endif
 }
 
 static void
@@ -1691,41 +1714,40 @@ static DisplayModePtr
 sna_output_panel_edid(xf86OutputPtr output, DisplayModePtr modes)
 {
 	xf86MonPtr mon = output->MonInfo;
+	DisplayModePtr i, m, preferred = NULL;
+	int max_x = 0, max_y = 0;
+	float max_vrefresh = 0.0;
 
-	if (!mon || !GTF_SUPPORTED(mon->features.msc)) {
-		DisplayModePtr i, m, p = NULL;
-		int max_x = 0, max_y = 0;
-		float max_vrefresh = 0.0;
-
-		for (m = modes; m; m = m->next) {
-			if (m->type & M_T_PREFERRED)
-				p = m;
-			max_x = max(max_x, m->HDisplay);
-			max_y = max(max_y, m->VDisplay);
-			max_vrefresh = max(max_vrefresh, xf86ModeVRefresh(m));
-		}
+	if (mon && GTF_SUPPORTED(mon->features.msc))
+		return modes;
 
-		max_vrefresh = max(max_vrefresh, 60.0);
-		max_vrefresh *= (1 + SYNC_TOLERANCE);
-
-		m = xf86GetDefaultModes();
-		xf86ValidateModesSize(output->scrn, m, max_x, max_y, 0);
+	for (m = modes; m; m = m->next) {
+		if (m->type & M_T_PREFERRED)
+			preferred = m;
+		max_x = max(max_x, m->HDisplay);
+		max_y = max(max_y, m->VDisplay);
+		max_vrefresh = max(max_vrefresh, xf86ModeVRefresh(m));
+	}
 
-		for (i = m; i; i = i->next) {
-			if (xf86ModeVRefresh(i) > max_vrefresh)
-				i->status = MODE_VSYNC;
-			if (p && i->HDisplay >= p->HDisplay &&
-			    i->VDisplay >= p->VDisplay &&
-			    xf86ModeVRefresh(i) >= xf86ModeVRefresh(p))
-				i->status = MODE_VSYNC;
-		}
+	max_vrefresh = max(max_vrefresh, 60.0);
+	max_vrefresh *= (1 + SYNC_TOLERANCE);
 
-		xf86PruneInvalidModes(output->scrn, &m, FALSE);
+	m = xf86GetDefaultModes();
+	xf86ValidateModesSize(output->scrn, m, max_x, max_y, 0);
 
-		modes = xf86ModesAdd(modes, m);
+	for (i = m; i; i = i->next) {
+		if (xf86ModeVRefresh(i) > max_vrefresh)
+			i->status = MODE_VSYNC;
+		if (preferred &&
+		    i->HDisplay >= preferred->HDisplay &&
+		    i->VDisplay >= preferred->VDisplay &&
+		    xf86ModeVRefresh(i) >= xf86ModeVRefresh(preferred))
+			i->status = MODE_PANEL;
 	}
 
-	return modes;
+	xf86PruneInvalidModes(output->scrn, &m, FALSE);
+
+	return xf86ModesAdd(modes, m);
 }
 
 static DisplayModePtr
@@ -1759,6 +1781,7 @@ sna_output_get_modes(xf86OutputPtr output)
 	 */
 	sna_output->has_panel_limits = false;
 	if (is_panel(koutput->connector_type)) {
+		sna_output->panel_hdisplay = sna_output->panel_vdisplay = 0;
 		for (i = 0; i < koutput->count_modes; i++) {
 			drmModeModeInfo *mode_ptr;
 
@@ -1768,7 +1791,6 @@ sna_output_get_modes(xf86OutputPtr output)
 			if (mode_ptr->vdisplay > sna_output->panel_vdisplay)
 				sna_output->panel_vdisplay = mode_ptr->vdisplay;
 		}
-
 		sna_output->has_panel_limits =
 			sna_output->panel_hdisplay &&
 			sna_output->panel_vdisplay;
@@ -2459,13 +2481,13 @@ sna_crtc_resize(ScrnInfoPtr scrn, int width, int height)
 			sna_crtc_disable(crtc);
 	}
 
-	if (screen->root) {
+	if (root(screen)) {
 		struct sna_visit_set_pixmap_window visit;
 
 		visit.old = old_front;
 		visit.new = sna->front;
-		TraverseTree(screen->root, sna_visit_set_window_pixmap, &visit);
-		assert(screen->GetWindowPixmap(screen->root) == sna->front);
+		TraverseTree(root(screen), sna_visit_set_window_pixmap, &visit);
+		assert(screen->GetWindowPixmap(root(screen)) == sna->front);
 	}
 	screen->SetScreenPixmap(sna->front);
 	assert(screen->GetScreenPixmap(screen) == sna->front);
@@ -2522,6 +2544,12 @@ static int do_page_flip(struct sna *sna, struct kgem_bo *bo,
 			DBG(("%s: flip [fb=%d] on crtc %d [%d] failed - %d\n",
 			     __FUNCTION__, arg.fb_id, i, crtc->id, errno));
 disable:
+			if (count == 0)
+				return 0;
+
+			xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
+				   "%s: page flipping failed, disabling CRTC:%d (pipe=%d)\n",
+				   __FUNCTION__, crtc->id, crtc->pipe);
 			sna_crtc_disable(config->crtc[i]);
 			continue;
 		}
@@ -2614,6 +2642,31 @@ bool sna_mode_pre_init(ScrnInfoPtr scrn, struct sna *sna)
 	return true;
 }
 
+static Bool sna_mode_has_pending_events(struct sna *sna)
+{
+	struct pollfd pfd;
+	pfd.fd = sna->kgem.fd;
+	pfd.events = POLLIN;
+	return poll(&pfd, 1, 0) == 1;
+}
+
+void
+sna_mode_close(struct sna *sna)
+{
+	xf86CrtcConfigPtr xf86_config = XF86_CRTC_CONFIG_PTR(sna->scrn);
+	int i;
+
+	/* In order to workaround a kernel bug in not honouring O_NONBLOCK,
+	 * check that the fd is readable before attempting to read the next
+	 * event from drm.
+	 */
+	if (sna_mode_has_pending_events(sna))
+		sna_mode_wakeup(sna);
+
+	for (i = 0; i < xf86_config->num_crtc; i++)
+		sna_crtc_disable_shadow(sna, to_sna_crtc(xf86_config->crtc[i]));
+}
+
 void
 sna_mode_fini(struct sna *sna)
 {
@@ -2689,6 +2742,11 @@ sna_covering_crtc(ScrnInfoPtr scrn,
 		     __FUNCTION__, c,
 		     crtc->bounds.x1, crtc->bounds.y1,
 		     crtc->bounds.x2, crtc->bounds.y2));
+		if (*(const uint64_t *)box == *(uint64_t *)&crtc->bounds) {
+			DBG(("%s: box exactly matches crtc [%d]\n",
+			     __FUNCTION__, c));
+			return crtc;
+		}
 
 		if (!sna_box_intersect(&cover_box, &crtc->bounds, box))
 			continue;
@@ -2717,41 +2775,123 @@ sna_covering_crtc(ScrnInfoPtr scrn,
 	return best_crtc;
 }
 
-/* Gen6 wait for scan line support */
 #define MI_LOAD_REGISTER_IMM			(0x22<<23)
 
-/* gen6: Scan lines register */
-#define GEN6_PIPEA_SLC			(0x70004)
-#define GEN6_PIPEB_SLC			(0x71004)
-
-static void sna_emit_wait_for_scanline_gen6(struct sna *sna,
+static bool sna_emit_wait_for_scanline_gen7(struct sna *sna,
+					    xf86CrtcPtr crtc,
 					    int pipe, int y1, int y2,
 					    bool full_height)
 {
-	uint32_t event;
 	uint32_t *b;
+	uint32_t event;
 
-	assert (y2 > 0);
+	if (!sna->kgem.has_secure_batches)
+		return false;
 
-	/* We just wait until the trace passes the roi */
-	if (pipe == 0) {
-		pipe = GEN6_PIPEA_SLC;
-		event = MI_WAIT_FOR_PIPEA_SCAN_LINE_WINDOW;
-	} else {
-		pipe = GEN6_PIPEB_SLC;
-		event = MI_WAIT_FOR_PIPEB_SCAN_LINE_WINDOW;
+	assert(y1 >= 0);
+	assert(y2 > y1);
+	assert(sna->kgem.mode);
+
+	/* Always program one less than the desired value */
+	if (--y1 < 0)
+		y1 = crtc->bounds.y2;
+	y2--;
+
+	switch (pipe) {
+	default:
+		assert(0);
+	case 0:
+		event = 1 << (full_height ? 3 : 0);
+		break;
+	case 1:
+		event = 1 << (full_height ? 11 : 8);
+		break;
+	case 2:
+		event = 1 << (full_height ? 21 : 14);
+		break;
+	}
+
+	b = kgem_get_batch(&sna->kgem);
+
+	/* Both the LRI and WAIT_FOR_EVENT must be in the same cacheline */
+	if (((sna->kgem.nbatch + 6) >> 4) != (sna->kgem.nbatch + 10) >> 4) {
+		int dw = sna->kgem.nbatch + 6;
+		dw = ALIGN(dw, 16) - dw;
+		while (dw--)
+			*b++ = MI_NOOP;
 	}
 
-	kgem_set_mode(&sna->kgem, KGEM_RENDER);
-	b = kgem_get_batch(&sna->kgem, 4);
 	b[0] = MI_LOAD_REGISTER_IMM | 1;
-	b[1] = pipe;
-	b[2] = y2 - 1;
-	b[3] = MI_WAIT_FOR_EVENT | event;
-	kgem_advance_batch(&sna->kgem, 4);
+	b[1] = 0x44050; /* DERRMR */
+	b[2] = ~event;
+	b[3] = MI_LOAD_REGISTER_IMM | 1;
+	b[4] = 0xa188; /* FORCEWAKE_MT */
+	b[5] = 2 << 16 | 2;
+	b[6] = MI_LOAD_REGISTER_IMM | 1;
+	b[7] = 0x70068 + 0x1000 * pipe;
+	b[8] = (1 << 31) | (1 << 30) | (y1 << 16) | y2;
+	b[9] = MI_WAIT_FOR_EVENT | event;
+	b[10] = MI_LOAD_REGISTER_IMM | 1;
+	b[11] = 0xa188; /* FORCEWAKE_MT */
+	b[12] = 2 << 16;
+	b[13] = MI_LOAD_REGISTER_IMM | 1;
+	b[14] = 0x44050; /* DERRMR */
+	b[15] = ~0;
+
+	sna->kgem.nbatch = b - sna->kgem.batch + 16;
+
+	sna->kgem.batch_flags |= I915_EXEC_SECURE;
+	return true;
+}
+
+static bool sna_emit_wait_for_scanline_gen6(struct sna *sna,
+					    xf86CrtcPtr crtc,
+					    int pipe, int y1, int y2,
+					    bool full_height)
+{
+	uint32_t *b;
+	uint32_t event;
+
+	if (!sna->kgem.has_secure_batches)
+		return false;
+
+	assert(y1 >= 0);
+	assert(y2 > y1);
+	assert(sna->kgem.mode == KGEM_RENDER);
+
+	/* Always program one less than the desired value */
+	if (--y1 < 0)
+		y1 = crtc->bounds.y2;
+	y2--;
+
+	/* The scanline granularity is 3 bits */
+	y1 &= ~7;
+	y2 &= ~7;
+	if (y2 == y1)
+		return false;
+
+	event = 1 << (3*full_height + pipe*8);
+
+	b = kgem_get_batch(&sna->kgem);
+	sna->kgem.nbatch += 10;
+
+	b[0] = MI_LOAD_REGISTER_IMM | 1;
+	b[1] = 0x44050; /* DERRMR */
+	b[2] = ~event;
+	b[3] = MI_LOAD_REGISTER_IMM | 1;
+	b[4] = 0x4f100; /* magic */
+	b[5] = (1 << 31) | (1 << 30) | pipe << 29 | (y1 << 16) | y2;
+	b[6] = MI_WAIT_FOR_EVENT | event;
+	b[7] = MI_LOAD_REGISTER_IMM | 1;
+	b[8] = 0x44050; /* DERRMR */
+	b[9] = ~0;
+
+	sna->kgem.batch_flags |= I915_EXEC_SECURE;
+	return true;
 }
 
-static void sna_emit_wait_for_scanline_gen4(struct sna *sna,
+static bool sna_emit_wait_for_scanline_gen4(struct sna *sna,
+					    xf86CrtcPtr crtc,
 					    int pipe, int y1, int y2,
 					    bool full_height)
 {
@@ -2770,17 +2910,20 @@ static void sna_emit_wait_for_scanline_gen4(struct sna *sna,
 			event = MI_WAIT_FOR_PIPEB_SCAN_LINE_WINDOW;
 	}
 
-	kgem_set_mode(&sna->kgem, KGEM_BLT);
-	b = kgem_get_batch(&sna->kgem, 5);
+	b = kgem_get_batch(&sna->kgem);
+	sna->kgem.nbatch += 5;
+
 	/* The documentation says that the LOAD_SCAN_LINES command
 	 * always comes in pairs. Don't ask me why. */
 	b[2] = b[0] = MI_LOAD_SCAN_LINES_INCL | pipe << 20;
 	b[3] = b[1] = (y1 << 16) | (y2-1);
 	b[4] = MI_WAIT_FOR_EVENT | event;
-	kgem_advance_batch(&sna->kgem, 5);
+
+	return true;
 }
 
-static void sna_emit_wait_for_scanline_gen2(struct sna *sna,
+static bool sna_emit_wait_for_scanline_gen2(struct sna *sna,
+					    xf86CrtcPtr crtc,
 					    int pipe, int y1, int y2,
 					    bool full_height)
 {
@@ -2794,17 +2937,16 @@ static void sna_emit_wait_for_scanline_gen2(struct sna *sna,
 	if (full_height)
 		y2 -= 2;
 
-	kgem_set_mode(&sna->kgem, KGEM_BLT);
-	b = kgem_get_batch(&sna->kgem, 5);
+	b = kgem_get_batch(&sna->kgem);
+	sna->kgem.nbatch += 5;
+
 	/* The documentation says that the LOAD_SCAN_LINES command
 	 * always comes in pairs. Don't ask me why. */
 	b[2] = b[0] = MI_LOAD_SCAN_LINES_INCL | pipe << 20;
 	b[3] = b[1] = (y1 << 16) | (y2-1);
-	if (pipe == 0)
-		b[4] = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_PIPEA_SCAN_LINE_WINDOW;
-	else
-		b[4] = MI_WAIT_FOR_EVENT | MI_WAIT_FOR_PIPEB_SCAN_LINE_WINDOW;
-	kgem_advance_batch(&sna->kgem, 5);
+	b[4] = MI_WAIT_FOR_EVENT | 1 << (1 + 4*pipe);
+
+	return true;
 }
 
 bool
@@ -2815,15 +2957,12 @@ sna_wait_for_scanline(struct sna *sna,
 {
 	bool full_height;
 	int y1, y2, pipe;
+	bool ret;
 
 	assert(crtc);
 	assert(to_sna_crtc(crtc)->bo != NULL);
 	assert(pixmap == sna->front);
 
-	/* XXX WAIT_EVENT is still causing hangs on SNB */
-	if (sna->kgem.gen >= 60)
-		return false;
-
 	/*
 	 * Make sure we don't wait for a scanline that will
 	 * never occur
@@ -2850,14 +2989,20 @@ sna_wait_for_scanline(struct sna *sna,
 	DBG(("%s: pipe=%d, y1=%d, y2=%d, full_height?=%d\n",
 	     __FUNCTION__, pipe, y1, y2, full_height));
 
-	if (sna->kgem.gen >= 60)
-		sna_emit_wait_for_scanline_gen6(sna, pipe, y1, y2, full_height);
-	else if (sna->kgem.gen >= 40)
-		sna_emit_wait_for_scanline_gen4(sna, pipe, y1, y2, full_height);
+	if (sna->kgem.gen >= 0100)
+		ret = false;
+	else if (sna->kgem.gen == 071)
+		ret =sna_emit_wait_for_scanline_gen6(sna, crtc, pipe, y1, y2, full_height);
+	else if (sna->kgem.gen >= 070)
+		ret = sna_emit_wait_for_scanline_gen7(sna, crtc, pipe, y1, y2, full_height);
+	else if (sna->kgem.gen >= 060)
+		ret =sna_emit_wait_for_scanline_gen6(sna, crtc, pipe, y1, y2, full_height);
+	else if (sna->kgem.gen >= 040)
+		ret = sna_emit_wait_for_scanline_gen4(sna, crtc, pipe, y1, y2, full_height);
 	else
-		sna_emit_wait_for_scanline_gen2(sna, pipe, y1, y2, full_height);
+		ret = sna_emit_wait_for_scanline_gen2(sna, crtc, pipe, y1, y2, full_height);
 
-	return true;
+	return ret;
 }
 
 void sna_mode_update(struct sna *sna)
@@ -3028,7 +3173,7 @@ sna_crtc_redisplay__composite(xf86CrtcPtr crtc, RegionPtr region)
 				   0, 0,
 				   0, 0,
 				   0, 0,
-				   0, 0,
+				   crtc->mode.HDisplay, crtc->mode.VDisplay,
 				   memset(&tmp, 0, sizeof(tmp)))) {
 		DBG(("%s: unsupported operation!\n", __FUNCTION__));
 		sna_crtc_redisplay__fallback(crtc, region);
@@ -3118,10 +3263,11 @@ void sna_mode_redisplay(struct sna *sna)
 	assert(sna->mode.shadow_active);
 
 	region = DamageRegion(sna->mode.shadow_damage);
-	if (!RegionNotEmpty(region))
+	if (RegionNil(region))
 		return;
 
-	if (!sna_pixmap_move_to_gpu(sna->front, MOVE_READ)) {
+	if (!can_render(sna) ||
+	    !sna_pixmap_move_to_gpu(sna->front, MOVE_READ)) {
 		if (!sna_pixmap_move_to_cpu(sna->front, MOVE_READ))
 			return;
 
@@ -3164,7 +3310,7 @@ void sna_mode_redisplay(struct sna *sna)
 		RegionIntersect(&damage, &damage, region);
 		if (RegionNotEmpty(&damage)) {
 			sna_crtc_redisplay(crtc, &damage);
-			__kgem_flush(&sna->kgem, sna_crtc->bo);
+			kgem_bo_flush(&sna->kgem, sna_crtc->bo);
 		}
 		RegionUninit(&damage);
 	}
@@ -3184,6 +3330,7 @@ void sna_mode_redisplay(struct sna *sna)
 
 		for (i = 0; i < config->num_crtc; i++) {
 			struct sna_crtc *crtc = config->crtc[i]->driver_private;
+			struct drm_mode_crtc_page_flip arg;
 
 			DBG(("%s: crtc %d [%d, pipe=%d] active? %d\n",
 			     __FUNCTION__, i, crtc->id, crtc->pipe, crtc->bo != NULL));
@@ -3191,41 +3338,36 @@ void sna_mode_redisplay(struct sna *sna)
 				continue;
 
 			assert(config->crtc[i]->enabled);
-
-			if (crtc->dpms_mode == DPMSModeOn) {
-				struct drm_mode_crtc_page_flip arg;
-				arg.crtc_id = crtc->id;
-				arg.fb_id = get_fb(sna, new,
-						   sna->scrn->virtualX,
-						   sna->scrn->virtualY);
-				if (arg.fb_id == 0)
-					goto disable;
-
-				/* Only the reference crtc will finally deliver its page flip
-				 * completion event. All other crtc's events will be discarded.
-				 */
-				arg.user_data = 0;
-				arg.flags = DRM_MODE_PAGE_FLIP_EVENT;
-				arg.reserved = 0;
-
-				if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_PAGE_FLIP, &arg)) {
-					DBG(("%s: flip [fb=%d] on crtc %d [%d, pipe=%d] failed - %d\n",
-					     __FUNCTION__, arg.fb_id, i, crtc->id, crtc->pipe, errno));
+			assert(crtc->dpms_mode == DPMSModeOn);
+
+			arg.crtc_id = crtc->id;
+			arg.fb_id = get_fb(sna, new,
+					   sna->scrn->virtualX,
+					   sna->scrn->virtualY);
+			if (arg.fb_id == 0)
+				goto disable;
+
+			arg.user_data = 0;
+			arg.flags = DRM_MODE_PAGE_FLIP_EVENT;
+			arg.reserved = 0;
+
+			if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_PAGE_FLIP, &arg)) {
+				DBG(("%s: flip [fb=%d] on crtc %d [%d, pipe=%d] failed - %d\n",
+				     __FUNCTION__, arg.fb_id, i, crtc->id, crtc->pipe, errno));
 disable:
-					sna_crtc_disable(config->crtc[i]);
-					continue;
-				}
-				sna->mode.shadow_flip++;
+				xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
+					   "%s: page flipping failed, disabling CRTC:%d (pipe=%d)\n",
+					   __FUNCTION__, crtc->id, crtc->pipe);
+				sna_crtc_disable(config->crtc[i]);
+				continue;
 			}
+			sna->mode.shadow_flip++;
 
 			kgem_bo_destroy(&sna->kgem, old);
 			crtc->bo = kgem_bo_reference(new);
 		}
 
 		if (sna->mode.shadow) {
-			/* XXX only works if the kernel stalls fwrites to the current
-			 * scanout whilst the flip is pending
-			 */
 			while (sna->mode.shadow_flip)
 				sna_mode_wakeup(sna);
 			(void)sna->render.copy_boxes(sna, GXcopy,
@@ -3237,8 +3379,9 @@ disable:
 			kgem_submit(&sna->kgem);
 
 			sna_pixmap(sna->front)->gpu_bo = old;
-			sna->mode.shadow = new;
+			sna_dri_pixmap_update_bo(sna, sna->front);
 
+			sna->mode.shadow = new;
 			new->flush = old->flush;
 		}
 
diff --git a/src/sna/sna_dri.c b/src/sna/sna_dri.c
index 15ac46a3e..f04f1afeb 100644
--- a/src/sna/sna_dri.c
+++ b/src/sna/sna_dri.c
@@ -41,6 +41,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "sna.h"
 #include "sna_reg.h"
+#include "intel_options.h"
 
 #include <xf86drm.h>
 #include <i915_drm.h>
@@ -51,18 +52,17 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #endif
 
 #if DRI2INFOREC_VERSION < 10
+#undef USE_ASYNC_SWAP
 #define USE_ASYNC_SWAP 0
 #endif
 
 #define COLOR_PREFER_TILING_Y 0
-#define FLIP_OFF_DELAY 5
 
 enum frame_event_type {
 	DRI2_SWAP,
 	DRI2_SWAP_WAIT,
 	DRI2_SWAP_THROTTLE,
 	DRI2_XCHG_THROTTLE,
-	DRI2_ASYNC_FLIP,
 	DRI2_FLIP,
 	DRI2_FLIP_THROTTLE,
 	DRI2_WAITMSC,
@@ -91,15 +91,17 @@ struct sna_dri_frame_event {
 	struct dri_bo {
 		struct kgem_bo *bo;
 		uint32_t name;
-	} old_front, next_front, cache;
+	} scanout[2], cache;
 
-	int off_delay;
+	int mode;
 };
 
 struct sna_dri_private {
-	int refcnt;
 	PixmapPtr pixmap;
 	struct kgem_bo *bo;
+	bool scanout;
+	uint32_t size;
+	int refcnt;
 };
 
 static inline struct sna_dri_frame_event *
@@ -144,7 +146,8 @@ static uint32_t color_tiling(struct sna *sna, DrawablePtr draw)
 static uint32_t other_tiling(struct sna *sna, DrawablePtr draw)
 {
 	/* XXX Can mix color X / depth Y? */
-	return kgem_choose_tiling(&sna->kgem, -I915_TILING_Y,
+	return kgem_choose_tiling(&sna->kgem,
+				  sna->kgem.gen >=40 ? -I915_TILING_Y : -I915_TILING_X,
 				  draw->width,
 				  draw->height,
 				  draw->bitsPerPixel);
@@ -173,6 +176,7 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 	}
 
 	assert(priv->cpu_damage == NULL);
+	assert(priv->gpu_bo->proxy == NULL);
 	if (priv->flush++)
 		return priv->gpu_bo;
 
@@ -198,13 +202,38 @@ static struct kgem_bo *sna_pixmap_set_dri(struct sna *sna,
 constant static inline void *sna_pixmap_get_buffer(PixmapPtr pixmap)
 {
 	assert(pixmap->refcnt);
-	return ((void **)dixGetPrivateAddr(&pixmap->devPrivates, &sna_pixmap_key))[2];
+	return ((void **)__get_private(pixmap, sna_pixmap_key))[2];
 }
 
 static inline void sna_pixmap_set_buffer(PixmapPtr pixmap, void *ptr)
 {
 	assert(pixmap->refcnt);
-	((void **)dixGetPrivateAddr(&pixmap->devPrivates, &sna_pixmap_key))[2] = ptr;
+	((void **)__get_private(pixmap, sna_pixmap_key))[2] = ptr;
+}
+
+void
+sna_dri_pixmap_update_bo(struct sna *sna, PixmapPtr pixmap)
+{
+	DRI2Buffer2Ptr buffer;
+	struct sna_dri_private *private;
+	struct kgem_bo *bo;
+
+	buffer = sna_pixmap_get_buffer(pixmap);
+	if (buffer == NULL)
+		return;
+
+	private = get_private(buffer);
+	assert(private->pixmap == pixmap);
+
+	bo = sna_pixmap(pixmap)->gpu_bo;
+	if (private->bo == bo)
+		return;
+
+	kgem_bo_destroy(&sna->kgem, private->bo);
+	buffer->name = kgem_bo_flink(&sna->kgem, bo);
+	private->bo = ref(bo);
+
+	/* XXX DRI2InvalidateDrawable(&pixmap->drawable); */
 }
 
 static DRI2Buffer2Ptr
@@ -217,12 +246,15 @@ sna_dri_create_buffer(DrawablePtr draw,
 	struct sna_dri_private *private;
 	PixmapPtr pixmap;
 	struct kgem_bo *bo;
+	unsigned flags = CREATE_EXACT;
+	uint32_t size;
 	int bpp;
 
 	DBG(("%s(attachment=%d, format=%d, drawable=%dx%d)\n",
 	     __FUNCTION__, attachment, format, draw->width, draw->height));
 
 	pixmap = NULL;
+	size = (uint32_t)draw->height << 16 | draw->width;
 	switch (attachment) {
 	case DRI2BufferFrontLeft:
 		pixmap = get_drawable_pixmap(draw);
@@ -236,7 +268,10 @@ sna_dri_create_buffer(DrawablePtr draw,
 
 			assert(private->pixmap == pixmap);
 			assert(sna_pixmap(pixmap)->gpu_bo == private->bo);
+			assert(sna_pixmap(pixmap)->pinned & PIN_DRI);
 			assert(kgem_bo_flink(&sna->kgem, private->bo) == buffer->name);
+			assert(8*private->bo->pitch >= pixmap->drawable.width * pixmap->drawable.bitsPerPixel);
+			assert(private->bo->pitch * pixmap->drawable.height <= kgem_bo_size(private->bo));
 
 			private->refcnt++;
 			return buffer;
@@ -252,6 +287,9 @@ sna_dri_create_buffer(DrawablePtr draw,
 		     __FUNCTION__,
 		     pixmap->drawable.width, pixmap->drawable.height,
 		     pixmap, pixmap->refcnt));
+		if (pixmap == sna->front)
+			flags |= CREATE_SCANOUT;
+		size = (uint32_t)pixmap->drawable.height << 16 | pixmap->drawable.width;
 		break;
 
 	case DRI2BufferBackLeft:
@@ -260,12 +298,15 @@ sna_dri_create_buffer(DrawablePtr draw,
 	case DRI2BufferFakeFrontLeft:
 	case DRI2BufferFakeFrontRight:
 		bpp = draw->bitsPerPixel;
+		if (draw->width  == sna->front->drawable.width &&
+		    draw->height == sna->front->drawable.height)
+			flags |= CREATE_SCANOUT;
 		bo = kgem_create_2d(&sna->kgem,
 				    draw->width,
 				    draw->height,
 				    draw->bitsPerPixel,
 				    color_tiling(sna, draw),
-				    CREATE_SCANOUT | CREATE_EXACT);
+				    flags);
 		break;
 
 	case DRI2BufferStencil:
@@ -296,7 +337,7 @@ sna_dri_create_buffer(DrawablePtr draw,
 		bo = kgem_create_2d(&sna->kgem,
 				    ALIGN(draw->width, 64),
 				    ALIGN((draw->height + 1) / 2, 64),
-				    bpp, I915_TILING_NONE, CREATE_EXACT);
+				    bpp, I915_TILING_NONE, flags);
 		break;
 
 	case DRI2BufferDepth:
@@ -307,7 +348,7 @@ sna_dri_create_buffer(DrawablePtr draw,
 		bo = kgem_create_2d(&sna->kgem,
 				    draw->width, draw->height, bpp,
 				    other_tiling(sna, draw),
-				    CREATE_EXACT);
+				    flags);
 		break;
 
 	default:
@@ -331,6 +372,8 @@ sna_dri_create_buffer(DrawablePtr draw,
 	private->refcnt = 1;
 	private->bo = bo;
 	private->pixmap = pixmap;
+	private->scanout = !!(flags & CREATE_SCANOUT);
+	private->size = size;
 
 	if (buffer->name == 0)
 		goto err;
@@ -415,13 +458,13 @@ damage_all:
 						   pixmap->drawable.width,
 						   pixmap->drawable.height);
 		sna_damage_destroy(&priv->cpu_damage);
-		priv->undamaged = false;
 	} else {
 		sna_damage_subtract(&priv->cpu_damage, region);
 		if (priv->cpu_damage == NULL)
 			goto damage_all;
 		sna_damage_add(&priv->gpu_damage, region);
 	}
+	priv->cpu = false;
 }
 
 static void set_bo(PixmapPtr pixmap, struct kgem_bo *bo)
@@ -430,6 +473,10 @@ static void set_bo(PixmapPtr pixmap, struct kgem_bo *bo)
 	struct sna_pixmap *priv = sna_pixmap(pixmap);
 	RegionRec region;
 
+	assert(pixmap->drawable.width * pixmap->drawable.bitsPerPixel <= 8*bo->pitch);
+	assert(pixmap->drawable.height * bo->pitch <= kgem_bo_size(bo));
+	assert(bo->proxy == NULL);
+
 	/* Post damage on the new front buffer so that listeners, such
 	 * as DisplayLink know take a copy and shove it over the USB,
 	 * also for software cursors and the like.
@@ -446,12 +493,17 @@ static void set_bo(PixmapPtr pixmap, struct kgem_bo *bo)
 	sna_damage_destroy(&priv->cpu_damage);
 	list_del(&priv->list);
 	priv->cpu = false;
-	priv->undamaged = false;
 
 	assert(bo->refcnt);
 	if (priv->gpu_bo != bo) {
 		kgem_bo_destroy(&sna->kgem, priv->gpu_bo);
 		priv->gpu_bo = ref(bo);
+		if (priv->mapped) {
+			assert(!priv->shm && priv->stride);
+			pixmap->devPrivate.ptr = PTR(priv->ptr);
+			pixmap->devKind = priv->stride;
+			priv->mapped = false;
+		}
 	}
 	if (bo->domain != DOMAIN_GPU)
 		bo->domain = DOMAIN_NONE;
@@ -459,17 +511,20 @@ static void set_bo(PixmapPtr pixmap, struct kgem_bo *bo)
 	DamageRegionProcessPending(&pixmap->drawable);
 }
 
-static void sna_dri_select_mode(struct sna *sna, struct kgem_bo *src, bool sync)
+static void sna_dri_select_mode(struct sna *sna, struct kgem_bo *dst, struct kgem_bo *src, bool sync)
 {
 	struct drm_i915_gem_busy busy;
 	int mode;
 
-	if (sna->kgem.gen < 60)
+	if (sna->kgem.gen < 060)
 		return;
 
 	if (sync) {
-		DBG(("%s: sync, force RENDER ring\n", __FUNCTION__));
-		kgem_set_mode(&sna->kgem, KGEM_RENDER);
+		DBG(("%s: sync, force %s ring\n", __FUNCTION__,
+		     sna->kgem.gen >= 070 ? "BLT" : "RENDER"));
+		kgem_set_mode(&sna->kgem,
+			      sna->kgem.gen >= 070 ? KGEM_BLT : KGEM_RENDER,
+			      dst);
 		return;
 	}
 
@@ -478,21 +533,21 @@ static void sna_dri_select_mode(struct sna *sna, struct kgem_bo *src, bool sync)
 		return;
 	}
 
-	if (sna->kgem.has_semaphores) {
-		DBG(("%s: have sempahores, prefering RENDER\n", __FUNCTION__));
-		kgem_set_mode(&sna->kgem, KGEM_RENDER);
-		return;
-	}
-
 	VG_CLEAR(busy);
-	busy.handle = src->handle;
+	busy.handle = dst->handle;
 	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_I915_GEM_BUSY, &busy))
 		return;
 
-	DBG(("%s: src busy?=%x\n", __FUNCTION__, busy.busy));
+	DBG(("%s: dst busy?=%x\n", __FUNCTION__, busy.busy));
 	if (busy.busy == 0) {
-		DBG(("%s: src is idle, using defaults\n", __FUNCTION__));
-		return;
+		busy.handle = src->handle;
+		if (drmIoctl(sna->kgem.fd, DRM_IOCTL_I915_GEM_BUSY, &busy))
+			return;
+		DBG(("%s: src busy?=%x\n", __FUNCTION__, busy.busy));
+		if (busy.busy == 0) {
+			DBG(("%s: src/dst is idle, using defaults\n", __FUNCTION__));
+			return;
+		}
 	}
 
 	/* Sandybridge introduced a separate ring which it uses to
@@ -513,6 +568,7 @@ static void sna_dri_select_mode(struct sna *sna, struct kgem_bo *src, bool sync)
 	mode = KGEM_RENDER;
 	if (busy.busy & (1 << 17))
 		mode = KGEM_BLT;
+	kgem_bo_mark_busy(dst, mode);
 	_kgem_set_mode(&sna->kgem, mode);
 }
 
@@ -525,6 +581,9 @@ sna_dri_copy_fallback(struct sna *sna, int bpp,
 	void *dst = kgem_bo_map__gtt(&sna->kgem, dst_bo);
 	void *src = kgem_bo_map__gtt(&sna->kgem, src_bo);
 
+	if (dst == NULL || src == NULL)
+		return;
+
 	DBG(("%s: src(%d, %d), dst(%d, %d) x %d\n",
 	     __FUNCTION__, sx, sy, dx, dy, n));
 
@@ -573,7 +632,7 @@ sna_dri_copy_to_front(struct sna *sna, DrawablePtr draw, RegionPtr region,
 		if (sync)
 			sync = sna_pixmap_is_scanout(sna, pixmap);
 
-		sna_dri_select_mode(sna, src_bo, sync);
+		sna_dri_select_mode(sna, dst_bo, src_bo, sync);
 	} else
 		sync = false;
 
@@ -628,10 +687,15 @@ sna_dri_copy_to_front(struct sna *sna, DrawablePtr draw, RegionPtr region,
 				      dst_bo, 0, 0,
 				      boxes, n);
 	} else {
+		unsigned flags;
+
+		flags = COPY_LAST;
+		if (flush)
+			flags |= COPY_SYNC;
 		sna->render.copy_boxes(sna, GXcopy,
 				       (PixmapPtr)draw, src_bo, -draw->x-dx, -draw->y-dy,
 				       pixmap, dst_bo, 0, 0,
-				       boxes, n, COPY_LAST);
+				       boxes, n, flags);
 
 		DBG(("%s: flushing? %d\n", __FUNCTION__, flush));
 		if (flush) { /* STAT! */
@@ -717,7 +781,7 @@ sna_dri_copy_from_front(struct sna *sna, DrawablePtr draw, RegionPtr region,
 				      dst_bo, -draw->x, -draw->y,
 				      boxes, n);
 	} else {
-		sna_dri_select_mode(sna, src_bo, false);
+		sna_dri_select_mode(sna, dst_bo, src_bo, false);
 		sna->render.copy_boxes(sna, GXcopy,
 				       pixmap, src_bo, dx, dy,
 				       (PixmapPtr)draw, dst_bo, -draw->x, -draw->y,
@@ -766,7 +830,7 @@ sna_dri_copy(struct sna *sna, DrawablePtr draw, RegionPtr region,
 				      dst_bo, 0, 0,
 				      boxes, n);
 	} else {
-		sna_dri_select_mode(sna, src_bo, false);
+		sna_dri_select_mode(sna, dst_bo, src_bo, false);
 		sna->render.copy_boxes(sna, GXcopy,
 				       (PixmapPtr)draw, src_bo, 0, 0,
 				       (PixmapPtr)draw, dst_bo, 0, 0,
@@ -777,6 +841,42 @@ sna_dri_copy(struct sna *sna, DrawablePtr draw, RegionPtr region,
 		pixman_region_fini(&clip);
 }
 
+static bool
+can_blit(struct sna * sna,
+	 DrawablePtr draw,
+	 DRI2BufferPtr front,
+	 DRI2BufferPtr back)
+{
+	RegionPtr clip;
+	int w, h;
+	uint32_t s;
+
+	if (draw->type == DRAWABLE_PIXMAP)
+		return true;
+
+	clip = &((WindowPtr)draw)->clipList;
+	w = clip->extents.x2 - draw->x;
+	h = clip->extents.y2 - draw->y;
+	if ((w|h) < 0)
+		return false;
+
+	s = get_private(front)->size;
+	if ((s>>16) < h || (s&0xffff) < w) {
+		DBG(("%s: reject front size (%dx%d) < (%dx%d)\n", __func__,
+		       s&0xffff, s>>16, w, h));
+		return false;
+	}
+
+	s = get_private(back)->size;
+	if ((s>>16) < h || (s&0xffff) < w) {
+		DBG(("%s:reject back size (%dx%d) < (%dx%d)\n", __func__,
+		     s&0xffff, s>>16, w, h));
+		return false;
+	}
+
+	return true;
+}
+
 static void
 sna_dri_copy_region(DrawablePtr draw,
 		    RegionPtr region,
@@ -789,6 +889,9 @@ sna_dri_copy_region(DrawablePtr draw,
 	void (*copy)(struct sna *, DrawablePtr, RegionPtr,
 		     struct kgem_bo *, struct kgem_bo *, bool) = sna_dri_copy;
 
+	if (!can_blit(sna, draw, dst_buffer, src_buffer))
+		return;
+
 	if (dst_buffer->attachment == DRI2BufferFrontLeft) {
 		dst = sna_pixmap_get_bo(pixmap);
 		copy = (void *)sna_dri_copy_to_front;
@@ -860,7 +963,7 @@ sna_dri_get_pipe(DrawablePtr pDraw)
 static struct sna_dri_frame_event *
 sna_dri_window_get_chain(WindowPtr win)
 {
-	return ((void **)dixGetPrivateAddr(&win->devPrivates, &sna_window_key))[1];
+	return ((void **)__get_private(win, sna_window_key))[1];
 }
 
 static void
@@ -869,7 +972,7 @@ sna_dri_window_set_chain(WindowPtr win,
 {
 	DBG(("%s: head now %p\n", __FUNCTION__, chain));
 	assert(win->drawable.type == DRAWABLE_WINDOW);
-	((void **)dixGetPrivateAddr(&win->devPrivates, &sna_window_key))[1] = chain;
+	((void **)__get_private(win, sna_window_key))[1] = chain;
 }
 
 static void
@@ -947,8 +1050,10 @@ sna_dri_frame_event_info_free(struct sna *sna,
 	_sna_dri_destroy_buffer(sna, info->front);
 	_sna_dri_destroy_buffer(sna, info->back);
 
-	if (info->old_front.bo)
-		kgem_bo_destroy(&sna->kgem, info->old_front.bo);
+	assert(info->scanout[1].bo == NULL);
+
+	if (info->scanout[0].bo)
+		kgem_bo_destroy(&sna->kgem, info->scanout[0].bo);
 
 	if (info->cache.bo)
 		kgem_bo_destroy(&sna->kgem, info->cache.bo);
@@ -959,25 +1064,39 @@ sna_dri_frame_event_info_free(struct sna *sna,
 	free(info);
 }
 
-static void
+static bool
 sna_dri_page_flip(struct sna *sna, struct sna_dri_frame_event *info)
 {
 	struct kgem_bo *bo = get_private(info->back)->bo;
+	struct dri_bo tmp;
 
 	DBG(("%s()\n", __FUNCTION__));
 
 	assert(sna_pixmap_get_buffer(sna->front) == info->front);
 	assert(get_drawable_pixmap(info->draw)->drawable.height * bo->pitch <= kgem_bo_size(bo));
+	assert(info->scanout[0].bo);
 
 	info->count = sna_page_flip(sna, bo, info, info->pipe);
+	if (!info->count)
+		return false;
+
+	info->scanout[1] = info->scanout[0];
+	info->scanout[0].bo = ref(bo);
+	info->scanout[0].name = info->back->name;
 
-	info->old_front.name = info->front->name;
-	info->old_front.bo = get_private(info->front)->bo;
+	tmp.bo = get_private(info->front)->bo;
+	tmp.name = info->front->name;
 
 	set_bo(sna->front, bo);
 
 	info->front->name = info->back->name;
 	get_private(info->front)->bo = bo;
+
+	info->back->name = tmp.name;
+	get_private(info->back)->bo = tmp.bo;
+
+	sna->dri.flip_pending = info;
+	return true;
 }
 
 static bool
@@ -1031,12 +1150,25 @@ can_flip(struct sna * sna,
 	if (sna_pixmap_get_buffer(pixmap) != front) {
 		DBG(("%s: no, DRI2 drawable is no longer attached (old name=%d, new name=%d) to pixmap=%ld\n",
 		     __FUNCTION__, front->name,
-		     ((DRI2BufferPtr)sna_pixmap_get_buffer(pixmap))->name,
+		     sna_pixmap_get_buffer(pixmap) ? ((DRI2BufferPtr)sna_pixmap_get_buffer(pixmap))->name : 0,
 		     pixmap->drawable.serialNumber));
 		return false;
 	}
 
+	if (!get_private(front)->scanout) {
+		DBG(("%s: no, DRI2 drawable not attached at time of creation)\n",
+		     __FUNCTION__));
+		return false;
+	}
 	assert(get_private(front)->pixmap == sna->front);
+	assert(sna_pixmap(sna->front)->gpu_bo == get_private(front)->bo);
+
+	if (!get_private(back)->scanout) {
+		DBG(("%s: no, DRI2 drawable was too small at time of creation)\n",
+		     __FUNCTION__));
+		return false;
+	}
+	assert(get_private(back)->size == get_private(front)->size);
 
 	DBG(("%s: window size: %dx%d, clip=(%d, %d), (%d, %d) x %d\n",
 	     __FUNCTION__,
@@ -1094,6 +1226,12 @@ can_exchange(struct sna * sna,
 	WindowPtr win = (WindowPtr)draw;
 	PixmapPtr pixmap;
 
+	/* XXX There is an inherent race between the DRI2 client and the DRI2
+	 * compositor which is only masked if we force a blit and serialise
+	 * the operations through the kernel command queue. Hopeless.
+	 */
+	return false;
+
 	if (front->format != back->format) {
 		DBG(("%s: no, format mismatch, front = %d, back = %d\n",
 		     __FUNCTION__, front->format, back->format));
@@ -1127,6 +1265,20 @@ can_exchange(struct sna * sna,
 		return false;
 	}
 
+	if (!get_private(front)->scanout) {
+		DBG(("%s: no, DRI2 drawable not attached at time of creation)\n",
+		     __FUNCTION__));
+		return false;
+	}
+	assert(get_private(front)->pixmap == sna->front);
+
+	if (!get_private(back)->scanout) {
+		DBG(("%s: no, DRI2 drawable was too small at time of creation)\n",
+		     __FUNCTION__));
+		return false;
+	}
+	assert(get_private(back)->size == get_private(front)->size);
+
 	return true;
 }
 
@@ -1167,10 +1319,12 @@ sna_dri_exchange_buffers(DrawablePtr draw,
 	     pixmap->drawable.width,
 	     pixmap->drawable.height));
 
-	DBG(("%s: back_bo pitch=%d, size=%d\n",
-	     __FUNCTION__, back_bo->pitch, kgem_bo_size(back_bo)));
-	DBG(("%s: front_bo pitch=%d, size=%d\n",
-	     __FUNCTION__, front_bo->pitch, kgem_bo_size(front_bo)));
+	DBG(("%s: back_bo pitch=%d, size=%d, ref=%d\n",
+	     __FUNCTION__, back_bo->pitch, kgem_bo_size(back_bo), back_bo->refcnt));
+	DBG(("%s: front_bo pitch=%d, size=%d, ref=%d\n",
+	     __FUNCTION__, front_bo->pitch, kgem_bo_size(front_bo), front_bo->refcnt));
+	assert(front_bo->refcnt);
+	assert(back_bo->refcnt);
 
 	assert(sna_pixmap_get_buffer(pixmap) == front);
 	assert(pixmap->drawable.height * back_bo->pitch <= kgem_bo_size(back_bo));
@@ -1188,7 +1342,7 @@ sna_dri_exchange_buffers(DrawablePtr draw,
 
 static void chain_swap(struct sna *sna,
 		       DrawablePtr draw,
-		       struct drm_event_vblank *event,
+		       int frame, unsigned int tv_sec, unsigned int tv_usec,
 		       struct sna_dri_frame_event *chain)
 {
 	drmVBlank vbl;
@@ -1209,7 +1363,7 @@ static void chain_swap(struct sna *sna,
 		DBG(("%s: performing chained exchange\n", __FUNCTION__));
 		sna_dri_exchange_buffers(draw, chain->front, chain->back);
 		type = DRI2_EXCHANGE_COMPLETE;
-	} else {
+	} else if (can_blit(sna, draw, chain->front, chain->back)) {
 		DBG(("%s: emitting chained vsync'ed blit\n", __FUNCTION__));
 
 		chain->bo = sna_dri_copy_to_front(sna, draw, NULL,
@@ -1218,10 +1372,16 @@ static void chain_swap(struct sna *sna,
 						  true);
 
 		type = DRI2_BLIT_COMPLETE;
+	} else {
+		DRI2SwapComplete(chain->client, draw,
+				 0, 0, 0, DRI2_BLIT_COMPLETE,
+				 chain->client ? chain->event_complete : NULL, chain->event_data);
+		sna_dri_frame_event_info_free(sna, draw, chain);
+		return;
 	}
 
 	DRI2SwapComplete(chain->client, draw,
-			 event->sequence, event->tv_sec, event->tv_usec,
+			 frame, tv_sec, tv_usec,
 			 type, chain->client ? chain->event_complete : NULL, chain->event_data);
 
 	VG_CLEAR(vbl);
@@ -1273,19 +1433,17 @@ void sna_dri_vblank_handler(struct sna *sna, struct drm_event_vblank *event)
 	switch (info->type) {
 	case DRI2_FLIP:
 		/* If we can still flip... */
-		if (can_flip(sna, draw, info->front, info->back)) {
-			sna_dri_page_flip(sna, info);
-			info->back->name = info->old_front.name;
-			get_private(info->back)->bo = info->old_front.bo;
-			info->old_front.bo = NULL;
+		if (can_flip(sna, draw, info->front, info->back) &&
+		    sna_dri_page_flip(sna, info))
 			return;
-		}
+
 		/* else fall through to blit */
 	case DRI2_SWAP:
-		info->bo = sna_dri_copy_to_front(sna, draw, NULL,
-						 get_private(info->front)->bo,
-						 get_private(info->back)->bo,
-						 true);
+		if (can_blit(sna, draw, info->front, info->back))
+			info->bo = sna_dri_copy_to_front(sna, draw, NULL,
+							 get_private(info->front)->bo,
+							 get_private(info->back)->bo,
+							 true);
 		info->type = DRI2_SWAP_WAIT;
 		/* fall through to SwapComplete */
 	case DRI2_SWAP_WAIT:
@@ -1325,7 +1483,9 @@ void sna_dri_vblank_handler(struct sna *sna, struct drm_event_vblank *event)
 
 	if (info->chain) {
 		sna_dri_remove_frame_event((WindowPtr)draw, info);
-		chain_swap(sna, draw, event, info->chain);
+		chain_swap(sna, draw,
+			   event->sequence, event->tv_sec, event->tv_usec,
+			   info->chain);
 		draw = NULL;
 	}
 
@@ -1334,22 +1494,91 @@ done:
 }
 
 static void
+sna_dri_flip_get_back(struct sna *sna, struct sna_dri_frame_event *info)
+{
+	struct kgem_bo *bo;
+	uint32_t name;
+
+	DBG(("%s: scanout=(%d, %d), back=%d, cache=%d\n",
+	     __FUNCTION__,
+	     info->scanout[0].bo ? info->scanout[0].bo->handle : 0,
+	     info->scanout[1].bo ? info->scanout[1].bo->handle : 0,
+	     get_private(info->back)->bo->handle,
+	     info->cache.bo ? info->cache.bo->handle : 0));
+
+	bo = get_private(info->back)->bo;
+	if (!(bo == info->scanout[0].bo || bo == info->scanout[1].bo))
+		return;
+
+	bo = info->cache.bo;
+	name = info->cache.name;
+	if (bo == NULL ||
+	    bo == info->scanout[0].bo ||
+	    bo == info->scanout[1].bo) {
+		if (bo) {
+			DBG(("%s: discarding old backbuffer\n", __FUNCTION__));
+			kgem_bo_destroy(&sna->kgem, bo);
+		}
+		DBG(("%s: allocating new backbuffer\n", __FUNCTION__));
+		bo = kgem_create_2d(&sna->kgem,
+				    info->draw->width,
+				    info->draw->height,
+				    info->draw->bitsPerPixel,
+				    get_private(info->front)->bo->tiling,
+				    CREATE_SCANOUT | CREATE_EXACT);
+		name = kgem_bo_flink(&sna->kgem, bo);
+	}
+
+	info->cache.bo = get_private(info->back)->bo;
+	info->cache.name = info->back->name;
+
+	get_private(info->back)->bo = bo;
+	info->back->name = name;
+
+	assert(get_private(info->back)->bo != info->scanout[0].bo);
+	assert(get_private(info->back)->bo != info->scanout[1].bo);
+}
+
+static bool
 sna_dri_flip_continue(struct sna *sna, struct sna_dri_frame_event *info)
 {
-	struct dri_bo tmp;
+	DBG(("%s(mode=%d)\n", __FUNCTION__, info->mode));
 
-	DBG(("%s()\n", __FUNCTION__));
+	if (info->mode > 1){
+		if (get_private(info->front)->bo != sna_pixmap(sna->front)->gpu_bo)
+			return false;
 
-	assert(sna_pixmap_get_buffer(get_drawable_pixmap(info->draw)) == info->front);
+		info->count = sna_page_flip(sna,
+					    get_private(info->front)->bo,
+					    info, info->pipe);
+		if (!info->count)
+			return false;
 
-	tmp = info->old_front;
+		info->scanout[1] = info->scanout[0];
+		info->scanout[0].bo = ref(get_private(info->front)->bo);
+		info->scanout[0].name = info->front->name;
+		sna->dri.flip_pending = info;
+	} else {
+		if (!info->draw)
+			return false;
 
-	sna_dri_page_flip(sna, info);
+		assert(sna_pixmap_get_buffer(get_drawable_pixmap(info->draw)) == info->front);
+		if (!can_flip(sna, info->draw, info->front, info->back))
+			return false;
 
-	get_private(info->back)->bo = tmp.bo;
-	info->back->name = tmp.name;
+		if (!sna_dri_page_flip(sna, info))
+			return false;
+
+		sna_dri_flip_get_back(sna, info);
+		DRI2SwapComplete(info->client, info->draw,
+				 0, 0, 0,
+				 DRI2_FLIP_COMPLETE,
+				 info->client ? info->event_complete : NULL,
+				 info->event_data);
+	}
 
-	info->next_front.name = 0;
+	info->mode = 0;
+	return true;
 }
 
 static void chain_flip(struct sna *sna)
@@ -1367,29 +1596,17 @@ static void chain_flip(struct sna *sna)
 	}
 
 	if (chain->type == DRI2_FLIP &&
-	    can_flip(sna, chain->draw, chain->front, chain->back)) {
+	    can_flip(sna, chain->draw, chain->front, chain->back) &&
+	    sna_dri_page_flip(sna, chain)) {
 		DBG(("%s: performing chained flip\n", __FUNCTION__));
-		sna_dri_page_flip(sna, chain);
-
-		chain->back->name = chain->old_front.name;
-		get_private(chain->back)->bo = chain->old_front.bo;
-		chain->old_front.bo = NULL;
-
-		if (chain->count == 0) {
-			DRI2SwapComplete(chain->client, chain->draw, 0, 0, 0,
-					 DRI2_EXCHANGE_COMPLETE,
-					 chain->event_complete,
-					 chain->event_data);
-			sna_dri_frame_event_info_free(sna, chain->draw, chain);
-		} else
-			sna->dri.flip_pending = chain;
 	} else {
-		DBG(("%s: emitting chained vsync'ed blit\n", __FUNCTION__));
-
-		chain->bo = sna_dri_copy_to_front(sna, chain->draw, NULL,
-						  get_private(chain->front)->bo,
-						  get_private(chain->back)->bo,
-						  true);
+		if (can_blit(sna, chain->draw, chain->front, chain->back)) {
+			DBG(("%s: emitting chained vsync'ed blit\n", __FUNCTION__));
+			chain->bo = sna_dri_copy_to_front(sna, chain->draw, NULL,
+							  get_private(chain->front)->bo,
+							  get_private(chain->back)->bo,
+							  true);
+		}
 		DRI2SwapComplete(chain->client, chain->draw, 0, 0, 0,
 				 DRI2_BLIT_COMPLETE, chain->client ? chain->event_complete : NULL, chain->event_data);
 		sna_dri_frame_event_info_free(sna, chain->draw, chain);
@@ -1406,6 +1623,14 @@ static void sna_dri_flip_event(struct sna *sna,
 	     flip->fe_tv_usec,
 	     flip->type));
 
+	if (flip->cache.bo == NULL) {
+		flip->cache = flip->scanout[1];
+		flip->scanout[1].bo = NULL;
+	}
+	if (flip->scanout[1].bo) {
+		kgem_bo_destroy(&sna->kgem, flip->scanout[1].bo);
+		flip->scanout[1].bo = NULL;
+	}
 	if (sna->dri.flip_pending == flip)
 		sna->dri.flip_pending = NULL;
 
@@ -1433,44 +1658,31 @@ static void sna_dri_flip_event(struct sna *sna,
 		if (sna->dri.flip_pending) {
 			sna_dri_frame_event_info_free(sna, flip->draw, flip);
 			chain_flip(sna);
-		} else if (!flip->next_front.name) {
-			/* Keep the pageflipping running for a couple of frames
-			 * so we keep the uncached scanouts alive.
-			 */
-			DBG(("%s: flip chain complete, off-delay=%d\n",
-			     __FUNCTION__, flip->off_delay));
-			if (flip->off_delay-- && flip->draw &&
-			    can_flip(sna, flip->draw, flip->front, flip->front) &&
-			    (flip->count = sna_page_flip(sna,
-							 get_private(flip->front)->bo,
-							 flip, flip->pipe))) {
-				assert(flip == sna_dri_window_get_chain((WindowPtr)flip->draw));
-				sna->dri.flip_pending = flip;
-			} else {
-				DBG(("%s: flip chain complete, off\n", __FUNCTION__));
-				sna_dri_frame_event_info_free(sna, flip->draw, flip);
+		} else if (!flip->mode) {
+			DBG(("%s: flip chain complete\n", __FUNCTION__));
+
+			if (flip->chain) {
+				sna_dri_remove_frame_event((WindowPtr)flip->draw,
+							   flip);
+				chain_swap(sna, flip->draw,
+					   flip->fe_frame,
+					   flip->fe_tv_sec,
+					   flip->fe_tv_usec,
+					   flip->chain);
+				flip->draw = NULL;
 			}
-		} else if (flip->draw &&
-			   can_flip(sna, flip->draw, flip->front, flip->back)) {
-			sna_dri_flip_continue(sna, flip);
-			DRI2SwapComplete(flip->client, flip->draw,
-					 0, 0, 0,
-					 DRI2_FLIP_COMPLETE,
-					 flip->client ? flip->event_complete : NULL,
-					 flip->event_data);
-			if (flip->count)
-				sna->dri.flip_pending = flip;
-			else
-				sna_dri_frame_event_info_free(sna, flip->draw, flip);
-			flip->off_delay = FLIP_OFF_DELAY;
-		} else {
+
+			sna_dri_frame_event_info_free(sna, flip->draw, flip);
+		} else if (!sna_dri_flip_continue(sna, flip)) {
 			DBG(("%s: no longer able to flip\n", __FUNCTION__));
 
 			if (flip->draw) {
-				flip->bo = sna_dri_copy_to_front(sna, flip->draw, NULL,
-								 get_private(flip->front)->bo,
-								 get_private(flip->back)->bo,
-								 false);
+				if (can_blit(sna, flip->draw, flip->front, flip->back)) {
+					flip->bo = sna_dri_copy_to_front(sna, flip->draw, NULL,
+									 get_private(flip->front)->bo,
+									 get_private(flip->back)->bo,
+									 false);
+				}
 				DRI2SwapComplete(flip->client, flip->draw,
 						 0, 0, 0,
 						 DRI2_BLIT_COMPLETE,
@@ -1482,59 +1694,6 @@ static void sna_dri_flip_event(struct sna *sna,
 		}
 		break;
 
-#if USE_ASYNC_SWAP
-	case DRI2_ASYNC_FLIP:
-		DBG(("%s: async swap flip completed on pipe %d, pending? %d, new? %d\n",
-		     __FUNCTION__, flip->pipe,
-		     sna->dri.flip_pending != NULL,
-		     flip->front->name != flip->next_front.name));
-
-		if (sna->dri.flip_pending) {
-			chain_flip(sna);
-			goto finish_async_flip;
-		} else if (flip->front->name != flip->next_front.name) {
-			DBG(("%s: async flip continuing\n", __FUNCTION__));
-
-			flip->cache = flip->old_front;
-			flip->old_front = flip->next_front;
-
-			flip->count = sna_page_flip(sna,
-						    get_private(flip->front)->bo,
-						    flip, flip->pipe);
-			if (flip->count == 0)
-				goto finish_async_flip;
-
-			flip->next_front.bo = get_private(flip->front)->bo;
-			flip->next_front.name = flip->front->name;
-			flip->off_delay = FLIP_OFF_DELAY;
-
-			sna->dri.flip_pending = flip;
-		} else if (flip->draw &&
-			   can_flip(sna, flip->draw, flip->front, flip->back) &&
-			   flip->off_delay--) {
-			assert(flip == sna_dri_window_get_chain((WindowPtr)flip->draw));
-			DBG(("%s: queuing no-flip [delay=%d]\n",
-			     __FUNCTION__, flip->off_delay));
-			/* Just queue a no-op flip to trigger another event */
-			flip->count = sna_page_flip(sna,
-						    get_private(flip->front)->bo,
-						    flip, flip->pipe);
-			if (flip->count == 0)
-				goto finish_async_flip;
-
-			assert(flip->next_front.bo == get_private(flip->front)->bo);
-			assert(flip->next_front.name == flip->front->name);
-
-			sna->dri.flip_pending = flip;
-		} else {
-finish_async_flip:
-			DBG(("%s: async flip completed (drawable gone? %d)\n",
-			     __FUNCTION__, flip->draw == NULL));
-			sna_dri_frame_event_info_free(sna, flip->draw, flip);
-		}
-		break;
-#endif
-
 	default:
 		xf86DrvMsg(sna->scrn->scrnIndex, X_WARNING,
 			   "%s: unknown vblank event received\n", __func__);
@@ -1564,213 +1723,22 @@ sna_dri_page_flip_handler(struct sna *sna,
 	sna_dri_flip_event(sna, info);
 }
 
-static bool
-sna_dri_schedule_flip(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
-		      DRI2BufferPtr back, CARD64 *target_msc, CARD64 divisor,
-		      CARD64 remainder, DRI2SwapEventPtr func, void *data)
-{
-	struct sna *sna = to_sna_from_drawable(draw);
-	struct sna_dri_frame_event *info;
-	drmVBlank vbl;
-	int pipe;
-	CARD64 current_msc;
-
-	DBG(("%s(target_msc=%llu, divisor=%llu, remainder=%llu)\n",
-	     __FUNCTION__,
-	     (long long)*target_msc,
-	     (long long)divisor,
-	     (long long)remainder));
-
-	VG_CLEAR(vbl);
-
-	pipe = sna_dri_get_pipe(draw);
-	if (pipe == -1) {
-		/* XXX WARN_ON(sna->dri.flip_pending) ? */
-		if (sna->dri.flip_pending == NULL) {
-			sna_dri_exchange_buffers(draw, front, back);
-			DRI2SwapComplete(client, draw, 0, 0, 0,
-					DRI2_EXCHANGE_COMPLETE, func, data);
-			return true;
-		} else
-			return false;
-	}
-
-	/* Truncate to match kernel interfaces; means occasional overflow
-	 * misses, but that's generally not a big deal */
-	divisor &= 0xffffffff;
-	if (divisor == 0) {
-		DBG(("%s: performing immediate swap on pipe %d, pending? %d\n",
-		     __FUNCTION__, pipe, sna->dri.flip_pending != NULL));
-
-		info = sna->dri.flip_pending;
-		if (info && info->draw == draw && info->type == DRI2_FLIP_THROTTLE) {
-			DBG(("%s: chaining flip\n", __FUNCTION__));
-			info->next_front.name = 1;
-			return true;
-		}
-
-		info = calloc(1, sizeof(struct sna_dri_frame_event));
-		if (info == NULL)
-			return false;
-
-		info->type = DRI2_FLIP_THROTTLE;
-
-		info->draw = draw;
-		info->client = client;
-		info->event_complete = func;
-		info->event_data = data;
-		info->front = front;
-		info->back = back;
-		info->pipe = pipe;
-
-		sna_dri_add_frame_event(draw, info);
-		sna_dri_reference_buffer(front);
-		sna_dri_reference_buffer(back);
-
-		if (sna->dri.flip_pending) {
-			/* We need to first wait (one vblank) for the
-			 * async flips to complete before this client
-			 * can take over.
-			 */
-			DBG(("%s: queueing flip after pending completion\n",
-			     __FUNCTION__));
-			info->type = DRI2_FLIP;
-			sna->dri.flip_pending = info;
-			return true;
-		}
-
-		sna_dri_page_flip(sna, info);
-
-		if (info->count == 0) {
-			info->back->name = info->old_front.name;
-			get_private(info->back)->bo = info->old_front.bo;
-			info->old_front.bo = NULL;
-
-			DRI2SwapComplete(info->client, draw, 0, 0, 0,
-					 DRI2_EXCHANGE_COMPLETE,
-					 info->event_complete,
-					 info->event_data);
-			sna_dri_frame_event_info_free(sna, draw, info);
-		} else if (info->type != DRI2_FLIP) {
-			get_private(info->back)->bo =
-				kgem_create_2d(&sna->kgem,
-					       draw->width,
-					       draw->height,
-					       draw->bitsPerPixel,
-					       get_private(info->front)->bo->tiling,
-					       CREATE_SCANOUT | CREATE_EXACT);
-			info->back->name = kgem_bo_flink(&sna->kgem,
-							 get_private(info->back)->bo);
-			info->off_delay = FLIP_OFF_DELAY;
-			sna->dri.flip_pending = info;
-
-			DRI2SwapComplete(info->client, draw, 0, 0, 0,
-					 DRI2_EXCHANGE_COMPLETE,
-					 info->event_complete,
-					 info->event_data);
-		} else {
-			info->back->name = info->old_front.name;
-			get_private(info->back)->bo = info->old_front.bo;
-			info->old_front.bo = NULL;
-		}
-	} else {
-		info = calloc(1, sizeof(struct sna_dri_frame_event));
-		if (info == NULL)
-			return false;
-
-		info->draw = draw;
-		info->client = client;
-		info->event_complete = func;
-		info->event_data = data;
-		info->front = front;
-		info->back = back;
-		info->pipe = pipe;
-		info->type = DRI2_FLIP;
-
-		sna_dri_add_frame_event(draw, info);
-		sna_dri_reference_buffer(front);
-		sna_dri_reference_buffer(back);
-
-		/* Get current count */
-		vbl.request.type = DRM_VBLANK_RELATIVE | pipe_select(pipe);
-		vbl.request.sequence = 0;
-		if (sna_wait_vblank(sna, &vbl)) {
-			sna_dri_frame_event_info_free(sna, draw, info);
-			return false;
-		}
-
-		current_msc = vbl.reply.sequence;
-		*target_msc &= 0xffffffff;
-		remainder &= 0xffffffff;
-
-		vbl.request.type =
-			DRM_VBLANK_ABSOLUTE |
-			DRM_VBLANK_EVENT |
-			pipe_select(pipe);
-
-		/*
-		 * If divisor is zero, or current_msc is smaller than target_msc
-		 * we just need to make sure target_msc passes before initiating
-		 * the swap.
-		 */
-		if (current_msc < *target_msc) {
-			DBG(("%s: waiting for swap: current=%d, target=%d, divisor=%d\n",
-			     __FUNCTION__,
-			     (int)current_msc,
-			     (int)*target_msc,
-			     (int)divisor));
-			vbl.request.sequence = *target_msc;
-		} else {
-			DBG(("%s: missed target, queueing event for next: current=%d, target=%d, divisor=%d\n",
-			     __FUNCTION__,
-			     (int)current_msc,
-			     (int)*target_msc,
-			     (int)divisor));
-
-			vbl.request.sequence = current_msc - current_msc % divisor + remainder;
-
-			/*
-			 * If the calculated deadline vbl.request.sequence is
-			 * smaller than or equal to current_msc, it means
-			 * we've passed the last point when effective onset
-			 * frame seq could satisfy *seq % divisor == remainder,
-			 * so we need to wait for the next time this will
-			 * happen.
-			 *
-			 * This comparison takes the 1 frame swap delay
-			 * in pageflipping mode into account.
-			 */
-			if (vbl.request.sequence <= current_msc)
-				vbl.request.sequence += divisor;
-
-			/* Adjust returned value for 1 frame pageflip offset */
-			*target_msc = vbl.reply.sequence + 1;
-		}
-
-		/* Account for 1 frame extra pageflip delay */
-		vbl.request.sequence -= 1;
-		vbl.request.signal = (unsigned long)info;
-		if (sna_wait_vblank(sna, &vbl)) {
-			sna_dri_frame_event_info_free(sna, draw, info);
-			return false;
-		}
-	}
-
-	return true;
-}
-
 static void
 sna_dri_immediate_xchg(struct sna *sna,
 		       DrawablePtr draw,
-		       struct sna_dri_frame_event *info)
+		       struct sna_dri_frame_event *info,
+		       bool sync)
 {
 	drmVBlank vbl;
 
-	DBG(("%s: emitting immediate exchange, throttling client\n",
-	     __FUNCTION__));
+	if (sna->flags & SNA_NO_WAIT)
+		sync = false;
+
+	DBG(("%s: emitting immediate exchange, throttling client, synced? %d\n",
+	     __FUNCTION__, sync));
 	VG_CLEAR(vbl);
 
-	if ((sna->flags & SNA_NO_WAIT) == 0) {
+	if (sync) {
 		info->type = DRI2_XCHG_THROTTLE;
 		if (sna_dri_window_get_chain((WindowPtr)draw) == info) {
 			DBG(("%s: no pending xchg, starting chain\n",
@@ -1804,16 +1772,20 @@ sna_dri_immediate_xchg(struct sna *sna,
 static void
 sna_dri_immediate_blit(struct sna *sna,
 		       DrawablePtr draw,
-		       struct sna_dri_frame_event *info)
+		       struct sna_dri_frame_event *info,
+		       bool sync)
 {
-	drmVBlank vbl;
+	if (sna->flags & SNA_NO_WAIT)
+		sync = false;
 
-	DBG(("%s: emitting immediate blit, throttling client\n", __FUNCTION__));
-	VG_CLEAR(vbl);
+	DBG(("%s: emitting immediate blit, throttling client, synced? %d\n",
+	     __FUNCTION__, sync));
 
-	if ((sna->flags & SNA_NO_WAIT) == 0) {
+	if (sync) {
 		info->type = DRI2_SWAP_THROTTLE;
 		if (sna_dri_window_get_chain((WindowPtr)draw) == info) {
+			drmVBlank vbl;
+
 			DBG(("%s: no pending blit, starting chain\n",
 			     __FUNCTION__));
 
@@ -1826,6 +1798,7 @@ sna_dri_immediate_blit(struct sna *sna,
 					 info->event_complete,
 					 info->event_data);
 
+			VG_CLEAR(vbl);
 			vbl.request.type =
 				DRM_VBLANK_RELATIVE |
 				DRM_VBLANK_NEXTONMISS |
@@ -1849,6 +1822,204 @@ sna_dri_immediate_blit(struct sna *sna,
 	}
 }
 
+static CARD64
+get_current_msc_for_target(struct sna *sna, CARD64 target_msc, int pipe)
+{
+	CARD64 ret = -1;
+
+	if (target_msc && (sna->flags & SNA_NO_WAIT) == 0) {
+		drmVBlank vbl;
+
+		VG_CLEAR(vbl);
+		vbl.request.type = DRM_VBLANK_RELATIVE | pipe_select(pipe);
+		vbl.request.sequence = 0;
+		if (sna_wait_vblank(sna, &vbl) == 0)
+			ret = vbl.reply.sequence;
+	}
+
+	return ret;
+}
+
+static bool
+sna_dri_schedule_flip(ClientPtr client, DrawablePtr draw,
+		      DRI2BufferPtr front, DRI2BufferPtr back, int pipe,
+		      CARD64 *target_msc, CARD64 divisor, CARD64 remainder,
+		      DRI2SwapEventPtr func, void *data)
+{
+	struct sna *sna = to_sna_from_drawable(draw);
+	struct sna_dri_frame_event *info;
+	drmVBlank vbl;
+	CARD64 current_msc;
+
+	current_msc = get_current_msc_for_target(sna, *target_msc, pipe);
+
+	DBG(("%s: target_msc=%u, current_msc=%u, divisor=%u\n", __FUNCTION__,
+	     (uint32_t)*target_msc, (uint32_t)current_msc, (uint32_t)divisor));
+
+	if (divisor == 0 && current_msc >= *target_msc - 1) {
+		info = sna->dri.flip_pending;
+
+		DBG(("%s: performing immediate swap on pipe %d, pending? %d, mode: %d\n",
+		     __FUNCTION__, pipe, info != NULL, info ? info->mode : 0));
+
+		if (info &&
+		    info->draw == draw) {
+			assert(info->type == DRI2_FLIP_THROTTLE);
+			assert(info->front == front);
+			if (info->back != back) {
+				_sna_dri_destroy_buffer(sna, info->back);
+				info->back = back;
+				sna_dri_reference_buffer(back);
+			}
+			if (current_msc >= *target_msc) {
+				DBG(("%s: executing xchg of pending flip\n",
+				     __FUNCTION__));
+				sna_dri_exchange_buffers(draw, front, back);
+				info->mode = 2;
+				goto new_back;
+			} else {
+				DBG(("%s: chaining flip\n", __FUNCTION__));
+				info->mode = 1;
+				current_msc++;
+				goto out;
+			}
+		}
+
+		info = calloc(1, sizeof(struct sna_dri_frame_event));
+		if (info == NULL)
+			return false;
+
+		info->type = sna->flags & SNA_TRIPLE_BUFFER ? DRI2_FLIP_THROTTLE: DRI2_FLIP;
+
+		info->draw = draw;
+		info->client = client;
+		info->event_complete = func;
+		info->event_data = data;
+		info->front = front;
+		info->back = back;
+		info->pipe = pipe;
+
+		info->scanout[0].bo = ref(get_private(front)->bo);
+		info->scanout[0].name = info->front->name;
+
+		sna_dri_add_frame_event(draw, info);
+		sna_dri_reference_buffer(front);
+		sna_dri_reference_buffer(back);
+
+		if (sna->dri.flip_pending) {
+			/* We need to first wait (one vblank) for the
+			 * async flips to complete before this client
+			 * can take over.
+			 */
+			DBG(("%s: queueing flip after pending completion\n",
+			     __FUNCTION__));
+			info->type = DRI2_FLIP;
+			sna->dri.flip_pending = info;
+			*target_msc = current_msc + 1;
+			return true;
+		}
+
+		if (!sna_dri_page_flip(sna, info)) {
+			sna_dri_frame_event_info_free(sna, draw, info);
+			return false;
+		}
+
+		if (info->type != DRI2_FLIP) {
+			current_msc++;
+new_back:
+			sna_dri_flip_get_back(sna, info);
+			DRI2SwapComplete(client, draw, 0, 0, 0,
+					 DRI2_EXCHANGE_COMPLETE,
+					 func, data);
+		}
+out:
+		*target_msc = current_msc;
+		return true;
+	}
+
+	info = calloc(1, sizeof(struct sna_dri_frame_event));
+	if (info == NULL)
+		return false;
+
+	info->draw = draw;
+	info->client = client;
+	info->event_complete = func;
+	info->event_data = data;
+	info->front = front;
+	info->back = back;
+	info->pipe = pipe;
+	info->type = DRI2_FLIP;
+
+	info->scanout[0].bo = ref(get_private(front)->bo);
+	info->scanout[0].name = info->front->name;
+
+	sna_dri_add_frame_event(draw, info);
+	sna_dri_reference_buffer(front);
+	sna_dri_reference_buffer(back);
+
+	*target_msc &= 0xffffffff;
+	remainder &= 0xffffffff;
+
+	VG_CLEAR(vbl);
+
+	vbl.request.type =
+		DRM_VBLANK_ABSOLUTE |
+		DRM_VBLANK_EVENT |
+		pipe_select(pipe);
+
+	/*
+	 * If divisor is zero, or current_msc is smaller than target_msc
+	 * we just need to make sure target_msc passes before initiating
+	 * the swap.
+	 */
+	if (current_msc <= *target_msc - 1) {
+		DBG(("%s: waiting for swap: current=%d, target=%d, divisor=%d\n",
+		     __FUNCTION__,
+		     (int)current_msc,
+		     (int)*target_msc,
+		     (int)divisor));
+		vbl.request.sequence = *target_msc;
+	} else {
+		DBG(("%s: missed target, queueing event for next: current=%d, target=%d, divisor=%d\n",
+		     __FUNCTION__,
+		     (int)current_msc,
+		     (int)*target_msc,
+		     (int)divisor));
+
+		if (divisor == 0)
+			divisor = 1;
+
+		vbl.request.sequence = current_msc - current_msc % divisor + remainder;
+
+		/*
+		 * If the calculated deadline vbl.request.sequence is
+		 * smaller than or equal to current_msc, it means
+		 * we've passed the last point when effective onset
+		 * frame seq could satisfy *seq % divisor == remainder,
+		 * so we need to wait for the next time this will
+		 * happen.
+		 *
+		 * This comparison takes the 1 frame swap delay
+		 * in pageflipping mode into account.
+		 */
+		if (vbl.request.sequence <= current_msc)
+			vbl.request.sequence += divisor;
+
+		/* Adjust returned value for 1 frame pageflip offset */
+		*target_msc = vbl.reply.sequence;
+	}
+
+	/* Account for 1 frame extra pageflip delay */
+	vbl.request.sequence -= 1;
+	vbl.request.signal = (unsigned long)info;
+	if (sna_wait_vblank(sna, &vbl)) {
+		sna_dri_frame_event_info_free(sna, draw, info);
+		return false;
+	}
+
+	return true;
+}
+
 /*
  * ScheduleSwap is responsible for requesting a DRM vblank event for the
  * appropriate frame.
@@ -1889,37 +2060,26 @@ sna_dri_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	     (long long)divisor,
 	     (long long)remainder));
 
-	if (can_flip(sna, draw, front, back)) {
-		DBG(("%s: try flip\n", __FUNCTION__));
-		if (sna_dri_schedule_flip(client, draw, front, back,
-					  target_msc, divisor, remainder,
-					  func, data))
-			return TRUE;
-	}
+	/* Truncate to match kernel interfaces; means occasional overflow
+	 * misses, but that's generally not a big deal */
+	*target_msc &= 0xffffffff;
+	divisor &= 0xffffffff;
+	remainder &= 0xffffffff;
 
 	/* Drawable not displayed... just complete the swap */
 	pipe = sna_dri_get_pipe(draw);
 	if (pipe == -1) {
-		if (can_exchange(sna, draw, front, back)) {
-			DBG(("%s: unattached, exchange pixmaps\n", __FUNCTION__));
-			sna_dri_exchange_buffers(draw, front, back);
-
-			DRI2SwapComplete(client, draw, 0, 0, 0,
-					 DRI2_EXCHANGE_COMPLETE, func, data);
-			return TRUE;
-		}
-
 		DBG(("%s: off-screen, immediate update\n", __FUNCTION__));
 		goto blit_fallback;
 	}
 
-	VG_CLEAR(vbl);
+	if (can_flip(sna, draw, front, back) &&
+	    sna_dri_schedule_flip(client, draw, front, back, pipe,
+				  target_msc, divisor, remainder,
+				  func, data))
+		return TRUE;
 
-	/* Truncate to match kernel interfaces; means occasional overflow
-	 * misses, but that's generally not a big deal */
-	*target_msc &= 0xffffffff;
-	divisor &= 0xffffffff;
-	remainder &= 0xffffffff;
+	VG_CLEAR(vbl);
 
 	info = calloc(1, sizeof(struct sna_dri_frame_event));
 	if (!info)
@@ -1938,21 +2098,25 @@ sna_dri_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	sna_dri_reference_buffer(back);
 
 	info->type = swap_type;
-	if (divisor == 0) {
-		if (can_exchange(sna, draw, front, back))
-			sna_dri_immediate_xchg(sna, draw, info);
-		else
-			sna_dri_immediate_blit(sna, draw, info);
-		return TRUE;
-	}
 
-	/* Get current count */
-	vbl.request.type = DRM_VBLANK_RELATIVE | pipe_select(pipe);
-	vbl.request.sequence = 0;
-	if (sna_wait_vblank(sna, &vbl))
-		goto blit_fallback;
+	current_msc = get_current_msc_for_target(sna, *target_msc, pipe);
+	DBG(("%s: target_msc=%u, current_msc=%u, divisor=%u\n", __FUNCTION__,
+	     (uint32_t)*target_msc, (uint32_t)current_msc, (uint32_t)divisor));
 
-	current_msc = vbl.reply.sequence;
+	if (divisor == 0 && current_msc >= *target_msc - 1) {
+		bool sync = current_msc < *target_msc;
+		if (can_exchange(sna, draw, front, back)) {
+			sna_dri_immediate_xchg(sna, draw, info, sync);
+		} else if (can_blit(sna, draw, front, back)) {
+			sna_dri_immediate_blit(sna, draw, info, sync);
+		} else {
+			DRI2SwapComplete(client, draw, 0, 0, 0,
+					 DRI2_BLIT_COMPLETE, func, data);
+			sna_dri_frame_event_info_free(sna, draw, info);
+		}
+		*target_msc = current_msc + sync;
+		return TRUE;
+	}
 
 	/*
 	 * If divisor is zero, or current_msc is smaller than target_msc
@@ -1991,6 +2155,9 @@ sna_dri_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	     (int)*target_msc,
 	     (int)divisor));
 
+	if (divisor == 0)
+		divisor = 1;
+
 	vbl.request.type =
 		DRM_VBLANK_ABSOLUTE |
 		DRM_VBLANK_EVENT |
@@ -2007,27 +2174,27 @@ sna_dri_schedule_swap(ClientPtr client, DrawablePtr draw, DRI2BufferPtr front,
 	 */
 	if (vbl.request.sequence < current_msc)
 		vbl.request.sequence += divisor;
-	vbl.request.sequence -= 1;
+	*target_msc = vbl.reply.sequence;
 
+	vbl.request.sequence -= 1;
 	vbl.request.signal = (unsigned long)info;
 	if (sna_wait_vblank(sna, &vbl))
 		goto blit_fallback;
 
-	*target_msc = vbl.reply.sequence;
 	return TRUE;
 
 blit_fallback:
+	pipe = DRI2_BLIT_COMPLETE;
 	if (can_exchange(sna, draw, front, back)) {
 		DBG(("%s -- xchg\n", __FUNCTION__));
 		sna_dri_exchange_buffers(draw, front, back);
 		pipe = DRI2_EXCHANGE_COMPLETE;
-	} else {
+	} else if (can_blit(sna, draw, front, back)) {
 		DBG(("%s -- blit\n", __FUNCTION__));
 		sna_dri_copy_to_front(sna, draw, NULL,
 				      get_private(front)->bo,
 				      get_private(back)->bo,
 				      false);
-		pipe = DRI2_BLIT_COMPLETE;
 	}
 	if (info)
 		sna_dri_frame_event_info_free(sna, draw, info);
@@ -2043,100 +2210,31 @@ sna_dri_async_swap(ClientPtr client, DrawablePtr draw,
 		   DRI2SwapEventPtr func, void *data)
 {
 	struct sna *sna = to_sna_from_drawable(draw);
-	struct sna_dri_frame_event *info;
-	struct kgem_bo *bo;
-	int name;
+	CARD64 target_msc = 0;
+	int pipe;
 
 	DBG(("%s()\n", __FUNCTION__));
 
-	if (!can_flip(sna, draw, front, back)) {
-blit:
+	if (!can_flip(sna, draw, front, back) ||
+	    (pipe = sna_dri_get_pipe(draw)) < 0 ||
+	    !sna_dri_schedule_flip(client, draw, front, back, pipe,
+				   &target_msc, 0, 0, func, data)) {
+		pipe = DRI2_BLIT_COMPLETE;
 		if (can_exchange(sna, draw, front, back)) {
 			DBG(("%s: unable to flip, so xchg\n", __FUNCTION__));
 			sna_dri_exchange_buffers(draw, front, back);
-			name = DRI2_EXCHANGE_COMPLETE;
-		} else {
+			pipe = DRI2_EXCHANGE_COMPLETE;
+		} else if (can_blit(sna, draw, front, back)) {
 			DBG(("%s: unable to flip, so blit\n", __FUNCTION__));
 			sna_dri_copy_to_front(sna, draw, NULL,
 					      get_private(front)->bo,
 					      get_private(back)->bo,
 					      false);
-			name = DRI2_BLIT_COMPLETE;
-		}
-
-		DRI2SwapComplete(client, draw, 0, 0, 0, name, func, data);
-		return name == DRI2_EXCHANGE_COMPLETE;
-	}
-
-	bo = NULL;
-	name = 0;
-
-	info = sna->dri.flip_pending;
-	if (info == NULL) {
-		int pipe = sna_dri_get_pipe(draw);
-		if (pipe == -1)
-			goto blit;
-
-		DBG(("%s: no pending flip, so updating scanout\n",
-		     __FUNCTION__));
-
-		info = calloc(1, sizeof(struct sna_dri_frame_event));
-		if (!info)
-			goto blit;
-
-		info->client = client;
-		info->draw = draw;
-		info->type = DRI2_ASYNC_FLIP;
-		info->pipe = pipe;
-		info->front = front;
-		info->back = back;
-
-		sna_dri_add_frame_event(draw, info);
-		sna_dri_reference_buffer(front);
-		sna_dri_reference_buffer(back);
-
-		sna_dri_page_flip(sna, info);
-
-		info->next_front.name = info->front->name;
-		info->next_front.bo = get_private(info->front)->bo;
-		info->off_delay = FLIP_OFF_DELAY;
-	} else if (info->type != DRI2_ASYNC_FLIP) {
-		/* A normal vsync'ed client is finishing, wait for it
-		 * to unpin the old framebuffer before taking over.
-		 */
-		goto blit;
-	} else {
-		DBG(("%s: pending flip, chaining next\n", __FUNCTION__));
-		if (info->next_front.name == info->front->name) {
-			name = info->cache.name;
-			bo = info->cache.bo;
-		} else {
-			name = info->front->name;
-			bo = get_private(info->front)->bo;
 		}
-		info->front->name = info->back->name;
-		get_private(info->front)->bo = get_private(info->back)->bo;
-	}
 
-	if (bo == NULL) {
-		DBG(("%s: creating new back buffer\n", __FUNCTION__));
-		bo = kgem_create_2d(&sna->kgem,
-				    draw->width,
-				    draw->height,
-				    draw->bitsPerPixel,
-				    get_private(info->front)->bo->tiling,
-				    CREATE_SCANOUT | CREATE_EXACT);
-		name = kgem_bo_flink(&sna->kgem, bo);
+		DRI2SwapComplete(client, draw, 0, 0, 0, pipe, func, data);
+		return pipe == DRI2_EXCHANGE_COMPLETE;
 	}
-	assert(bo->refcnt);
-	get_private(info->back)->bo = bo;
-	info->back->name = name;
-
-	set_bo(sna->front, get_private(info->front)->bo);
-	sna->dri.flip_pending = info;
-
-	DRI2SwapComplete(client, draw, 0, 0, 0,
-			 DRI2_EXCHANGE_COMPLETE, func, data);
 	return TRUE;
 }
 #endif
@@ -2291,6 +2389,17 @@ out_complete:
 }
 #endif
 
+static const char *dri_driver_name(struct sna *sna)
+{
+	const char *s = xf86GetOptValString(sna->Options, OPTION_DRI);
+	Bool dummy;
+
+	if (s == NULL || xf86getBoolValue(&dummy, s))
+		return sna->kgem.gen < 040 ? "i915" : "i965";
+
+	return s;
+}
+
 bool sna_dri_open(struct sna *sna, ScreenPtr screen)
 {
 	DRI2InfoRec info;
@@ -2318,8 +2427,7 @@ bool sna_dri_open(struct sna *sna, ScreenPtr screen)
 	sna->deviceName = drmGetDeviceNameFromFd(sna->kgem.fd);
 	memset(&info, '\0', sizeof(info));
 	info.fd = sna->kgem.fd;
-	info.driverName =
-		(sna->kgem.gen && sna->kgem.gen < 40) ? "i915" : "i965";
+	info.driverName = dri_driver_name(sna);
 	info.deviceName = sna->deviceName;
 
 	DBG(("%s: loading dri driver '%s' [gen=%d] for device '%s'\n",
diff --git a/src/sna/sna_driver.c b/src/sna/sna_driver.c
index 1b7e817f5..ffeaead58 100644
--- a/src/sna/sna_driver.c
+++ b/src/sna/sna_driver.c
@@ -62,7 +62,6 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include <sys/ioctl.h>
 #include <sys/fcntl.h>
-#include <sys/poll.h>
 #include "i915_drm.h"
 
 #ifdef HAVE_VALGRIND
@@ -79,11 +78,6 @@ DevPrivateKeyRec sna_gc_key;
 DevPrivateKeyRec sna_window_key;
 DevPrivateKeyRec sna_glyph_key;
 
-static Bool sna_enter_vt(VT_FUNC_ARGS_DECL);
-
-/* temporary */
-extern void xf86SetCursor(ScreenPtr screen, CursorPtr pCurs, int x, int y);
-
 static void
 sna_load_palette(ScrnInfoPtr scrn, int numColors, int *indices,
 		 LOCO * colors, VisualPtr pVisual)
@@ -150,6 +144,79 @@ sna_load_palette(ScrnInfoPtr scrn, int numColors, int *indices,
 	}
 }
 
+static void
+sna_set_fallback_mode(ScrnInfoPtr scrn)
+{
+	xf86CrtcConfigPtr config = XF86_CRTC_CONFIG_PTR(scrn);
+	xf86OutputPtr output = NULL;
+	xf86CrtcPtr crtc = NULL;
+	int n;
+
+	if ((unsigned)config->compat_output < config->num_output) {
+		output = config->output[config->compat_output];
+		crtc = output->crtc;
+	}
+
+	for (n = 0; n < config->num_output; n++)
+		config->output[n]->crtc = NULL;
+	for (n = 0; n < config->num_crtc; n++)
+		config->crtc[n]->enabled = FALSE;
+
+	if (output && crtc) {
+		DisplayModePtr mode;
+
+		output->crtc = crtc;
+
+		mode = xf86OutputFindClosestMode(output, scrn->currentMode);
+		if (mode &&
+		    xf86CrtcSetModeTransform(crtc, mode, RR_Rotate_0, NULL, 0, 0)) {
+			crtc->desiredMode = *mode;
+			crtc->desiredMode.prev = crtc->desiredMode.next = NULL;
+			crtc->desiredMode.name = NULL;
+			crtc->desiredMode.PrivSize = 0;
+			crtc->desiredMode.PrivFlags = 0;
+			crtc->desiredMode.Private = NULL;
+			crtc->desiredRotation = RR_Rotate_0;
+			crtc->desiredTransformPresent = FALSE;
+			crtc->desiredX = 0;
+			crtc->desiredY = 0;
+			crtc->enabled = TRUE;
+		}
+	}
+
+	xf86DisableUnusedFunctions(scrn);
+#ifdef RANDR_12_INTERFACE
+	if (root(scrn->pScreen))
+		xf86RandR12TellChanged(scrn->pScreen);
+#endif
+}
+
+static Bool sna_become_master(struct sna *sna)
+{
+	ScrnInfoPtr scrn = sna->scrn;
+
+	DBG(("%s\n", __FUNCTION__));
+
+	if (drmSetMaster(sna->kgem.fd)) {
+		sleep(2); /* XXX wait for the current master to decease */
+		if (drmSetMaster(sna->kgem.fd)) {
+			xf86DrvMsg(scrn->scrnIndex, X_ERROR,
+					"drmSetMaster failed: %s\n",
+					strerror(errno));
+			return FALSE;
+		}
+	}
+
+	if (!xf86SetDesiredModes(scrn)) {
+		xf86DrvMsg(scrn->scrnIndex, X_WARNING,
+			   "failed to restore desired modes on VT switch\n");
+		sna_set_fallback_mode(scrn);
+	}
+
+	sna_mode_disable_unused(sna);
+	return TRUE;
+}
+
 /**
  * Adjust the screen pixmap for the current location of the front buffer.
  * This is done at EnterVT when buffers are bound as long as the resources
@@ -158,7 +225,6 @@ sna_load_palette(ScrnInfoPtr scrn, int numColors, int *indices,
  */
 static Bool sna_create_screen_resources(ScreenPtr screen)
 {
-	ScrnInfoPtr scrn = xf86ScreenToScrn(screen);
 	struct sna *sna = to_sna_from_screen(screen);
 
 	DBG(("%s(%dx%d@%d)\n", __FUNCTION__,
@@ -197,7 +263,7 @@ static Bool sna_create_screen_resources(ScreenPtr screen)
 
 	sna_copy_fbcon(sna);
 
-	if (!sna_enter_vt(VT_FUNC_ARGS(0))) {
+	if (!sna_become_master(sna)) {
 		xf86DrvMsg(screen->myNum, X_ERROR,
 			   "[intel] Failed to become DRM master\n");
 		goto cleanup_front;
@@ -363,6 +429,12 @@ static void sna_setup_capabilities(ScrnInfoPtr scrn, int fd)
 #endif
 }
 
+static Bool sna_option_cast_to_bool(struct sna *sna, int id, Bool val)
+{
+	xf86getBoolValue(&val, xf86GetOptValString(sna->Options, id));
+	return val;
+}
+
 /**
  * This is called before ScreenInit to do any require probing of screen
  * configuration.
@@ -468,7 +540,8 @@ static Bool sna_pre_init(ScrnInfoPtr scrn, int flags)
 	intel_detect_chipset(scrn, sna->pEnt, sna->PciInfo);
 
 	kgem_init(&sna->kgem, fd, sna->PciInfo, sna->info->gen);
-	if (xf86ReturnOptValBool(sna->Options, OPTION_ACCEL_DISABLE, FALSE)) {
+	if (xf86ReturnOptValBool(sna->Options, OPTION_ACCEL_DISABLE, FALSE) ||
+	    !sna_option_cast_to_bool(sna, OPTION_ACCEL_METHOD, TRUE)) {
 		xf86DrvMsg(sna->scrn->scrnIndex, X_CONFIG,
 			   "Disabling hardware acceleration.\n");
 		sna->kgem.wedged = true;
@@ -493,12 +566,10 @@ static Bool sna_pre_init(ScrnInfoPtr scrn, int flags)
 		sna->tiling &= ~SNA_TILING_FB;
 
 	sna->flags = 0;
-	if (!xf86ReturnOptValBool(sna->Options, OPTION_THROTTLE, TRUE))
-		sna->flags |= SNA_NO_THROTTLE;
-	if (!xf86ReturnOptValBool(sna->Options, OPTION_DELAYED_FLUSH, TRUE))
-		sna->flags |= SNA_NO_DELAYED_FLUSH;
 	if (!xf86ReturnOptValBool(sna->Options, OPTION_SWAPBUFFERS_WAIT, TRUE))
 		sna->flags |= SNA_NO_WAIT;
+	if (xf86ReturnOptValBool(sna->Options, OPTION_TRIPLE_BUFFER, TRUE))
+		sna->flags |= SNA_TRIPLE_BUFFER;
 	if (has_pageflipping(sna)) {
 		if (xf86ReturnOptValBool(sna->Options, OPTION_TEAR_FREE, FALSE))
 			sna->flags |= SNA_TEAR_FREE;
@@ -511,12 +582,6 @@ static Bool sna_pre_init(ScrnInfoPtr scrn, int flags)
 		   sna->tiling & SNA_TILING_FB ? "tiled" : "linear");
 	xf86DrvMsg(scrn->scrnIndex, X_CONFIG, "Pixmaps %s\n",
 		   sna->tiling & SNA_TILING_2D ? "tiled" : "linear");
-	xf86DrvMsg(scrn->scrnIndex, X_CONFIG, "3D buffers %s\n",
-		   sna->tiling & SNA_TILING_3D ? "tiled" : "linear");
-	xf86DrvMsg(scrn->scrnIndex, X_CONFIG, "Throttling %sabled\n",
-		   sna->flags & SNA_NO_THROTTLE ? "dis" : "en");
-	xf86DrvMsg(scrn->scrnIndex, X_CONFIG, "Delayed flush %sabled\n",
-		   sna->flags & SNA_NO_DELAYED_FLUSH ? "dis" : "en");
 	xf86DrvMsg(scrn->scrnIndex, X_CONFIG, "\"Tear free\" %sabled\n",
 		   sna->flags & SNA_TEAR_FREE ? "en" : "dis");
 	xf86DrvMsg(scrn->scrnIndex, X_CONFIG, "Forcing per-crtc-pixmaps? %s\n",
@@ -543,7 +608,7 @@ static Bool sna_pre_init(ScrnInfoPtr scrn, int flags)
 	xf86SetDpi(scrn, 0, 0);
 
 	sna->dri_available = false;
-	if (xf86ReturnOptValBool(sna->Options, OPTION_DRI, TRUE))
+	if (sna_option_cast_to_bool(sna, OPTION_DRI, TRUE))
 		sna->dri_available = !!xf86LoadSubModule(scrn, "dri2");
 
 	return TRUE;
@@ -552,9 +617,11 @@ static Bool sna_pre_init(ScrnInfoPtr scrn, int flags)
 static void
 sna_block_handler(BLOCKHANDLER_ARGS_DECL)
 {
-	SCREEN_PTR(arg);
-	ScrnInfoPtr scrn = xf86ScreenToScrn(screen);
-	struct sna *sna = to_sna(scrn);
+#ifndef XF86_SCRN_INTERFACE
+	struct sna *sna = to_sna(xf86Screens[arg]);
+#else
+	struct sna *sna = to_sna_from_screen(arg);
+#endif
 	struct timeval **tv = timeout;
 
 	DBG(("%s (tv=%ld.%06ld)\n", __FUNCTION__,
@@ -569,9 +636,11 @@ sna_block_handler(BLOCKHANDLER_ARGS_DECL)
 static void
 sna_wakeup_handler(WAKEUPHANDLER_ARGS_DECL)
 {
-	SCREEN_PTR(arg);
-	ScrnInfoPtr scrn = xf86ScreenToScrn(screen);
-	struct sna *sna = to_sna(scrn);
+#ifndef XF86_SCRN_INTERFACE
+	struct sna *sna = to_sna(xf86Screens[arg]);
+#else
+	struct sna *sna = to_sna_from_screen(arg);
+#endif
 
 	DBG(("%s\n", __FUNCTION__));
 
@@ -639,11 +708,14 @@ sna_uevent_init(ScrnInfoPtr scrn)
 
 	DBG(("%s\n", __FUNCTION__));
 
-	if (!xf86GetOptValBool(sna->Options, OPTION_HOTPLUG, &hotplug)) {
-		from = X_DEFAULT;
-		hotplug = TRUE;
-	}
+	/* RandR will be disabled if Xinerama is active, and so generating
+	 * RR hotplug events is then verboten.
+	 */
+	if (!dixPrivateKeyRegistered(rrPrivKey))
+		return;
 
+	if (!xf86GetOptValBool(sna->Options, OPTION_HOTPLUG, &hotplug))
+		from = X_DEFAULT, hotplug = TRUE;
 	xf86DrvMsg(scrn->scrnIndex, from, "hotplug detection: \"%s\"\n",
 			hotplug ? "enabled" : "disabled");
 	if (!hotplug)
@@ -654,16 +726,14 @@ sna_uevent_init(ScrnInfoPtr scrn)
 		return;
 
 	mon = udev_monitor_new_from_netlink(u, "udev");
-
 	if (!mon) {
 		udev_unref(u);
 		return;
 	}
 
 	if (udev_monitor_filter_add_match_subsystem_devtype(mon,
-				"drm",
-				"drm_minor") < 0 ||
-			udev_monitor_enable_receiving(mon) < 0)
+				"drm", "drm_minor") < 0 ||
+	    udev_monitor_enable_receiving(mon) < 0)
 	{
 		udev_monitor_unref(mon);
 		udev_unref(u);
@@ -681,23 +751,29 @@ sna_uevent_init(ScrnInfoPtr scrn)
 	}
 
 	sna->uevent_monitor = mon;
+
+	DBG(("%s: installed uvent handler\n", __FUNCTION__));
 }
 
 static void
 sna_uevent_fini(ScrnInfoPtr scrn)
 {
 	struct sna *sna = to_sna(scrn);
+	struct udev *u;
 
-	if (sna->uevent_handler) {
-		struct udev *u = udev_monitor_get_udev(sna->uevent_monitor);
+	if (sna->uevent_handler == NULL)
+		return;
 
-		xf86RemoveGeneralHandler(sna->uevent_handler);
+	xf86RemoveGeneralHandler(sna->uevent_handler);
 
-		udev_monitor_unref(sna->uevent_monitor);
-		udev_unref(u);
-		sna->uevent_handler = NULL;
-		sna->uevent_monitor = NULL;
-	}
+	u = udev_monitor_get_udev(sna->uevent_monitor);
+	udev_monitor_unref(sna->uevent_monitor);
+	udev_unref(u);
+
+	sna->uevent_handler = NULL;
+	sna->uevent_monitor = NULL;
+
+	DBG(("%s: removed uvent handler\n", __FUNCTION__));
 }
 #else
 static void sna_uevent_fini(ScrnInfoPtr scrn) { }
@@ -717,18 +793,6 @@ static void sna_leave_vt(VT_FUNC_ARGS_DECL)
 			   "drmDropMaster failed: %s\n", strerror(errno));
 }
 
-/* In order to workaround a kernel bug in not honouring O_NONBLOCK,
- * check that the fd is readable before attempting to read the next
- * event from drm.
- */
-static Bool sna_mode_has_pending_events(struct sna *sna)
-{
-	struct pollfd pfd;
-	pfd.fd = sna->kgem.fd;
-	pfd.events = POLLIN;
-	return poll(&pfd, 1, 0) == 1;
-}
-
 static Bool sna_early_close_screen(CLOSE_SCREEN_ARGS_DECL)
 {
 	ScrnInfoPtr scrn = xf86ScreenToScrn(screen);
@@ -739,9 +803,7 @@ static Bool sna_early_close_screen(CLOSE_SCREEN_ARGS_DECL)
 	xf86_hide_cursors(scrn);
 	sna_uevent_fini(scrn);
 
-	/* drain the event queues */
-	if (sna_mode_has_pending_events(sna))
-		sna_mode_wakeup(sna);
+	sna_mode_close(sna);
 
 	if (sna->dri_open) {
 		sna_dri_close(sna, screen);
@@ -793,6 +855,7 @@ static void sna_mode_set(ScrnInfoPtr scrn)
 static Bool
 sna_register_all_privates(void)
 {
+#if HAS_DIXREGISTERPRIVATEKEY
 	if (!dixRegisterPrivateKey(&sna_pixmap_key, PRIVATE_PIXMAP,
 				   3*sizeof(void *)))
 		return FALSE;
@@ -808,6 +871,19 @@ sna_register_all_privates(void)
 	if (!dixRegisterPrivateKey(&sna_window_key, PRIVATE_WINDOW,
 				   2*sizeof(void *)))
 		return FALSE;
+#else
+	if (!dixRequestPrivate(&sna_pixmap_key, 3*sizeof(void *)))
+		return FALSE;
+
+	if (!dixRequestPrivate(&sna_gc_key, sizeof(FbGCPrivate)))
+		return FALSE;
+
+	if (!dixRequestPrivate(&sna_glyph_key, sizeof(struct sna_glyph)))
+		return FALSE;
+
+	if (!dixRequestPrivate(&sna_window_key, 2*sizeof(void *)))
+		return FALSE;
+#endif
 
 	return TRUE;
 }
@@ -815,7 +891,7 @@ sna_register_all_privates(void)
 static size_t
 agp_aperture_size(struct pci_device *dev, int gen)
 {
-	return dev->regions[gen < 30 ? 0 : 2].size;
+	return dev->regions[gen < 030 ? 0 : 2].size;
 }
 
 static Bool
@@ -983,24 +1059,9 @@ static void sna_free_screen(FREE_SCREEN_ARGS_DECL)
 static Bool sna_enter_vt(VT_FUNC_ARGS_DECL)
 {
 	SCRN_INFO_PTR(arg);
-	struct sna *sna = to_sna(scrn);
 
 	DBG(("%s\n", __FUNCTION__));
-
-	if (drmSetMaster(sna->kgem.fd)) {
-		xf86DrvMsg(scrn->scrnIndex, X_ERROR,
-			   "drmSetMaster failed: %s\n",
-			   strerror(errno));
-		return FALSE;
-	}
-
-	if (!xf86SetDesiredModes(scrn))
-		xf86DrvMsg(scrn->scrnIndex, X_WARNING,
-			   "failed to restore desired modes on VT switch\n");
-
-	sna_mode_disable_unused(sna);
-
-	return TRUE;
+	return sna_become_master(to_sna(scrn));
 }
 
 static Bool sna_switch_mode(SWITCH_MODE_ARGS_DECL)
@@ -1094,6 +1155,10 @@ Bool sna_init_scrn(ScrnInfoPtr scrn, int entity_num)
 	xf86DrvMsg(scrn->scrnIndex, X_INFO,
 		   "SNA compiled with assertions enabled\n");
 #endif
+#if DEBUG_SYNC
+	xf86DrvMsg(scrn->scrnIndex, X_INFO,
+		   "SNA compiled with synchronous rendering\n");
+#endif
 #if DEBUG_MEMORY
 	xf86DrvMsg(scrn->scrnIndex, X_INFO,
 		   "SNA compiled with memory allocation reporting enabled\n");
@@ -1117,11 +1182,15 @@ Bool sna_init_scrn(ScrnInfoPtr scrn, int entity_num)
 	scrn->ValidMode = sna_valid_mode;
 	scrn->PMEvent = sna_pm_event;
 
+#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,9,99,901,0)
 	scrn->ModeSet = sna_mode_set;
+#endif
 
 	xf86SetEntitySharable(entity_num);
 	xf86SetEntityInstanceForScreen(scrn, entity_num,
 				       xf86GetNumEntityInstances(entity_num)-1);
 
+	sna_threads_init();
+
 	return TRUE;
 }
diff --git a/src/sna/sna_glyphs.c b/src/sna/sna_glyphs.c
index 9a6ad4b52..5fed8b419 100644
--- a/src/sna/sna_glyphs.c
+++ b/src/sna/sna_glyphs.c
@@ -84,6 +84,8 @@
 
 #define N_STACK_GLYPHS 512
 
+#define glyph_valid(g) *((uint32_t *)&(g)->info.width)
+
 #if HAS_DEBUG_FULL
 static void _assert_pixmap_contains_box(PixmapPtr pixmap, BoxPtr box, const char *function)
 {
@@ -108,7 +110,7 @@ extern DevPrivateKeyRec sna_glyph_key;
 
 static inline struct sna_glyph *sna_glyph(GlyphPtr glyph)
 {
-	return dixGetPrivateAddr(&glyph->devPrivates, &sna_glyph_key);
+	return __get_private(glyph, sna_glyph_key);
 }
 
 #define NeedsComponent(f) (PICT_FORMAT_A(f) != 0 && PICT_FORMAT_RGB(f) != 0)
@@ -191,11 +193,17 @@ bool sna_glyphs_create(struct sna *sna)
 	if (sna->render.white_image == NULL)
 		goto bail;
 
-	if (!can_render(sna))
+	if (!can_render(sna)) {
+		DBG(("%s: no render acceleration, no render glyph caches\n",
+		     __FUNCTION__));
 		return true;
+	}
 
-	if (xf86IsEntityShared(sna->scrn->entityList[0]))
+	if (xf86IsEntityShared(sna->scrn->entityList[0])) {
+		DBG(("%s: shared GlyphPictures, no render glyph caches\n",
+		     __FUNCTION__));
 		return true;
+	}
 
 	for (i = 0; i < ARRAY_SIZE(formats); i++) {
 		struct sna_glyph_cache *cache = &sna->render.glyph[i];
@@ -215,9 +223,12 @@ bool sna_glyphs_create(struct sna *sna)
 					      CACHE_PICTURE_SIZE,
 					      CACHE_PICTURE_SIZE,
 					      depth,
-					      SNA_CREATE_SCRATCH);
-		if (!pixmap)
+					      SNA_CREATE_GLYPHS);
+		if (!pixmap) {
+			DBG(("%s: failed to allocate pixmap for Glyph cache\n",
+			     __FUNCTION__));
 			goto bail;
+		}
 
 		priv = sna_pixmap(pixmap);
 		if (priv != NULL) {
@@ -235,6 +246,7 @@ bool sna_glyphs_create(struct sna *sna)
 			goto bail;
 
 		ValidatePicture(picture);
+		assert(picture->pDrawable == &pixmap->drawable);
 
 		cache->count = cache->evict = 0;
 		cache->picture = picture;
@@ -297,7 +309,7 @@ glyph_extents(int nlist,
 		while (n--) {
 			GlyphPtr glyph = *glyphs++;
 
-			if (glyph->info.width && glyph->info.height) {
+			if (glyph_valid(glyph)) {
 				int v;
 
 				v = x - glyph->info.x;
@@ -350,14 +362,20 @@ glyph_cache(ScreenPtr screen,
 	    struct sna_render *render,
 	    GlyphPtr glyph)
 {
-	PicturePtr glyph_picture = GetGlyphPicture(glyph, screen);
-	struct sna_glyph_cache *cache = &render->glyph[PICT_FORMAT_RGB(glyph_picture->format) != 0];
+	PicturePtr glyph_picture;
+	struct sna_glyph_cache *cache;
 	struct sna_glyph *priv;
 	int size, mask, pos, s;
 
 	if (NO_GLYPH_CACHE)
 		return false;
 
+	glyph_picture = GetGlyphPicture(glyph, screen);
+	if (unlikely(glyph_picture == NULL)) {
+		glyph->info.width = glyph->info.height = 0;
+		return false;
+	}
+
 	if (glyph->info.width > GLYPH_MAX_SIZE ||
 	    glyph->info.height > GLYPH_MAX_SIZE) {
 		PixmapPtr pixmap = (PixmapPtr)glyph_picture->pDrawable;
@@ -373,6 +391,7 @@ glyph_cache(ScreenPtr screen,
 		if (glyph->info.width <= size && glyph->info.height <= size)
 			break;
 
+	cache = &render->glyph[PICT_FORMAT_RGB(glyph_picture->format) != 0];
 	s = glyph_size_to_count(size);
 	mask = glyph_count_to_mask(s);
 	pos = (cache->count + s - 1) & mask;
@@ -528,7 +547,7 @@ glyphs_to_dst(struct sna *sna,
 			struct sna_glyph priv;
 			int i;
 
-			if (glyph->info.width == 0 || glyph->info.height == 0)
+			if (!glyph_valid(glyph))
 				goto next_glyph;
 
 			priv = *sna_glyph(glyph);
@@ -540,6 +559,10 @@ glyphs_to_dst(struct sna *sna,
 				if (!glyph_cache(screen, &sna->render, glyph)) {
 					/* no cache for this glyph */
 					priv.atlas = GetGlyphPicture(glyph, screen);
+					if (unlikely(priv.atlas == NULL)) {
+						glyph->info.width = glyph->info.height = 0;
+						goto next_glyph;
+					}
 					priv.coordinate.x = priv.coordinate.y = 0;
 				} else
 					priv = *sna_glyph(glyph);
@@ -671,7 +694,7 @@ glyphs_slow(struct sna *sna,
 			BoxPtr rects;
 			int nrect;
 
-			if (glyph->info.width == 0 || glyph->info.height == 0)
+			if (!glyph_valid(glyph))
 				goto next_glyph;
 
 			priv = *sna_glyph(glyph);
@@ -679,6 +702,10 @@ glyphs_slow(struct sna *sna,
 				if (!glyph_cache(screen, &sna->render, glyph)) {
 					/* no cache for this glyph */
 					priv.atlas = GetGlyphPicture(glyph, screen);
+					if (unlikely(priv.atlas == NULL)) {
+						glyph->info.width = glyph->info.height = 0;
+						goto next_glyph;
+					}
 					priv.coordinate.x = priv.coordinate.y = 0;
 				} else
 					priv = *sna_glyph(glyph);
@@ -780,7 +807,7 @@ __sna_glyph_get_image(GlyphPtr g, ScreenPtr s)
 	int dx, dy;
 
 	p = GetGlyphPicture(g, s);
-	if (p == NULL)
+	if (unlikely(p == NULL))
 		return NULL;
 
 	image = image_from_pict(p, FALSE, &dx, &dy);
@@ -917,7 +944,7 @@ glyphs_via_mask(struct sna *sna,
 					GlyphPtr g = *glyphs++;
 					const void *ptr;
 
-					if (g->info.width == 0 || g->info.height == 0)
+					if (!glyph_valid(g))
 						goto next_pglyph;
 
 					ptr = pixman_glyph_cache_lookup(cache, g, NULL);
@@ -968,7 +995,7 @@ next_pglyph:
 				pixman_image_t *glyph_image;
 				int16_t xi, yi;
 
-				if (g->info.width == 0 || g->info.height == 0)
+				if (!glyph_valid(g))
 					goto next_image;
 
 				/* If the mask has been cropped, it is likely
@@ -984,6 +1011,8 @@ next_pglyph:
 
 				glyph_image =
 					sna_glyph_get_image(g, dst->pDrawable->pScreen);
+				if (glyph_image == NULL)
+					goto next_image;
 
 				DBG(("%s: glyph to mask (%d, %d)x(%d, %d)\n",
 				     __FUNCTION__,
@@ -1058,7 +1087,7 @@ next_image:
 				PicturePtr this_atlas;
 				struct sna_composite_rectangles r;
 
-				if (glyph->info.width == 0 || glyph->info.height == 0)
+				if (!glyph_valid(glyph))
 					goto next_glyph;
 
 				priv = sna_glyph(glyph);
@@ -1076,6 +1105,10 @@ next_image:
 					} else {
 						/* no cache for this glyph */
 						this_atlas = GetGlyphPicture(glyph, screen);
+						if (unlikely(this_atlas == NULL)) {
+							glyph->info.width = glyph->info.height = 0;
+							goto next_glyph;
+						}
 						r.src.x = r.src.y = 0;
 					}
 				}
@@ -1090,7 +1123,8 @@ next_image:
 					     __FUNCTION__,
 					     (int)this_atlas->format,
 					     (int)(format->depth << 24 | format->format)));
-					if (this_atlas->format == (format->depth << 24 | format->format)) {
+					if (this_atlas->format == (format->depth << 24 | format->format) &&
+					    (sna->kgem.gen >> 3) != 4) { /* XXX cache corruption? how? */
 						ok = sna->render.composite(sna, PictOpAdd,
 									   this_atlas, NULL, mask,
 									   0, 0, 0, 0, 0, 0,
@@ -1194,7 +1228,7 @@ glyphs_format(int nlist, GlyphListPtr list, GlyphPtr * glyphs)
 		while (n--) {
 			GlyphPtr glyph = *glyphs++;
 
-			if (glyph->info.width == 0 || glyph->info.height == 0) {
+			if (!glyph_valid(glyph)) {
 				x += glyph->info.xOff;
 				y += glyph->info.yOff;
 				continue;
@@ -1333,7 +1367,7 @@ glyphs_fallback(CARD8 op,
 	     __FUNCTION__,
 	     RegionExtents(&region)->x1, RegionExtents(&region)->y1,
 	     RegionExtents(&region)->x2, RegionExtents(&region)->y2));
-	if (!RegionNotEmpty(&region))
+	if (RegionNil(&region))
 		return;
 
 	if (!sna_drawable_move_region_to_cpu(dst->pDrawable, &region,
@@ -1391,7 +1425,7 @@ glyphs_fallback(CARD8 op,
 				GlyphPtr g = *glyphs++;
 				const void *ptr;
 
-				if (g->info.width == 0 || g->info.height == 0)
+				if (!glyph_valid(g))
 					goto next;
 
 				ptr = pixman_glyph_cache_lookup(cache, g, NULL);
@@ -1517,7 +1551,7 @@ out:
 				GlyphPtr g = *glyphs++;
 				pixman_image_t *glyph_image;
 
-				if (g->info.width == 0 || g->info.height == 0)
+				if (!glyph_valid(g))
 					goto next_glyph;
 
 				glyph_image = sna_glyph_get_image(g, screen);
@@ -1654,7 +1688,7 @@ sna_glyphs(CARD8 op,
 	}
 
 	if ((too_small(priv) || DAMAGE_IS_ALL(priv->cpu_damage)) &&
-	    !picture_is_gpu(src)) {
+	    !picture_is_gpu(sna, src)) {
 		DBG(("%s: fallback -- too small (%dx%d)\n",
 		     __FUNCTION__, dst->pDrawable->width, dst->pDrawable->height));
 		goto fallback;
@@ -1810,7 +1844,7 @@ glyphs_via_image(struct sna *sna,
 				GlyphPtr g = *glyphs++;
 				const void *ptr;
 
-				if (g->info.width == 0 || g->info.height == 0)
+				if (!glyph_valid(g))
 					goto next_pglyph;
 
 				ptr = pixman_glyph_cache_lookup(cache, g, NULL);
@@ -1861,7 +1895,7 @@ next_pglyph:
 				pixman_image_t *glyph_image;
 				int16_t xi, yi;
 
-				if (g->info.width == 0 || g->info.height == 0)
+				if (!glyph_valid(g))
 					goto next_image;
 
 				/* If the mask has been cropped, it is likely
@@ -1877,6 +1911,8 @@ next_pglyph:
 
 				glyph_image =
 					sna_glyph_get_image(g, dst->pDrawable->pScreen);
+				if (glyph_image == NULL)
+					goto next_image;
 
 				DBG(("%s: glyph to mask (%d, %d)x(%d, %d)\n",
 				     __FUNCTION__,
@@ -1976,7 +2012,7 @@ sna_glyphs__shared(CARD8 op,
 	}
 
 	if ((too_small(priv) || DAMAGE_IS_ALL(priv->cpu_damage)) &&
-	    !picture_is_gpu(src)) {
+	    !picture_is_gpu(sna, src)) {
 		DBG(("%s: fallback -- too small (%dx%d)\n",
 		     __FUNCTION__, dst->pDrawable->width, dst->pDrawable->height));
 		goto fallback;
diff --git a/src/sna/sna_gradient.c b/src/sna/sna_gradient.c
index 5f06fbc8d..db09e72db 100644
--- a/src/sna/sna_gradient.c
+++ b/src/sna/sna_gradient.c
@@ -219,11 +219,11 @@ sna_render_flush_solid(struct sna *sna)
 	DBG(("sna_render_flush_solid(size=%d)\n", cache->size));
 	assert(cache->dirty);
 	assert(cache->size);
+	assert(cache->size <= 1024);
 
 	kgem_bo_write(&sna->kgem, cache->cache_bo,
 		      cache->color, cache->size*sizeof(uint32_t));
 	cache->dirty = 0;
-	cache->last = 0;
 }
 
 static void
@@ -250,21 +250,24 @@ sna_render_finish_solid(struct sna *sna, bool force)
 		cache->bo[i] = NULL;
 	}
 
-	old = cache->cache_bo;
-
 	DBG(("sna_render_finish_solid reset\n"));
-
+	old = cache->cache_bo;
 	cache->cache_bo = kgem_create_linear(&sna->kgem, sizeof(cache->color), 0);
 	if (cache->cache_bo == NULL) {
 		cache->cache_bo = old;
 		old = NULL;
 	}
 
-	cache->bo[0] = kgem_create_proxy(&sna->kgem, cache->cache_bo,
-					 0, sizeof(uint32_t));
-	cache->bo[0]->pitch = 4;
 	if (force)
-		cache->size = 1;
+		cache->size = 0;
+	if (cache->last < cache->size) {
+		cache->bo[cache->last] = kgem_create_proxy(&sna->kgem, cache->cache_bo,
+							   cache->last*sizeof(uint32_t), sizeof(uint32_t));
+		if (cache->bo[cache->last])
+			cache->bo[cache->last]->pitch = 4;
+		else
+			cache->last = 1024;
+	}
 
 	if (old)
 		kgem_bo_destroy(&sna->kgem, old);
@@ -283,7 +286,38 @@ sna_render_get_solid(struct sna *sna, uint32_t color)
 
 	if (color == 0xffffffff) {
 		DBG(("%s(white)\n", __FUNCTION__));
-		return kgem_bo_reference(cache->bo[0]);
+		return kgem_bo_reference(sna->render.alpha_cache.bo[255+7]);
+	}
+
+	if ((color >> 24) == 0xff) {
+		int v = 0;
+
+		if (((color >> 16) & 0xff) == 0)
+			v |= 0;
+		else if (((color >> 16) & 0xff) == 0xff)
+			v |= 1 << 2;
+		else
+			v = -1;
+
+		if (((color >> 8) & 0xff) == 0)
+			v |= 0;
+		else if (((color >> 8) & 0xff) == 0xff)
+			v |= 1 << 1;
+		else
+			v = -1;
+
+		if (((color >> 0) & 0xff) == 0)
+			v |= 0;
+		else if (((color >> 0) & 0xff) == 0xff)
+			v |= 1 << 0;
+		else
+			v = -1;
+
+		if (v >= 0) {
+			DBG(("%s(primary (%d,%d,%d): %d)\n",
+			     __FUNCTION__, v & 4, v & 2, v & 1, v));
+			return kgem_bo_reference(sna->render.alpha_cache.bo[255+v]);
+		}
 	}
 
 	if (cache->color[cache->last] == color) {
@@ -292,7 +326,7 @@ sna_render_get_solid(struct sna *sna, uint32_t color)
 		return kgem_bo_reference(cache->bo[cache->last]);
 	}
 
-	for (i = 1; i < cache->size; i++) {
+	for (i = 0; i < cache->size; i++) {
 		if (cache->color[i] == color) {
 			if (cache->bo[i] == NULL) {
 				DBG(("sna_render_get_solid(%d) = %x (recreate)\n",
@@ -306,7 +340,7 @@ sna_render_get_solid(struct sna *sna, uint32_t color)
 		}
 	}
 
-	sna_render_finish_solid(sna, i == ARRAY_SIZE(cache->color));
+	sna_render_finish_solid(sna, i == 1024);
 
 	i = cache->size++;
 	cache->color[i] = color;
@@ -326,7 +360,7 @@ done:
 static bool sna_alpha_cache_init(struct sna *sna)
 {
 	struct sna_alpha_cache *cache = &sna->render.alpha_cache;
-	uint32_t color[256];
+	uint32_t color[256 + 7];
 	int i;
 
 	DBG(("%s\n", __FUNCTION__));
@@ -346,6 +380,28 @@ static bool sna_alpha_cache_init(struct sna *sna)
 
 		cache->bo[i]->pitch = 4;
 	}
+
+	/* primary */
+	for (i = 1; i < 8; i++) {
+		int j = 255+i;
+
+		color[j] = 0xff << 24;
+		if (i & 1)
+			color[j] |= 0xff << 0;
+		if (i & 2)
+			color[j] |= 0xff << 8;
+		if (i & 4)
+			color[j] |= 0xff << 16;
+		cache->bo[j] = kgem_create_proxy(&sna->kgem,
+						 cache->cache_bo,
+						 sizeof(uint32_t)*j,
+						 sizeof(uint32_t));
+		if (cache->bo[j] == NULL)
+			return false;
+
+		cache->bo[j]->pitch = 4;
+	}
+
 	return kgem_bo_write(&sna->kgem, cache->cache_bo, color, sizeof(color));
 }
 
@@ -356,24 +412,14 @@ static bool sna_solid_cache_init(struct sna *sna)
 	DBG(("%s\n", __FUNCTION__));
 
 	cache->cache_bo =
-		kgem_create_linear(&sna->kgem, sizeof(cache->color), 0);
+		kgem_create_linear(&sna->kgem, 4096, 0);
 	if (!cache->cache_bo)
 		return false;
 
-	/*
-	 * Initialise [0] with white since it is very common and filling the
-	 * zeroth slot simplifies some of the checks.
-	 */
-	cache->color[0] = 0xffffffff;
-	cache->bo[0] = kgem_create_proxy(&sna->kgem, cache->cache_bo,
-					 0, sizeof(uint32_t));
-	if (cache->bo[0] == NULL)
-		return false;
-
-	cache->bo[0]->pitch = 4;
-	cache->dirty = 1;
-	cache->size = 1;
-	cache->last = 0;
+	cache->last = 1024;
+	cache->color[cache->last] = 0;
+	cache->dirty = 0;
+	cache->size = 0;
 
 	return true;
 }
diff --git a/src/sna/sna_io.c b/src/sna/sna_io.c
index 69d920c7d..f89cd89ec 100644
--- a/src/sna/sna_io.c
+++ b/src/sna/sna_io.c
@@ -117,12 +117,18 @@ static void read_boxes_inplace(struct kgem *kgem,
 
 static bool download_inplace(struct kgem *kgem, struct kgem_bo *bo)
 {
+	if (unlikely(kgem->wedged))
+		return true;
+
 	if (!kgem_bo_can_map(kgem, bo))
 		return false;
 
 	if (FORCE_INPLACE)
 		return FORCE_INPLACE > 0;
 
+	if (kgem->can_blt_cpu && kgem->max_cpu_size)
+		return false;
+
 	return !__kgem_bo_is_busy(kgem, bo) || bo->tiling == I915_TILING_NONE;
 }
 
@@ -364,7 +370,7 @@ fallback:
 
 	cmd = XY_SRC_COPY_BLT_CMD;
 	src_pitch = src_bo->pitch;
-	if (kgem->gen >= 40 && src_bo->tiling) {
+	if (kgem->gen >= 040 && src_bo->tiling) {
 		cmd |= BLT_SRC_TILED;
 		src_pitch >>= 2;
 	}
@@ -378,11 +384,13 @@ fallback:
 	case 1: break;
 	}
 
-	kgem_set_mode(kgem, KGEM_BLT);
-	if (!kgem_check_reloc_and_exec(kgem, 2) ||
-	    !kgem_check_batch(kgem, 8) ||
+	kgem_set_mode(kgem, KGEM_BLT, dst_bo);
+	if (!kgem_check_batch(kgem, 8) ||
+	    !kgem_check_reloc_and_exec(kgem, 2) ||
 	    !kgem_check_many_bo_fenced(kgem, dst_bo, src_bo, NULL)) {
-		_kgem_submit(kgem);
+		kgem_submit(kgem);
+		if (!kgem_check_many_bo_fenced(kgem, dst_bo, src_bo, NULL))
+			goto fallback;
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
@@ -483,7 +491,16 @@ fallback:
 
 static bool upload_inplace__tiled(struct kgem *kgem, struct kgem_bo *bo)
 {
-	if (kgem->gen < 50) /* bit17 swizzling :( */
+#ifndef __x86_64__
+	/* Between a register starved compiler emitting attrocious code
+	 * and the extra overhead in the kernel for managing the tight
+	 * 32-bit address space, unless we have a 64-bit system,
+	 * using memcpy_to_tiled_x() is extremely slow.
+	 */
+	return false;
+#endif
+
+	if (kgem->gen < 050) /* bit17 swizzling :( */
 		return false;
 
 	if (bo->tiling != I915_TILING_X)
@@ -579,19 +596,13 @@ static bool write_boxes_inplace(struct kgem *kgem,
 	return true;
 }
 
-static bool upload_inplace(struct kgem *kgem,
-			   struct kgem_bo *bo,
-			   const BoxRec *box,
-			   int n, int bpp)
+static bool __upload_inplace(struct kgem *kgem,
+			     struct kgem_bo *bo,
+			     const BoxRec *box,
+			     int n, int bpp)
 {
 	unsigned int bytes;
 
-	if (kgem->wedged)
-		return true;
-
-	if (!kgem_bo_can_map(kgem, bo) && !upload_inplace__tiled(kgem, bo))
-		return false;
-
 	if (FORCE_INPLACE)
 		return FORCE_INPLACE > 0;
 
@@ -610,6 +621,20 @@ static bool upload_inplace(struct kgem *kgem,
 		return bytes * bpp >> 12;
 }
 
+static bool upload_inplace(struct kgem *kgem,
+			   struct kgem_bo *bo,
+			   const BoxRec *box,
+			   int n, int bpp)
+{
+	if (unlikely(kgem->wedged))
+		return true;
+
+	if (!kgem_bo_can_map(kgem, bo) && !upload_inplace__tiled(kgem, bo))
+		return false;
+
+	return __upload_inplace(kgem, bo, box, n,bpp);
+}
+
 bool sna_write_boxes(struct sna *sna, PixmapPtr dst,
 		     struct kgem_bo * const dst_bo, int16_t const dst_dx, int16_t const dst_dy,
 		     const void * const src, int const stride, int16_t const src_dx, int16_t const src_dy,
@@ -672,14 +697,18 @@ fallback:
 		     sna->render.max_3d_size, sna->render.max_3d_size));
 		if (must_tile(sna, tmp.drawable.width, tmp.drawable.height)) {
 			BoxRec tile, stack[64], *clipped, *c;
-			int step;
+			int cpp, step;
 
 tile:
-			step = MIN(sna->render.max_3d_size - 4096 / dst->drawable.bitsPerPixel,
-				   8*(MAXSHORT&~63) / dst->drawable.bitsPerPixel);
-			while (step * step * 4 > sna->kgem.max_upload_tile_size)
+			cpp = dst->drawable.bitsPerPixel / 8;
+			step = MIN(sna->render.max_3d_size,
+				   (MAXSHORT&~63) / cpp);
+			while (step * step * cpp > sna->kgem.max_upload_tile_size)
 				step /= 2;
 
+			if (step * cpp > 4096)
+				step = 4096 / cpp;
+
 			DBG(("%s: tiling upload, using %dx%d tiles\n",
 			     __FUNCTION__, step, step));
 
@@ -803,7 +832,7 @@ tile:
 
 	cmd = XY_SRC_COPY_BLT_CMD;
 	br13 = dst_bo->pitch;
-	if (kgem->gen >= 40 && dst_bo->tiling) {
+	if (kgem->gen >= 040 && dst_bo->tiling) {
 		cmd |= BLT_DST_TILED;
 		br13 >>= 2;
 	}
@@ -816,11 +845,13 @@ tile:
 	case 8: break;
 	}
 
-	kgem_set_mode(kgem, KGEM_BLT);
+	kgem_set_mode(kgem, KGEM_BLT, dst_bo);
 	if (!kgem_check_batch(kgem, 8) ||
 	    !kgem_check_reloc_and_exec(kgem, 2) ||
 	    !kgem_check_bo_fenced(kgem, dst_bo)) {
-		_kgem_submit(kgem);
+		kgem_submit(kgem);
+		if (!kgem_check_bo_fenced(kgem, dst_bo))
+			goto fallback;
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
@@ -960,6 +991,20 @@ write_boxes_inplace__xor(struct kgem *kgem,
 	} while (--n);
 }
 
+static bool upload_inplace__xor(struct kgem *kgem,
+				struct kgem_bo *bo,
+				const BoxRec *box,
+				int n, int bpp)
+{
+	if (unlikely(kgem->wedged))
+		return true;
+
+	if (!kgem_bo_can_map(kgem, bo))
+		return false;
+
+	return __upload_inplace(kgem, bo, box, n, bpp);
+}
+
 void sna_write_boxes__xor(struct sna *sna, PixmapPtr dst,
 			  struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
 			  const void *src, int stride, int16_t src_dx, int16_t src_dy,
@@ -976,7 +1021,7 @@ void sna_write_boxes__xor(struct sna *sna, PixmapPtr dst,
 
 	DBG(("%s x %d\n", __FUNCTION__, nbox));
 
-	if (upload_inplace(kgem, dst_bo, box, nbox, dst->drawable.bitsPerPixel)) {
+	if (upload_inplace__xor(kgem, dst_bo, box, nbox, dst->drawable.bitsPerPixel)) {
 fallback:
 		write_boxes_inplace__xor(kgem,
 					 src, stride, dst->drawable.bitsPerPixel, src_dx, src_dy,
@@ -1158,7 +1203,7 @@ tile:
 
 	cmd = XY_SRC_COPY_BLT_CMD;
 	br13 = dst_bo->pitch;
-	if (kgem->gen >= 40 && dst_bo->tiling) {
+	if (kgem->gen >= 040 && dst_bo->tiling) {
 		cmd |= BLT_DST_TILED;
 		br13 >>= 2;
 	}
@@ -1171,11 +1216,13 @@ tile:
 	case 8: break;
 	}
 
-	kgem_set_mode(kgem, KGEM_BLT);
-	if (!kgem_check_reloc_and_exec(kgem, 2) ||
-	    !kgem_check_batch(kgem, 8) ||
+	kgem_set_mode(kgem, KGEM_BLT, dst_bo);
+	if (!kgem_check_batch(kgem, 8) ||
+	    !kgem_check_reloc_and_exec(kgem, 2) ||
 	    !kgem_check_bo_fenced(kgem, dst_bo)) {
-		_kgem_submit(kgem);
+		kgem_submit(kgem);
+		if (!kgem_check_bo_fenced(kgem, dst_bo))
+			goto fallback;
 		_kgem_set_mode(kgem, KGEM_BLT);
 	}
 
diff --git a/src/sna/sna_render.c b/src/sna/sna_render.c
index 58449228d..69ac21c3b 100644
--- a/src/sna/sna_render.c
+++ b/src/sna/sna_render.c
@@ -87,8 +87,8 @@ no_render_composite(struct sna *sna,
 	if (mask)
 		return false;
 
-	if (!is_gpu(dst->pDrawable) &&
-	    (src->pDrawable == NULL || !is_gpu(src->pDrawable)))
+	if (!is_gpu(sna, dst->pDrawable, PREFER_GPU_BLT) &&
+	    (src->pDrawable == NULL || !is_gpu(sna, src->pDrawable, PREFER_GPU_BLT)))
 		return false;
 
 	return sna_blt_composite(sna,
@@ -246,15 +246,14 @@ static void
 no_render_context_switch(struct kgem *kgem,
 			 int new_mode)
 {
-	if (!kgem->mode)
+	if (!kgem->nbatch)
 		return;
 
-	if (kgem_is_idle(kgem)) {
+	if (kgem_ring_is_idle(kgem, kgem->ring)) {
 		DBG(("%s: GPU idle, flushing\n", __FUNCTION__));
 		_kgem_submit(kgem);
 	}
 
-	(void)kgem;
 	(void)new_mode;
 }
 
@@ -280,7 +279,9 @@ void no_render_init(struct sna *sna)
 {
 	struct sna_render *render = &sna->render;
 
-	memset (render,0, sizeof (*render));
+	memset (render, 0, sizeof (*render));
+
+	render->prefer_gpu = PREFER_GPU_BLT;
 
 	render->vertices = render->vertex_data;
 	render->vertex_size = ARRAY_SIZE(render->vertex_data);
@@ -305,6 +306,8 @@ void no_render_init(struct sna *sna)
 	sna->kgem.expire = no_render_expire;
 	if (sna->kgem.has_blt)
 		sna->kgem.ring = KGEM_BLT;
+
+	sna_vertex_init(sna);
 }
 
 static struct kgem_bo *
@@ -321,6 +324,14 @@ use_cpu_bo(struct sna *sna, PixmapPtr pixmap, const BoxRec *box, bool blt)
 		return NULL;
 	}
 
+	if (priv->shm) {
+		DBG(("%s: shm CPU bo, avoiding promotion to GPU\n",
+		     __FUNCTION__));
+		assert(!priv->flush);
+		sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
+		return priv->cpu_bo;
+	}
+
 	if (priv->cpu_bo->snoop && priv->source_count > SOURCE_BIAS) {
 		DBG(("%s: promoting snooped CPU bo due to reuse\n",
 		     __FUNCTION__));
@@ -339,6 +350,11 @@ use_cpu_bo(struct sna *sna, PixmapPtr pixmap, const BoxRec *box, bool blt)
 			     __FUNCTION__));
 			break;
 		default:
+			if (kgem_bo_is_busy(priv->gpu_bo)){
+				DBG(("%s: box is partially damaged on the CPU, and the GPU is busy\n",
+				     __FUNCTION__));
+				return NULL;
+			}
 			if (sna_damage_contains_box(priv->gpu_damage,
 						    box) != PIXMAN_REGION_OUT) {
 				DBG(("%s: box is damaged on the GPU\n",
@@ -384,11 +400,6 @@ use_cpu_bo(struct sna *sna, PixmapPtr pixmap, const BoxRec *box, bool blt)
 		}
 	}
 
-	if (priv->shm) {
-		assert(!priv->flush);
-		sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
-	}
-
 	DBG(("%s for box=(%d, %d), (%d, %d)\n",
 	     __FUNCTION__, box->x1, box->y1, box->x2, box->y2));
 	++priv->source_count;
@@ -423,12 +434,21 @@ move_to_gpu(PixmapPtr pixmap, const BoxRec *box, bool blt)
 		return priv->gpu_bo;
 	}
 
+	if (priv->cpu_damage == NULL) {
+		DBG(("%s: not migrating uninitialised pixmap\n",
+		     __FUNCTION__));
+		return NULL;
+	}
+
 	if (pixmap->usage_hint) {
 		DBG(("%s: not migrating pixmap due to usage_hint=%d\n",
 		     __FUNCTION__, pixmap->usage_hint));
 		return NULL;
 	}
 
+	if (priv->shm)
+		blt = true;
+
 	if (DBG_FORCE_UPLOAD < 0) {
 		if (!sna_pixmap_force_to_gpu(pixmap,
 					     blt ? MOVE_READ : MOVE_SOURCE_HINT | MOVE_READ))
@@ -439,7 +459,9 @@ move_to_gpu(PixmapPtr pixmap, const BoxRec *box, bool blt)
 
 	w = box->x2 - box->x1;
 	h = box->y2 - box->y1;
-	if (w == pixmap->drawable.width && h == pixmap->drawable.height) {
+	if (priv->cpu_bo && !priv->cpu_bo->flush) {
+		migrate = true;
+	} else if (w == pixmap->drawable.width && h == pixmap->drawable.height) {
 		migrate = priv->source_count++ > SOURCE_BIAS;
 
 		DBG(("%s: migrating whole pixmap (%dx%d) for source (%d,%d),(%d,%d), count %d? %d\n",
@@ -464,9 +486,15 @@ move_to_gpu(PixmapPtr pixmap, const BoxRec *box, bool blt)
 		migrate = count*w*h > pixmap->drawable.width * pixmap->drawable.height;
 	}
 
-	if (migrate && !sna_pixmap_force_to_gpu(pixmap,
-						blt ? MOVE_READ : MOVE_SOURCE_HINT | MOVE_READ))
-		return NULL;
+	if (migrate) {
+		if (blt) {
+			if (!sna_pixmap_move_area_to_gpu(pixmap, box, MOVE_READ))
+				return NULL;
+		} else {
+			if (!sna_pixmap_force_to_gpu(pixmap, MOVE_SOURCE_HINT | MOVE_READ))
+				return NULL;
+		}
+	}
 
 	return priv->gpu_bo;
 }
@@ -474,7 +502,7 @@ move_to_gpu(PixmapPtr pixmap, const BoxRec *box, bool blt)
 static struct kgem_bo *upload(struct sna *sna,
 			      struct sna_composite_channel *channel,
 			      PixmapPtr pixmap,
-			      BoxPtr box)
+			      const BoxRec *box)
 {
 	struct sna_pixmap *priv;
 	struct kgem_bo *bo;
@@ -488,6 +516,9 @@ static struct kgem_bo *upload(struct sna *sna,
 
 	priv = sna_pixmap(pixmap);
 	if (priv) {
+		if (priv->cpu_damage == NULL)
+			return NULL;
+
 		/* As we know this box is on the CPU just fixup the shadow */
 		if (priv->mapped) {
 			pixmap->devPrivate.ptr = NULL;
@@ -497,7 +528,7 @@ static struct kgem_bo *upload(struct sna *sna,
 			if (priv->ptr == NULL) /* uninitialised */
 				return NULL;
 			assert(priv->stride);
-			pixmap->devPrivate.ptr = priv->ptr;
+			pixmap->devPrivate.ptr = PTR(priv->ptr);
 			pixmap->devKind = priv->stride;
 		}
 	}
@@ -515,8 +546,11 @@ static struct kgem_bo *upload(struct sna *sna,
 		if (priv &&
 		    pixmap->usage_hint == 0 &&
 		    channel->width  == pixmap->drawable.width &&
-		    channel->height == pixmap->drawable.height)
+		    channel->height == pixmap->drawable.height) {
+			assert(priv->gpu_damage == NULL);
+			assert(priv->gpu_bo == NULL);
 			kgem_proxy_bo_attach(bo, &priv->gpu_bo);
+		}
 	}
 
 	return bo;
@@ -575,6 +609,10 @@ sna_render_pixmap_bo(struct sna *sna,
 		    !priv->cpu_bo->snoop && priv->cpu_bo->pitch < 4096) {
 			DBG(("%s: CPU all damaged\n", __FUNCTION__));
 			channel->bo = priv->cpu_bo;
+			if (priv->shm) {
+				assert(!priv->flush);
+				sna_add_flush_pixmap(sna, priv, priv->cpu_bo);
+			}
 			goto done;
 		}
 	}
@@ -587,8 +625,8 @@ sna_render_pixmap_bo(struct sna *sna,
 	} else {
 		box.x1 = x;
 		box.y1 = y;
-		box.x2 = x + w;
-		box.y2 = y + h;
+		box.x2 = bound(x, w);
+		box.y2 = bound(y, h);
 
 		if (channel->repeat == RepeatNone || channel->repeat == RepeatPad) {
 			if (box.x1 < 0)
@@ -661,8 +699,8 @@ static int sna_render_picture_downsample(struct sna *sna,
 
 	box.x1 = x;
 	box.y1 = y;
-	box.x2 = x + w;
-	box.y2 = y + h;
+	box.x2 = bound(x, w);
+	box.y2 = bound(y, h);
 	if (channel->transform) {
 		pixman_vector_t v;
 
@@ -843,8 +881,8 @@ sna_render_pixmap_partial(struct sna *sna,
 
 	box.x1 = x;
 	box.y1 = y;
-	box.x2 = x + w;
-	box.y2 = y + h;
+	box.x2 = bound(x, w);
+	box.y2 = bound(y, h);
 	DBG(("%s: unaligned box (%d, %d), (%d, %d)\n",
 	     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));
 
@@ -861,6 +899,9 @@ sna_render_pixmap_partial(struct sna *sna,
 		DBG(("%s: tile size for tiling %d: %dx%d, size=%d\n",
 		     __FUNCTION__, bo->tiling, tile_width, tile_height, tile_size));
 
+		if (sna->kgem.gen < 033)
+			tile_width = bo->pitch;
+
 		/* Ensure we align to an even tile row */
 		box.y1 = box.y1 & ~(2*tile_height - 1);
 		box.y2 = ALIGN(box.y2, 2*tile_height);
@@ -934,8 +975,8 @@ sna_render_picture_partial(struct sna *sna,
 
 	box.x1 = x;
 	box.y1 = y;
-	box.x2 = x + w;
-	box.y2 = y + h;
+	box.x2 = bound(x, w);
+	box.y2 = bound(y, h);
 	if (channel->transform)
 		pixman_transform_bounds(channel->transform, &box);
 
@@ -1077,8 +1118,8 @@ sna_render_picture_extract(struct sna *sna,
 
 	ox = box.x1 = x;
 	oy = box.y1 = y;
-	box.x2 = x + w;
-	box.y2 = y + h;
+	box.x2 = bound(x, w);
+	box.y2 = bound(y, h);
 	if (channel->transform) {
 		pixman_vector_t v;
 
@@ -1147,8 +1188,11 @@ sna_render_picture_extract(struct sna *sna,
 			    box.x2 - box.x1 == pixmap->drawable.width &&
 			    box.y2 - box.y1 == pixmap->drawable.height) {
 				struct sna_pixmap *priv = sna_pixmap(pixmap);
-				if (priv)
+				if (priv) {
+					assert(priv->gpu_damage == NULL);
+					assert(priv->gpu_bo == NULL);
 					kgem_proxy_bo_attach(bo, &priv->gpu_bo);
+				}
 			}
 		}
 	}
@@ -1334,6 +1378,7 @@ sna_render_picture_flatten(struct sna *sna,
 	assert(w <= sna->render.max_3d_size && h <= sna->render.max_3d_size);
 
 	/* XXX shortcut a8? */
+	DBG(("%s: %dx%d\n", __FUNCTION__, w, h));
 
 	pixmap = screen->CreatePixmap(screen, w, h, 32, SNA_CREATE_SCRATCH);
 	if (pixmap == NullPixmap)
@@ -1346,6 +1391,8 @@ sna_render_picture_flatten(struct sna *sna,
 	if (tmp == NULL)
 		return 0;
 
+	ValidatePicture(tmp);
+
 	old_format = picture->format;
 	picture->format = PICT_FORMAT(PICT_FORMAT_BPP(picture->format),
 				      PICT_FORMAT_TYPE(picture->format),
@@ -1445,11 +1492,11 @@ sna_render_picture_approximate_gradient(struct sna *sna,
 		pixman_transform_multiply(&t, picture->transform, &t);
 	pixman_image_set_transform(src, &t);
 
-	pixman_image_composite(PictOpSrc, src, NULL, dst,
-			       x + dx, y + dy,
-			       0, 0,
-			       0, 0,
-			       w2, h2);
+	sna_image_composite(PictOpSrc, src, NULL, dst,
+			    x+dx, y+dy,
+			    0, 0,
+			    0, 0,
+			    w2, h2);
 	free_pixman_pict(picture, src);
 	pixman_image_unref(dst);
 
@@ -1498,7 +1545,8 @@ sna_render_picture_fixup(struct sna *sna,
 
 	if (picture->alphaMap) {
 		DBG(("%s: alphamap\n", __FUNCTION__));
-		if (is_gpu(picture->pDrawable) || is_gpu(picture->alphaMap->pDrawable)) {
+		if (is_gpu(sna, picture->pDrawable, PREFER_GPU_RENDER) ||
+		    is_gpu(sna, picture->alphaMap->pDrawable, PREFER_GPU_RENDER)) {
 			return sna_render_picture_flatten(sna, picture, channel,
 							  x, y, w, h, dst_x, dst_y);
 		}
@@ -1508,7 +1556,7 @@ sna_render_picture_fixup(struct sna *sna,
 
 	if (picture->filter == PictFilterConvolution) {
 		DBG(("%s: convolution\n", __FUNCTION__));
-		if (is_gpu(picture->pDrawable)) {
+		if (is_gpu(sna, picture->pDrawable, PREFER_GPU_RENDER)) {
 			return sna_render_picture_convolve(sna, picture, channel,
 							   x, y, w, h, dst_x, dst_y);
 		}
@@ -1541,8 +1589,10 @@ do_fixup:
 	}
 
 	/* Composite in the original format to preserve idiosyncracies */
-	if (picture->format == channel->pict_format)
-		dst = pixman_image_create_bits(picture->format,
+	if (!kgem_buffer_is_inplace(channel->bo) &&
+	    (picture->pDrawable == NULL ||
+	     picture->format == channel->pict_format))
+		dst = pixman_image_create_bits(channel->pict_format,
 					       w, h, ptr, channel->bo->pitch);
 	else
 		dst = pixman_image_create_bits(picture->format, w, h, NULL, 0);
@@ -1560,15 +1610,15 @@ do_fixup:
 
 	DBG(("%s: compositing tmp=(%d+%d, %d+%d)x(%d, %d)\n",
 	     __FUNCTION__, x, dx, y, dy, w, h));
-	pixman_image_composite(PictOpSrc, src, NULL, dst,
-			       x + dx, y + dy,
-			       0, 0,
-			       0, 0,
-			       w, h);
+	sna_image_composite(PictOpSrc, src, NULL, dst,
+			    x + dx, y + dy,
+			    0, 0,
+			    0, 0,
+			    w, h);
 	free_pixman_pict(picture, src);
 
 	/* Then convert to card format */
-	if (picture->format != channel->pict_format) {
+	if (pixman_image_get_data(dst) != ptr) {
 		DBG(("%s: performing post-conversion %08x->%08x (%d, %d)\n",
 		     __FUNCTION__,
 		     picture->format, channel->pict_format,
@@ -1614,11 +1664,10 @@ sna_render_picture_convert(struct sna *sna,
 			   PixmapPtr pixmap,
 			   int16_t x, int16_t y,
 			   int16_t w, int16_t h,
-			   int16_t dst_x, int16_t dst_y)
+			   int16_t dst_x, int16_t dst_y,
+			   bool fixup_alpha)
 {
-	pixman_image_t *src, *dst;
 	BoxRec box;
-	void *ptr;
 
 #if NO_CONVERT
 	return -1;
@@ -1627,8 +1676,8 @@ sna_render_picture_convert(struct sna *sna,
 	if (w != 0 && h != 0) {
 		box.x1 = x;
 		box.y1 = y;
-		box.x2 = x + w;
-		box.y2 = y + h;
+		box.x2 = bound(x, w);
+		box.y2 = bound(y, h);
 
 		if (channel->transform) {
 			DBG(("%s: has transform, converting whole surface\n",
@@ -1668,52 +1717,113 @@ sna_render_picture_convert(struct sna *sna,
 		return 0;
 	}
 
-	if (!sna_pixmap_move_to_cpu(pixmap, MOVE_READ))
-		return 0;
+	if (fixup_alpha && is_gpu(sna, &pixmap->drawable, PREFER_GPU_RENDER)) {
+		ScreenPtr screen = pixmap->drawable.pScreen;
+		PixmapPtr tmp;
+		PicturePtr src, dst;
+		int error;
 
-	src = pixman_image_create_bits(picture->format,
-				       pixmap->drawable.width,
-				       pixmap->drawable.height,
-				       pixmap->devPrivate.ptr,
-				       pixmap->devKind);
-	if (!src)
-		return 0;
+		assert(PICT_FORMAT_BPP(picture->format) == pixmap->drawable.bitsPerPixel);
+		channel->pict_format = PICT_FORMAT(PICT_FORMAT_BPP(picture->format),
+						   PICT_FORMAT_TYPE(picture->format),
+						   PICT_FORMAT_BPP(picture->format) - PIXMAN_FORMAT_DEPTH(picture->format),
+						   PICT_FORMAT_R(picture->format),
+						   PICT_FORMAT_G(picture->format),
+						   PICT_FORMAT_B(picture->format));
 
-	if (PICT_FORMAT_RGB(picture->format) == 0) {
-		channel->pict_format = PIXMAN_a8;
-		DBG(("%s: converting to a8 from %08x\n",
-		     __FUNCTION__, picture->format));
+		DBG(("%s: converting to %08x from %08x using composite alpha-fixup\n",
+		     __FUNCTION__, (unsigned)picture->format));
+
+		tmp = screen->CreatePixmap(screen, w, h, pixmap->drawable.bitsPerPixel, 0);
+		if (tmp == NULL)
+			return 0;
+
+		dst = CreatePicture(0, &tmp->drawable,
+				    PictureMatchFormat(screen,
+						       pixmap->drawable.bitsPerPixel,
+						       channel->pict_format),
+				    0, NULL, serverClient, &error);
+		if (dst == NULL) {
+			screen->DestroyPixmap(tmp);
+			return 0;
+		}
+
+		src = CreatePicture(0, &pixmap->drawable,
+				    PictureMatchFormat(screen,
+						       pixmap->drawable.depth,
+						       picture->format),
+				    0, NULL, serverClient, &error);
+		if (src == NULL) {
+			FreePicture(dst, 0);
+			screen->DestroyPixmap(tmp);
+			return 0;
+		}
+
+		ValidatePicture(src);
+		ValidatePicture(dst);
+
+		sna_composite(PictOpSrc, src, NULL, dst,
+			      box.x1, box.y1,
+			      0, 0,
+			      0, 0,
+			      w, h);
+		FreePicture(dst, 0);
+		FreePicture(src, 0);
+
+		channel->bo = sna_pixmap_get_bo(tmp);
+		kgem_bo_reference(channel->bo);
+		screen->DestroyPixmap(tmp);
 	} else {
-		channel->pict_format = PIXMAN_a8r8g8b8;
-		DBG(("%s: converting to a8r8g8b8 from %08x\n",
-		     __FUNCTION__, picture->format));
-	}
+		pixman_image_t *src, *dst;
+		void *ptr;
 
-	channel->bo = kgem_create_buffer_2d(&sna->kgem,
-					    w, h, PIXMAN_FORMAT_BPP(channel->pict_format),
-					    KGEM_BUFFER_WRITE_INPLACE,
-					    &ptr);
-	if (!channel->bo) {
-		pixman_image_unref(src);
-		return 0;
-	}
+		if (!sna_pixmap_move_to_cpu(pixmap, MOVE_READ))
+			return 0;
 
-	dst = pixman_image_create_bits(channel->pict_format,
-				       w, h, ptr, channel->bo->pitch);
-	if (!dst) {
-		kgem_bo_destroy(&sna->kgem, channel->bo);
+		src = pixman_image_create_bits(picture->format,
+					       pixmap->drawable.width,
+					       pixmap->drawable.height,
+					       pixmap->devPrivate.ptr,
+					       pixmap->devKind);
+		if (!src)
+			return 0;
+
+		if (PICT_FORMAT_RGB(picture->format) == 0) {
+			channel->pict_format = PIXMAN_a8;
+			DBG(("%s: converting to a8 from %08x\n",
+			     __FUNCTION__, picture->format));
+		} else {
+			channel->pict_format = PIXMAN_a8r8g8b8;
+			DBG(("%s: converting to a8r8g8b8 from %08x\n",
+			     __FUNCTION__, picture->format));
+		}
+
+		channel->bo = kgem_create_buffer_2d(&sna->kgem,
+						    w, h, PIXMAN_FORMAT_BPP(channel->pict_format),
+						    KGEM_BUFFER_WRITE_INPLACE,
+						    &ptr);
+		if (!channel->bo) {
+			pixman_image_unref(src);
+			return 0;
+		}
+
+		dst = pixman_image_create_bits(channel->pict_format,
+					       w, h, ptr, channel->bo->pitch);
+		if (!dst) {
+			kgem_bo_destroy(&sna->kgem, channel->bo);
+			pixman_image_unref(src);
+			return 0;
+		}
+
+		pixman_image_composite(PictOpSrc, src, NULL, dst,
+				       box.x1, box.y1,
+				       0, 0,
+				       0, 0,
+				       w, h);
+		pixman_image_unref(dst);
 		pixman_image_unref(src);
-		return 0;
 	}
 
-	pixman_image_composite(PictOpSrc, src, NULL, dst,
-			       box.x1, box.y1,
-			       0, 0,
-			       0, 0,
-			       w, h);
-	pixman_image_unref(dst);
-	pixman_image_unref(src);
-
 	channel->width  = w;
 	channel->height = h;
 
@@ -1722,11 +1832,10 @@ sna_render_picture_convert(struct sna *sna,
 	channel->offset[0] = x - dst_x - box.x1;
 	channel->offset[1] = y - dst_y - box.y1;
 
-	DBG(("%s: offset=(%d, %d), size=(%d, %d) ptr[0]=%08x\n",
+	DBG(("%s: offset=(%d, %d), size=(%d, %d)\n",
 	     __FUNCTION__,
 	     channel->offset[0], channel->offset[1],
-	     channel->width, channel->height,
-	     *(uint32_t*)ptr));
+	     channel->width, channel->height));
 	return 1;
 }
 
@@ -1764,9 +1873,9 @@ sna_render_composite_redirect(struct sna *sna,
 		     __FUNCTION__, op->dst.bo->pitch, sna->render.max_3d_pitch));
 
 		box.x1 = x;
-		box.x2 = x + width;
+		box.x2 = bound(x, width);
 		box.y1 = y;
-		box.y2 = y + height;
+		box.y2 = bound(y, height);
 
 		/* Ensure we align to an even tile row */
 		if (op->dst.bo->tiling) {
@@ -1783,7 +1892,7 @@ sna_render_composite_redirect(struct sna *sna,
 
 			offset = box.x1 * op->dst.pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
 		} else {
-			if (sna->kgem.gen < 40) {
+			if (sna->kgem.gen < 040) {
 				box.y1 = box.y1 & ~3;
 				box.y2 = ALIGN(box.y2, 4);
 
@@ -1860,8 +1969,8 @@ sna_render_composite_redirect(struct sna *sna,
 
 	t->box.x1 = x + op->dst.x;
 	t->box.y1 = y + op->dst.y;
-	t->box.x2 = t->box.x1 + width;
-	t->box.y2 = t->box.y1 + height;
+	t->box.x2 = bound(t->box.x1, width);
+	t->box.y2 = bound(t->box.y1, height);
 
 	DBG(("%s: original box (%d, %d), (%d, %d)\n",
 	     __FUNCTION__, t->box.x1, t->box.y1, t->box.x2, t->box.y2));
@@ -1911,11 +2020,13 @@ sna_render_composite_redirect_done(struct sna *sna,
 			assert(ok);
 		}
 		if (t->damage) {
-			DBG(("%s: combining damage, offset=(%d, %d)\n",
-			     __FUNCTION__, t->box.x1, t->box.y1));
-			sna_damage_combine(t->real_damage, t->damage,
+			DBG(("%s: combining damage (all? %d), offset=(%d, %d)\n",
+			     __FUNCTION__, (int)DAMAGE_IS_ALL(t->damage),
+			     t->box.x1, t->box.y1));
+			sna_damage_combine(t->real_damage,
+					   DAMAGE_PTR(t->damage),
 					   t->box.x1, t->box.y1);
-			__sna_damage_destroy(t->damage);
+			__sna_damage_destroy(DAMAGE_PTR(t->damage));
 		}
 
 		kgem_bo_destroy(&sna->kgem, op->dst.bo);
diff --git a/src/sna/sna_render.h b/src/sna/sna_render.h
index 03a700571..01176c6aa 100644
--- a/src/sna/sna_render.h
+++ b/src/sna/sna_render.h
@@ -5,6 +5,11 @@
 
 #include <picturestr.h>
 
+#include <stdbool.h>
+#include <stdint.h>
+#include <pthread.h>
+#include "atomic.h"
+
 #define GRADIENT_CACHE_SIZE 16
 
 #define GXinvalid 0xff
@@ -30,6 +35,8 @@ struct sna_composite_op {
 			     const BoxRec *box);
 	void (*boxes)(struct sna *sna, const struct sna_composite_op *op,
 		      const BoxRec *box, int nbox);
+	void (*thread_boxes)(struct sna *sna, const struct sna_composite_op *op,
+			     const BoxRec *box, int nbox);
 	void (*done)(struct sna *sna, const struct sna_composite_op *op);
 
 	struct sna_damage **damage;
@@ -66,10 +73,10 @@ struct sna_composite_op {
 
 		union {
 			struct {
+				float dx, dy, offset;
+			} linear;
+			struct {
 				uint32_t pixel;
-				float linear_dx;
-				float linear_dy;
-				float linear_offset;
 			} gen2;
 			struct gen3_shader_channel {
 				int type;
@@ -88,6 +95,9 @@ struct sna_composite_op {
 	fastcall void (*prim_emit)(struct sna *sna,
 				   const struct sna_composite_op *op,
 				   const struct sna_composite_rectangles *r);
+	fastcall void (*emit_boxes)(const struct sna_composite_op *op,
+				    const BoxRec *box, int nbox,
+				    float *v);
 
 	struct sna_composite_redirect {
 		struct kgem_bo *real_bo;
@@ -122,8 +132,8 @@ struct sna_composite_op {
 		} gen4;
 
 		struct {
-			int wm_kernel;
-			int ve_id;
+			int16_t wm_kernel;
+			int16_t ve_id;
 		} gen5;
 
 		struct {
@@ -138,6 +148,11 @@ struct sna_composite_op {
 	void *priv;
 };
 
+struct sna_opacity_box {
+	BoxRec box;
+	float alpha;
+} __packed__;
+
 struct sna_composite_spans_op {
 	struct sna_composite_op base;
 
@@ -149,6 +164,12 @@ struct sna_composite_spans_op {
 		      const struct sna_composite_spans_op *op,
 		      const BoxRec *box, int nbox,
 		      float opacity);
+
+	fastcall void (*thread_boxes)(struct sna *sna,
+				      const struct sna_composite_spans_op *op,
+				      const struct sna_opacity_box *box,
+				      int nbox);
+
 	fastcall void (*done)(struct sna *sna,
 			      const struct sna_composite_spans_op *op);
 
@@ -156,6 +177,9 @@ struct sna_composite_spans_op {
 				   const struct sna_composite_spans_op *op,
 				   const BoxRec *box,
 				   float opacity);
+	fastcall void (*emit_boxes)(const struct sna_composite_spans_op *op,
+				    const struct sna_opacity_box *box, int nbox,
+				    float *v);
 };
 
 struct sna_fill_op {
@@ -184,9 +208,18 @@ struct sna_copy_op {
 };
 
 struct sna_render {
+	pthread_mutex_t lock;
+	pthread_cond_t wait;
+	int active;
+
 	int max_3d_size;
 	int max_3d_pitch;
 
+	unsigned prefer_gpu;
+#define PREFER_GPU_BLT 0x1
+#define PREFER_GPU_RENDER 0x2
+#define PREFER_GPU_SPANS 0x4
+
 	bool (*composite)(struct sna *sna, uint8_t op,
 			  PicturePtr dst, PicturePtr src, PicturePtr mask,
 			  int16_t src_x, int16_t src_y,
@@ -214,6 +247,7 @@ struct sna_render {
 		      RegionPtr dstRegion,
 		      short src_w, short src_h,
 		      short drw_w, short drw_h,
+		      short dx, short dy,
 		      PixmapPtr pixmap);
 
 	bool (*fill_boxes)(struct sna *sna,
@@ -237,6 +271,7 @@ struct sna_render {
 			   PixmapPtr dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
 			   const BoxRec *box, int n, unsigned flags);
 #define COPY_LAST 0x1
+#define COPY_SYNC 0x2
 
 	bool (*copy)(struct sna *sna, uint8_t alu,
 		     PixmapPtr src, struct kgem_bo *src_bo,
@@ -249,13 +284,13 @@ struct sna_render {
 
 	struct sna_alpha_cache {
 		struct kgem_bo *cache_bo;
-		struct kgem_bo *bo[256];
+		struct kgem_bo *bo[256+7];
 	} alpha_cache;
 
 	struct sna_solid_cache {
 		struct kgem_bo *cache_bo;
-		uint32_t color[1024];
 		struct kgem_bo *bo[1024];
+		uint32_t color[1025];
 		int last;
 		int size;
 		int dirty;
@@ -282,6 +317,8 @@ struct sna_render {
 	pixman_glyph_cache_t *glyph_cache;
 #endif
 
+	uint16_t vb_id;
+	uint16_t vertex_offset;
 	uint16_t vertex_start;
 	uint16_t vertex_index;
 	uint16_t vertex_used;
@@ -302,7 +339,6 @@ struct gen2_render_state {
 	uint32_t ls1, ls2, vft;
 	uint32_t diffuse;
 	uint32_t specular;
-	uint16_t vertex_offset;
 };
 
 struct gen3_render_state {
@@ -318,7 +354,6 @@ struct gen3_render_state {
 	uint32_t last_diffuse;
 	uint32_t last_specular;
 
-	uint16_t vertex_offset;
 	uint16_t last_vertex_offset;
 	uint16_t floats_per_vertex;
 	uint16_t last_floats_per_vertex;
@@ -332,16 +367,14 @@ struct gen4_render_state {
 	struct kgem_bo *general_bo;
 
 	uint32_t vs;
-	uint32_t sf[2];
+	uint32_t sf;
 	uint32_t wm;
 	uint32_t cc;
 
 	int ve_id;
 	uint32_t drawrect_offset;
 	uint32_t drawrect_limit;
-	uint32_t vb_id;
 	uint32_t last_pipelined_pointers;
-	uint16_t vertex_offset;
 	uint16_t last_primitive;
 	int16_t floats_per_vertex;
 	uint16_t surface_table;
@@ -361,8 +394,6 @@ struct gen5_render_state {
 	int ve_id;
 	uint32_t drawrect_offset;
 	uint32_t drawrect_limit;
-	uint32_t vb_id;
-	uint16_t vertex_offset;
 	uint16_t last_primitive;
 	int16_t floats_per_vertex;
 	uint16_t surface_table;
@@ -402,7 +433,6 @@ struct gen6_render_state {
 	uint32_t wm_state;
 	uint32_t wm_kernel[GEN6_KERNEL_COUNT][3];
 
-	uint32_t cc_vp;
 	uint32_t cc_blend;
 
 	uint32_t drawrect_offset;
@@ -412,9 +442,7 @@ struct gen6_render_state {
 	uint32_t kernel;
 
 	uint16_t num_sf_outputs;
-	uint16_t vb_id;
 	uint16_t ve_id;
-	uint16_t vertex_offset;
 	uint16_t last_primitive;
 	int16_t floats_per_vertex;
 	uint16_t surface_table;
@@ -454,7 +482,6 @@ struct gen7_render_state {
 	uint32_t wm_state;
 	uint32_t wm_kernel[GEN7_WM_KERNEL_COUNT][3];
 
-	uint32_t cc_vp;
 	uint32_t cc_blend;
 
 	uint32_t drawrect_offset;
@@ -464,9 +491,7 @@ struct gen7_render_state {
 	uint32_t kernel;
 
 	uint16_t num_sf_outputs;
-	uint16_t vb_id;
 	uint16_t ve_id;
-	uint16_t vertex_offset;
 	uint16_t last_primitive;
 	int16_t floats_per_vertex;
 	uint16_t surface_table;
@@ -690,7 +715,8 @@ sna_render_picture_convert(struct sna *sna,
 			   PixmapPtr pixmap,
 			   int16_t x, int16_t y,
 			   int16_t w, int16_t h,
-			   int16_t dst_x, int16_t dst_y);
+			   int16_t dst_x, int16_t dst_y,
+			   bool fixup_alpha);
 
 inline static void sna_render_composite_redirect_init(struct sna_composite_op *op)
 {
@@ -717,4 +743,36 @@ sna_render_copy_boxes__overlap(struct sna *sna, uint8_t alu,
 bool
 sna_composite_mask_is_opaque(PicturePtr mask);
 
+void sna_vertex_init(struct sna *sna);
+
+static inline void sna_vertex_lock(struct sna_render *r)
+{
+	pthread_mutex_lock(&r->lock);
+}
+
+static inline void sna_vertex_acquire__locked(struct sna_render *r)
+{
+	r->active++;
+}
+
+static inline void sna_vertex_unlock(struct sna_render *r)
+{
+	pthread_mutex_unlock(&r->lock);
+}
+
+static inline void sna_vertex_release__locked(struct sna_render *r)
+{
+	assert(r->active > 0);
+	if (--r->active == 0)
+		pthread_cond_signal(&r->wait);
+}
+
+static inline bool sna_vertex_wait__locked(struct sna_render *r)
+{
+	bool was_active = r->active;
+	while (r->active)
+		pthread_cond_wait(&r->wait, &r->lock);
+	return was_active;
+}
+
 #endif /* SNA_RENDER_H */
diff --git a/src/sna/sna_render_inline.h b/src/sna/sna_render_inline.h
index a796903fb..7d9f2cacf 100644
--- a/src/sna/sna_render_inline.h
+++ b/src/sna/sna_render_inline.h
@@ -17,6 +17,17 @@ static inline bool need_redirect(struct sna *sna, PixmapPtr dst)
 		dst->drawable.height > sna->render.max_3d_size);
 }
 
+static inline float pack_2s(int16_t x, int16_t y)
+{
+	union {
+		struct sna_coordinate p;
+		float f;
+	} u;
+	u.p.x = x;
+	u.p.y = y;
+	return u.f;
+}
+
 static inline int vertex_space(struct sna *sna)
 {
 	return sna->render.vertex_size - sna->render.vertex_used;
@@ -28,21 +39,7 @@ static inline void vertex_emit(struct sna *sna, float v)
 }
 static inline void vertex_emit_2s(struct sna *sna, int16_t x, int16_t y)
 {
-	int16_t *v = (int16_t *)&sna->render.vertices[sna->render.vertex_used++];
-	assert(sna->render.vertex_used <= sna->render.vertex_size);
-	v[0] = x;
-	v[1] = y;
-}
-
-static inline float pack_2s(int16_t x, int16_t y)
-{
-	union {
-		struct sna_coordinate p;
-		float f;
-	} u;
-	u.p.x = x;
-	u.p.y = y;
-	return u.f;
+	vertex_emit(sna, pack_2s(x, y));
 }
 
 static inline int batch_space(struct sna *sna)
@@ -70,17 +67,18 @@ static inline void batch_emit_float(struct sna *sna, float f)
 }
 
 static inline bool
-is_gpu(DrawablePtr drawable)
+is_gpu(struct sna *sna, DrawablePtr drawable, unsigned prefer)
 {
 	struct sna_pixmap *priv = sna_pixmap_from_drawable(drawable);
 
-	if (priv == NULL || priv->clear)
+	if (priv == NULL || priv->clear || priv->cpu)
 		return false;
 
 	if (priv->cpu_damage == NULL)
 		return true;
 
-	if (priv->gpu_damage && !priv->gpu_bo->proxy)
+	if (priv->gpu_damage && !priv->gpu_bo->proxy &&
+	    (sna->render.prefer_gpu & prefer))
 		return true;
 
 	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
@@ -111,11 +109,20 @@ unattached(DrawablePtr drawable)
 }
 
 static inline bool
-picture_is_gpu(PicturePtr picture)
+picture_is_gpu(struct sna *sna, PicturePtr picture)
 {
 	if (!picture || !picture->pDrawable)
 		return false;
-	return is_gpu(picture->pDrawable);
+	return is_gpu(sna, picture->pDrawable, PREFER_GPU_RENDER);
+}
+
+static inline bool
+picture_is_cpu(struct sna *sna, PicturePtr picture)
+{
+	if (!picture->pDrawable)
+		return false;
+
+	return !is_gpu(sna, picture->pDrawable, PREFER_GPU_RENDER);
 }
 
 static inline bool sna_blt_compare_depth(DrawablePtr src, DrawablePtr dst)
@@ -146,8 +153,8 @@ sna_render_picture_extents(PicturePtr p, BoxRec *box)
 {
 	box->x1 = p->pDrawable->x;
 	box->y1 = p->pDrawable->y;
-	box->x2 = p->pDrawable->x + p->pDrawable->width;
-	box->y2 = p->pDrawable->y + p->pDrawable->height;
+	box->x2 = bound(box->x1, p->pDrawable->width);
+	box->y2 = bound(box->y1, p->pDrawable->height);
 
 	if (box->x1 < p->pCompositeClip->extents.x1)
 		box->x1 = p->pCompositeClip->extents.x1;
@@ -158,6 +165,8 @@ sna_render_picture_extents(PicturePtr p, BoxRec *box)
 		box->x2 = p->pCompositeClip->extents.x2;
 	if (box->y2 > p->pCompositeClip->extents.y2)
 		box->y2 = p->pCompositeClip->extents.y2;
+
+	assert(box->x2 > box->x1 && box->y2 > box->y1);
 }
 
 static inline void
@@ -218,4 +227,44 @@ color_convert(uint32_t pixel,
 	return pixel;
 }
 
+inline static bool dst_use_gpu(PixmapPtr pixmap)
+{
+	struct sna_pixmap *priv = sna_pixmap(pixmap);
+	if (priv == NULL)
+		return false;
+
+	if (priv->cpu_bo && kgem_bo_is_busy(priv->cpu_bo))
+		return true;
+
+	if (priv->clear)
+		return false;
+
+	if (priv->gpu_bo && kgem_bo_is_busy(priv->gpu_bo))
+		return true;
+
+	return priv->gpu_damage && (!priv->cpu || !priv->cpu_damage);
+}
+
+inline static bool dst_use_cpu(PixmapPtr pixmap)
+{
+	struct sna_pixmap *priv = sna_pixmap(pixmap);
+	if (priv == NULL || priv->shm)
+		return true;
+
+	return priv->cpu_damage && priv->cpu;
+}
+
+inline static bool dst_is_cpu(PixmapPtr pixmap)
+{
+	struct sna_pixmap *priv = sna_pixmap(pixmap);
+	return priv == NULL || DAMAGE_IS_ALL(priv->cpu_damage);
+}
+
+inline static bool
+untransformed(PicturePtr p)
+{
+	return !p->transform || pixman_transform_is_int_translate(p->transform);
+}
+
+
 #endif /* SNA_RENDER_INLINE_H */
diff --git a/src/sna/sna_threads.c b/src/sna/sna_threads.c
new file mode 100644
index 000000000..f77ddbfe8
--- /dev/null
+++ b/src/sna/sna_threads.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#include "sna.h"
+
+#include <unistd.h>
+#include <pthread.h>
+#include <signal.h>
+
+static int max_threads = -1;
+
+static struct thread {
+    pthread_t thread;
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+
+    void (*func)(void *arg);
+    void *arg;
+} *threads;
+
+static void *__run__(void *arg)
+{
+	struct thread *t = arg;
+	sigset_t signals;
+
+	/* Disable all signals in the slave threads as X uses them for IO */
+	sigfillset(&signals);
+	pthread_sigmask(SIG_BLOCK, &signals, NULL);
+
+	pthread_mutex_lock(&t->mutex);
+	while (1) {
+		while (t->func == NULL)
+			pthread_cond_wait(&t->cond, &t->mutex);
+		pthread_mutex_unlock(&t->mutex);
+
+		assert(t->func);
+		t->func(t->arg);
+
+		pthread_mutex_lock(&t->mutex);
+		t->func = NULL;
+		pthread_cond_signal(&t->cond);
+	}
+	pthread_mutex_unlock(&t->mutex);
+
+	return NULL;
+}
+
+#if defined(__GNUC__)
+#define popcount(x) __builtin_popcount(x)
+#else
+static int popcount(unsigned int x)
+{
+	int count = 0;
+
+	while (x) {
+		count += x&1;
+		x >>= 1;
+	}
+
+	return count;
+}
+#endif
+
+static int
+num_cores(void)
+{
+	FILE *file = fopen("/proc/cpuinfo", "r");
+	int count = 0;
+	if (file) {
+		size_t len = 0;
+		char *line = NULL;
+		uint32_t processors = 0, cores = 0;
+		while (getline(&line, &len, file) != -1) {
+			int id;
+			if (sscanf(line, "physical id : %d", &id) == 1) {
+				if (id >= 32)
+					return 0;
+				processors |= 1 << id;
+			} else if (sscanf(line, "core id : %d", &id) == 1) {
+				if (id >= 32)
+					return 0;
+				cores |= 1 << id;
+			}
+		}
+		free(line);
+		fclose(file);
+
+		DBG(("%s: processors=0x%08x, cores=0x%08x\n",
+		     __FUNCTION__, processors, cores));
+
+		count = popcount(processors) * popcount(cores);
+	}
+	return count;
+}
+
+void sna_threads_init(void)
+{
+	int n;
+
+	if (max_threads != -1)
+		return;
+
+	max_threads = num_cores();
+	if (max_threads == 0)
+		max_threads = sysconf(_SC_NPROCESSORS_ONLN) / 2;
+	if (max_threads <= 1)
+		goto bail;
+
+	DBG(("%s: creating a thread pool of %d threads\n",
+	     __func__, max_threads));
+
+	threads = malloc (sizeof(threads[0])*max_threads);
+	if (threads == NULL)
+		goto bail;
+
+	for (n = 0; n < max_threads; n++) {
+		pthread_mutex_init(&threads[n].mutex, NULL);
+		pthread_cond_init(&threads[n].cond, NULL);
+
+		threads[n].func = NULL;
+		if (pthread_create(&threads[n].thread, NULL,
+				   __run__, &threads[n]))
+			goto bail;
+	}
+
+	return;
+
+bail:
+	max_threads = 0;
+}
+
+void sna_threads_run(void (*func)(void *arg), void *arg)
+{
+	int n;
+
+	assert(max_threads > 0);
+
+	for (n = 0; n < max_threads; n++) {
+		if (threads[n].func)
+			continue;
+
+		pthread_mutex_lock(&threads[n].mutex);
+		if (threads[n].func) {
+			pthread_mutex_unlock(&threads[n].mutex);
+			continue;
+		}
+
+		goto execute;
+	}
+
+	n = rand() % max_threads;
+	pthread_mutex_lock(&threads[n].mutex);
+	while (threads[n].func)
+		pthread_cond_wait(&threads[n].cond, &threads[n].mutex);
+
+execute:
+	threads[n].func = func;
+	threads[n].arg = arg;
+	pthread_cond_signal(&threads[n].cond);
+	pthread_mutex_unlock(&threads[n].mutex);
+}
+
+void sna_threads_wait(void)
+{
+	int n;
+
+	assert(max_threads > 0);
+
+	for (n = 0; n < max_threads; n++) {
+		if (threads[n].func == NULL)
+			continue;
+
+		pthread_mutex_lock(&threads[n].mutex);
+		while (threads[n].func)
+			pthread_cond_wait(&threads[n].cond, &threads[n].mutex);
+		pthread_mutex_unlock(&threads[n].mutex);
+	}
+}
+
+int sna_use_threads(int width, int height, int threshold)
+{
+	int num_threads;
+
+	if (max_threads <= 0)
+		return 1;
+
+	num_threads = height / (128/width + 1) / threshold-1;
+	if (num_threads <= 0)
+		return 1;
+
+	if (num_threads > max_threads)
+		num_threads = max_threads;
+	return num_threads;
+}
+
+struct thread_composite {
+	pixman_image_t *src, *mask, *dst;
+	pixman_op_t op;
+	int16_t src_x, src_y;
+	int16_t mask_x, mask_y;
+	int16_t dst_x, dst_y;
+	uint16_t width, height;
+};
+
+static void thread_composite(void *arg)
+{
+	struct thread_composite *t = arg;
+	pixman_image_composite(t->op, t->src, t->mask, t->dst,
+			       t->src_x, t->src_y,
+			       t->mask_x, t->mask_y,
+			       t->dst_x, t->dst_y,
+			       t->width, t->height);
+}
+
+void sna_image_composite(pixman_op_t        op,
+			 pixman_image_t    *src,
+			 pixman_image_t    *mask,
+			 pixman_image_t    *dst,
+			 int16_t            src_x,
+			 int16_t            src_y,
+			 int16_t            mask_x,
+			 int16_t            mask_y,
+			 int16_t            dst_x,
+			 int16_t            dst_y,
+			 uint16_t           width,
+			 uint16_t           height)
+{
+	int num_threads;
+
+	num_threads = sna_use_threads(width, height, 16);
+	if (num_threads <= 1) {
+		pixman_image_composite(op, src, mask, dst,
+				       src_x, src_y,
+				       mask_x, mask_y,
+				       dst_x, dst_y,
+				       width, height);
+	} else {
+		struct thread_composite data[num_threads];
+		int y, dy, n;
+
+		DBG(("%s: using %d threads for compositing %dx%d\n",
+		     __FUNCTION__, num_threads, width, height));
+
+		y = dst_y;
+		dy = (height + num_threads - 1) / num_threads;
+
+		data[0].op = op;
+		data[0].src = src;
+		data[0].mask = mask;
+		data[0].dst = dst;
+		data[0].src_x = src_x;
+		data[0].src_y = src_y;
+		data[0].mask_x = mask_x;
+		data[0].mask_y = mask_y;
+		data[0].dst_x = dst_x;
+		data[0].dst_y = y;
+		data[0].width = width;
+		data[0].height = dy;
+
+		for (n = 1; n < num_threads; n++) {
+			data[n] = data[0];
+			data[n].src_y += y - dst_y;
+			data[n].mask_y += y - dst_y;
+			data[n].dst_y = y;
+			y += dy;
+
+			sna_threads_run(thread_composite, &data[n]);
+		}
+
+		if (y + dy > dst_y + height)
+			dy = dst_y + height - y;
+
+		data[0].src_y += y - dst_y;
+		data[0].mask_y += y - dst_y;
+		data[0].dst_y = y;
+		data[0].height = dy;
+
+		thread_composite(&data[0]);
+
+		sna_threads_wait();
+	}
+}
diff --git a/src/sna/sna_tiling.c b/src/sna/sna_tiling.c
index 9e70833c6..5bebf0020 100644
--- a/src/sna/sna_tiling.c
+++ b/src/sna/sna_tiling.c
@@ -624,7 +624,7 @@ sna_tiling_fill_boxes(struct sna *sna,
 
 			RegionNull(&this);
 			RegionIntersect(&this, &region, &tile);
-			if (!RegionNotEmpty(&this))
+			if (RegionNil(&this))
 				continue;
 
 			tmp.drawable.width  = this.extents.x2 - this.extents.x1;
@@ -737,7 +737,7 @@ bool sna_tiling_blt_copy_boxes(struct sna *sna, uint8_t alu,
 
 			RegionNull(&this);
 			RegionIntersect(&this, &region, &tile);
-			if (!RegionNotEmpty(&this))
+			if (RegionNil(&this))
 				continue;
 
 			w = this.extents.x2 - this.extents.x1;
diff --git a/src/sna/sna_trapezoids.c b/src/sna/sna_trapezoids.c
index 482abd369..c547fb5aa 100644
--- a/src/sna/sna_trapezoids.c
+++ b/src/sna/sna_trapezoids.c
@@ -49,6 +49,7 @@
 #define NO_ALIGNED_BOXES 0
 #define NO_UNALIGNED_BOXES 0
 #define NO_SCAN_CONVERTER 0
+#define NO_GPU_THREADS 0
 
 /* TODO: Emit unantialiased and MSAA triangles. */
 
@@ -68,6 +69,9 @@
 #define FAST_SAMPLES_Y (1<<FAST_SAMPLES_shift)
 #define FAST_SAMPLES_mask ((1<<FAST_SAMPLES_shift)-1)
 
+#define region_count(r) ((r)->data ? (r)->data->numRects : 1)
+#define region_boxes(r) ((r)->data ? (BoxPtr)((r)->data + 1) : &(r)->extents)
+
 typedef void (*span_func_t)(struct sna *sna,
 			    struct sna_composite_spans_op *op,
 			    pixman_region16_t *clip,
@@ -328,10 +332,10 @@ floored_divrem(int a, int b)
 /* Compute the floored division (x*a)/b. Assumes / and % perform symmetric
  * division. */
 static struct quorem
-floored_muldivrem(int x, int a, int b)
+floored_muldivrem(int32_t x, int32_t a, int32_t b)
 {
 	struct quorem qr;
-	long long xa = (long long)x*a;
+	int64_t xa = (int64_t)x*a;
 	qr.quo = xa/b;
 	qr.rem = xa%b;
 	if (qr.rem && (xa>=0) != (b>=0)) {
@@ -471,7 +475,7 @@ cell_list_reset(struct cell_list *cells)
 	pool_reset(cells->cell_pool.base);
 }
 
-static struct cell *
+inline static struct cell *
 cell_list_alloc(struct cell_list *cells,
 		struct cell *tail,
 		int x)
@@ -532,6 +536,9 @@ cell_list_add_subspan(struct cell_list *cells,
 	int ix1, fx1;
 	int ix2, fx2;
 
+	if (x1 == x2)
+		return;
+
 	FAST_SAMPLES_X_TO_INT_FRAC(x1, ix1, fx1);
 	FAST_SAMPLES_X_TO_INT_FRAC(x2, ix2, fx2);
 
@@ -671,6 +678,8 @@ polygon_add_edge(struct polygon *polygon,
 	ybot = bottom <= ymax ? bottom : ymax;
 	e->ytop = ytop;
 	e->height_left = ybot - ytop;
+	if (e->height_left <= 0)
+		return;
 
 	if (dx == 0) {
 		e->x.quo = x1;
@@ -733,6 +742,8 @@ polygon_add_line(struct polygon *polygon,
 
 	e->ytop = top;
 	e->height_left = bot - top;
+	if (e->height_left <= 0)
+		return;
 
 	if (dx == 0) {
 		e->x.quo = p1->x;
@@ -797,6 +808,9 @@ merge_sorted_edges(struct edge *head_a, struct edge *head_b)
 	struct edge *head, **next, *prev;
 	int32_t x;
 
+	if (head_b == NULL)
+		return head_a;
+
 	prev = head_a->prev;
 	next = &head;
 	if (head_a->x.quo <= head_b->x.quo) {
@@ -869,11 +883,39 @@ sort_edges(struct edge  *list,
 	return remaining;
 }
 
+static struct edge *filter(struct edge *edges)
+{
+	struct edge *e;
+
+	e = edges;
+	do {
+		struct edge *n = e->next;
+		if (e->dir == -n->dir &&
+		    e->height_left == n->height_left &&
+		    *(uint64_t *)&e->x == *(uint64_t *)&n->x &&
+		    *(uint64_t *)&e->dxdy == *(uint64_t *)&n->dxdy) {
+			if (e->prev)
+				e->prev->next = n->next;
+			else
+				edges = n->next;
+			if (n->next)
+				n->next->prev = e->prev;
+			else
+				break;
+
+			e = n->next;
+		} else
+			e = e->next;
+	} while (e->next);
+
+	return edges;
+}
+
 static struct edge *
 merge_unsorted_edges (struct edge *head, struct edge *unsorted)
 {
 	sort_edges (unsorted, UINT_MAX, &unsorted);
-	return merge_sorted_edges (head, unsorted);
+	return merge_sorted_edges (head, filter(unsorted));
 }
 
 /* Test if the edges on the active list can be safely advanced by a
@@ -881,18 +923,18 @@ merge_unsorted_edges (struct edge *head, struct edge *unsorted)
 inline static bool
 can_full_step(struct active_list *active)
 {
-	const struct edge *e;
-
 	/* Recomputes the minimum height of all edges on the active
 	 * list if we have been dropping edges. */
 	if (active->min_height <= 0) {
+		const struct edge *e;
 		int min_height = INT_MAX;
 		int is_vertical = 1;
 
 		for (e = active->head.next; &active->tail != e; e = e->next) {
 			if (e->height_left < min_height)
 				min_height = e->height_left;
-			is_vertical &= e->dy == 0;
+			if (is_vertical)
+				is_vertical = e->dy == 0;
 		}
 
 		active->is_vertical = is_vertical;
@@ -929,7 +971,8 @@ fill_buckets(struct active_list *active,
 		*b = edge;
 		if (edge->height_left < min_height)
 			min_height = edge->height_left;
-		is_vertical &= edge->dy == 0;
+		if (is_vertical)
+			is_vertical = edge->dy == 0;
 		edge = next;
 	}
 
@@ -1836,7 +1879,7 @@ static void
 mono_add_line(struct mono *mono,
 	      int dst_x, int dst_y,
 	      xFixed top, xFixed bottom,
-	      xPointFixed *p1, xPointFixed *p2,
+	      const xPointFixed *p1, const xPointFixed *p2,
 	      int dir)
 {
 	struct mono_polygon *polygon = &mono->polygon;
@@ -1853,7 +1896,7 @@ mono_add_line(struct mono *mono,
 	       dir));
 
 	if (top > bottom) {
-		xPointFixed *t;
+		const xPointFixed *t;
 
 		y = top;
 		top = bottom;
@@ -1917,6 +1960,9 @@ mono_merge_sorted_edges(struct mono_edge *head_a, struct mono_edge *head_b)
 	struct mono_edge *head, **next, *prev;
 	int32_t x;
 
+	if (head_b == NULL)
+		return head_a;
+
 	prev = head_a->prev;
 	next = &head;
 	if (head_a->x.quo <= head_b->x.quo) {
@@ -1990,11 +2036,39 @@ mono_sort_edges(struct mono_edge *list,
 	return remaining;
 }
 
+static struct mono_edge *mono_filter(struct mono_edge *edges)
+{
+	struct mono_edge *e;
+
+	e = edges;
+	do {
+		struct mono_edge *n = e->next;
+		if (e->dir == -n->dir &&
+		    e->height_left == n->height_left &&
+		    *(uint64_t *)&e->x == *(uint64_t *)&n->x &&
+		    *(uint64_t *)&e->dxdy == *(uint64_t *)&n->dxdy) {
+			if (e->prev)
+				e->prev->next = n->next;
+			else
+				edges = n->next;
+			if (n->next)
+				n->next->prev = e->prev;
+			else
+				break;
+
+			e = n->next;
+		} else
+			e = e->next;
+	} while (e->next);
+
+	return edges;
+}
+
 static struct mono_edge *
 mono_merge_unsorted_edges(struct mono_edge *head, struct mono_edge *unsorted)
 {
 	mono_sort_edges(unsorted, UINT_MAX, &unsorted);
-	return mono_merge_sorted_edges(head, unsorted);
+	return mono_merge_sorted_edges(head, mono_filter(unsorted));
 }
 
 #if 0
@@ -2079,6 +2153,60 @@ mono_span__fast(struct mono *c, int x1, int x2, BoxPtr box)
 	c->op.box(c->sna, &c->op, box);
 }
 
+struct mono_span_thread_boxes {
+	const struct sna_composite_op *op;
+#define MONO_SPAN_MAX_BOXES (8192/sizeof(BoxRec))
+	BoxRec boxes[MONO_SPAN_MAX_BOXES];
+	int num_boxes;
+};
+
+inline static void
+thread_mono_span_add_boxes(struct mono *c, const BoxRec *box, int count)
+{
+	struct mono_span_thread_boxes *b = c->op.priv;
+
+	assert(count > 0 && count <= MONO_SPAN_MAX_BOXES);
+	if (b->num_boxes + count > MONO_SPAN_MAX_BOXES) {
+		b->op->thread_boxes(c->sna, b->op, b->boxes, b->num_boxes);
+		b->num_boxes = 0;
+	}
+
+	memcpy(b->boxes + b->num_boxes, box, count*sizeof(BoxRec));
+	b->num_boxes += count;
+	assert(b->num_boxes <= MONO_SPAN_MAX_BOXES);
+}
+
+fastcall static void
+thread_mono_span_clipped(struct mono *c, int x1, int x2, BoxPtr box)
+{
+	pixman_region16_t region;
+
+	__DBG(("%s [%d, %d]\n", __FUNCTION__, x1, x2));
+
+	box->x1 = x1;
+	box->x2 = x2;
+
+	assert(c->clip.data);
+
+	pixman_region_init_rects(&region, box, 1);
+	RegionIntersect(&region, &region, &c->clip);
+	if (REGION_NUM_RECTS(&region))
+		thread_mono_span_add_boxes(c,
+					   REGION_RECTS(&region),
+					   REGION_NUM_RECTS(&region));
+	pixman_region_fini(&region);
+}
+
+fastcall static void
+thread_mono_span(struct mono *c, int x1, int x2, BoxPtr box)
+{
+	__DBG(("%s [%d, %d]\n", __FUNCTION__, x1, x2));
+
+	box->x1 = x1;
+	box->x2 = x2;
+	thread_mono_span_add_boxes(c, box, 1);
+}
+
 inline static void
 mono_row(struct mono *c, int16_t y, int16_t h)
 {
@@ -2196,10 +2324,7 @@ mono_render(struct mono *mono)
 	struct mono_polygon *polygon = &mono->polygon;
 	int i, j, h = mono->clip.extents.y2 - mono->clip.extents.y1;
 
-	if (mono->clip.data == NULL && mono->op.damage == NULL)
-		mono->span = mono_span__fast;
-	else
-		mono->span = mono_span;
+	assert(mono->span);
 
 	for (i = 0; i < h; i = j) {
 		j = i + 1;
@@ -2332,7 +2457,8 @@ is_mono(PicturePtr dst, PictFormatPtr mask)
 }
 
 static bool
-trapezoids_inplace_fallback(CARD8 op,
+trapezoids_inplace_fallback(struct sna *sna,
+			    CARD8 op,
 			    PicturePtr src, PicturePtr dst, PictFormatPtr mask,
 			    int ntrap, xTrapezoid *traps)
 {
@@ -2372,7 +2498,7 @@ trapezoids_inplace_fallback(CARD8 op,
 		return false;
 	}
 
-	if (is_gpu(dst->pDrawable)) {
+	if (is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
 		DBG(("%s: not performing inplace as dst is already on the GPU\n",
 		     __FUNCTION__));
 		return false;
@@ -2398,8 +2524,66 @@ trapezoids_inplace_fallback(CARD8 op,
 	return true;
 }
 
+struct rasterize_traps_thread {
+	xTrapezoid *traps;
+	char *ptr;
+	int stride;
+	BoxRec bounds;
+	pixman_format_code_t format;
+	int ntrap;
+};
+
+static void rasterize_traps_thread(void *arg)
+{
+	struct rasterize_traps_thread *thread = arg;
+	pixman_image_t *image;
+	int width, height, n;
+
+	width = thread->bounds.x2 - thread->bounds.x1;
+	height = thread->bounds.y2 - thread->bounds.y1;
+
+	memset(thread->ptr, 0, thread->stride*height);
+	if (PIXMAN_FORMAT_DEPTH(thread->format) < 8)
+		image = pixman_image_create_bits(thread->format,
+						 width, height,
+						 NULL, 0);
+	else
+		image = pixman_image_create_bits(thread->format,
+						 width, height,
+						 (uint32_t *)thread->ptr,
+						 thread->stride);
+	if (image == NULL)
+		return;
+
+	for (n = 0; n < thread->ntrap; n++)
+		pixman_rasterize_trapezoid(image,
+					   (pixman_trapezoid_t *)&thread->traps[n],
+					   -thread->bounds.x1, -thread->bounds.y1);
+
+	if (PIXMAN_FORMAT_DEPTH(thread->format) < 8) {
+		pixman_image_t *a8;
+
+		a8 = pixman_image_create_bits(PIXMAN_a8,
+					      width, height,
+					      (uint32_t *)thread->ptr,
+					      thread->stride);
+		if (a8) {
+			pixman_image_composite(PIXMAN_OP_SRC,
+					       image, NULL, a8,
+					       0, 0,
+					       0, 0,
+					       0, 0,
+					       width, height);
+			pixman_image_unref(a8);
+		}
+	}
+
+	pixman_image_unref(image);
+}
+
 static void
-trapezoids_fallback(CARD8 op, PicturePtr src, PicturePtr dst,
+trapezoids_fallback(struct sna *sna,
+		    CARD8 op, PicturePtr src, PicturePtr dst,
 		    PictFormatPtr maskFormat, INT16 xSrc, INT16 ySrc,
 		    int ntrap, xTrapezoid * traps)
 {
@@ -2441,6 +2625,8 @@ trapezoids_fallback(CARD8 op, PicturePtr src, PicturePtr dst,
 		height = bounds.y2 - bounds.y1;
 		bounds.x1 -= dst->pDrawable->x;
 		bounds.y1 -= dst->pDrawable->y;
+		bounds.x2 -= dst->pDrawable->x;
+		bounds.y2 -= dst->pDrawable->y;
 		depth = maskFormat->depth;
 		if (depth == 1) {
 			format = PIXMAN_a1;
@@ -2452,51 +2638,90 @@ trapezoids_fallback(CARD8 op, PicturePtr src, PicturePtr dst,
 
 		DBG(("%s: mask (%dx%d) depth=%d, format=%08x\n",
 		     __FUNCTION__, width, height, depth, format));
-		if (is_gpu(dst->pDrawable) || picture_is_gpu(src)) {
+		if (is_gpu(sna, dst->pDrawable, PREFER_GPU_RENDER) ||
+		    picture_is_gpu(sna, src)) {
+			int num_threads;
+
 			scratch = sna_pixmap_create_upload(screen,
 							   width, height, 8,
 							   KGEM_BUFFER_WRITE);
 			if (!scratch)
 				return;
 
-			if (depth < 8) {
-				image = pixman_image_create_bits(format, width, height,
-								 NULL, 0);
-			} else {
-				memset(scratch->devPrivate.ptr, 0, scratch->devKind*height);
-				image = pixman_image_create_bits(format, width, height,
-								 scratch->devPrivate.ptr,
-								 scratch->devKind);
-			}
-			if (image) {
-				for (; ntrap; ntrap--, traps++)
-					pixman_rasterize_trapezoid(image,
-								   (pixman_trapezoid_t *)traps,
-								   -bounds.x1, -bounds.y1);
+			num_threads = sna_use_threads(width, height, 4);
+			if (num_threads == 1) {
 				if (depth < 8) {
-					pixman_image_t *a8;
-
-					a8 = pixman_image_create_bits(PIXMAN_a8, width, height,
-								      scratch->devPrivate.ptr,
-								      scratch->devKind);
-					if (a8) {
-						pixman_image_composite(PIXMAN_OP_SRC,
-								       image, NULL, a8,
-								       0, 0,
-								       0, 0,
-								       0, 0,
-								       width, height);
-						format = PIXMAN_a8;
-						depth = 8;
-						pixman_image_unref (a8);
+					image = pixman_image_create_bits(format, width, height,
+									 NULL, 0);
+				} else {
+					memset(scratch->devPrivate.ptr, 0, scratch->devKind*height);
+
+					image = pixman_image_create_bits(format, width, height,
+									 scratch->devPrivate.ptr,
+									 scratch->devKind);
+				}
+				if (image) {
+					for (; ntrap; ntrap--, traps++)
+						pixman_rasterize_trapezoid(image,
+									   (pixman_trapezoid_t *)traps,
+									   -bounds.x1, -bounds.y1);
+					if (depth < 8) {
+						pixman_image_t *a8;
+
+						a8 = pixman_image_create_bits(PIXMAN_a8, width, height,
+									      scratch->devPrivate.ptr,
+									      scratch->devKind);
+						if (a8) {
+							pixman_image_composite(PIXMAN_OP_SRC,
+									       image, NULL, a8,
+									       0, 0,
+									       0, 0,
+									       0, 0,
+									       width, height);
+							format = PIXMAN_a8;
+							depth = 8;
+							pixman_image_unref(a8);
+						}
 					}
+
+					pixman_image_unref(image);
+				}
+				if (format != PIXMAN_a8) {
+					sna_pixmap_destroy(scratch);
+					return;
+				}
+			} else {
+				struct rasterize_traps_thread threads[num_threads];
+				int y, dy, n;
+
+				threads[0].ptr = scratch->devPrivate.ptr;
+				threads[0].stride = scratch->devKind;
+				threads[0].traps = traps;
+				threads[0].ntrap = ntrap;
+				threads[0].bounds = bounds;
+				threads[0].format = format;
+
+				y = bounds.y1;
+				dy = (height + num_threads - 1) / num_threads;
+
+				for (n = 1; n < num_threads; n++) {
+					threads[n] = threads[0];
+					threads[n].ptr += (y - bounds.y1) * threads[n].stride;
+					threads[n].bounds.y1 = y;
+					threads[n].bounds.y2 = y += dy;
+
+					sna_threads_run(rasterize_traps_thread, &threads[n]);
 				}
 
-				pixman_image_unref(image);
-			}
-			if (format != PIXMAN_a8) {
-				sna_pixmap_destroy(scratch);
-				return;
+				threads[0].ptr += (y - bounds.y1) * threads[0].stride;
+				threads[0].bounds.y1 = y;
+				threads[0].bounds.y2 = bounds.y2;
+				rasterize_traps_thread(&threads[0]);
+
+				sna_threads_wait();
+
+				format = PIXMAN_a8;
+				depth = 8;
 			}
 		} else {
 			scratch = sna_pixmap_create_unattached(screen,
@@ -2538,7 +2763,7 @@ trapezoids_fallback(CARD8 op, PicturePtr src, PicturePtr dst,
 			maskFormat = PictureMatchFormat(screen, 8, PICT_a8);
 
 		for (; ntrap; ntrap--, traps++)
-			trapezoids_fallback(op,
+			trapezoids_fallback(sna, op,
 					    src, dst, maskFormat,
 					    xSrc, ySrc, 1, traps);
 	}
@@ -3073,13 +3298,13 @@ lerp32_unaligned_box_row(PixmapPtr scratch, uint32_t color,
 			 uint8_t covered)
 {
 	int16_t x1 = pixman_fixed_to_int(trap->left.p1.x) + dx;
-	int16_t fx1 = grid_coverage(SAMPLES_X, trap->left.p1.x);
+	uint16_t fx1 = grid_coverage(SAMPLES_X, trap->left.p1.x);
 	int16_t x2 = pixman_fixed_to_int(trap->right.p2.x) + dx;
-	int16_t fx2 = grid_coverage(SAMPLES_X, trap->right.p2.x);
+	uint16_t fx2 = grid_coverage(SAMPLES_X, trap->right.p2.x);
 
 	if (x1 < extents->x1)
 		x1 = extents->x1, fx1 = 0;
-	if (x2 > extents->x2)
+	if (x2 >= extents->x2)
 		x2 = extents->x2, fx2 = 0;
 
 	DBG(("%s: x=(%d.%d, %d.%d), y=%dx%d, covered=%d\n", __FUNCTION__,
@@ -3171,13 +3396,13 @@ pixsolid_unaligned_box_row(struct pixman_inplace *pi,
 			   uint8_t covered)
 {
 	int16_t x1 = pixman_fixed_to_int(trap->left.p1.x);
-	int16_t fx1 = grid_coverage(SAMPLES_X, trap->left.p1.x);
+	uint16_t fx1 = grid_coverage(SAMPLES_X, trap->left.p1.x);
 	int16_t x2 = pixman_fixed_to_int(trap->right.p1.x);
-	int16_t fx2 = grid_coverage(SAMPLES_X, trap->right.p1.x);
+	uint16_t fx2 = grid_coverage(SAMPLES_X, trap->right.p1.x);
 
 	if (x1 < extents->x1)
 		x1 = extents->x1, fx1 = 0;
-	if (x2 > extents->x2)
+	if (x2 >= extents->x2)
 		x2 = extents->x2, fx2 = 0;
 
 	if (x1 < x2) {
@@ -3198,7 +3423,8 @@ pixsolid_unaligned_box_row(struct pixman_inplace *pi,
 }
 
 static bool
-composite_unaligned_boxes_inplace__solid(CARD8 op, uint32_t color,
+composite_unaligned_boxes_inplace__solid(struct sna *sna,
+					 CARD8 op, uint32_t color,
 					 PicturePtr dst, int n, xTrapezoid *t,
 					 bool force_fallback)
 {
@@ -3206,9 +3432,9 @@ composite_unaligned_boxes_inplace__solid(CARD8 op, uint32_t color,
 	int16_t dx, dy;
 
 	DBG(("%s: force=%d, is_gpu=%d, op=%d, color=%x\n", __FUNCTION__,
-	     force_fallback, is_gpu(dst->pDrawable), op, color));
+	     force_fallback, is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS), op, color));
 
-	if (!force_fallback && is_gpu(dst->pDrawable)) {
+	if (!force_fallback && is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
 		DBG(("%s: fallback -- can not perform operation in place, destination busy\n",
 		     __FUNCTION__));
 
@@ -3276,9 +3502,9 @@ composite_unaligned_boxes_inplace__solid(CARD8 op, uint32_t color,
 		extents = REGION_RECTS(&clip);
 		while (count--) {
 			int16_t y1 = dy + pixman_fixed_to_int(t->top);
-			int16_t fy1 = pixman_fixed_frac(t->top);
+			uint16_t fy1 = pixman_fixed_frac(t->top);
 			int16_t y2 = dy + pixman_fixed_to_int(t->bottom);
-			int16_t fy2 = pixman_fixed_frac(t->bottom);
+			uint16_t fy2 = pixman_fixed_frac(t->bottom);
 
 			DBG(("%s: t=(%d, %d), (%d, %d), extents (%d, %d), (%d, %d)\n",
 			     __FUNCTION__,
@@ -3291,7 +3517,7 @@ composite_unaligned_boxes_inplace__solid(CARD8 op, uint32_t color,
 
 			if (y1 < extents->y1)
 				y1 = extents->y1, fy1 = 0;
-			if (y2 > extents->y2)
+			if (y2 >= extents->y2)
 				y2 = extents->y2, fy2 = 0;
 
 			if (y1 < y2) {
@@ -3363,13 +3589,13 @@ pixman:
 		extents = REGION_RECTS(&clip);
 		while (count--) {
 			int16_t y1 = pixman_fixed_to_int(t->top);
-			int16_t fy1 = pixman_fixed_frac(t->top);
+			uint16_t fy1 = pixman_fixed_frac(t->top);
 			int16_t y2 = pixman_fixed_to_int(t->bottom);
-			int16_t fy2 = pixman_fixed_frac(t->bottom);
+			uint16_t fy2 = pixman_fixed_frac(t->bottom);
 
 			if (y1 < extents->y1)
 				y1 = extents->y1, fy1 = 0;
-			if (y2 > extents->y2)
+			if (y2 >= extents->y2)
 				y2 = extents->y2, fy2 = 0;
 			if (y1 < y2) {
 				if (fy1) {
@@ -3424,18 +3650,18 @@ pixmask_opacity(struct pixman_inplace *pi,
 static void
 pixmask_unaligned_box_row(struct pixman_inplace *pi,
 			  const BoxRec *extents,
-			  xTrapezoid *trap,
+			  const xTrapezoid *trap,
 			  int16_t y, int16_t h,
 			  uint8_t covered)
 {
 	int16_t x1 = pixman_fixed_to_int(trap->left.p1.x);
-	int16_t fx1 = grid_coverage(SAMPLES_X, trap->left.p1.x);
+	uint16_t fx1 = grid_coverage(SAMPLES_X, trap->left.p1.x);
 	int16_t x2 = pixman_fixed_to_int(trap->right.p1.x);
-	int16_t fx2 = grid_coverage(SAMPLES_X, trap->right.p1.x);
+	uint16_t fx2 = grid_coverage(SAMPLES_X, trap->right.p1.x);
 
 	if (x1 < extents->x1)
 		x1 = extents->x1, fx1 = 0;
-	if (x2 > extents->x2)
+	if (x2 >= extents->x2)
 		x2 = extents->x2, fx2 = 0;
 
 	if (x1 < x2) {
@@ -3455,13 +3681,82 @@ pixmask_unaligned_box_row(struct pixman_inplace *pi,
 	}
 }
 
+struct rectilinear_inplace_thread {
+	pixman_image_t *dst, *src;
+	const RegionRec *clip;
+	const xTrapezoid *trap;
+	int dx, dy, sx, sy;
+	int y1, y2;
+	CARD8 op;
+};
+
+static void rectilinear_inplace_thread(void *arg)
+{
+	struct rectilinear_inplace_thread *thread = arg;
+	const xTrapezoid *t = thread->trap;
+	struct pixman_inplace pi;
+	const BoxRec *extents;
+	int count;
+
+	pi.image = thread->dst;
+	pi.dx = thread->dx;
+	pi.dy = thread->dy;
+
+	pi.source = thread->src;
+	pi.sx = thread->sx;
+	pi.sy = thread->sy;
+
+	pi.mask = pixman_image_create_bits(PIXMAN_a8, 1, 1, &pi.color, 4);
+	pixman_image_set_repeat(pi.mask, PIXMAN_REPEAT_NORMAL);
+	pi.bits = pixman_image_get_data(pi.mask);
+	pi.op = thread->op;
+
+	count = region_count(thread->clip);
+	extents = region_boxes(thread->clip);
+	while (count--) {
+		int16_t y1 = pixman_fixed_to_int(t->top);
+		uint16_t fy1 = pixman_fixed_frac(t->top);
+		int16_t y2 = pixman_fixed_to_int(t->bottom);
+		uint16_t fy2 = pixman_fixed_frac(t->bottom);
+
+		if (y1 < MAX(thread->y1, extents->y1))
+			y1 = MAX(thread->y1, extents->y1), fy1 = 0;
+		if (y2 > MIN(thread->y2, extents->y2))
+			y2 = MIN(thread->y2, extents->y2), fy2 = 0;
+		if (y1 < y2) {
+			if (fy1) {
+				pixmask_unaligned_box_row(&pi, extents, t, y1, 1,
+							  SAMPLES_Y - grid_coverage(SAMPLES_Y, fy1));
+				y1++;
+			}
+
+			if (y2 > y1)
+				pixmask_unaligned_box_row(&pi, extents, t, y1, y2 - y1,
+							  SAMPLES_Y);
+
+			if (fy2)
+				pixmask_unaligned_box_row(&pi, extents, t, y2, 1,
+							  grid_coverage(SAMPLES_Y, fy2));
+		} else if (y1 == y2 && fy2 > fy1) {
+			pixmask_unaligned_box_row(&pi, extents, t, y1, 1,
+						  grid_coverage(SAMPLES_Y, fy2) - grid_coverage(SAMPLES_Y, fy1));
+		}
+		extents++;
+	}
+
+	pixman_image_unref(pi.mask);
+}
+
 static bool
-composite_unaligned_boxes_inplace(CARD8 op,
+composite_unaligned_boxes_inplace(struct sna *sna,
+				  CARD8 op,
 				  PicturePtr src, int16_t src_x, int16_t src_y,
 				  PicturePtr dst, int n, xTrapezoid *t,
 				  bool force_fallback)
 {
-	if (!force_fallback) {
+	if (!force_fallback &&
+	    (is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS) ||
+	     picture_is_gpu(sna, src))) {
 		DBG(("%s: fallback -- not forcing\n", __FUNCTION__));
 		return false;
 	}
@@ -3471,10 +3766,10 @@ composite_unaligned_boxes_inplace(CARD8 op,
 	src_x -= pixman_fixed_to_int(t[0].left.p1.x);
 	src_y -= pixman_fixed_to_int(t[0].left.p1.y);
 	do {
-		struct pixman_inplace pi;
 		RegionRec clip;
 		BoxPtr extents;
 		int count;
+		int num_threads;
 
 		clip.extents.x1 = pixman_fixed_to_int(t->left.p1.x);
 		clip.extents.x2 = pixman_fixed_to_int(t->right.p1.x + pixman_fixed_1_minus_e);
@@ -3513,59 +3808,100 @@ composite_unaligned_boxes_inplace(CARD8 op,
 			}
 		}
 
-		pi.image = image_from_pict(dst, false, &pi.dx, &pi.dy);
-		pi.source = image_from_pict(src, false, &pi.sx, &pi.sy);
-		pi.sx += src_x;
-		pi.sy += src_y;
-		pi.mask = pixman_image_create_bits(PIXMAN_a8, 1, 1, NULL, 0);
-		pixman_image_set_repeat(pi.mask, PIXMAN_REPEAT_NORMAL);
-		pi.bits = pixman_image_get_data(pi.mask);
-		pi.op = op;
+		num_threads = sna_use_threads(clip.extents.x2 - clip.extents.x1,
+					      clip.extents.y2 - clip.extents.y1,
+					      32);
+		if (num_threads == 1) {
+			struct pixman_inplace pi;
 
-		count = REGION_NUM_RECTS(&clip);
-		extents = REGION_RECTS(&clip);
-		while (count--) {
-			int16_t y1 = pixman_fixed_to_int(t->top);
-			int16_t fy1 = pixman_fixed_frac(t->top);
-			int16_t y2 = pixman_fixed_to_int(t->bottom);
-			int16_t fy2 = pixman_fixed_frac(t->bottom);
+			pi.image = image_from_pict(dst, false, &pi.dx, &pi.dy);
+			pi.source = image_from_pict(src, false, &pi.sx, &pi.sy);
+			pi.sx += src_x;
+			pi.sy += src_y;
+			pi.mask = pixman_image_create_bits(PIXMAN_a8, 1, 1, &pi.color, 4);
+			pixman_image_set_repeat(pi.mask, PIXMAN_REPEAT_NORMAL);
+			pi.bits = pixman_image_get_data(pi.mask);
+			pi.op = op;
 
-			if (y1 < extents->y1)
-				y1 = extents->y1, fy1 = 0;
-			if (y2 > extents->y2)
-				y2 = extents->y2, fy2 = 0;
-			if (y1 < y2) {
-				if (fy1) {
+			count = REGION_NUM_RECTS(&clip);
+			extents = REGION_RECTS(&clip);
+			while (count--) {
+				int16_t y1 = pixman_fixed_to_int(t->top);
+				uint16_t fy1 = pixman_fixed_frac(t->top);
+				int16_t y2 = pixman_fixed_to_int(t->bottom);
+				uint16_t fy2 = pixman_fixed_frac(t->bottom);
+
+				if (y1 < extents->y1)
+					y1 = extents->y1, fy1 = 0;
+				if (y2 > extents->y2)
+					y2 = extents->y2, fy2 = 0;
+				if (y1 < y2) {
+					if (fy1) {
+						pixmask_unaligned_box_row(&pi, extents, t, y1, 1,
+									  SAMPLES_Y - grid_coverage(SAMPLES_Y, fy1));
+						y1++;
+					}
+
+					if (y2 > y1)
+						pixmask_unaligned_box_row(&pi, extents, t, y1, y2 - y1,
+									  SAMPLES_Y);
+
+					if (fy2)
+						pixmask_unaligned_box_row(&pi, extents, t, y2, 1,
+									  grid_coverage(SAMPLES_Y, fy2));
+				} else if (y1 == y2 && fy2 > fy1) {
 					pixmask_unaligned_box_row(&pi, extents, t, y1, 1,
-								   SAMPLES_Y - grid_coverage(SAMPLES_Y, fy1));
-					y1++;
+								  grid_coverage(SAMPLES_Y, fy2) - grid_coverage(SAMPLES_Y, fy1));
 				}
+				extents++;
+			}
 
-				if (y2 > y1)
-					pixmask_unaligned_box_row(&pi, extents, t, y1, y2 - y1,
-								   SAMPLES_Y);
+			pixman_image_unref(pi.image);
+			pixman_image_unref(pi.source);
+			pixman_image_unref(pi.mask);
+		} else {
+			struct rectilinear_inplace_thread thread[num_threads];
+			int i, y, dy;
 
-				if (fy2)
-					pixmask_unaligned_box_row(&pi, extents, t, y2, 1,
-								   grid_coverage(SAMPLES_Y, fy2));
-			} else if (y1 == y2 && fy2 > fy1) {
-				pixmask_unaligned_box_row(&pi, extents, t, y1, 1,
-							  grid_coverage(SAMPLES_Y, fy2) - grid_coverage(SAMPLES_Y, fy1));
+
+			thread[0].trap = t;
+			thread[0].dst = image_from_pict(dst, false, &thread[0].dx, &thread[0].dy);
+			thread[0].src = image_from_pict(src, false, &thread[0].sx, &thread[0].sy);
+			thread[0].sx += src_x;
+			thread[0].sy += src_y;
+
+			thread[0].clip = &clip;
+			thread[0].op = op;
+
+			y = clip.extents.y1;
+			dy = (clip.extents.y2 - clip.extents.y1 + num_threads - 1) / num_threads;
+
+			for (i = 1; i < num_threads; i++) {
+				thread[i] = thread[0];
+				thread[i].y1 = y;
+				thread[i].y2 = y += dy;
+				sna_threads_run(rectilinear_inplace_thread, &thread[i]);
 			}
-			extents++;
+
+			thread[0].y1 = y;
+			thread[0].y2 = clip.extents.y2;
+			rectilinear_inplace_thread(&thread[0]);
+
+			sna_threads_wait();
+
+			pixman_image_unref(thread[0].dst);
+			pixman_image_unref(thread[0].src);
 		}
 
 		RegionUninit(&clip);
-		pixman_image_unref(pi.image);
-		pixman_image_unref(pi.source);
-		pixman_image_unref(pi.mask);
 	} while (--n && t++);
 
 	return true;
 }
 
 static bool
-composite_unaligned_boxes_fallback(CARD8 op,
+composite_unaligned_boxes_fallback(struct sna *sna,
+				   CARD8 op,
 				   PicturePtr src,
 				   PicturePtr dst,
 				   INT16 src_x, INT16 src_y,
@@ -3579,12 +3915,12 @@ composite_unaligned_boxes_fallback(CARD8 op,
 	int n;
 
 	if (sna_picture_is_solid(src, &color) &&
-	    composite_unaligned_boxes_inplace__solid(op, color, dst,
+	    composite_unaligned_boxes_inplace__solid(sna, op, color, dst,
 						     ntrap, traps,
 						     force_fallback))
 		return true;
 
-	if (composite_unaligned_boxes_inplace(op, src, src_x, src_y,
+	if (composite_unaligned_boxes_inplace(sna, op, src, src_x, src_y,
 					      dst, ntrap, traps,
 					      force_fallback))
 		return true;
@@ -3708,7 +4044,7 @@ composite_unaligned_boxes(struct sna *sna,
 	    !sna->render.check_composite_spans(sna, op, src, dst, 0, 0,
 					       COMPOSITE_SPANS_RECTILINEAR)) {
 fallback:
-		return composite_unaligned_boxes_fallback(op, src, dst,
+		return composite_unaligned_boxes_fallback(sna, op, src, dst,
 							  src_x, src_y,
 							  ntrap, traps,
 							  force_fallback);
@@ -3860,14 +4196,13 @@ static span_func_t
 choose_span(struct sna_composite_spans_op *tmp,
 	    PicturePtr dst,
 	    PictFormatPtr maskFormat,
-	    uint8_t op,
 	    RegionPtr clip)
 {
 	span_func_t span;
 
 	if (is_mono(dst, maskFormat)) {
 		/* XXX An imprecise approximation */
-		if (maskFormat && !operator_is_bounded(op)) {
+		if (maskFormat && !operator_is_bounded(tmp->base.op)) {
 			span = tor_blt_span_mono_unbounded;
 			if (REGION_NUM_RECTS(clip) > 1)
 				span = tor_blt_span_mono_unbounded_clipped;
@@ -3888,8 +4223,77 @@ choose_span(struct sna_composite_spans_op *tmp,
 	return span;
 }
 
+struct mono_span_thread {
+	struct sna *sna;
+	const xTrapezoid *traps;
+	const struct sna_composite_op *op;
+	RegionPtr clip;
+	int ntrap;
+	BoxRec extents;
+	int dx, dy;
+};
+
+static void
+mono_span_thread(void *arg)
+{
+	struct mono_span_thread *thread = arg;
+	struct mono mono;
+	struct mono_span_thread_boxes boxes;
+	const xTrapezoid *t;
+	int n;
+
+	mono.sna = thread->sna;
+
+	mono.clip.extents = thread->extents;
+	mono.clip.data = NULL;
+	if (thread->clip->data) {
+		RegionIntersect(&mono.clip, &mono.clip, thread->clip);
+		if (RegionNil(&mono.clip))
+			return;
+	}
+
+	boxes.op = thread->op;
+	boxes.num_boxes = 0;
+	mono.op.priv = &boxes;
+
+	if (!mono_init(&mono, 2*thread->ntrap)) {
+		RegionUninit(&mono.clip);
+		return;
+	}
+
+	for (n = thread->ntrap, t = thread->traps; n--; t++) {
+		if (!xTrapezoidValid(t))
+			continue;
+
+		if (pixman_fixed_to_int(t->top) + thread->dy >= thread->extents.y2 ||
+		    pixman_fixed_to_int(t->bottom) + thread->dy <= thread->extents.y1)
+			continue;
+
+		mono_add_line(&mono, thread->dx, thread->dy,
+			      t->top, t->bottom,
+			      &t->left.p1, &t->left.p2, 1);
+		mono_add_line(&mono, thread->dx, thread->dy,
+			      t->top, t->bottom,
+			      &t->right.p1, &t->right.p2, -1);
+	}
+
+	if (mono.clip.data == NULL)
+		mono.span = thread_mono_span;
+	else
+		mono.span = thread_mono_span_clipped;
+
+	mono_render(&mono);
+	mono_fini(&mono);
+
+	if (boxes.num_boxes)
+		thread->op->thread_boxes(thread->sna, thread->op,
+					 boxes.boxes, boxes.num_boxes);
+	RegionUninit(&mono.clip);
+}
+
 static bool
-mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
+mono_trapezoids_span_converter(struct sna *sna,
+			       CARD8 op, PicturePtr src, PicturePtr dst,
 			       INT16 src_x, INT16 src_y,
 			       int ntrap, xTrapezoid *traps)
 {
@@ -3897,8 +4301,8 @@ mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	BoxRec extents;
 	int16_t dst_x, dst_y;
 	int16_t dx, dy;
-	bool was_clear;
-	int n;
+	bool unbounded;
+	int num_threads, n;
 
 	if (NO_SCAN_CONVERTER)
 		return false;
@@ -3937,11 +4341,69 @@ mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	     src_x + mono.clip.extents.x1 - dst_x - dx,
 	     src_y + mono.clip.extents.y1 - dst_y - dy));
 
-	mono.sna = to_sna_from_drawable(dst->pDrawable);
-	if (!mono_init(&mono, 2*ntrap))
+	unbounded = (!sna_drawable_is_clear(dst->pDrawable) &&
+		     !operator_is_bounded(op));
+
+	mono.sna = sna;
+	if (!mono.sna->render.composite(mono.sna, op, src, NULL, dst,
+				       src_x + mono.clip.extents.x1 - dst_x - dx,
+				       src_y + mono.clip.extents.y1 - dst_y - dy,
+				       0, 0,
+				       mono.clip.extents.x1,  mono.clip.extents.y1,
+				       mono.clip.extents.x2 - mono.clip.extents.x1,
+				       mono.clip.extents.y2 - mono.clip.extents.y1,
+				       memset(&mono.op, 0, sizeof(mono.op))))
 		return false;
 
-	was_clear = sna_drawable_is_clear(dst->pDrawable);
+	num_threads = 1;
+	if (!NO_GPU_THREADS &&
+	    mono.op.thread_boxes &&
+	    mono.op.damage == NULL &&
+	    !unbounded)
+		num_threads = sna_use_threads(mono.clip.extents.x2 - mono.clip.extents.x1,
+					      mono.clip.extents.y2 - mono.clip.extents.y1,
+					      16);
+	if (num_threads > 1) {
+		struct mono_span_thread threads[num_threads];
+		int y, h;
+
+		DBG(("%s: using %d threads for mono span compositing %dx%d\n",
+		     __FUNCTION__, num_threads,
+		     mono.clip.extents.x2 - mono.clip.extents.x1,
+		     mono.clip.extents.y2 - mono.clip.extents.y1));
+
+		threads[0].sna = mono.sna;
+		threads[0].op = &mono.op;
+		threads[0].traps = traps;
+		threads[0].ntrap = ntrap;
+		threads[0].extents = mono.clip.extents;
+		threads[0].clip = &mono.clip;
+		threads[0].dx = dx;
+		threads[0].dy = dy;
+
+		y = extents.y1;
+		h = extents.y2 - extents.y1;
+		h = (h + num_threads - 1) / num_threads;
+
+		for (n = 1; n < num_threads; n++) {
+			threads[n] = threads[0];
+			threads[n].extents.y1 = y;
+			threads[n].extents.y2 = y += h;
+
+			sna_threads_run(mono_span_thread, &threads[n]);
+		}
+
+		threads[0].extents.y1 = y;
+		threads[0].extents.y2 = extents.y2;
+		mono_span_thread(&threads[0]);
+
+		sna_threads_wait();
+		mono.op.done(mono.sna, &mono.op);
+		return true;
+	}
+
+	if (!mono_init(&mono, 2*ntrap))
+		return false;
 
 	for (n = 0; n < ntrap; n++) {
 		if (!xTrapezoidValid(&traps[n]))
@@ -3959,23 +4421,16 @@ mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 			      &traps[n].right.p1, &traps[n].right.p2, -1);
 	}
 
-	memset(&mono.op, 0, sizeof(mono.op));
-	if (!mono.sna->render.composite(mono.sna, op, src, NULL, dst,
-				       src_x + mono.clip.extents.x1 - dst_x - dx,
-				       src_y + mono.clip.extents.y1 - dst_y - dy,
-				       0, 0,
-				       mono.clip.extents.x1,  mono.clip.extents.y1,
-				       mono.clip.extents.x2 - mono.clip.extents.x1,
-				       mono.clip.extents.y2 - mono.clip.extents.y1,
-				       &mono.op)) {
-		mono_fini(&mono);
-		return false;
-	}
+	if (mono.clip.data == NULL && mono.op.damage == NULL)
+		mono.span = mono_span__fast;
+	else
+		mono.span = mono_span;
+
 	mono_render(&mono);
 	mono.op.done(mono.sna, &mono.op);
 	mono_fini(&mono);
 
-	if (!was_clear && !operator_is_bounded(op)) {
+	if (unbounded) {
 		xPointFixed p1, p2;
 
 		if (!mono_init(&mono, 2+2*ntrap))
@@ -4027,26 +4482,171 @@ mono_trapezoids_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	return true;
 }
 
+struct span_thread {
+	struct sna *sna;
+	const struct sna_composite_spans_op *op;
+	const xTrapezoid *traps;
+	RegionPtr clip;
+	span_func_t span;
+	BoxRec extents;
+	int dx, dy, draw_y;
+	int ntrap;
+	bool unbounded;
+};
+
+#define SPAN_THREAD_MAX_BOXES (8192/sizeof(struct sna_opacity_box))
+struct span_thread_boxes {
+	const struct sna_composite_spans_op *op;
+	struct sna_opacity_box boxes[SPAN_THREAD_MAX_BOXES];
+	int num_boxes;
+};
+
+static void span_thread_add_boxes(struct sna *sna, void *data,
+				  const BoxRec *box, int count, float alpha)
+{
+	struct span_thread_boxes *b = data;
+
+	__DBG(("%s: adding %d boxes with alpha=%f\n",
+	       __FUNCTION__, count, alpha));
+
+	assert(count > 0 && count <= SPAN_THREAD_MAX_BOXES);
+	if (b->num_boxes + count > SPAN_THREAD_MAX_BOXES) {
+		DBG(("%s: flushing %d boxes, adding %d\n", __FUNCTION__, b->num_boxes, count));
+		assert(b->num_boxes <= SPAN_THREAD_MAX_BOXES);
+		b->op->thread_boxes(sna, b->op, b->boxes, b->num_boxes);
+		b->num_boxes = 0;
+	}
+
+	do {
+		b->boxes[b->num_boxes].box = *box++;
+		b->boxes[b->num_boxes].alpha = alpha;
+		b->num_boxes++;
+	} while (--count);
+	assert(b->num_boxes <= SPAN_THREAD_MAX_BOXES);
+}
+
+static void
+span_thread_box(struct sna *sna,
+		struct sna_composite_spans_op *op,
+		pixman_region16_t *clip,
+		const BoxRec *box,
+		int coverage)
+{
+	__DBG(("%s: %d -> %d @ %d\n", __FUNCTION__, box->x1, box->x2, coverage));
+	span_thread_add_boxes(sna, op, box, 1, AREA_TO_ALPHA(coverage));
+}
+
+static void
+span_thread_clipped_box(struct sna *sna,
+			struct sna_composite_spans_op *op,
+			pixman_region16_t *clip,
+			const BoxRec *box,
+			int coverage)
+{
+	pixman_region16_t region;
+
+	__DBG(("%s: %d -> %d @ %f\n", __FUNCTION__, box->x1, box->x2,
+	       AREA_TO_ALPHA(coverage)));
+
+	pixman_region_init_rects(&region, box, 1);
+	RegionIntersect(&region, &region, clip);
+	if (REGION_NUM_RECTS(&region)) {
+		span_thread_add_boxes(sna, op,
+				      REGION_RECTS(&region),
+				      REGION_NUM_RECTS(&region),
+				      AREA_TO_ALPHA(coverage));
+	}
+	pixman_region_fini(&region);
+}
+
+static span_func_t
+thread_choose_span(struct sna_composite_spans_op *tmp,
+		   PicturePtr dst,
+		   PictFormatPtr maskFormat,
+		   RegionPtr clip)
+{
+	span_func_t span;
+
+	if (tmp->base.damage)
+		return NULL;
+
+	if (is_mono(dst, maskFormat)) {
+		return NULL;
+	} else {
+		if (REGION_NUM_RECTS(clip) > 1)
+			span = span_thread_clipped_box;
+		else
+			span = span_thread_box;
+	}
+
+	return span;
+}
+
+static void
+span_thread(void *arg)
+{
+	struct span_thread *thread = arg;
+	struct span_thread_boxes boxes;
+	struct tor tor;
+	const xTrapezoid *t;
+	int n, y1, y2;
+
+	if (tor_init(&tor, &thread->extents, 2*thread->ntrap))
+		return;
+
+	boxes.op = thread->op;
+	boxes.num_boxes = 0;
+
+	y1 = thread->extents.y1 - thread->draw_y;
+	y2 = thread->extents.y2 - thread->draw_y;
+	for (n = thread->ntrap, t = thread->traps; n--; t++) {
+		xTrapezoid tt;
+
+		if (pixman_fixed_to_int(t->top) >= y2 ||
+		    pixman_fixed_to_int(t->bottom) < y1)
+			continue;
+
+		if (!project_trapezoid_onto_grid(t, thread->dx, thread->dy, &tt))
+			continue;
+
+		tor_add_edge(&tor, &tt, &tt.left, 1);
+		tor_add_edge(&tor, &tt, &tt.right, -1);
+	}
+
+	tor_render(thread->sna, &tor,
+		   (struct sna_composite_spans_op *)&boxes, thread->clip,
+		   thread->span, thread->unbounded);
+
+	tor_fini(&tor);
+
+	if (boxes.num_boxes) {
+		DBG(("%s: flushing %d boxes\n", __FUNCTION__, boxes.num_boxes));
+		assert(boxes.num_boxes <= SPAN_THREAD_MAX_BOXES);
+		thread->op->thread_boxes(thread->sna, thread->op,
+					 boxes.boxes, boxes.num_boxes);
+	}
+}
+
 static bool
-trapezoid_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
+trapezoid_span_converter(struct sna *sna,
+			 CARD8 op, PicturePtr src, PicturePtr dst,
 			 PictFormatPtr maskFormat, unsigned int flags,
 			 INT16 src_x, INT16 src_y,
 			 int ntrap, xTrapezoid *traps)
 {
-	struct sna *sna;
 	struct sna_composite_spans_op tmp;
-	struct tor tor;
 	BoxRec extents;
 	pixman_region16_t clip;
 	int16_t dst_x, dst_y;
 	bool was_clear;
 	int dx, dy, n;
+	int num_threads;
 
 	if (NO_SCAN_CONVERTER)
 		return false;
 
 	if (is_mono(dst, maskFormat))
-		return mono_trapezoids_span_converter(op, src, dst,
+		return mono_trapezoids_span_converter(sna, op, src, dst,
 						      src_x, src_y,
 						      ntrap, traps);
 
@@ -4057,7 +4657,6 @@ trapezoid_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 		return false;
 	}
 
-	sna = to_sna_from_drawable(dst->pDrawable);
 	if (!sna->render.check_composite_spans(sna, op, src, dst, 0, 0, flags)) {
 		DBG(("%s: fallback -- composite spans not supported\n",
 		     __FUNCTION__));
@@ -4144,29 +4743,78 @@ trapezoid_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 
 	dx *= FAST_SAMPLES_X;
 	dy *= FAST_SAMPLES_Y;
-	if (tor_init(&tor, &extents, 2*ntrap))
-		goto skip;
 
-	for (n = 0; n < ntrap; n++) {
-		xTrapezoid t;
+	num_threads = 1;
+	if (!NO_GPU_THREADS && tmp.thread_boxes &&
+	    thread_choose_span(&tmp, dst, maskFormat, &clip))
+		num_threads = sna_use_threads(extents.x2-extents.x1,
+					      extents.y2-extents.y1,
+					      16);
+	if (num_threads == 1) {
+		struct tor tor;
 
-		if (!project_trapezoid_onto_grid(&traps[n], dx, dy, &t))
-			continue;
+		if (tor_init(&tor, &extents, 2*ntrap))
+			goto skip;
 
-		if (pixman_fixed_to_int(traps[n].top) + dst->pDrawable->y >= extents.y2 ||
-		    pixman_fixed_to_int(traps[n].bottom) + dst->pDrawable->y < extents.y1)
-			continue;
+		for (n = 0; n < ntrap; n++) {
+			xTrapezoid t;
 
-		tor_add_edge(&tor, &t, &t.left, 1);
-		tor_add_edge(&tor, &t, &t.right, -1);
-	}
+			if (!project_trapezoid_onto_grid(&traps[n], dx, dy, &t))
+				continue;
 
-	tor_render(sna, &tor, &tmp, &clip,
-		   choose_span(&tmp, dst, maskFormat, op, &clip),
-		   !was_clear && maskFormat && !operator_is_bounded(op));
+			if (pixman_fixed_to_int(traps[n].top) + dst->pDrawable->y >= extents.y2 ||
+			    pixman_fixed_to_int(traps[n].bottom) + dst->pDrawable->y < extents.y1)
+				continue;
+
+			tor_add_edge(&tor, &t, &t.left, 1);
+			tor_add_edge(&tor, &t, &t.right, -1);
+		}
+
+		tor_render(sna, &tor, &tmp, &clip,
+			   choose_span(&tmp, dst, maskFormat, &clip),
+			   !was_clear && maskFormat && !operator_is_bounded(op));
 
 skip:
-	tor_fini(&tor);
+		tor_fini(&tor);
+	} else {
+		struct span_thread threads[num_threads];
+		int y, h;
+
+		DBG(("%s: using %d threads for span compositing %dx%d\n",
+		     __FUNCTION__, num_threads,
+		     extents.x2 - extents.x1,
+		     extents.y2 - extents.y1));
+
+		threads[0].sna = sna;
+		threads[0].op = &tmp;
+		threads[0].traps = traps;
+		threads[0].ntrap = ntrap;
+		threads[0].extents = extents;
+		threads[0].clip = &clip;
+		threads[0].dx = dx;
+		threads[0].dy = dy;
+		threads[0].draw_y = dst->pDrawable->y;
+		threads[0].unbounded = !was_clear && maskFormat && !operator_is_bounded(op);
+		threads[0].span = thread_choose_span(&tmp, dst, maskFormat, &clip);
+
+		y = extents.y1;
+		h = extents.y2 - extents.y1;
+		h = (h + num_threads - 1) / num_threads;
+
+		for (n = 1; n < num_threads; n++) {
+			threads[n] = threads[0];
+			threads[n].extents.y1 = y;
+			threads[n].extents.y2 = y += h;
+
+			sna_threads_run(span_thread, &threads[n]);
+		}
+
+		threads[0].extents.y1 = y;
+		threads[0].extents.y2 = extents.y2;
+		span_thread(&threads[0]);
+
+		sna_threads_wait();
+	}
 	tmp.done(sna, &tmp);
 
 	REGION_UNINIT(NULL, &clip);
@@ -4351,7 +4999,8 @@ struct inplace {
 static force_inline uint8_t coverage_opacity(int coverage, uint8_t opacity)
 {
 	coverage = coverage * 256 / FAST_SAMPLES_XY;
-	return mul_8_8(coverage - (coverage >> 8), opacity);
+	coverage -= coverage >> 8;
+	return opacity == 255 ? coverage : mul_8_8(coverage, opacity);
 }
 
 static void
@@ -4673,7 +5322,8 @@ mono_inplace_composite_boxes(struct sna *sna,
 }
 
 static bool
-trapezoid_spans_maybe_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
+trapezoid_spans_maybe_inplace(struct sna *sna,
+			      CARD8 op, PicturePtr src, PicturePtr dst,
 			      PictFormatPtr maskFormat)
 {
 	struct sna_pixmap *priv;
@@ -4706,7 +5356,7 @@ trapezoid_spans_maybe_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 
 	case PICT_x8r8g8b8:
 	case PICT_a8r8g8b8:
-		if (picture_is_gpu(src))
+		if (picture_is_gpu(sna, src))
 			return false;
 
 		switch (op) {
@@ -4753,7 +5403,8 @@ out:
 }
 
 static bool
-trapezoid_span_mono_inplace(CARD8 op,
+trapezoid_span_mono_inplace(struct sna *sna,
+			    CARD8 op,
 			    PicturePtr src,
 			    PicturePtr dst,
 			    INT16 src_x, INT16 src_y,
@@ -4799,7 +5450,7 @@ trapezoid_span_mono_inplace(CARD8 op,
 					     MOVE_WRITE | MOVE_READ))
 		return true;
 
-	mono.sna = to_sna_from_drawable(dst->pDrawable);
+	mono.sna = sna;
 	if (!mono_init(&mono, 2*ntrap))
 		return false;
 
@@ -4855,6 +5506,20 @@ unbounded_pass:
 
 		op = 0;
 	} else {
+		if (src->pDrawable) {
+			if (!sna_drawable_move_to_cpu(src->pDrawable,
+						      MOVE_READ)) {
+				mono_fini(&mono);
+				return false;
+			}
+			if (src->alphaMap &&
+			    !sna_drawable_move_to_cpu(src->alphaMap->pDrawable,
+						      MOVE_READ)) {
+				mono_fini(&mono);
+				return false;
+			}
+		}
+
 		inplace.composite.dst = image_from_pict(dst, false,
 							&inplace.composite.dx,
 							&inplace.composite.dy);
@@ -4871,6 +5536,11 @@ unbounded_pass:
 		mono.op.box = mono_inplace_composite_box;
 		mono.op.boxes = mono_inplace_composite_boxes;
 	}
+
+	if (mono.clip.data == NULL && mono.op.damage == NULL)
+		mono.span = mono_span__fast;
+	else
+		mono.span = mono_span;
 	mono_render(&mono);
 	mono_fini(&mono);
 
@@ -4922,6 +5592,45 @@ unbounded_pass:
 }
 
 static void
+pixmask_span_solid(struct sna *sna,
+		   struct sna_composite_spans_op *op,
+		   pixman_region16_t *clip,
+		   const BoxRec *box,
+		   int coverage)
+{
+	struct pixman_inplace *pi = (struct pixman_inplace *)op;
+	if (coverage != FAST_SAMPLES_XY) {
+		coverage = coverage * 256 / FAST_SAMPLES_XY;
+		coverage -= coverage >> 8;
+		*pi->bits = mul_4x8_8(pi->color, coverage);
+	} else
+		*pi->bits = pi->color;
+	pixman_image_composite(pi->op, pi->source, NULL, pi->image,
+			       box->x1, box->y1,
+			       0, 0,
+			       pi->dx + box->x1, pi->dy + box->y1,
+			       box->x2 - box->x1, box->y2 - box->y1);
+}
+static void
+pixmask_span_solid__clipped(struct sna *sna,
+			    struct sna_composite_spans_op *op,
+			    pixman_region16_t *clip,
+			    const BoxRec *box,
+			    int coverage)
+{
+	pixman_region16_t region;
+	int n;
+
+	pixman_region_init_rects(&region, box, 1);
+	RegionIntersect(&region, &region, clip);
+	n = REGION_NUM_RECTS(&region);
+	box = REGION_RECTS(&region);
+	while (n--)
+		pixmask_span_solid(sna, op, NULL, box++, coverage);
+	pixman_region_fini(&region);
+}
+
+static void
 pixmask_span(struct sna *sna,
 	     struct sna_composite_spans_op *op,
 	     pixman_region16_t *clip,
@@ -4961,6 +5670,113 @@ pixmask_span__clipped(struct sna *sna,
 	pixman_region_fini(&region);
 }
 
+struct inplace_x8r8g8b8_thread {
+	xTrapezoid *traps;
+	PicturePtr dst, src;
+	BoxRec extents;
+	int dx, dy;
+	int ntrap;
+	bool lerp, is_solid;
+	uint32_t color;
+	int16_t src_x, src_y;
+	uint8_t op;
+};
+
+static void inplace_x8r8g8b8_thread(void *arg)
+{
+	struct inplace_x8r8g8b8_thread *thread = arg;
+	struct tor tor;
+	span_func_t span;
+	RegionPtr clip;
+	int y1, y2, n;
+
+	if (tor_init(&tor, &thread->extents, 2*thread->ntrap))
+		return;
+
+	y1 = thread->extents.y1 - thread->dst->pDrawable->y;
+	y2 = thread->extents.y2 - thread->dst->pDrawable->y;
+	for (n = 0; n < thread->ntrap; n++) {
+		xTrapezoid t;
+
+		if (!project_trapezoid_onto_grid(&thread->traps[n], thread->dx, thread->dy, &t))
+			continue;
+
+		if (pixman_fixed_to_int(thread->traps[n].top) >= y2 ||
+		    pixman_fixed_to_int(thread->traps[n].bottom) < y1)
+			continue;
+
+		tor_add_edge(&tor, &t, &t.left, 1);
+		tor_add_edge(&tor, &t, &t.right, -1);
+	}
+
+	clip = thread->dst->pCompositeClip;
+	if (thread->lerp) {
+		struct inplace inplace;
+		int16_t dst_x, dst_y;
+		PixmapPtr pixmap;
+
+		pixmap = get_drawable_pixmap(thread->dst->pDrawable);
+		get_drawable_deltas(thread->dst->pDrawable, pixmap, &dst_x, &dst_y);
+
+		inplace.ptr = pixmap->devPrivate.ptr;
+		inplace.ptr += dst_y * pixmap->devKind + dst_x * 4;
+		inplace.stride = pixmap->devKind;
+		inplace.color = thread->color;
+
+		if (clip->data)
+			span = tor_blt_lerp32_clipped;
+		else
+			span = tor_blt_lerp32;
+
+		tor_render(NULL, &tor, (void*)&inplace, clip, span, false);
+	} else if (thread->is_solid) {
+		struct pixman_inplace pi;
+
+		pi.image = image_from_pict(thread->dst, false, &pi.dx, &pi.dy);
+		pi.op = thread->op;
+		pi.color = thread->color;
+
+		pi.bits = (uint32_t *)&pi.sx;
+		pi.source = pixman_image_create_bits(PIXMAN_a8r8g8b8,
+						     1, 1, pi.bits, 0);
+		pixman_image_set_repeat(pi.source, PIXMAN_REPEAT_NORMAL);
+
+		if (clip->data)
+			span = pixmask_span_solid__clipped;
+		else
+			span = pixmask_span_solid;
+
+		tor_render(NULL, &tor, (void*)&pi, clip, span, false);
+
+		pixman_image_unref(pi.source);
+		pixman_image_unref(pi.image);
+	} else {
+		struct pixman_inplace pi;
+
+		pi.image = image_from_pict(thread->dst, false, &pi.dx, &pi.dy);
+		pi.source = image_from_pict(thread->src, false, &pi.sx, &pi.sy);
+		pi.sx += thread->src_x - pixman_fixed_to_int(thread->traps[0].left.p1.x);
+		pi.sy += thread->src_y - pixman_fixed_to_int(thread->traps[0].left.p1.y);
+		pi.mask = pixman_image_create_bits(PIXMAN_a8, 1, 1, NULL, 0);
+		pixman_image_set_repeat(pi.mask, PIXMAN_REPEAT_NORMAL);
+		pi.bits = pixman_image_get_data(pi.mask);
+		pi.op = thread->op;
+
+		if (clip->data)
+			span = pixmask_span__clipped;
+		else
+			span = pixmask_span;
+
+		tor_render(NULL, &tor, (void*)&pi, clip, span, false);
+
+		pixman_image_unref(pi.mask);
+		pixman_image_unref(pi.source);
+		pixman_image_unref(pi.image);
+	}
+
+	tor_fini(&tor);
+}
+
 static bool
 trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 				 PicturePtr dst,
@@ -4968,17 +5784,15 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 				 PictFormatPtr maskFormat,
 				 int ntrap, xTrapezoid *traps)
 {
-	struct tor tor;
-	span_func_t span;
 	uint32_t color;
-	bool lerp;
+	bool lerp, is_solid;
 	RegionRec region;
-	int16_t dst_x, dst_y;
 	int dx, dy;
-	int n;
+	int num_threads, n;
 
 	lerp = false;
-	if (sna_picture_is_solid(src, &color)) {
+	is_solid = sna_picture_is_solid(src, &color);
+	if (is_solid) {
 		if (op == PictOpOver && (color >> 24) == 0xff)
 			op = PictOpSrc;
 		if (op == PictOpOver && sna_drawable_is_clear(dst->pDrawable))
@@ -5037,43 +5851,66 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 	     region.extents.x1, region.extents.y1,
 	     region.extents.x2, region.extents.y2));
 
-	if (tor_init(&tor, &region.extents, 2*ntrap))
+	region.data = NULL;
+	if (!sna_drawable_move_region_to_cpu(dst->pDrawable, &region,
+					    MOVE_WRITE | MOVE_READ))
 		return true;
 
+	if (!is_solid && src->pDrawable) {
+		if (!sna_drawable_move_to_cpu(src->pDrawable,
+					      MOVE_READ))
+			return true;
+
+		if (src->alphaMap &&
+		    !sna_drawable_move_to_cpu(src->alphaMap->pDrawable,
+					      MOVE_READ))
+			return true;
+	}
+
 	dx = dst->pDrawable->x * FAST_SAMPLES_X;
 	dy = dst->pDrawable->y * FAST_SAMPLES_Y;
 
-	for (n = 0; n < ntrap; n++) {
-		xTrapezoid t;
+	num_threads = sna_use_threads(4*(region.extents.x2 - region.extents.x1),
+				      region.extents.y2 - region.extents.y1,
+				      8);
 
-		if (!project_trapezoid_onto_grid(&traps[n], dx, dy, &t))
-			continue;
+	DBG(("%s: %dx%d, format=%x, op=%d, lerp?=%d, num_threads=%d\n",
+	     __FUNCTION__,
+	     region.extents.x2 - region.extents.x1,
+	     region.extents.y2 - region.extents.y1,
+	     dst->format, op, lerp, num_threads));
 
-		if (pixman_fixed_to_int(traps[n].top) >= region.extents.y2 - dst->pDrawable->y ||
-		    pixman_fixed_to_int(traps[n].bottom) < region.extents.y1 - dst->pDrawable->y)
-			continue;
+	if (num_threads == 1) {
+		struct tor tor;
+		span_func_t span;
 
-		tor_add_edge(&tor, &t, &t.left, 1);
-		tor_add_edge(&tor, &t, &t.right, -1);
-	}
+		if (tor_init(&tor, &region.extents, 2*ntrap))
+			return true;
 
-	DBG(("%s: move-to-cpu\n", __FUNCTION__));
-	region.data = NULL;
-	if (sna_drawable_move_region_to_cpu(dst->pDrawable, &region,
-					    MOVE_WRITE | MOVE_READ)) {
-		PixmapPtr pixmap;
+		for (n = 0; n < ntrap; n++) {
+			xTrapezoid t;
 
-		pixmap = get_drawable_pixmap(dst->pDrawable);
-		get_drawable_deltas(dst->pDrawable, pixmap, &dst_x, &dst_y);
+			if (!project_trapezoid_onto_grid(&traps[n], dx, dy, &t))
+				continue;
 
-		DBG(("%s: format=%x, op=%d, color=%x\n",
-		     __FUNCTION__, dst->format, op, color));
+			if (pixman_fixed_to_int(traps[n].top) >= region.extents.y2 - dst->pDrawable->y ||
+			    pixman_fixed_to_int(traps[n].bottom) < region.extents.y1 - dst->pDrawable->y)
+				continue;
+
+			tor_add_edge(&tor, &t, &t.left, 1);
+			tor_add_edge(&tor, &t, &t.right, -1);
+		}
 
 		if (lerp) {
 			struct inplace inplace;
+			PixmapPtr pixmap;
+			int16_t dst_x, dst_y;
+
+			pixmap = get_drawable_pixmap(dst->pDrawable);
+			get_drawable_deltas(dst->pDrawable, pixmap, &dst_x, &dst_y);
 
 			inplace.ptr = pixmap->devPrivate.ptr;
-			inplace.ptr += dst_y * pixmap->devKind + dst_x;
+			inplace.ptr += dst_y * pixmap->devKind + dst_x * 4;
 			inplace.stride = pixmap->devKind;
 			inplace.color = color;
 
@@ -5087,7 +5924,29 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 
 			tor_render(NULL, &tor, (void*)&inplace,
 				   dst->pCompositeClip, span, false);
-			tor_fini(&tor);
+		} else if (is_solid) {
+			struct pixman_inplace pi;
+
+			pi.image = image_from_pict(dst, false, &pi.dx, &pi.dy);
+			pi.op = op;
+			pi.color = color;
+
+			pi.bits = (uint32_t *)&pi.sx;
+			pi.source = pixman_image_create_bits(PIXMAN_a8r8g8b8,
+							     1, 1, pi.bits, 0);
+			pixman_image_set_repeat(pi.source, PIXMAN_REPEAT_NORMAL);
+
+			if (dst->pCompositeClip->data)
+				span = pixmask_span_solid__clipped;
+			else
+				span = pixmask_span_solid;
+
+			tor_render(NULL, &tor, (void*)&pi,
+				   dst->pCompositeClip, span,
+				   false);
+
+			pixman_image_unref(pi.source);
+			pixman_image_unref(pi.image);
 		} else {
 			struct pixman_inplace pi;
 
@@ -5108,24 +5967,106 @@ trapezoid_span_inplace__x8r8g8b8(CARD8 op,
 			tor_render(NULL, &tor, (void*)&pi,
 				   dst->pCompositeClip, span,
 				   false);
-			tor_fini(&tor);
 
 			pixman_image_unref(pi.mask);
 			pixman_image_unref(pi.source);
 			pixman_image_unref(pi.image);
 		}
+
+		tor_fini(&tor);
+	} else {
+		struct inplace_x8r8g8b8_thread threads[num_threads];
+		int y, h;
+
+		DBG(("%s: using %d threads for inplace compositing %dx%d\n",
+		     __FUNCTION__, num_threads,
+		     region.extents.x2 - region.extents.x1,
+		     region.extents.y2 - region.extents.y1));
+
+		threads[0].traps = traps;
+		threads[0].ntrap = ntrap;
+		threads[0].extents = region.extents;
+		threads[0].lerp = lerp;
+		threads[0].is_solid = is_solid;
+		threads[0].color = color;
+		threads[0].dx = dx;
+		threads[0].dy = dy;
+		threads[0].dst = dst;
+		threads[0].src = src;
+		threads[0].op = op;
+		threads[0].src_x = src_x;
+		threads[0].src_y = src_y;
+
+		y = region.extents.y1;
+		h = region.extents.y2 - region.extents.y1;
+		h = (h + num_threads - 1) / num_threads;
+
+		for (n = 1; n < num_threads; n++) {
+			threads[n] = threads[0];
+			threads[n].extents.y1 = y;
+			threads[n].extents.y2 = y += h;
+
+			sna_threads_run(inplace_x8r8g8b8_thread, &threads[n]);
+		}
+
+		threads[0].extents.y1 = y;
+		threads[0].extents.y2 = region.extents.y2;
+		inplace_x8r8g8b8_thread(&threads[0]);
+
+		sna_threads_wait();
 	}
 
 	return true;
 }
 
+struct inplace_thread {
+	xTrapezoid *traps;
+	RegionPtr clip;
+	span_func_t span;
+	struct inplace inplace;
+	BoxRec extents;
+	int dx, dy;
+	int draw_x, draw_y;
+	bool unbounded;
+	int ntrap;
+};
+
+static void inplace_thread(void *arg)
+{
+	struct inplace_thread *thread = arg;
+	struct tor tor;
+	int n;
+
+	if (tor_init(&tor, &thread->extents, 2*thread->ntrap))
+		return;
+
+	for (n = 0; n < thread->ntrap; n++) {
+		xTrapezoid t;
+
+		if (!project_trapezoid_onto_grid(&thread->traps[n], thread->dx, thread->dy, &t))
+			continue;
+
+		if (pixman_fixed_to_int(thread->traps[n].top) >= thread->extents.y2 - thread->draw_y ||
+		    pixman_fixed_to_int(thread->traps[n].bottom) < thread->extents.y1 - thread->draw_y)
+			continue;
+
+		tor_add_edge(&tor, &t, &t.left, 1);
+		tor_add_edge(&tor, &t, &t.right, -1);
+	}
+
+	tor_render(NULL, &tor, (void*)&thread->inplace,
+		   thread->clip, thread->span, thread->unbounded);
+
+	tor_fini(&tor);
+}
+
 static bool
-trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
+trapezoid_span_inplace(struct sna *sna,
+		       CARD8 op, PicturePtr src, PicturePtr dst,
 		       PictFormatPtr maskFormat, INT16 src_x, INT16 src_y,
 		       int ntrap, xTrapezoid *traps,
 		       bool fallback)
 {
-	struct tor tor;
 	struct inplace inplace;
 	span_func_t span;
 	PixmapPtr pixmap;
@@ -5135,7 +6076,7 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 	bool unbounded;
 	int16_t dst_x, dst_y;
 	int dx, dy;
-	int n;
+	int num_threads, n;
 
 	if (NO_SCAN_CONVERTER)
 		return false;
@@ -5151,7 +6092,7 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 		return false;
 	}
 
-	if (!fallback && is_gpu(dst->pDrawable)) {
+	if (!fallback && is_gpu(sna, dst->pDrawable, PREFER_GPU_SPANS)) {
 		DBG(("%s: fallback -- can not perform operation in place, destination busy\n",
 		     __FUNCTION__));
 
@@ -5159,7 +6100,7 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 	}
 
 	if (is_mono(dst, maskFormat))
-		return trapezoid_span_mono_inplace(op, src, dst,
+		return trapezoid_span_mono_inplace(sna, op, src, dst,
 						   src_x, src_y, ntrap, traps);
 
 	if (dst->format == PICT_a8r8g8b8 || dst->format == PICT_x8r8g8b8)
@@ -5234,7 +6175,7 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 		     __FUNCTION__));
 		do {
 			/* XXX unwind errors? */
-			if (!trapezoid_span_inplace(op, src, dst, NULL,
+			if (!trapezoid_span_inplace(sna, op, src, dst, NULL,
 						    src_x, src_y, 1, traps++,
 						    fallback))
 				return false;
@@ -5266,26 +6207,6 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 	     region.extents.x1, region.extents.y1,
 	     region.extents.x2, region.extents.y2));
 
-	if (tor_init(&tor, &region.extents, 2*ntrap))
-		return true;
-
-	dx = dst->pDrawable->x * FAST_SAMPLES_X;
-	dy = dst->pDrawable->y * FAST_SAMPLES_Y;
-
-	for (n = 0; n < ntrap; n++) {
-		xTrapezoid t;
-
-		if (!project_trapezoid_onto_grid(&traps[n], dx, dy, &t))
-			continue;
-
-		if (pixman_fixed_to_int(traps[n].top) >= region.extents.y2 - dst->pDrawable->y ||
-		    pixman_fixed_to_int(traps[n].bottom) < region.extents.y1 - dst->pDrawable->y)
-			continue;
-
-		tor_add_edge(&tor, &t, &t.left, 1);
-		tor_add_edge(&tor, &t, &t.right, -1);
-	}
-
 	if (op == PictOpSrc) {
 		if (dst->pCompositeClip->data)
 			span = tor_blt_src_clipped;
@@ -5310,6 +6231,9 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 					     op == PictOpSrc ? MOVE_WRITE | MOVE_INPLACE_HINT : MOVE_WRITE | MOVE_READ))
 		return true;
 
+	dx = dst->pDrawable->x * FAST_SAMPLES_X;
+	dy = dst->pDrawable->y * FAST_SAMPLES_Y;
+
 	get_drawable_deltas(dst->pDrawable, pixmap, &dst_x, &dst_y);
 
 	inplace.ptr = pixmap->devPrivate.ptr;
@@ -5317,10 +6241,72 @@ trapezoid_span_inplace(CARD8 op, PicturePtr src, PicturePtr dst,
 	inplace.stride = pixmap->devKind;
 	inplace.opacity = color >> 24;
 
-	tor_render(NULL, &tor, (void*)&inplace,
-		   dst->pCompositeClip, span, unbounded);
+	num_threads = sna_use_threads(region.extents.x2 - region.extents.x1,
+				      region.extents.y2 - region.extents.y1,
+				      8);
+	if (num_threads == 1) {
+		struct tor tor;
 
-	tor_fini(&tor);
+		if (tor_init(&tor, &region.extents, 2*ntrap))
+			return true;
+
+		for (n = 0; n < ntrap; n++) {
+			xTrapezoid t;
+
+			if (!project_trapezoid_onto_grid(&traps[n], dx, dy, &t))
+				continue;
+
+			if (pixman_fixed_to_int(traps[n].top) >= region.extents.y2 - dst->pDrawable->y ||
+			    pixman_fixed_to_int(traps[n].bottom) < region.extents.y1 - dst->pDrawable->y)
+				continue;
+
+			tor_add_edge(&tor, &t, &t.left, 1);
+			tor_add_edge(&tor, &t, &t.right, -1);
+		}
+
+		tor_render(NULL, &tor, (void*)&inplace,
+			   dst->pCompositeClip, span, unbounded);
+
+		tor_fini(&tor);
+	} else {
+		struct inplace_thread threads[num_threads];
+		int y, h;
+
+		DBG(("%s: using %d threads for inplace compositing %dx%d\n",
+		     __FUNCTION__, num_threads,
+		     region.extents.x2 - region.extents.x1,
+		     region.extents.y2 - region.extents.y1));
+
+		threads[0].traps = traps;
+		threads[0].ntrap = ntrap;
+		threads[0].inplace = inplace;
+		threads[0].extents = region.extents;
+		threads[0].clip = dst->pCompositeClip;
+		threads[0].span = span;
+		threads[0].unbounded = unbounded;
+		threads[0].dx = dx;
+		threads[0].dy = dy;
+		threads[0].draw_x = dst->pDrawable->x;
+		threads[0].draw_y = dst->pDrawable->y;
+
+		y = region.extents.y1;
+		h = region.extents.y2 - region.extents.y1;
+		h = (h + num_threads - 1) / num_threads;
+
+		for (n = 1; n < num_threads; n++) {
+			threads[n] = threads[0];
+			threads[n].extents.y1 = y;
+			threads[n].extents.y2 = y += h;
+
+			sna_threads_run(inplace_thread, &threads[n]);
+		}
+
+		threads[0].extents.y1 = y;
+		threads[0].extents.y2 = region.extents.y2;
+		inplace_thread(&threads[0]);
+
+		sna_threads_wait();
+	}
 
 	return true;
 }
@@ -5441,32 +6427,14 @@ trapezoid_span_fallback(CARD8 op, PicturePtr src, PicturePtr dst,
 		region.extents.y2 = region.extents.y1 + extents.y2;
 		region.data = NULL;
 
-		DBG(("%s: move-to-cpu\n", __FUNCTION__));
-		if (!sna_drawable_move_region_to_cpu(dst->pDrawable, &region,
-						     MOVE_READ | MOVE_WRITE))
-			goto done;
-		if (dst->alphaMap  &&
-		    !sna_drawable_move_to_cpu(dst->alphaMap->pDrawable,
-					      MOVE_READ | MOVE_WRITE))
-			goto done;
-		if (src->pDrawable) {
-			if (!sna_drawable_move_to_cpu(src->pDrawable,
-						      MOVE_READ))
-				goto done;
-			if (src->alphaMap &&
-			    !sna_drawable_move_to_cpu(src->alphaMap->pDrawable,
-						      MOVE_READ))
-				goto done;
-		}
-
 		DBG(("%s: fbComposite()\n", __FUNCTION__));
-		fbComposite(op, src, mask, dst,
-			    src_x + dst_x - pixman_fixed_to_int(traps[0].left.p1.x),
-			    src_y + dst_y - pixman_fixed_to_int(traps[0].left.p1.y),
-			    0, 0,
-			    dst_x, dst_y,
-			    extents.x2, extents.y2);
-done:
+		sna_composite_fb(op, src, mask, dst, &region,
+				 src_x + dst_x - pixman_fixed_to_int(traps[0].left.p1.x),
+				 src_y + dst_y - pixman_fixed_to_int(traps[0].left.p1.y),
+				 0, 0,
+				 dst_x, dst_y,
+				 extents.x2, extents.y2);
+
 		FreePicture(mask, 0);
 	}
 	sna_pixmap_destroy(scratch);
@@ -5518,11 +6486,14 @@ sna_composite_trapezoids(CARD8 op,
 
 	force_fallback = FORCE_FALLBACK > 0;
 	if ((too_small(priv) || DAMAGE_IS_ALL(priv->cpu_damage)) &&
-	    !picture_is_gpu(src)) {
-		DBG(("%s: force fallbacks -- dst is too small, %dx%d\n",
+	    !picture_is_gpu(sna, src) && untransformed(src)) {
+		DBG(("%s: force fallbacks --too small, %dx%d? %d, all-cpu? %d, src-is-cpu? %d\n",
 		     __FUNCTION__,
 		     dst->pDrawable->width,
-		     dst->pDrawable->height));
+		     dst->pDrawable->height,
+		     too_small(priv),
+		     (int)DAMAGE_IS_ALL(priv->cpu_damage),
+		     !picture_is_gpu(sna, src)));
 		force_fallback = true;
 	}
 	if (FORCE_FALLBACK < 0)
@@ -5589,24 +6560,24 @@ sna_composite_trapezoids(CARD8 op,
 		goto fallback;
 
 	if (is_mono(dst, maskFormat) &&
-	    mono_trapezoids_span_converter(op, src, dst,
+	    mono_trapezoids_span_converter(sna, op, src, dst,
 					   xSrc, ySrc,
 					   ntrap, traps))
 		return;
 
-	if (trapezoid_spans_maybe_inplace(op, src, dst, maskFormat)) {
+	if (trapezoid_spans_maybe_inplace(sna, op, src, dst, maskFormat)) {
 		flags |= COMPOSITE_SPANS_INPLACE_HINT;
-		if (trapezoid_span_inplace(op, src, dst, maskFormat,
+		if (trapezoid_span_inplace(sna, op, src, dst, maskFormat,
 					   xSrc, ySrc, ntrap, traps,
 					   false))
 			return;
 	}
 
-	if (trapezoid_span_converter(op, src, dst, maskFormat, flags,
+	if (trapezoid_span_converter(sna, op, src, dst, maskFormat, flags,
 				     xSrc, ySrc, ntrap, traps))
 		return;
 
-	if (trapezoid_span_inplace(op, src, dst, maskFormat,
+	if (trapezoid_span_inplace(sna, op, src, dst, maskFormat,
 				   xSrc, ySrc, ntrap, traps,
 				   false))
 		return;
@@ -5616,7 +6587,7 @@ sna_composite_trapezoids(CARD8 op,
 		return;
 
 fallback:
-	if (trapezoid_span_inplace(op, src, dst, maskFormat,
+	if (trapezoid_span_inplace(sna, op, src, dst, maskFormat,
 				   xSrc, ySrc, ntrap, traps,
 				   true))
 		return;
@@ -5625,12 +6596,13 @@ fallback:
 				    xSrc, ySrc, ntrap, traps))
 		return;
 
-	if (trapezoids_inplace_fallback(op, src, dst, maskFormat, ntrap, traps))
+	if (trapezoids_inplace_fallback(sna, op, src, dst, maskFormat,
+					ntrap, traps))
 		return;
 
 	DBG(("%s: fallback mask=%08x, ntrap=%d\n", __FUNCTION__,
 	     maskFormat ? (unsigned)maskFormat->format : 0, ntrap));
-	trapezoids_fallback(op, src, dst, maskFormat,
+	trapezoids_fallback(sna, op, src, dst, maskFormat,
 			    xSrc, ySrc,
 			    ntrap, traps);
 }
@@ -5652,7 +6624,8 @@ project_trap_onto_grid(const xTrap *in,
 }
 
 static bool
-mono_trap_span_converter(PicturePtr dst,
+mono_trap_span_converter(struct sna *sna,
+			 PicturePtr dst,
 			 INT16 x, INT16 y,
 			 int ntrap, xTrap *traps)
 {
@@ -5677,7 +6650,7 @@ mono_trap_span_converter(PicturePtr dst,
 	     mono.clip.extents.x2, mono.clip.extents.y2,
 	     x, y));
 
-	mono.sna = to_sna_from_drawable(dst->pDrawable);
+	mono.sna = sna;
 	if (!mono_init(&mono, 2*ntrap))
 		return false;
 
@@ -5722,11 +6695,11 @@ mono_trap_span_converter(PicturePtr dst,
 }
 
 static bool
-trap_span_converter(PicturePtr dst,
+trap_span_converter(struct sna *sna,
+		    PicturePtr dst,
 		    INT16 src_x, INT16 src_y,
 		    int ntrap, xTrap *trap)
 {
-	struct sna *sna;
 	struct sna_composite_spans_op tmp;
 	struct tor tor;
 	BoxRec extents;
@@ -5740,9 +6713,8 @@ trap_span_converter(PicturePtr dst,
 		return false;
 
 	if (dst->polyEdge == PolyEdgeSharp)
-		return mono_trap_span_converter(dst, src_x, src_y, ntrap, trap);
+		return mono_trap_span_converter(sna, dst, src_x, src_y, ntrap, trap);
 
-	sna = to_sna_from_drawable(dst->pDrawable);
 	if (!sna->render.check_composite_spans(sna, PictOpAdd, sna->render.white_picture, dst,
 					       dst->pCompositeClip->extents.x2 - dst->pCompositeClip->extents.x1,
 					       dst->pCompositeClip->extents.y2 - dst->pCompositeClip->extents.y1,
@@ -5806,7 +6778,7 @@ trap_span_converter(PicturePtr dst,
 	}
 
 	tor_render(sna, &tor, &tmp, clip,
-		   choose_span(&tmp, dst, NULL, PictOpAdd, clip), false);
+		   choose_span(&tmp, dst, NULL, clip), false);
 
 skip:
 	tor_fini(&tor);
@@ -5827,7 +6799,6 @@ static void mark_damaged(PixmapPtr pixmap, struct sna_pixmap *priv,
 			       pixmap->drawable.width,
 			       pixmap->drawable.height);
 		list_del(&priv->list);
-		priv->undamaged = false;
 	} else {
 		sna_damage_add_box(&priv->gpu_damage, box);
 		sna_damage_subtract_box(&priv->cpu_damage, box);
@@ -5835,11 +6806,11 @@ static void mark_damaged(PixmapPtr pixmap, struct sna_pixmap *priv,
 }
 
 static bool
-trap_mask_converter(PicturePtr picture,
+trap_mask_converter(struct sna *sna,
+		    PicturePtr picture,
 		    INT16 x, INT16 y,
 		    int ntrap, xTrap *trap)
 {
-	struct sna *sna;
 	struct tor tor;
 	ScreenPtr screen = picture->pDrawable->pScreen;
 	PixmapPtr scratch, pixmap;
@@ -6033,13 +7004,18 @@ trap_upload(PicturePtr picture,
 void
 sna_add_traps(PicturePtr picture, INT16 x, INT16 y, int n, xTrap *t)
 {
+	struct sna *sna;
+
 	DBG(("%s (%d, %d) x %d\n", __FUNCTION__, x, y, n));
 
-	if (is_gpu(picture->pDrawable)) {
-		if (trap_span_converter(picture, x, y, n, t))
+	sna = to_sna_from_drawable(picture->pDrawable);
+	if (is_gpu(sna, picture->pDrawable, PREFER_GPU_SPANS)) {
+		if (trap_span_converter(sna, picture, x, y, n, t))
 			return;
+	}
 
-		if (trap_mask_converter(picture, x, y, n, t))
+	if (is_gpu(sna, picture->pDrawable, PREFER_GPU_RENDER)) {
+		if (trap_mask_converter(sna, picture, x, y, n, t))
 			return;
 
 		if (trap_upload(picture, x, y, n, t))
@@ -6070,6 +7046,7 @@ project_point_onto_grid(const xPointFixed *in,
 	out->y = dy + pixman_fixed_to_grid(in->y);
 }
 
+#if HAS_PIXMAN_TRIANGLES
 static inline bool
 xTriangleValid(const xTriangle *t)
 {
@@ -6104,7 +7081,8 @@ project_triangle_onto_grid(const xTriangle *in,
 }
 
 static bool
-mono_triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
+mono_triangles_span_converter(struct sna *sna,
+			      CARD8 op, PicturePtr src, PicturePtr dst,
 			      INT16 src_x, INT16 src_y,
 			      int count, xTriangle *tri)
 {
@@ -6115,7 +7093,7 @@ mono_triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	bool was_clear;
 	int n;
 
-	mono.sna = to_sna_from_drawable(dst->pDrawable);
+	mono.sna = sna;
 
 	dst_x = pixman_fixed_to_int(tri[0].p1.x);
 	dst_y = pixman_fixed_to_int(tri[0].p1.y);
@@ -6177,6 +7155,10 @@ mono_triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 				       mono.clip.extents.x2 - mono.clip.extents.x1,
 				       mono.clip.extents.y2 - mono.clip.extents.y1,
 				       &mono.op)) {
+		if (mono.clip.data == NULL && mono.op.damage == NULL)
+			mono.span = mono_span__fast;
+		else
+			mono.span = mono_span;
 		mono_render(&mono);
 		mono.op.done(mono.sna, &mono.op);
 	}
@@ -6220,6 +7202,10 @@ mono_triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 					       mono.clip.extents.x2 - mono.clip.extents.x1,
 					       mono.clip.extents.y2 - mono.clip.extents.y1,
 					       &mono.op)) {
+			if (mono.clip.data == NULL && mono.op.damage == NULL)
+				mono.span = mono_span__fast;
+			else
+				mono.span = mono_span;
 			mono_render(&mono);
 			mono.op.done(mono.sna, &mono.op);
 		}
@@ -6232,11 +7218,11 @@ mono_triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 }
 
 static bool
-triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
+triangles_span_converter(struct sna *sna,
+			 CARD8 op, PicturePtr src, PicturePtr dst,
 			 PictFormatPtr maskFormat, INT16 src_x, INT16 src_y,
 			 int count, xTriangle *tri)
 {
-	struct sna *sna;
 	struct sna_composite_spans_op tmp;
 	struct tor tor;
 	BoxRec extents;
@@ -6249,7 +7235,7 @@ triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 		return false;
 
 	if (is_mono(dst, maskFormat))
-		return mono_triangles_span_converter(op, src, dst,
+		return mono_triangles_span_converter(sna, op, src, dst,
 						     src_x, src_y,
 						     count, tri);
 
@@ -6260,7 +7246,6 @@ triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 		return false;
 	}
 
-	sna = to_sna_from_drawable(dst->pDrawable);
 	if (!sna->render.check_composite_spans(sna, op, src, dst, 0, 0, 0)) {
 		DBG(("%s: fallback -- composite spans not supported\n",
 		     __FUNCTION__));
@@ -6352,7 +7337,7 @@ triangles_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	}
 
 	tor_render(sna, &tor, &tmp, &clip,
-		   choose_span(&tmp, dst, maskFormat, op, &clip),
+		   choose_span(&tmp, dst, maskFormat, &clip),
 		   !was_clear && maskFormat && !operator_is_bounded(op));
 
 skip:
@@ -6585,7 +7570,9 @@ sna_composite_triangles(CARD8 op,
 			 INT16 xSrc, INT16 ySrc,
 			 int n, xTriangle *tri)
 {
-	if (triangles_span_converter(op, src, dst, maskFormat,
+	struct sna *sna = to_sna_from_drawable(dst->pDrawable);
+
+	if (triangles_span_converter(sna, op, src, dst, maskFormat,
 				     xSrc, ySrc,
 				     n, tri))
 		return;
@@ -6599,11 +7586,11 @@ sna_composite_triangles(CARD8 op,
 }
 
 static bool
-tristrip_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
+tristrip_span_converter(struct sna *sna,
+			CARD8 op, PicturePtr src, PicturePtr dst,
 			PictFormatPtr maskFormat, INT16 src_x, INT16 src_y,
 			int count, xPointFixed *points)
 {
-	struct sna *sna;
 	struct sna_composite_spans_op tmp;
 	struct tor tor;
 	BoxRec extents;
@@ -6624,7 +7611,6 @@ tristrip_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 		return false;
 	}
 
-	sna = to_sna_from_drawable(dst->pDrawable);
 	if (!sna->render.check_composite_spans(sna, op, src, dst, 0, 0, 0)) {
 		DBG(("%s: fallback -- composite spans not supported\n",
 		     __FUNCTION__));
@@ -6726,7 +7712,7 @@ tristrip_span_converter(CARD8 op, PicturePtr src, PicturePtr dst,
 	assert(tor.polygon->num_edges <= 2*count);
 
 	tor_render(sna, &tor, &tmp, &clip,
-		   choose_span(&tmp, dst, maskFormat, op, &clip),
+		   choose_span(&tmp, dst, maskFormat, &clip),
 		   !was_clear && maskFormat && !operator_is_bounded(op));
 
 skip:
@@ -6865,7 +7851,9 @@ sna_composite_tristrip(CARD8 op,
 		       INT16 xSrc, INT16 ySrc,
 		       int n, xPointFixed *points)
 {
-	if (tristrip_span_converter(op, src, dst, maskFormat, xSrc, ySrc, n, points))
+	struct sna *sna = to_sna_from_drawable(dst->pDrawable);
+
+	if (tristrip_span_converter(sna, op, src, dst, maskFormat, xSrc, ySrc, n, points))
 		return;
 
 	tristrip_fallback(op, src, dst, maskFormat, xSrc, ySrc, n, points);
@@ -7001,3 +7989,4 @@ sna_composite_trifan(CARD8 op,
 {
 	trifan_fallback(op, src, dst, maskFormat, xSrc, ySrc, n, points);
 }
+#endif
diff --git a/src/sna/sna_vertex.c b/src/sna/sna_vertex.c
new file mode 100644
index 000000000..6755d9aad
--- /dev/null
+++ b/src/sna/sna_vertex.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Chris Wilson <chris@chris-wilson.co.uk>
+ *
+ */
+
+#include "sna.h"
+
+#include <unistd.h>
+
+void sna_vertex_init(struct sna *sna)
+{
+	pthread_mutex_init(&sna->render.lock, NULL);
+	pthread_cond_init(&sna->render.wait, NULL);
+	sna->render.active = 0;
+}
diff --git a/src/sna/sna_video.c b/src/sna/sna_video.c
index 7bf20e96b..07fa829fa 100644
--- a/src/sna/sna_video.c
+++ b/src/sna/sna_video.c
@@ -100,9 +100,16 @@ sna_video_buffer(struct sna *sna,
 	if (video->buf && __kgem_bo_size(video->buf) < frame->size)
 		sna_video_free_buffers(sna, video);
 
-	if (video->buf == NULL)
-		video->buf = kgem_create_linear(&sna->kgem, frame->size,
-						CREATE_GTT_MAP);
+	if (video->buf == NULL) {
+		if (video->tiled) {
+			video->buf = kgem_create_2d(&sna->kgem,
+						    frame->width, frame->height, 32,
+						    I915_TILING_X, CREATE_EXACT);
+		} else {
+			video->buf = kgem_create_linear(&sna->kgem, frame->size,
+							CREATE_GTT_MAP);
+		}
+	}
 
 	return video->buf;
 }
@@ -166,14 +173,20 @@ sna_video_clip_helper(ScrnInfoPtr scrn,
 	if (crtc_region != reg)
 		RegionUninit(crtc_region);
 
-	frame->top = y1 >> 16;
-	frame->left = (x1 >> 16) & ~1;
-	frame->npixels = ALIGN(((x2 + 0xffff) >> 16), 2) - frame->left;
+	frame->src.x1 = x1 >> 16;
+	frame->src.y1 = y1 >> 16;
+	frame->src.x2 = (x2 + 0xffff) >> 16;
+	frame->src.y2 = (y2 + 0xffff) >> 16;
+
+	frame->image.x1 = frame->src.x1 & ~1;
+	frame->image.x2 = ALIGN(frame->src.x2, 2);
 	if (is_planar_fourcc(frame->id)) {
-		frame->top &= ~1;
-		frame->nlines = ALIGN(((y2 + 0xffff) >> 16), 2) - frame->top;
-	} else
-		frame->nlines = ((y2 + 0xffff) >> 16) - frame->top;
+		frame->image.y1 = frame->src.y1 & ~1;
+		frame->image.y2 = ALIGN(frame->src.y2, 2);
+	} else {
+		frame->image.y1 = frame->src.y1;
+		frame->image.y2 = frame->src.y2;
+	}
 
 	return ret;
 }
@@ -186,51 +199,38 @@ sna_video_frame_init(struct sna *sna,
 {
 	int align;
 
+	DBG(("%s: id=%d [planar? %d], width=%d, height=%d, align=%d\n",
+	     __FUNCTION__, id, is_planar_fourcc(id), width, height, video->alignment));
+	assert(width && height);
+
 	frame->bo = NULL;
 	frame->id = id;
 	frame->width = width;
 	frame->height = height;
 
-	/* Only needs to be DWORD-aligned for textured on i915, but overlay has
-	 * stricter requirements.
-	 */
-	if (video->textured) {
-		align = 4;
-	} else {
-		if (sna->kgem.gen >= 40)
-			/* Actually the alignment is 64 bytes, too. But the
-			 * stride must be at least 512 bytes. Take the easy fix
-			 * and align on 512 bytes unconditionally. */
-			align = 512;
-		else if (sna->kgem.gen < 21)
-			/* Harsh, errata on these chipsets limit the stride
-			 * to be a multiple of 256 bytes.
-			 */
-			align = 256;
-		else
-			align = 64;
-	}
-
+	align = video->alignment;
 #if SNA_XVMC
 	/* for i915 xvmc, hw requires 1kb aligned surfaces */
-	if (id == FOURCC_XVMC && sna->kgem.gen < 40)
+	if (id == FOURCC_XVMC && sna->kgem.gen < 040 && align < 1024)
 		align = 1024;
 #endif
 
-
-	/* Determine the desired destination pitch (representing the chroma's pitch,
-	 * in the planar case.
+	/* Determine the desired destination pitch (representing the
+	 * chroma's pitch in the planar case).
 	 */
 	if (is_planar_fourcc(id)) {
+		assert((width & 1) == 0);
+		assert((height & 1) == 0);
 		if (video->rotation & (RR_Rotate_90 | RR_Rotate_270)) {
 			frame->pitch[0] = ALIGN((height / 2), align);
 			frame->pitch[1] = ALIGN(height, align);
-			frame->size = 3U * frame->pitch[0] * width;
+			frame->size = width;
 		} else {
 			frame->pitch[0] = ALIGN((width / 2), align);
 			frame->pitch[1] = ALIGN(width, align);
-			frame->size = 3U * frame->pitch[0] * height;
+			frame->size = height;
 		}
+		frame->size *= frame->pitch[0] + frame->pitch[1];
 	} else {
 		if (video->rotation & (RR_Rotate_90 | RR_Rotate_270)) {
 			frame->pitch[0] = ALIGN((height << 1), align);
@@ -251,51 +251,68 @@ sna_video_frame_init(struct sna *sna,
 		frame->VBufOffset =
 			frame->UBufOffset + (int)frame->pitch[0] * height / 2;
 	}
+
+	assert(frame->size);
 }
 
-static void sna_memcpy_plane(uint8_t *dst, const uint8_t *src,
-			     int height, int width,
-			     int dstPitch, int srcPitch,
-			     Rotation rotation)
+static void sna_memcpy_plane(struct sna_video *video,
+			     uint8_t *dst, const uint8_t *src,
+			     const struct sna_video_frame *frame, int sub)
 {
+	int dstPitch = frame->pitch[!sub], srcPitch;
 	const uint8_t *s;
 	int i, j = 0;
+	int x, y, w, h;
+
+	x = frame->image.x1;
+	y = frame->image.y1;
+	w = frame->image.x2 - frame->image.x1;
+	h = frame->image.y2 - frame->image.y1;
+	if (sub) {
+		x >>= 1; w >>= 1;
+		y >>= 1; h >>= 1;
+		srcPitch = ALIGN((frame->width >> 1), 4);
+	} else
+		srcPitch = ALIGN(frame->width, 4);
+
+	src += y * srcPitch + x;
+	if (!video->textured)
+		x = y = 0;
 
-	switch (rotation) {
+	switch (video->rotation) {
 	case RR_Rotate_0:
-		/* optimise for the case of no clipping */
-		if (srcPitch == dstPitch && srcPitch == width)
-			memcpy(dst, src, srcPitch * height);
-		else while (height--) {
-			memcpy(dst, src, width);
+		dst += y * dstPitch + x;
+		if (srcPitch == dstPitch && srcPitch == w)
+			memcpy(dst, src, srcPitch * h);
+		else while (h--) {
+			memcpy(dst, src, w);
 			src += srcPitch;
 			dst += dstPitch;
 		}
 		break;
 	case RR_Rotate_90:
-		for (i = 0; i < height; i++) {
+		for (i = 0; i < h; i++) {
 			s = src;
-			for (j = 0; j < width; j++) {
-				dst[(i) + ((width - j - 1) * dstPitch)] = *s++;
-			}
+			for (j = 0; j < w; j++)
+				dst[i + ((x + w - j - 1) * dstPitch)] = *s++;
 			src += srcPitch;
 		}
 		break;
 	case RR_Rotate_180:
-		for (i = 0; i < height; i++) {
+		for (i = 0; i < h; i++) {
 			s = src;
-			for (j = 0; j < width; j++) {
-				dst[(width - j - 1) +
-				    ((height - i - 1) * dstPitch)] = *s++;
+			for (j = 0; j < w; j++) {
+				dst[(x + w - j - 1) +
+				    ((h - i - 1) * dstPitch)] = *s++;
 			}
 			src += srcPitch;
 		}
 		break;
 	case RR_Rotate_270:
-		for (i = 0; i < height; i++) {
+		for (i = 0; i < h; i++) {
 			s = src;
-			for (j = 0; j < width; j++) {
-				dst[(height - i - 1) + (j * dstPitch)] = *s++;
+			for (j = 0; j < w; j++) {
+				dst[(h - i - 1) + (x + j * dstPitch)] = *s++;
 			}
 			src += srcPitch;
 		}
@@ -309,36 +326,22 @@ sna_copy_planar_data(struct sna_video *video,
 		     const uint8_t *src, uint8_t *dst)
 {
 	uint8_t *d;
-	int w = frame->npixels;
-	int h = frame->nlines;
-	int pitch;
 
-	pitch = ALIGN(frame->width, 4);
-	sna_memcpy_plane(dst, src + frame->top * pitch + frame->left,
-			 h, w, frame->pitch[1], pitch, video->rotation);
-
-	src += frame->height * pitch; /* move over Luma plane */
-
-	/* align to beginning of chroma planes */
-	pitch = ALIGN((frame->width >> 1), 0x4);
-	src += (frame->top >> 1) * pitch + (frame->left >> 1);
-	w >>= 1;
-	h >>= 1;
+	sna_memcpy_plane(video, dst, src, frame, 0);
+	src += frame->height * ALIGN(frame->width, 4);
 
 	if (frame->id == FOURCC_I420)
 		d = dst + frame->UBufOffset;
 	else
 		d = dst + frame->VBufOffset;
-
-	sna_memcpy_plane(d, src, h, w, frame->pitch[0], pitch, video->rotation);
-	src += (frame->height >> 1) * pitch; /* move over Chroma plane */
+	sna_memcpy_plane(video, d, src, frame, 1);
+	src += (frame->height >> 1) * ALIGN(frame->width >> 1, 4);
 
 	if (frame->id == FOURCC_I420)
 		d = dst + frame->VBufOffset;
 	else
 		d = dst + frame->UBufOffset;
-
-	sna_memcpy_plane(d, src, h, w, frame->pitch[0], pitch, video->rotation);
+	sna_memcpy_plane(video, d, src, frame, 1);
 }
 
 static void
@@ -349,11 +352,22 @@ sna_copy_packed_data(struct sna_video *video,
 {
 	int pitch = frame->width << 1;
 	const uint8_t *src, *s;
-	int w = frame->npixels;
-	int h = frame->nlines;
+	int x, y, w, h;
 	int i, j;
 
-	src = buf + (frame->top * pitch) + (frame->left << 1);
+	if (video->textured) {
+		/* XXX support copying cropped extents */
+		x = y = 0;
+		w = frame->width;
+		h = frame->height;
+	} else {
+		x = frame->image.x1;
+		y = frame->image.y1;
+		w = frame->image.x2 - frame->image.x1;
+		h = frame->image.y2 - frame->image.y1;
+	}
+
+	src = buf + (y * pitch) + (x << 1);
 
 	switch (video->rotation) {
 	case RR_Rotate_0:
@@ -376,7 +390,7 @@ sna_copy_packed_data(struct sna_video *video,
 			src += pitch;
 		}
 		h >>= 1;
-		src = buf + (frame->top * pitch) + (frame->left << 1);
+		src = buf + (y * pitch) + (x << 1);
 		for (i = 0; i < h; i += 2) {
 			for (j = 0; j < w; j += 2) {
 				/* Copy U */
@@ -412,7 +426,7 @@ sna_copy_packed_data(struct sna_video *video,
 			src += pitch;
 		}
 		h >>= 1;
-		src = buf + (frame->top * pitch) + (frame->left << 1);
+		src = buf + (y * pitch) + (x << 1);
 		for (i = 0; i < h; i += 2) {
 			for (j = 0; j < w; j += 2) {
 				/* Copy U */
@@ -435,27 +449,28 @@ sna_video_copy_data(struct sna *sna,
 {
 	uint8_t *dst;
 
-	DBG(("%s: handle=%d, size=%dx%d, rotation=%d\n",
+	DBG(("%s: handle=%d, size=%dx%d [%d], rotation=%d, is-texture=%d\n",
 	     __FUNCTION__, frame->bo ? frame->bo->handle : 0,
-	     frame->width, frame->height, video->rotation));
-	DBG(("%s: top=%d, left=%d\n", __FUNCTION__, frame->top, frame->left));
+	     frame->width, frame->height, frame->size,
+	     video->rotation, video->textured));
+	DBG(("%s: image=(%d, %d), (%d, %d), source=(%d, %d), (%d, %d)\n",
+	     __FUNCTION__,
+	     frame->image.x1, frame->image.y1, frame->image.x2, frame->image.y2,
+	     frame->src.x1, frame->src.y1, frame->src.x2, frame->src.y2));
+	assert(frame->width && frame->height);
+	assert(frame->size);
 
 	/* In the common case, we can simply the upload in a single pwrite */
-	if (video->rotation == RR_Rotate_0) {
+	if (video->rotation == RR_Rotate_0 && !video->tiled) {
 		if (is_planar_fourcc(frame->id)) {
-			uint16_t pitch[2] = {
-				ALIGN((frame->width >> 1), 0x4),
-				ALIGN(frame->width, 0x4),
-			};
-			if (pitch[0] == frame->pitch[0] &&
-			    pitch[1] == frame->pitch[1] &&
-			    frame->top == 0 && frame->left == 0) {
-				uint32_t len =
-					(uint32_t)pitch[1]*frame->height +
-					(uint32_t)pitch[0]*frame->height;
+			int w = frame->image.x2 - frame->image.x1;
+			int h = frame->image.y2 - frame->image.y1;
+			if (ALIGN(h, 2) == frame->height &&
+			    ALIGN(w >> 1, 4) == frame->pitch[0] &&
+			    ALIGN(w, 4) == frame->pitch[1]) {
 				if (frame->bo) {
 					kgem_bo_write(&sna->kgem, frame->bo,
-						      buf, len);
+						      buf, frame->size);
 				} else {
 					frame->bo = kgem_create_buffer(&sna->kgem, frame->size,
 								       KGEM_BUFFER_WRITE | KGEM_BUFFER_WRITE_INPLACE,
@@ -463,7 +478,7 @@ sna_video_copy_data(struct sna *sna,
 					if (frame->bo == NULL)
 						return false;
 
-					memcpy(dst, buf, len);
+					memcpy(dst, buf, frame->size);
 				}
 				if (frame->id != FOURCC_I420) {
 					uint32_t tmp;
@@ -477,8 +492,8 @@ sna_video_copy_data(struct sna *sna,
 			if (frame->width*2 == frame->pitch[0]) {
 				if (frame->bo) {
 					kgem_bo_write(&sna->kgem, frame->bo,
-						      buf + (2U*frame->top * frame->width) + (frame->left << 1),
-						      2U*frame->nlines*frame->width);
+						      buf + (2U*frame->image.y1 * frame->width) + (frame->image.x1 << 1),
+						      2U*(frame->image.y2-frame->image.y1)*frame->width);
 				} else {
 					frame->bo = kgem_create_buffer(&sna->kgem, frame->size,
 								       KGEM_BUFFER_WRITE | KGEM_BUFFER_WRITE_INPLACE,
@@ -487,8 +502,8 @@ sna_video_copy_data(struct sna *sna,
 						return false;
 
 					memcpy(dst,
-					       buf + (frame->top * frame->width*2) + (frame->left << 1),
-					       2U*frame->nlines*frame->width);
+					       buf + (frame->image.y1 * frame->width*2) + (frame->image.x1 << 1),
+					       2U*(frame->image.y2-frame->image.y1)*frame->width);
 				}
 				return true;
 			}
diff --git a/src/sna/sna_video.h b/src/sna/sna_video.h
index 3ce72c009..c0c023cf9 100644
--- a/src/sna/sna_video.h
+++ b/src/sna/sna_video.h
@@ -57,6 +57,8 @@ struct sna_video {
 	struct kgem_bo *old_buf[2];
 	struct kgem_bo *buf;
 
+	int alignment;
+	bool tiled;
 	bool textured;
 	Rotation rotation;
 	int plane;
@@ -75,8 +77,8 @@ struct sna_video_frame {
 	uint16_t pitch[2];
 
 	/* extents */
-	uint16_t top, left;
-	uint16_t npixels, nlines;
+	BoxRec image;
+	BoxRec src;
 };
 
 void sna_video_init(struct sna *sna, ScreenPtr screen);
diff --git a/src/sna/sna_video_hwmc.c b/src/sna/sna_video_hwmc.c
index b0e8d25d2..b3e065d95 100644
--- a/src/sna/sna_video_hwmc.c
+++ b/src/sna/sna_video_hwmc.c
@@ -36,63 +36,72 @@
 #include <X11/extensions/XvMC.h>
 #include <fourcc.h>
 
-static int create_subpicture(ScrnInfoPtr scrn, XvMCSubpicturePtr subpicture,
-			     int *num_priv, CARD32 ** priv)
+extern DevPrivateKey XF86XvScreenKey;
+
+static int create_subpicture(XvMCSubpicturePtr sub, int *size, CARD32 **priv)
 {
 	return Success;
 }
 
-static void destroy_subpicture(ScrnInfoPtr scrn, XvMCSubpicturePtr subpicture)
+static void destroy_subpicture(XvMCSubpicturePtr sub)
 {
 }
 
-static int create_surface(ScrnInfoPtr scrn, XvMCSurfacePtr surface,
-			  int *num_priv, CARD32 ** priv)
+static int create_surface(XvMCSurfacePtr surface, int *size, CARD32 **priv)
 {
 	return Success;
 }
 
-static void destroy_surface(ScrnInfoPtr scrn, XvMCSurfacePtr surface)
+static void destroy_surface(XvMCSurfacePtr surface)
 {
 }
 
-static int create_context(ScrnInfoPtr scrn, XvMCContextPtr pContext,
-				    int *num_priv, CARD32 **priv)
+static int create_context(XvPortPtr port, XvMCContextPtr ctx,
+			  int *size, CARD32 **out)
 {
-	struct sna *sna = to_sna(scrn);
-	struct sna_xvmc_hw_context *contextRec;
+	struct sna *sna = to_sna_from_screen(ctx->pScreen);
+	struct intel_xvmc_hw_context {
+		unsigned int type;
+		union {
+			struct {
+				unsigned int use_phys_addr : 1;
+			} i915;
+			struct {
+				unsigned int is_g4x:1;
+				unsigned int is_965_q:1;
+				unsigned int is_igdng:1;
+			} i965;
+		};
+	} *priv;
 
-	*priv = calloc(1, sizeof(struct sna_xvmc_hw_context));
-	contextRec = (struct sna_xvmc_hw_context *) *priv;
-	if (!contextRec) {
-		*num_priv = 0;
-		return BadAlloc;
-	}
+	ctx->port_priv = port->devPriv.ptr;
 
-	*num_priv = sizeof(struct sna_xvmc_hw_context) >> 2;
+	priv = calloc(1, sizeof(*priv));
+	if (priv == NULL)
+		return BadAlloc;
 
-	if (sna->kgem.gen >= 40) {
-		if (sna->kgem.gen >= 45)
-			contextRec->type = XVMC_I965_MPEG2_VLD;
+	if (sna->kgem.gen >= 040) {
+		if (sna->kgem.gen >= 045)
+			priv->type = XVMC_I965_MPEG2_VLD;
 		else
-			contextRec->type = XVMC_I965_MPEG2_MC;
-		contextRec->i965.is_g4x = sna->kgem.gen == 45;
-		contextRec->i965.is_965_q = IS_965_Q(sna);
-		contextRec->i965.is_igdng = sna->kgem.gen == 50;
-	} else {
-		contextRec->type = XVMC_I915_MPEG2_MC;
-		contextRec->i915.use_phys_addr = 0;
-	}
+			priv->type = XVMC_I965_MPEG2_MC;
+		priv->i965.is_g4x = sna->kgem.gen == 045;
+		priv->i965.is_965_q = IS_965_Q(sna);
+		priv->i965.is_igdng = sna->kgem.gen == 050;
+	} else
+		priv->type = XVMC_I915_MPEG2_MC;
 
+	*size = sizeof(*priv) >> 2;
+	*out = priv;
 	return Success;
 }
 
-static void destroy_context(ScrnInfoPtr scrn, XvMCContextPtr context)
+static void destroy_context(XvMCContextPtr ctx)
 {
 }
 
 /* i915 hwmc support */
-static XF86MCSurfaceInfoRec i915_YV12_mpg2_surface = {
+static XvMCSurfaceInfoRec i915_YV12_mpg2_surface = {
 	FOURCC_YV12,
 	XVMC_CHROMA_FORMAT_420,
 	0,
@@ -107,7 +116,7 @@ static XF86MCSurfaceInfoRec i915_YV12_mpg2_surface = {
 	NULL,
 };
 
-static XF86MCSurfaceInfoRec i915_YV12_mpg1_surface = {
+static XvMCSurfaceInfoRec i915_YV12_mpg1_surface = {
 	FOURCC_YV12,
 	XVMC_CHROMA_FORMAT_420,
 	0,
@@ -121,9 +130,9 @@ static XF86MCSurfaceInfoRec i915_YV12_mpg1_surface = {
 	NULL,
 };
 
-static XF86MCSurfaceInfoPtr surface_info_i915[2] = {
-	(XF86MCSurfaceInfoPtr) & i915_YV12_mpg2_surface,
-	(XF86MCSurfaceInfoPtr) & i915_YV12_mpg1_surface
+static XvMCSurfaceInfoPtr surface_info_i915[2] = {
+	&i915_YV12_mpg2_surface,
+	&i915_YV12_mpg1_surface
 };
 
 /* i965 and later hwmc support */
@@ -131,7 +140,7 @@ static XF86MCSurfaceInfoPtr surface_info_i915[2] = {
 #define XVMC_VLD  0x00020000
 #endif
 
-static XF86MCSurfaceInfoRec yv12_mpeg2_vld_surface = {
+static XvMCSurfaceInfoRec yv12_mpeg2_vld_surface = {
 	FOURCC_YV12,
 	XVMC_CHROMA_FORMAT_420,
 	0,
@@ -144,7 +153,7 @@ static XF86MCSurfaceInfoRec yv12_mpeg2_vld_surface = {
 	NULL
 };
 
-static XF86MCSurfaceInfoRec yv12_mpeg2_i965_surface = {
+static XvMCSurfaceInfoRec yv12_mpeg2_i965_surface = {
 	FOURCC_YV12,
 	XVMC_CHROMA_FORMAT_420,
 	0,
@@ -159,7 +168,7 @@ static XF86MCSurfaceInfoRec yv12_mpeg2_i965_surface = {
 	NULL
 };
 
-static XF86MCSurfaceInfoRec yv12_mpeg1_i965_surface = {
+static XvMCSurfaceInfoRec yv12_mpeg1_i965_surface = {
 	FOURCC_YV12,
 	XVMC_CHROMA_FORMAT_420,
 	0,
@@ -176,12 +185,12 @@ static XF86MCSurfaceInfoRec yv12_mpeg1_i965_surface = {
 	NULL
 };
 
-static XF86MCSurfaceInfoPtr surface_info_i965[] = {
+static XvMCSurfaceInfoPtr surface_info_i965[] = {
 	&yv12_mpeg2_i965_surface,
 	&yv12_mpeg1_i965_surface
 };
 
-static XF86MCSurfaceInfoPtr surface_info_vld[] = {
+static XvMCSurfaceInfoPtr surface_info_vld[] = {
 	&yv12_mpeg2_vld_surface,
 	&yv12_mpeg2_i965_surface,
 };
@@ -191,63 +200,76 @@ Bool sna_video_xvmc_setup(struct sna *sna,
 			  ScreenPtr screen,
 			  XF86VideoAdaptorPtr target)
 {
-	XF86MCAdaptorRec *pAdapt;
+	XvMCAdaptorRec *adaptors;
+	XvScreenPtr xv;
 	const char *name;
-	char buf[64];
+	char bus[64];
+	int i;
+
+	if (!xf86LoaderCheckSymbol("XvMCScreenInit"))
+		return FALSE;
 
 	/* Needs KMS support. */
-	if (sna->kgem.gen < 31)
+	if (sna->kgem.gen < 031)
 		return FALSE;
 
 	/* Not implemented */
-	if (sna->kgem.gen >= 60)
+	if (sna->kgem.gen >= 060)
 		return FALSE;
 
-	pAdapt = calloc(1, sizeof(XF86MCAdaptorRec));
-	if (!pAdapt)
+	adaptors = calloc(1, sizeof(XvMCAdaptorRec));
+	if (adaptors == NULL)
 		return FALSE;
 
-	pAdapt->name = target->name;
-	pAdapt->num_subpictures = 0;
-	pAdapt->subpictures = NULL;
-	pAdapt->CreateContext = create_context;
-	pAdapt->DestroyContext = destroy_context;
-	pAdapt->CreateSurface = create_surface;
-	pAdapt->DestroySurface = destroy_surface;
-	pAdapt->CreateSubpicture =  create_subpicture;
-	pAdapt->DestroySubpicture = destroy_subpicture;
-
-	if (sna->kgem.gen >= 45) {
+	xv = dixLookupPrivate(&screen->devPrivates, XF86XvScreenKey);
+	for (i = 0; i< xv->nAdaptors;i++) {
+		if (strcmp(xv->pAdaptors[i].name, target->name) == 0) {
+			adaptors->xv_adaptor = &xv->pAdaptors[i];
+			break;
+		}
+	}
+	assert(adaptors->xv_adaptor);
+
+	adaptors->num_subpictures = 0;
+	adaptors->subpictures = NULL;
+	adaptors->CreateContext = create_context;
+	adaptors->DestroyContext = destroy_context;
+	adaptors->CreateSurface = create_surface;
+	adaptors->DestroySurface = destroy_surface;
+	adaptors->CreateSubpicture =  create_subpicture;
+	adaptors->DestroySubpicture = destroy_subpicture;
+
+	if (sna->kgem.gen >= 045) {
 		name = "xvmc_vld",
-		pAdapt->num_surfaces = ARRAY_SIZE(surface_info_vld);
-		pAdapt->surfaces = surface_info_vld;
-	} else if (sna->kgem.gen >= 40) {
+		adaptors->num_surfaces = ARRAY_SIZE(surface_info_vld);
+		adaptors->surfaces = surface_info_vld;
+	} else if (sna->kgem.gen >= 040) {
 		name = "i965_xvmc",
-		pAdapt->num_surfaces = ARRAY_SIZE(surface_info_i965);
-		pAdapt->surfaces = surface_info_i965;
+		adaptors->num_surfaces = ARRAY_SIZE(surface_info_i965);
+		adaptors->surfaces = surface_info_i965;
 	} else {
 		name = "i915_xvmc",
-		pAdapt->num_surfaces = ARRAY_SIZE(surface_info_i915);
-		pAdapt->surfaces = surface_info_i915;
+		adaptors->num_surfaces = ARRAY_SIZE(surface_info_i915);
+		adaptors->surfaces = surface_info_i915;
 	}
 
-	if (xf86XvMCScreenInit(screen, 1, &pAdapt)) {
-		xf86DrvMsg(sna->scrn->scrnIndex, X_INFO,
-			   "[XvMC] %s driver initialized.\n",
-			   name);
-	} else {
+	if (XvMCScreenInit(screen, 1, adaptors) != Success) {
 		xf86DrvMsg(sna->scrn->scrnIndex, X_INFO,
 			   "[XvMC] Failed to initialize XvMC.\n");
+		free(adaptors);
 		return FALSE;
 	}
 
-	sprintf(buf, "pci:%04x:%02x:%02x.%d",
+	sprintf(bus, "pci:%04x:%02x:%02x.%d",
 		sna->PciInfo->domain,
 		sna->PciInfo->bus, sna->PciInfo->dev, sna->PciInfo->func);
 
-	xf86XvMCRegisterDRInfo(screen, SNA_XVMC_LIBNAME,
-			       buf,
+	xf86XvMCRegisterDRInfo(screen, SNA_XVMC_LIBNAME, bus,
 			       SNA_XVMC_MAJOR, SNA_XVMC_MINOR,
 			       SNA_XVMC_PATCHLEVEL);
+
+	xf86DrvMsg(sna->scrn->scrnIndex, X_INFO,
+		   "[XvMC] %s driver initialized.\n",
+		   name);
 	return TRUE;
 }
diff --git a/src/sna/sna_video_hwmc.h b/src/sna/sna_video_hwmc.h
index 2494d44bd..44de456e9 100644
--- a/src/sna/sna_video_hwmc.h
+++ b/src/sna/sna_video_hwmc.h
@@ -32,38 +32,12 @@
 #define SNA_XVMC_MINOR	1
 #define SNA_XVMC_PATCHLEVEL	0
 
-/*
- * Commands that client submits through XvPutImage:
- */
-
-#define SNA_XVMC_COMMAND_DISPLAY      0x00
-#define SNA_XVMC_COMMAND_UNDISPLAY    0x01
-
 /* hw xvmc support type */
 #define XVMC_I915_MPEG2_MC	0x01
 #define XVMC_I965_MPEG2_MC	0x02
 #define XVMC_I945_MPEG2_VLD	0x04
 #define XVMC_I965_MPEG2_VLD	0x08
 
-struct sna_xvmc_hw_context {
-	unsigned int type;
-	union {
-		struct {
-			unsigned int use_phys_addr : 1;
-		} i915;
-		struct {
-			unsigned int is_g4x:1;
-			unsigned int is_965_q:1;
-			unsigned int is_igdng:1;
-		} i965;
-	};
-};
-
-/* Intel private XvMC command to DDX driver */
-struct sna_xvmc_command {
-	uint32_t handle;
-};
-
 #ifdef _SNA_XVMC_SERVER_
 #include <xf86xvmc.h>
 Bool sna_video_xvmc_setup(struct sna *sna,
diff --git a/src/sna/sna_video_overlay.c b/src/sna/sna_video_overlay.c
index b73e9ddf8..3655b8763 100644
--- a/src/sna/sna_video_overlay.c
+++ b/src/sna/sna_video_overlay.c
@@ -41,7 +41,7 @@
 
 #define MAKE_ATOM(a) MakeAtom(a, sizeof(a) - 1, TRUE)
 
-#define HAS_GAMMA(sna) ((sna)->kgem.gen >= 30)
+#define HAS_GAMMA(sna) ((sna)->kgem.gen >= 030)
 
 static Atom xvBrightness, xvContrast, xvSaturation, xvColorKey, xvPipe;
 static Atom xvGamma0, xvGamma1, xvGamma2, xvGamma3, xvGamma4, xvGamma5;
@@ -296,7 +296,7 @@ sna_video_overlay_query_best_size(ScrnInfoPtr scrn,
 		drw_h = vid_h >> 1;
 	}
 
-	if (sna->kgem.gen < 21) {
+	if (sna->kgem.gen < 021) {
 		max_w = IMAGE_MAX_WIDTH_LEGACY;
 		max_h = IMAGE_MAX_HEIGHT_LEGACY;
 	} else {
@@ -532,6 +532,7 @@ sna_video_overlay_put_image(ScrnInfoPtr scrn,
 		return BadAlloc;
 	}
 
+	frame.bo->domain = DOMAIN_NONE;
 	sna_video_buffer_fini(sna, video);
 
 	/* update cliplist */
@@ -554,7 +555,7 @@ sna_video_overlay_query_video_attributes(ScrnInfoPtr scrn,
 
 	DBG(("%s: w is %d, h is %d\n", __FUNCTION__, *w, *h));
 
-	if (sna->kgem.gen < 21) {
+	if (sna->kgem.gen < 021) {
 		if (*w > IMAGE_MAX_WIDTH_LEGACY)
 			*w = IMAGE_MAX_WIDTH_LEGACY;
 		if (*h > IMAGE_MAX_HEIGHT_LEGACY)
@@ -664,7 +665,7 @@ XF86VideoAdaptorPtr sna_video_overlay_setup(struct sna *sna,
 	adaptor->nEncodings = 1;
 	adaptor->pEncodings = xnfalloc(sizeof(DummyEncoding));
 	memcpy(adaptor->pEncodings, DummyEncoding, sizeof(DummyEncoding));
-	if (sna->kgem.gen < 21) {
+	if (sna->kgem.gen < 021) {
 		adaptor->pEncodings->width = IMAGE_MAX_WIDTH_LEGACY;
 		adaptor->pEncodings->height = IMAGE_MAX_HEIGHT_LEGACY;
 	}
@@ -701,6 +702,18 @@ XF86VideoAdaptorPtr sna_video_overlay_setup(struct sna *sna,
 	adaptor->PutImage = sna_video_overlay_put_image;
 	adaptor->QueryImageAttributes = sna_video_overlay_query_video_attributes;
 
+	if (sna->kgem.gen >= 040)
+		/* Actually the alignment is 64 bytes, too. But the
+		 * stride must be at least 512 bytes. Take the easy fix
+		 * and align on 512 bytes unconditionally. */
+		video->alignment = 512;
+	else if (sna->kgem.gen < 021)
+		/* Harsh, errata on these chipsets limit the stride
+		 * to be a multiple of 256 bytes.
+		 */
+		video->alignment = 256;
+	else
+		video->alignment = 64;
 	video->textured = false;
 	video->color_key = sna_video_overlay_color_key(sna);
 	video->brightness = -19;	/* (255/219) * -16 */
diff --git a/src/sna/sna_video_sprite.c b/src/sna/sna_video_sprite.c
index a912590fe..7737460b5 100644
--- a/src/sna/sna_video_sprite.c
+++ b/src/sna/sna_video_sprite.c
@@ -37,8 +37,11 @@
 #include <xf86xv.h>
 #include <X11/extensions/Xv.h>
 #include <fourcc.h>
-#include <drm_fourcc.h>
 #include <i915_drm.h>
+#include <errno.h>
+
+#ifdef  DRM_IOCTL_MODE_GETPLANERESOURCES
+#include <drm_fourcc.h>
 
 #define IMAGE_MAX_WIDTH		2048
 #define IMAGE_MAX_HEIGHT	2048
@@ -60,13 +63,14 @@ static XF86AttributeRec attribs[] = {
 
 static void sna_video_sprite_off(struct sna *sna, struct sna_video *video)
 {
+	struct drm_mode_set_plane s;
+
 	if (video->plane == 0)
 		return;
 
-	if (drmModeSetPlane(sna->kgem.fd,
-			    video->plane, 0, 0, 0,
-			    0, 0, 0, 0,
-			    0, 0, 0, 0))
+	memset(&s, 0, sizeof(s));
+	s.plane_id = video->plane;
+	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_SETPLANE, &s))
 		xf86DrvMsg(sna->scrn->scrnIndex, X_ERROR,
 			   "failed to disable plane\n");
 
@@ -114,8 +118,15 @@ static void sna_video_sprite_best_size(ScrnInfoPtr scrn, Bool motion,
 				       unsigned int *p_w, unsigned int *p_h,
 				       pointer data)
 {
-	*p_w = vid_w;
-	*p_h = vid_h;
+	struct sna *sna = to_sna(scrn);
+
+	if (sna->kgem.gen == 075) {
+		*p_w = vid_w;
+		*p_h = vid_h;
+	} else {
+		*p_w = drw_w;
+		*p_h = drw_h;
+	}
 }
 
 static void
@@ -174,7 +185,10 @@ sna_video_sprite_show(struct sna *sna,
 		      xf86CrtcPtr crtc,
 		      BoxPtr dstBox)
 {
-	int plane = sna_crtc_to_plane(crtc);
+	struct drm_mode_set_plane s;
+
+	VG_CLEAR(s);
+	s.plane_id = sna_crtc_to_plane(crtc);
 
 	update_dst_box_to_crtc_coords(sna, crtc, dstBox);
 	if (crtc->rotation & (RR_Rotate_90 | RR_Rotate_270)) {
@@ -184,13 +198,13 @@ sna_video_sprite_show(struct sna *sna,
 	}
 
 #if defined(DRM_I915_SET_SPRITE_DESTKEY)
-	if (video->color_key_changed || video->plane != plane) {
+	if (video->color_key_changed || video->plane != s.plane_id) {
 		struct drm_intel_set_sprite_destkey set;
 
 		DBG(("%s: updating color key: %x\n",
 		     __FUNCTION__, video->color_key));
 
-		set.plane_id = plane;
+		set.plane_id = s.plane_id;
 		set.value = video->color_key;
 
 		if (drmIoctl(sna->kgem.fd,
@@ -221,8 +235,9 @@ sna_video_sprite_show(struct sna *sna,
 		pitches[0] = frame->pitch[0];
 		offsets[0] = 0;
 
-		DBG(("%s: creating new fb for handle=%d\n",
-		     __FUNCTION__, frame->bo->handle));
+		DBG(("%s: creating new fb for handle=%d, width=%d, height=%d, stride=%d\n",
+		     __FUNCTION__, frame->bo->handle,
+		     frame->width, frame->height, frame->pitch[0]));
 
 		if (drmModeAddFB2(sna->kgem.fd,
 				  frame->width, frame->height, pixel_format,
@@ -236,21 +251,33 @@ sna_video_sprite_show(struct sna *sna,
 		frame->bo->scanout = true;
 	}
 
-	DBG(("%s: updating plane=%d, handle=%d [fb %d], dst=(%d,%d)x(%d,%d)\n",
-	     __FUNCTION__, plane, frame->bo->handle, frame->bo->delta,
-	     dstBox->x1, dstBox->y1,
-	     dstBox->x2 - dstBox->x1, dstBox->y2 - dstBox->y1));
 	assert(frame->bo->scanout);
 	assert(frame->bo->delta);
 
-	if (drmModeSetPlane(sna->kgem.fd,
-			    plane, sna_crtc_id(crtc), frame->bo->delta, 0,
-			    dstBox->x1, dstBox->y1,
-			    dstBox->x2 - dstBox->x1, dstBox->y2 - dstBox->y1,
-			    0, 0, frame->width << 16, frame->height << 16))
+	s.crtc_id = sna_crtc_id(crtc);
+	s.fb_id = frame->bo->delta;
+	s.flags = 0;
+	s.crtc_x = dstBox->x1;
+	s.crtc_y = dstBox->y1;
+	s.crtc_w = dstBox->x2 - dstBox->x1;
+	s.crtc_h = dstBox->y2 - dstBox->y1;
+	s.src_x = 0;
+	s.src_y = 0;
+	s.src_w = (frame->image.x2 - frame->image.x1) << 16;
+	s.src_h = (frame->image.y2 - frame->image.y1) << 16;
+
+	DBG(("%s: updating crtc=%d, plane=%d, handle=%d [fb %d], dst=(%d,%d)x(%d,%d), src=(%d,%d)x(%d,%d)\n",
+	     __FUNCTION__, s.crtc_id, s.plane_id, frame->bo->handle, s.fb_id,
+	     s.crtc_x, s.crtc_y, s.crtc_w, s.crtc_h,
+	     s.src_x >> 16, s.src_y >> 16, s.src_w >> 16, s.src_h >> 16));
+
+	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_SETPLANE, &s)) {
+		DBG(("SET_PLANE failed: ret=%d\n", errno));
 		return false;
+	}
 
-	video->plane = plane;
+	frame->bo->domain = DOMAIN_NONE;
+	video->plane = s.plane_id;
 	return true;
 }
 
@@ -278,7 +305,7 @@ static int sna_video_sprite_put_image(ScrnInfoPtr scrn,
 				   clip))
 		return Success;
 
-	if (!crtc || !sna_crtc_to_plane(crtc)) {
+	if (!crtc || sna_crtc_to_plane(crtc) == 0) {
 		/* If the video isn't visible on any CRTC, turn it off */
 		sna_video_sprite_off(sna, video);
 		return Success;
@@ -370,6 +397,7 @@ XF86VideoAdaptorPtr sna_video_sprite_setup(struct sna *sna,
 	memset(&r, 0, sizeof(struct drm_mode_get_plane_res));
 	if (drmIoctl(sna->kgem.fd, DRM_IOCTL_MODE_GETPLANERESOURCES, &r))
 		return NULL;
+	DBG(("%s: %d sprite planes\n", __FUNCTION__, r.count_planes));
 	if (r.count_planes == 0)
 		return NULL;
 
@@ -411,7 +439,7 @@ XF86VideoAdaptorPtr sna_video_sprite_setup(struct sna *sna,
 	adaptor->PutImage = sna_video_sprite_put_image;
 	adaptor->QueryImageAttributes = sna_video_sprite_query_attrs;
 
-	video->textured = false;
+	video->alignment = 64;
 	video->color_key = sna_video_sprite_color_key(sna);
 	video->color_key_changed = true;
 	video->brightness = -19;	/* (255/219) * -16 */
@@ -433,3 +461,9 @@ XF86VideoAdaptorPtr sna_video_sprite_setup(struct sna *sna,
 
 	return adaptor;
 }
+#else
+XF86VideoAdaptorPtr sna_video_sprite_setup(struct sna *sna, ScreenPtr screen)
+{
+	return NULL;
+}
+#endif
diff --git a/src/sna/sna_video_textured.c b/src/sna/sna_video_textured.c
index 27fc09f47..e5cae859e 100644
--- a/src/sna/sna_video_textured.c
+++ b/src/sna/sna_video_textured.c
@@ -254,7 +254,7 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 		DBG(("%s: using passthough, name=%d\n",
 		     __FUNCTION__, *(uint32_t *)buf));
 
-		if (sna->kgem.gen < 31) {
+		if (sna->kgem.gen < 031) {
 			/* XXX: i915 is not support and needs some
 			 * serious care.  grep for KMS in i915_hwmc.c */
 			return BadAlloc;
@@ -267,6 +267,10 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 		}
 
 		assert(kgem_bo_size(frame.bo) >= frame.size);
+		frame.image.x1 = 0;
+		frame.image.y1 = 0;
+		frame.image.x2 = frame.width;
+		frame.image.y2 = frame.height;
 	} else {
 		if (!sna_video_copy_data(sna, video, &frame, buf)) {
 			DBG(("%s: failed to copy frame\n", __FUNCTION__));
@@ -276,15 +280,17 @@ sna_video_textured_put_image(ScrnInfoPtr scrn,
 	}
 
 	if (crtc && video->SyncToVblank != 0 &&
-	    sna_pixmap_is_scanout(sna, pixmap))
+	    sna_pixmap_is_scanout(sna, pixmap)) {
+		kgem_set_mode(&sna->kgem, KGEM_RENDER, sna_pixmap(pixmap)->gpu_bo);
 		flush = sna_wait_for_scanline(sna, pixmap, crtc,
 					      &clip->extents);
+	}
 
 	ret = Success;
 	if (!sna->render.video(sna, video, &frame, clip,
-			      src_w, src_h,
-			      drw_w, drw_h,
-			      pixmap)) {
+			       src_w, src_h, drw_w, drw_h,
+			       drw_x - src_x, drw_y - src_y,
+			       pixmap)) {
 		DBG(("%s: failed to render video\n", __FUNCTION__));
 		ret = BadAlloc;
 	} else
@@ -355,7 +361,7 @@ sna_video_textured_query(ScrnInfoPtr scrn,
 #ifdef SNA_XVMC
 	case FOURCC_XVMC:
 		*h = (*h + 1) & ~1;
-		size = sizeof(struct sna_xvmc_command);
+		size = sizeof(uint32_t);
 		if (pitches)
 			pitches[0] = size;
 		break;
@@ -447,6 +453,7 @@ XF86VideoAdaptorPtr sna_video_textured_setup(struct sna *sna,
 		struct sna_video *v = &video[i];
 
 		v->textured = true;
+		v->alignment = 4;
 		v->rotation = RR_Rotate_0;
 		v->SyncToVblank = 1;
 
diff --git a/src/xvmc/Makefile.am b/src/xvmc/Makefile.am
index d3ed4499a..51c98b0c8 100644
--- a/src/xvmc/Makefile.am
+++ b/src/xvmc/Makefile.am
@@ -17,7 +17,8 @@ libIntelXvMC_la_SOURCES = intel_xvmc.c \
         intel_batchbuffer.h
 
 AM_CFLAGS = @XORG_CFLAGS@ @DRM_CFLAGS@ @DRI_CFLAGS@ \
-	@XVMCLIB_CFLAGS@ -I$(top_srcdir)/src -DTRUE=1 -DFALSE=0
+	    @XVMCLIB_CFLAGS@ @XCB_CFLAGS@ \
+	    -I$(top_srcdir)/src -DTRUE=1 -DFALSE=0
 
 libIntelXvMC_la_LDFLAGS = -version-number 1:0:0
-libIntelXvMC_la_LIBADD = @DRI_LIBS@ @DRM_LIBS@ @XVMCLIB_LIBS@ -lpthread -ldrm_intel
+libIntelXvMC_la_LIBADD = @DRI_LIBS@ @DRM_LIBS@ @XVMCLIB_LIBS@ @XCB_LIBS@ @DRMINTEL_LIBS@ -lpthread
diff --git a/src/xvmc/Makefile.in b/src/xvmc/Makefile.in
index 028ea3729..36f565528 100644
--- a/src/xvmc/Makefile.in
+++ b/src/xvmc/Makefile.in
@@ -245,7 +245,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -284,6 +283,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -357,15 +358,16 @@ libIntelXvMC_la_SOURCES = intel_xvmc.c \
         intel_batchbuffer.h
 
 AM_CFLAGS = @XORG_CFLAGS@ @DRM_CFLAGS@ @DRI_CFLAGS@ \
-	@XVMCLIB_CFLAGS@ -I$(top_srcdir)/src -DTRUE=1 -DFALSE=0
+	    @XVMCLIB_CFLAGS@ @XCB_CFLAGS@ \
+	    -I$(top_srcdir)/src -DTRUE=1 -DFALSE=0
 
 libIntelXvMC_la_LDFLAGS = -version-number 1:0:0
-libIntelXvMC_la_LIBADD = @DRI_LIBS@ @DRM_LIBS@ @XVMCLIB_LIBS@ -lpthread -ldrm_intel
+libIntelXvMC_la_LIBADD = @DRI_LIBS@ @DRM_LIBS@ @XVMCLIB_LIBS@ @XCB_LIBS@ @DRMINTEL_LIBS@ -lpthread
 all: all-recursive
 
 .SUFFIXES:
 .SUFFIXES: .c .lo .o .obj
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -390,9 +392,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
 install-libLTLIBRARIES: $(lib_LTLIBRARIES)
diff --git a/src/xvmc/shader/Makefile.in b/src/xvmc/shader/Makefile.in
index 04fe1979c..1910c289b 100644
--- a/src/xvmc/shader/Makefile.in
+++ b/src/xvmc/shader/Makefile.in
@@ -183,7 +183,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -222,6 +221,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -284,7 +285,7 @@ SUBDIRS = mc vld
 all: all-recursive
 
 .SUFFIXES:
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -309,9 +310,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
 
diff --git a/src/xvmc/shader/mc/Makefile.am b/src/xvmc/shader/mc/Makefile.am
index c1bff77dd..8d6576917 100644
--- a/src/xvmc/shader/mc/Makefile.am
+++ b/src/xvmc/shader/mc/Makefile.am
@@ -109,9 +109,9 @@ if HAVE_GEN4ASM
 
 SUFFIXES = .g4a .g4b
 .g4a.g4b:
-	m4 -I$(srcdir) $(srcdir)/$*.g4a > $*.g4m && @INTEL_GEN4ASM@ -o $@ $*.g4m && @INTEL_GEN4ASM@ -g 5 -o $@.gen5 $*.g4m && rm $*.g4m
+	$(AM_V_GEN)m4 -I$(srcdir) $(srcdir)/$*.g4a > $*.g4m && @INTEL_GEN4ASM@ -o $@ $*.g4m && @INTEL_GEN4ASM@ -g 5 -o $@.gen5 $*.g4m && rm $*.g4m
 
-$(INTEL_G4B): $(INTEL_G4I)
+$(INTEL_G4B): $(INTEL_GEN4ASM) $(INTEL_G4I)
 
 BUILT_SOURCES= $(INTEL_G4B)
 
diff --git a/src/xvmc/shader/mc/Makefile.in b/src/xvmc/shader/mc/Makefile.in
index 165ebc807..1a196be3e 100644
--- a/src/xvmc/shader/mc/Makefile.in
+++ b/src/xvmc/shader/mc/Makefile.in
@@ -143,7 +143,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -182,6 +181,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -353,7 +354,7 @@ all: $(BUILT_SOURCES)
 
 .SUFFIXES:
 .SUFFIXES: .g4a .g4b
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -378,9 +379,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
 
@@ -545,9 +546,9 @@ uninstall-am:
 	uninstall uninstall-am
 
 @HAVE_GEN4ASM_TRUE@.g4a.g4b:
-@HAVE_GEN4ASM_TRUE@	m4 -I$(srcdir) $(srcdir)/$*.g4a > $*.g4m && @INTEL_GEN4ASM@ -o $@ $*.g4m && @INTEL_GEN4ASM@ -g 5 -o $@.gen5 $*.g4m && rm $*.g4m
+@HAVE_GEN4ASM_TRUE@	$(AM_V_GEN)m4 -I$(srcdir) $(srcdir)/$*.g4a > $*.g4m && @INTEL_GEN4ASM@ -o $@ $*.g4m && @INTEL_GEN4ASM@ -g 5 -o $@.gen5 $*.g4m && rm $*.g4m
 
-@HAVE_GEN4ASM_TRUE@$(INTEL_G4B): $(INTEL_G4I)
+@HAVE_GEN4ASM_TRUE@$(INTEL_G4B): $(INTEL_GEN4ASM) $(INTEL_G4I)
 
 @HAVE_GEN4ASM_TRUE@clean-local:
 @HAVE_GEN4ASM_TRUE@	-rm -f $(INTEL_G4B)
diff --git a/src/xvmc/shader/vld/Makefile.am b/src/xvmc/shader/vld/Makefile.am
index 9280f1513..8f1047e02 100644
--- a/src/xvmc/shader/vld/Makefile.am
+++ b/src/xvmc/shader/vld/Makefile.am
@@ -62,9 +62,9 @@ if HAVE_GEN4ASM
 
 SUFFIXES = .g4a .g4b
 .g4a.g4b:
-	m4 $*.g4a > $*.g4m && @INTEL_GEN4ASM@ -o $@ $*.g4m && @INTEL_GEN4ASM@ -g 5 -o $@.gen5 $*.g4m && rm $*.g4m
+	$(AM_V_GEN)m4 $*.g4a > $*.g4m && @INTEL_GEN4ASM@ -o $@ $*.g4m && @INTEL_GEN4ASM@ -g 5 -o $@.gen5 $*.g4m && rm $*.g4m
 
-$(INTEL_G4B): $(INTEL_G4I)
+$(INTEL_G4B): $(INTEL_GEN4ASM) $(INTEL_G4I)
 
 BUILT_SOURCES= $(INTEL_G4B)
 
diff --git a/src/xvmc/shader/vld/Makefile.in b/src/xvmc/shader/vld/Makefile.in
index 5cf44c18f..595948ebc 100644
--- a/src/xvmc/shader/vld/Makefile.in
+++ b/src/xvmc/shader/vld/Makefile.in
@@ -143,7 +143,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -182,6 +181,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -306,7 +307,7 @@ all: $(BUILT_SOURCES)
 
 .SUFFIXES:
 .SUFFIXES: .g4a .g4b
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -331,9 +332,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
 
@@ -498,9 +499,9 @@ uninstall-am:
 	uninstall uninstall-am
 
 @HAVE_GEN4ASM_TRUE@.g4a.g4b:
-@HAVE_GEN4ASM_TRUE@	m4 $*.g4a > $*.g4m && @INTEL_GEN4ASM@ -o $@ $*.g4m && @INTEL_GEN4ASM@ -g 5 -o $@.gen5 $*.g4m && rm $*.g4m
+@HAVE_GEN4ASM_TRUE@	$(AM_V_GEN)m4 $*.g4a > $*.g4m && @INTEL_GEN4ASM@ -o $@ $*.g4m && @INTEL_GEN4ASM@ -g 5 -o $@.gen5 $*.g4m && rm $*.g4m
 
-@HAVE_GEN4ASM_TRUE@$(INTEL_G4B): $(INTEL_G4I)
+@HAVE_GEN4ASM_TRUE@$(INTEL_G4B): $(INTEL_GEN4ASM) $(INTEL_G4I)
 
 @HAVE_GEN4ASM_TRUE@clean-local:
 @HAVE_GEN4ASM_TRUE@	-rm -f $(INTEL_G4B)
diff --git a/test/Makefile.am b/test/Makefile.am
index 96c87f824..0f9bd7d09 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -21,6 +21,8 @@ stress_TESTS = \
 
 check_PROGRAMS = $(stress_TESTS)
 
+noinst_PROGRAMS = lowlevel-blt-bench
+
 AM_CFLAGS = @CWARNFLAGS@ @X11_CFLAGS@ @DRM_CFLAGS@
 LDADD = libtest.la @X11_LIBS@ -lXfixes @DRM_LIBS@ -lrt
 
@@ -35,4 +37,11 @@ libtest_la_SOURCES = \
 	dri2.h \
 	$(NULL)
 
-EXTRA_DIST = README
+vsync.avi: mkvsync.sh
+	./mkvsync.sh $@
+
+clean-vsync-avi:
+	rm -rf vsync.avi .build.tmp
+
+EXTRA_DIST = README mkvsync.sh
+clean-local: clean-vsync-avi
diff --git a/test/Makefile.in b/test/Makefile.in
index 315802172..b462d6f4b 100644
--- a/test/Makefile.in
+++ b/test/Makefile.in
@@ -15,6 +15,7 @@
 
 @SET_MAKE@
 
+
 VPATH = @srcdir@
 am__make_dryrun = \
   { \
@@ -52,6 +53,7 @@ POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
 check_PROGRAMS = $(am__EXEEXT_1)
+noinst_PROGRAMS = lowlevel-blt-bench$(EXEEXT)
 subdir = test
 DIST_COMMON = README $(srcdir)/Makefile.am $(srcdir)/Makefile.in
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
@@ -79,6 +81,7 @@ am__EXEEXT_1 = basic-fillrect$(EXEEXT) basic-rectangle$(EXEEXT) \
 	render-composite-solid$(EXEEXT) render-copyarea$(EXEEXT) \
 	render-copyarea-size$(EXEEXT) render-copy-alphaless$(EXEEXT) \
 	mixed-stress$(EXEEXT) dri2-swap$(EXEEXT)
+PROGRAMS = $(noinst_PROGRAMS)
 basic_copyarea_SOURCES = basic-copyarea.c
 basic_copyarea_OBJECTS = basic-copyarea.$(OBJEXT)
 basic_copyarea_LDADD = $(LDADD)
@@ -115,6 +118,10 @@ dri2_swap_SOURCES = dri2-swap.c
 dri2_swap_OBJECTS = dri2-swap.$(OBJEXT)
 dri2_swap_LDADD = $(LDADD)
 dri2_swap_DEPENDENCIES = libtest.la
+lowlevel_blt_bench_SOURCES = lowlevel-blt-bench.c
+lowlevel_blt_bench_OBJECTS = lowlevel-blt-bench.$(OBJEXT)
+lowlevel_blt_bench_LDADD = $(LDADD)
+lowlevel_blt_bench_DEPENDENCIES = libtest.la
 mixed_stress_SOURCES = mixed-stress.c
 mixed_stress_OBJECTS = mixed-stress.$(OBJEXT)
 mixed_stress_LDADD = $(LDADD)
@@ -180,14 +187,14 @@ am__v_GEN_0 = @echo "  GEN   " $@;
 SOURCES = $(libtest_la_SOURCES) basic-copyarea.c basic-copyarea-size.c \
 	basic-fillrect.c basic-lines.c basic-putimage.c \
 	basic-rectangle.c basic-stress.c basic-string.c dri2-swap.c \
-	mixed-stress.c render-composite-solid.c \
+	lowlevel-blt-bench.c mixed-stress.c render-composite-solid.c \
 	render-copy-alphaless.c render-copyarea.c \
 	render-copyarea-size.c render-fill.c render-fill-copy.c \
 	render-trapezoid.c render-trapezoid-image.c
 DIST_SOURCES = $(libtest_la_SOURCES) basic-copyarea.c \
 	basic-copyarea-size.c basic-fillrect.c basic-lines.c \
 	basic-putimage.c basic-rectangle.c basic-stress.c \
-	basic-string.c dri2-swap.c mixed-stress.c \
+	basic-string.c dri2-swap.c lowlevel-blt-bench.c mixed-stress.c \
 	render-composite-solid.c render-copy-alphaless.c \
 	render-copyarea.c render-copyarea-size.c render-fill.c \
 	render-fill-copy.c render-trapezoid.c render-trapezoid-image.c
@@ -268,7 +275,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -307,6 +313,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -399,12 +407,12 @@ libtest_la_SOURCES = \
 	dri2.h \
 	$(NULL)
 
-EXTRA_DIST = README
+EXTRA_DIST = README mkvsync.sh
 all: all-am
 
 .SUFFIXES:
 .SUFFIXES: .c .lo .o .obj
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -429,9 +437,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
 
@@ -454,6 +462,15 @@ clean-checkPROGRAMS:
 	list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \
 	echo " rm -f" $$list; \
 	rm -f $$list
+
+clean-noinstPROGRAMS:
+	@list='$(noinst_PROGRAMS)'; test -n "$$list" || exit 0; \
+	echo " rm -f" $$list; \
+	rm -f $$list || exit $$?; \
+	test -n "$(EXEEXT)" || exit 0; \
+	list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \
+	echo " rm -f" $$list; \
+	rm -f $$list
 basic-copyarea$(EXEEXT): $(basic_copyarea_OBJECTS) $(basic_copyarea_DEPENDENCIES) $(EXTRA_basic_copyarea_DEPENDENCIES) 
 	@rm -f basic-copyarea$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(basic_copyarea_OBJECTS) $(basic_copyarea_LDADD) $(LIBS)
@@ -481,6 +498,9 @@ basic-string$(EXEEXT): $(basic_string_OBJECTS) $(basic_string_DEPENDENCIES) $(EX
 dri2-swap$(EXEEXT): $(dri2_swap_OBJECTS) $(dri2_swap_DEPENDENCIES) $(EXTRA_dri2_swap_DEPENDENCIES) 
 	@rm -f dri2-swap$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(dri2_swap_OBJECTS) $(dri2_swap_LDADD) $(LIBS)
+lowlevel-blt-bench$(EXEEXT): $(lowlevel_blt_bench_OBJECTS) $(lowlevel_blt_bench_DEPENDENCIES) $(EXTRA_lowlevel_blt_bench_DEPENDENCIES) 
+	@rm -f lowlevel-blt-bench$(EXEEXT)
+	$(AM_V_CCLD)$(LINK) $(lowlevel_blt_bench_OBJECTS) $(lowlevel_blt_bench_LDADD) $(LIBS)
 mixed-stress$(EXEEXT): $(mixed_stress_OBJECTS) $(mixed_stress_DEPENDENCIES) $(EXTRA_mixed_stress_DEPENDENCIES) 
 	@rm -f mixed-stress$(EXEEXT)
 	$(AM_V_CCLD)$(LINK) $(mixed_stress_OBJECTS) $(mixed_stress_LDADD) $(LIBS)
@@ -525,6 +545,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/basic-string.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dri2-swap.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/dri2.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lowlevel-blt-bench.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/mixed-stress.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/render-composite-solid.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/render-copy-alphaless.Po@am__quote@
@@ -651,7 +672,7 @@ distdir: $(DISTFILES)
 check-am: all-am
 	$(MAKE) $(AM_MAKEFLAGS) $(check_PROGRAMS)
 check: check-am
-all-am: Makefile $(LTLIBRARIES)
+all-am: Makefile $(LTLIBRARIES) $(PROGRAMS)
 installdirs:
 install: install-am
 install-exec: install-exec-am
@@ -685,8 +706,8 @@ maintainer-clean-generic:
 	@echo "it deletes files that may require special tools to rebuild."
 clean: clean-am
 
-clean-am: clean-checkPROGRAMS clean-generic clean-libtool \
-	clean-noinstLTLIBRARIES mostlyclean-am
+clean-am: clean-checkPROGRAMS clean-generic clean-libtool clean-local \
+	clean-noinstLTLIBRARIES clean-noinstPROGRAMS mostlyclean-am
 
 distclean: distclean-am
 	-rm -rf ./$(DEPDIR)
@@ -757,19 +778,26 @@ uninstall-am:
 .MAKE: check-am install-am install-strip
 
 .PHONY: CTAGS GTAGS all all-am check check-am clean \
-	clean-checkPROGRAMS clean-generic clean-libtool \
-	clean-noinstLTLIBRARIES ctags distclean distclean-compile \
-	distclean-generic distclean-libtool distclean-tags distdir dvi \
-	dvi-am html html-am info info-am install install-am \
-	install-data install-data-am install-dvi install-dvi-am \
-	install-exec install-exec-am install-html install-html-am \
-	install-info install-info-am install-man install-pdf \
-	install-pdf-am install-ps install-ps-am install-strip \
-	installcheck installcheck-am installdirs maintainer-clean \
-	maintainer-clean-generic mostlyclean mostlyclean-compile \
-	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
-	tags uninstall uninstall-am
-
+	clean-checkPROGRAMS clean-generic clean-libtool clean-local \
+	clean-noinstLTLIBRARIES clean-noinstPROGRAMS ctags distclean \
+	distclean-compile distclean-generic distclean-libtool \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-data install-data-am install-dvi \
+	install-dvi-am install-exec install-exec-am install-html \
+	install-html-am install-info install-info-am install-man \
+	install-pdf install-pdf-am install-ps install-ps-am \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
+	pdf pdf-am ps ps-am tags uninstall uninstall-am
+
+
+vsync.avi: mkvsync.sh
+	./mkvsync.sh $@
+
+clean-vsync-avi:
+	rm -rf vsync.avi .build.tmp
+clean-local: clean-vsync-avi
 
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
diff --git a/test/lowlevel-blt-bench.c b/test/lowlevel-blt-bench.c
new file mode 100644
index 000000000..0cea0a81a
--- /dev/null
+++ b/test/lowlevel-blt-bench.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright © 2009 Nokia Corporation
+ * Copyright © 2010 Movial Creative Technologies Oy
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <X11/X.h>
+#include <X11/Xutil.h> /* for XDestroyImage */
+#include <pixman.h> /* for pixman blt functions */
+
+#include "test.h"
+
+static const struct format {
+	const char *name;
+	pixman_format_code_t pixman_format;
+} formats[] = {
+	{ "a8r8g8b8", PIXMAN_a8r8g8b8 },
+	{ "x8r8g8b8", PIXMAN_x8r8g8b8 },
+	{ "a8", PIXMAN_a8 },
+	{ "a4", PIXMAN_a4 },
+	{ "a1", PIXMAN_a1 },
+};
+
+static const struct op {
+	const char *name;
+} ops[] = {
+	[PictOpClear] = { "Clear" },
+	[PictOpSrc] = { "Src" },
+	[PictOpDst] = { "Dst" },
+	[PictOpOver] = { "Over" },
+	[PictOpOverReverse] = { "OverReverse" },
+	[PictOpIn] = { "In" },
+	[PictOpInReverse] = { "InReverse" },
+	[PictOpOut] = { "Out" },
+	[PictOpOutReverse] = { "OutReverse" },
+	[PictOpAtop] = { "Atop" },
+	[PictOpAtopReverse] = { "AtopReverse" },
+	[PictOpXor] = { "Xor" },
+	[PictOpAdd] = { "Add" },
+	[PictOpSaturate] = { "Saturate" },
+};
+
+static double _bench(struct test_display *t, enum target target_type,
+		     int op, int src_format,
+		     int loops)
+{
+	XRenderColor render_color = { 0x8000, 0x8000, 0x8000, 0x8000 };
+	struct test_target target;
+	Pixmap pixmap;
+	Picture picture;
+	struct timespec tv;
+	double elapsed;
+
+	test_target_create_render(t, target_type, &target);
+	XRenderFillRectangle(t->dpy, PictOpClear, target.picture, &render_color,
+			     0, 0, target.width, target.height);
+
+	pixmap = XCreatePixmap(t->dpy, t->root,
+			       target.width, target.height,
+			       PIXMAN_FORMAT_DEPTH(formats[src_format].pixman_format));
+
+	picture = XRenderCreatePicture(t->dpy, pixmap,
+				       XRenderFindStandardFormat(t->dpy, src_format),
+				       0, NULL);
+	XRenderFillRectangle(t->dpy, PictOpSrc, picture, &render_color,
+			     0, 0, target.width, target.height);
+
+	test_timer_start(t, &tv);
+	while (loops--)
+		XRenderComposite(t->dpy, op,
+				 picture, 0, target.picture,
+				 0, 0,
+				 0, 0,
+				 0, 0,
+				 target.width, target.height);
+	elapsed = test_timer_stop(t, &tv);
+
+	XRenderFreePicture(t->dpy, picture);
+	XFreePixmap(t->dpy, pixmap);
+	test_target_destroy_render(t, &target);
+
+	return elapsed;
+}
+
+static void bench(struct test *t, enum target target, int op, int sf)
+{
+	double real, ref;
+
+	ref = _bench(&t->ref, target, op, sf, 1000);
+	real = _bench(&t->real, target, op, sf, 1000);
+
+	fprintf (stdout, "Testing %s with %s: ref=%f, real=%f\n",
+		 formats[sf].name, ops[op].name, ref, real);
+}
+
+int main(int argc, char **argv)
+{
+	struct test test;
+	int op, sf;
+
+	test_init(&test, argc, argv);
+
+	for (op = 0; op < sizeof(ops)/sizeof(ops[0]); op++) {
+		for (sf = 0; sf < sizeof(formats)/sizeof(formats[0]); sf++)
+			bench(&test, ROOT, op, sf);
+		fprintf (stdout, "\n");
+	}
+
+	return 0;
+}
diff --git a/test/mkvsync.sh b/test/mkvsync.sh
new file mode 100755
index 000000000..dd96ad8df
--- /dev/null
+++ b/test/mkvsync.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+OUT="$1"
+[ -n "$OUT" ] || OUT="vsync.avi"
+
+TMP=".build.tmp"
+
+rm -rf ${TMP}
+mkdir ${TMP}
+convert -size 640x480 -depth 24 canvas:black png24:${TMP}/black.png
+convert -size 640x480 -depth 24 canvas:white png24:${TMP}/white.png
+
+mkdir ${TMP}/anim
+
+for ((a=0; $a < 1000; a=$a+2)); do
+	ln -s ../black.png ${TMP}/anim/$a.png
+done
+
+for ((a=1; $a < 1000; a=$a+2)); do
+	ln -s ../white.png ${TMP}/anim/$a.png
+done
+
+mencoder "mf://${TMP}/anim/*.png" -v -vf-clr -mf fps=60 -o "${OUT}" -ovc lavc
+exitcode=$?
+rm -rf ${TMP}
+
+exit ${exitcode}
diff --git a/test/test.h b/test/test.h
index 1e3995bbf..7ef4dca4c 100644
--- a/test/test.h
+++ b/test/test.h
@@ -2,6 +2,8 @@
 #define TEST_H
 
 #include <stdint.h>
+#include <time.h>
+
 #include <X11/Xlib.h>
 #include <X11/extensions/XShm.h>
 #include <X11/extensions/Xrender.h>
@@ -107,6 +109,9 @@ static inline uint32_t color(uint8_t red, uint8_t green, uint8_t blue, uint8_t a
 	return alpha << 24 | ra >> 8 << 16 | ga >> 8 << 8 | ba >> 8;
 }
 
+void test_timer_start(struct test_display *t, struct timespec *tv);
+double test_timer_stop(struct test_display *t, struct timespec *tv);
+
 #ifndef MAX
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 #endif
diff --git a/test/test_display.c b/test/test_display.c
index ad3e40bc7..b5e7e06ed 100644
--- a/test/test_display.c
+++ b/test/test_display.c
@@ -148,3 +148,20 @@ void test_init(struct test *test, int argc, char **argv)
 	memset(test, 0, sizeof(*test));
 	test_get_displays(argc, argv, &test->real, &test->ref);
 }
+
+void test_timer_start(struct test_display *t, struct timespec *tv)
+{
+	clock_gettime(CLOCK_MONOTONIC, tv);
+}
+
+double test_timer_stop(struct test_display *t, struct timespec *tv)
+{
+	XImage *image;
+	struct timespec now;
+
+	image = XGetImage(t->dpy, t->root, 0, 0, 1, 1, AllPlanes, ZPixmap);
+	clock_gettime(CLOCK_MONOTONIC, &now);
+	XDestroyImage(image);
+
+	return (now.tv_sec - tv->tv_sec) + 1e-9*(now.tv_nsec - tv->tv_nsec);
+}
diff --git a/uxa/Makefile.in b/uxa/Makefile.in
index 417eb9cfc..6a01effe9 100644
--- a/uxa/Makefile.in
+++ b/uxa/Makefile.in
@@ -175,7 +175,6 @@ LIB_MAN_SUFFIX = @LIB_MAN_SUFFIX@
 LIPO = @LIPO@
 LN_S = @LN_S@
 LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
 MAKEINFO = @MAKEINFO@
 MANIFEST_TOOL = @MANIFEST_TOOL@
 MAN_SUBSTS = @MAN_SUBSTS@
@@ -214,6 +213,8 @@ VALGRIND_LIBS = @VALGRIND_LIBS@
 VERSION = @VERSION@
 X11_CFLAGS = @X11_CFLAGS@
 X11_LIBS = @X11_LIBS@
+XCB_CFLAGS = @XCB_CFLAGS@
+XCB_LIBS = @XCB_LIBS@
 XORG_CFLAGS = @XORG_CFLAGS@
 XORG_LIBS = @XORG_LIBS@
 XORG_MAN_PAGE = @XORG_MAN_PAGE@
@@ -293,7 +294,7 @@ all: all-am
 
 .SUFFIXES:
 .SUFFIXES: .c .lo .o .obj
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
 	    *$$dep*) \
@@ -318,9 +319,9 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
+$(top_srcdir)/configure:  $(am__configure_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
 
diff --git a/uxa/uxa-accel.c b/uxa/uxa-accel.c
index 76425fe9f..27215ddf6 100644
--- a/uxa/uxa-accel.c
+++ b/uxa/uxa-accel.c
@@ -1027,7 +1027,7 @@ uxa_push_pixels(GCPtr pGC, PixmapPtr pBitmap,
 				ok = glamor_push_pixels_nf(pGC, pBitmap, pDrawable, w, h, x, y);
 				uxa_finish_access(&pBitmap->drawable, UXA_GLAMOR_ACCESS_RO);
 			}
-			uxa_prepare_access(pDrawable, UXA_GLAMOR_ACCESS_RW);
+			uxa_finish_access(pDrawable, UXA_GLAMOR_ACCESS_RW);
 		}
 		if (!ok)
 			goto fallback;
diff --git a/uxa/uxa-render.c b/uxa/uxa-render.c
index 4463dc2f0..d783ea26c 100644
--- a/uxa/uxa-render.c
+++ b/uxa/uxa-render.c
@@ -962,7 +962,7 @@ uxa_try_driver_composite(CARD8 op,
 	RegionRec region;
 	BoxPtr pbox;
 	int nbox;
-	int xDst_copy, yDst_copy;
+	int xDst_copy = 0, yDst_copy = 0;
 	int src_off_x, src_off_y, mask_off_x, mask_off_y, dst_off_x, dst_off_y;
 	PixmapPtr pSrcPix, pMaskPix = NULL, pDstPix;
 	PicturePtr localSrc, localMask = NULL;
author	Anas Nashif <anas.nashif@intel.com>	2013-02-11 07:30:29 -0800
committer	Anas Nashif <anas.nashif@intel.com>	2013-02-11 07:30:29 -0800
commit	c4f30fa8253338176ec71f157200b8e2824c0f15 (patch)
tree	6b7485eb6f028539ce3dcc40770ee35889eda025
parent	1501461b978a770b6fc8883901d6c3d177661667 (diff)
download	xf86-video-intel-c4f30fa8253338176ec71f157200b8e2824c0f15.tar.gz xf86-video-intel-c4f30fa8253338176ec71f157200b8e2824c0f15.tar.bz2 xf86-video-intel-c4f30fa8253338176ec71f157200b8e2824c0f15.zip